@@ -180,8 +180,8 @@ class CopyvioMixIn: | |||||
self._logger.info(result.get_log_message(self.title)) | self._logger.info(result.get_log_message(self.title)) | ||||
return result | return result | ||||
def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5): | |||||
"""Check the page like :py:meth:`copyvio_check` against a specific URL. | |||||
def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5): | |||||
"""Check the page like :py:meth:`copyvio_check` against specific URLs. | |||||
This is essentially a reduced version of :meth:`copyvio_check` - a | This is essentially a reduced version of :meth:`copyvio_check` - a | ||||
copyivo comparison is made using Markov chains and the result is | copyivo comparison is made using Markov chains and the result is | ||||
@@ -201,9 +201,11 @@ class CopyvioMixIn: | |||||
Since no searching is done, neither :exc:`.UnknownSearchEngineError` | Since no searching is done, neither :exc:`.UnknownSearchEngineError` | ||||
nor :exc:`.SearchQueryError` will be raised. | nor :exc:`.SearchQueryError` will be raised. | ||||
""" | """ | ||||
if not isinstance(urls, list): | |||||
urls = [urls] | |||||
log = "Starting copyvio compare for [[{0}]] against {1}" | log = "Starting copyvio compare for [[{0}]] against {1}" | ||||
self._logger.info(log.format(self.title, url)) | |||||
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=5) | |||||
self._logger.info(log.format(self.title, ", ".join(urls))) | |||||
article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree) | |||||
workspace = CopyvioWorkspace( | workspace = CopyvioWorkspace( | ||||
article, | article, | ||||
min_confidence, | min_confidence, | ||||
@@ -211,11 +213,12 @@ class CopyvioMixIn: | |||||
self._logger, | self._logger, | ||||
self._addheaders, | self._addheaders, | ||||
max_time, | max_time, | ||||
num_workers=1, | |||||
num_workers=min(len(urls), 8), | |||||
short_circuit=False, | |||||
config=self._search_config, | config=self._search_config, | ||||
degree=degree, | degree=degree, | ||||
) | ) | ||||
workspace.enqueue([url]) | |||||
workspace.enqueue(urls) | |||||
workspace.wait() | workspace.wait() | ||||
result = workspace.get_result() | result = workspace.get_result() | ||||
self._logger.info(result.get_log_message(self.title)) | self._logger.info(result.get_log_message(self.title)) | ||||
@@ -93,5 +93,35 @@ class MarkovChainIntersection(MarkovChain): | |||||
return res.format(self.size, self.mc1, self.mc2) | return res.format(self.size, self.mc1, self.mc2) | ||||
class MarkovChainUnion(MarkovChain): | |||||
"""Implemented the union of multiple chains.""" | |||||
def __init__(self, chains): | |||||
self.chains = list(chains) | |||||
self.chain = self._build() | |||||
self.size = self._get_size() | |||||
def _build(self): | |||||
"""Build and return the Markov chain from the input chains.""" | |||||
union = {} | |||||
for chain in self.chains: | |||||
for phrase, count in chain.chain.iteritems(): | |||||
if phrase in union: | |||||
union[phrase] += count | |||||
else: | |||||
union[phrase] = count | |||||
return union | |||||
def __repr__(self): | |||||
"""Return the canonical string representation of the union.""" | |||||
res = "MarkovChainUnion(chains={!r})" | |||||
return res.format(self.chains) | |||||
def __str__(self): | |||||
"""Return a nice string representation of the union.""" | |||||
res = "<MarkovChainUnion of size {} ({})>" | |||||
return res.format(self.size, "| ".join(str(chain) for chain in self.chains)) | |||||
EMPTY = MarkovChain("") | EMPTY = MarkovChain("") | ||||
EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY) | EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY) |
@@ -273,7 +273,7 @@ class _HTMLParser(_BaseTextParser): | |||||
for element in soup.find_all(tag): | for element in soup.find_all(tag): | ||||
element.extract() | element.extract() | ||||
return "\n".join(soup.stripped_strings) | |||||
return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings) | |||||
def _open(self, url, **kwargs): | def _open(self, url, **kwargs): | ||||
"""Try to read a URL. Return None if it couldn't be read.""" | """Try to read a URL. Return None if it couldn't be read.""" | ||||
@@ -21,6 +21,8 @@ | |||||
from threading import Event | from threading import Event | ||||
from time import time | from time import time | ||||
import urlparse | |||||
from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION | from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION | ||||
__all__ = ["CopyvioSource", "CopyvioCheckResult"] | __all__ = ["CopyvioSource", "CopyvioCheckResult"] | ||||
@@ -84,6 +86,11 @@ class CopyvioSource: | |||||
res = "<CopyvioSource ({0} with {1} conf)>" | res = "<CopyvioSource ({0} with {1} conf)>" | ||||
return res.format(self.url, self.confidence) | return res.format(self.url, self.confidence) | ||||
@property | |||||
def domain(self): | |||||
"""The source URL's domain name, or None.""" | |||||
return urlparse.urlparse(self.url).netloc or None | |||||
def start_work(self): | def start_work(self): | ||||
"""Mark this source as being worked on right now.""" | """Mark this source as being worked on right now.""" | ||||
self._event2.clear() | self._event2.clear() | ||||
@@ -137,14 +144,25 @@ class CopyvioCheckResult: | |||||
""" | """ | ||||
def __init__( | def __init__( | ||||
self, violation, sources, queries, check_time, article_chain, possible_miss | |||||
self, | |||||
violation, | |||||
sources, | |||||
queries, | |||||
check_time, | |||||
article_chain, | |||||
possible_miss, | |||||
included_sources=None, | |||||
unified_confidence=None, | |||||
): | ): | ||||
assert isinstance(sources, list) | |||||
self.violation = violation | self.violation = violation | ||||
self.sources = sources | self.sources = sources | ||||
self.queries = queries | self.queries = queries | ||||
self.time = check_time | self.time = check_time | ||||
self.article_chain = article_chain | self.article_chain = article_chain | ||||
self.possible_miss = possible_miss | self.possible_miss = possible_miss | ||||
self.included_sources = included_sources if included_sources else [] | |||||
self.unified_confidence = unified_confidence | |||||
def __repr__(self): | def __repr__(self): | ||||
"""Return the canonical string representation of the result.""" | """Return the canonical string representation of the result.""" | ||||
@@ -164,7 +182,13 @@ class CopyvioCheckResult: | |||||
@property | @property | ||||
def confidence(self): | def confidence(self): | ||||
"""The confidence of the best source, or 0 if no sources exist.""" | """The confidence of the best source, or 0 if no sources exist.""" | ||||
return self.best.confidence if self.best else 0.0 | |||||
return ( | |||||
self.unified_confidence | |||||
if self.unified_confidence is not None | |||||
else self.best.confidence | |||||
if self.best | |||||
else 0.0 | |||||
) | |||||
@property | @property | ||||
def url(self): | def url(self): | ||||
@@ -37,7 +37,11 @@ from urllib.request import Request, build_opener | |||||
from earwigbot import importer | from earwigbot import importer | ||||
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError | from earwigbot.exceptions import ParserExclusionError, ParserRedirectError | ||||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||||
from earwigbot.wiki.copyvios.markov import ( | |||||
MarkovChain, | |||||
MarkovChainIntersection, | |||||
MarkovChainUnion, | |||||
) | |||||
from earwigbot.wiki.copyvios.parsers import get_parser | from earwigbot.wiki.copyvios.parsers import get_parser | ||||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | ||||
@@ -45,6 +49,8 @@ tldextract = importer.new("tldextract") | |||||
__all__ = ["globalize", "localize", "CopyvioWorkspace"] | __all__ = ["globalize", "localize", "CopyvioWorkspace"] | ||||
INCLUDE_THRESHOLD = 0.15 | |||||
_MAX_REDIRECTS = 3 | _MAX_REDIRECTS = 3 | ||||
_MAX_RAW_SIZE = 20 * 1024**2 | _MAX_RAW_SIZE = 20 * 1024**2 | ||||
@@ -501,15 +507,26 @@ class CopyvioWorkspace: | |||||
def get_result(self, num_queries=0): | def get_result(self, num_queries=0): | ||||
"""Return a CopyvioCheckResult containing the results of this check.""" | """Return a CopyvioCheckResult containing the results of this check.""" | ||||
self.sources.sort( | |||||
key=lambda s: ( | |||||
s.confidence, | |||||
not s.excluded, | |||||
not s.skipped, | |||||
s.chains[0].size, | |||||
), | |||||
reverse=True, | |||||
) | |||||
def cmpfunc(s1, s2): | |||||
if s2.confidence != s1.confidence: | |||||
return 1 if s2.confidence > s1.confidence else -1 | |||||
if s2.excluded != s1.excluded: | |||||
return 1 if s1.excluded else -1 | |||||
return int(s1.skipped) - int(s2.skipped) | |||||
included_sources = [ | |||||
source for source in self.sources if source.confidence >= INCLUDE_THRESHOLD | |||||
] | |||||
if included_sources: | |||||
unified = MarkovChainUnion(source.chains[0] for source in included_sources) | |||||
delta = MarkovChainIntersection(self._article, unified) | |||||
unified_confidence = self._calculate_confidence(delta) | |||||
else: | |||||
unified_confidence = None | |||||
self.sources.sort(cmpfunc) | |||||
return CopyvioCheckResult( | return CopyvioCheckResult( | ||||
self.finished, | self.finished, | ||||
self.sources, | self.sources, | ||||
@@ -517,4 +534,6 @@ class CopyvioWorkspace: | |||||
time.time() - self._start_time, | time.time() - self._start_time, | ||||
self._article, | self._article, | ||||
self.possible_miss, | self.possible_miss, | ||||
included_sources, | |||||
unified_confidence, | |||||
) | ) |