diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index af8cc6f..9ecfe30 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -162,8 +162,8 @@ class CopyvioMixIn(object): self._logger.info(result.get_log_message(self.title)) return result - def copyvio_compare(self, url, min_confidence=0.75, max_time=30, degree=5): - """Check the page like :py:meth:`copyvio_check` against a specific URL. + def copyvio_compare(self, urls, min_confidence=0.75, max_time=30, degree=5): + """Check the page like :py:meth:`copyvio_check` against specific URLs. This is essentially a reduced version of :meth:`copyvio_check` - a copyivo comparison is made using Markov chains and the result is @@ -183,13 +183,16 @@ class CopyvioMixIn(object): Since no searching is done, neither :exc:`.UnknownSearchEngineError` nor :exc:`.SearchQueryError` will be raised. """ + if not isinstance(urls, list): + urls = [urls] log = u"Starting copyvio compare for [[{0}]] against {1}" - self._logger.info(log.format(self.title, url)) + self._logger.info(log.format(self.title, ", ".join(urls))) article = MarkovChain(ArticleTextParser(self.get()).strip(), degree=degree) workspace = CopyvioWorkspace( article, min_confidence, max_time, self._logger, self._addheaders, - max_time, num_workers=1, config=self._search_config, degree=degree) - workspace.enqueue([url]) + max_time, num_workers=min(len(urls), 8), short_circuit=False, + config=self._search_config, degree=degree) + workspace.enqueue(urls) workspace.wait() result = workspace.get_result() self._logger.info(result.get_log_message(self.title)) diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index e3db1a7..692340c 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -94,5 +94,35 @@ class MarkovChainIntersection(MarkovChain): return res.format(self.size, self.mc1, self.mc2) +class MarkovChainUnion(MarkovChain): + """Implemented the union of multiple chains.""" + + def __init__(self, chains): + self.chains = list(chains) + self.chain = self._build() + self.size = self._get_size() + + def _build(self): + """Build and return the Markov chain from the input chains.""" + union = {} + for chain in self.chains: + for phrase, count in chain.chain.iteritems(): + if phrase in union: + union[phrase] += count + else: + union[phrase] = count + return union + + def __repr__(self): + """Return the canonical string representation of the union.""" + res = "MarkovChainUnion(chains={!r})" + return res.format(self.chains) + + def __str__(self): + """Return a nice string representation of the union.""" + res = "" + return res.format(self.size, "| ".join(str(chain) for chain in self.chains)) + + EMPTY = MarkovChain("") EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 5d694aa..b67a15f 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -265,7 +265,7 @@ class _HTMLParser(_BaseTextParser): for element in soup.find_all(tag): element.extract() - return "\n".join(soup.stripped_strings) + return "\n".join(s.replace("\n", " ") for s in soup.stripped_strings) def _open(self, url, **kwargs): """Try to read a URL. Return None if it couldn't be read.""" diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py index ca73036..275047d 100644 --- a/earwigbot/wiki/copyvios/result.py +++ b/earwigbot/wiki/copyvios/result.py @@ -22,6 +22,7 @@ from threading import Event from time import time +import urlparse from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION @@ -77,6 +78,11 @@ class CopyvioSource(object): res = "" return res.format(self.url, self.confidence) + @property + def domain(self): + """The source URL's domain name, or None.""" + return urlparse.urlparse(self.url).netloc or None + def start_work(self): """Mark this source as being worked on right now.""" self._event2.clear() @@ -130,13 +136,16 @@ class CopyvioCheckResult(object): """ def __init__(self, violation, sources, queries, check_time, article_chain, - possible_miss): + possible_miss, included_sources=None, unified_confidence=None): + assert isinstance(sources, list) self.violation = violation self.sources = sources self.queries = queries self.time = check_time self.article_chain = article_chain self.possible_miss = possible_miss + self.included_sources = included_sources if included_sources else [] + self.unified_confidence = unified_confidence def __repr__(self): """Return the canonical string representation of the result.""" @@ -157,7 +166,11 @@ class CopyvioCheckResult(object): @property def confidence(self): """The confidence of the best source, or 0 if no sources exist.""" - return self.best.confidence if self.best else 0.0 + return ( + self.unified_confidence if self.unified_confidence is not None else + self.best.confidence if self.best else + 0.0 + ) @property def url(self): diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 9ec33eb..d2e74e5 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -39,7 +39,7 @@ import urlparse from earwigbot import importer from earwigbot.exceptions import ParserExclusionError, ParserRedirectError -from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection +from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection, MarkovChainUnion from earwigbot.wiki.copyvios.parsers import get_parser from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource @@ -47,6 +47,8 @@ tldextract = importer.new("tldextract") __all__ = ["globalize", "localize", "CopyvioWorkspace"] +INCLUDE_THRESHOLD = 0.15 + _MAX_REDIRECTS = 3 _MAX_RAW_SIZE = 20 * 1024 ** 2 @@ -476,14 +478,21 @@ class CopyvioWorkspace(object): def get_result(self, num_queries=0): """Return a CopyvioCheckResult containing the results of this check.""" - def cmpfunc(s1, s2): - if s2.confidence != s1.confidence: - return 1 if s2.confidence > s1.confidence else -1 - if s2.excluded != s1.excluded: - return 1 if s1.excluded else -1 - return int(s1.skipped) - int(s2.skipped) - - self.sources.sort(cmpfunc) + self.sources.sort( + key=lambda s: (s.confidence, not s.excluded, not s.skipped, s.chains[0].size), + reverse=True, + ) + + included_sources = [ + source for source in self.sources if source.confidence >= INCLUDE_THRESHOLD + ] + if included_sources: + unified = MarkovChainUnion(source.chains[0] for source in included_sources) + delta = MarkovChainIntersection(self._article, unified) + unified_confidence = self._calculate_confidence(delta) + else: + unified_confidence = None + return CopyvioCheckResult(self.finished, self.sources, num_queries, time.time() - self._start_time, self._article, - self.possible_miss) + self.possible_miss, included_sources, unified_confidence)