diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 087f299..419c84e 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -78,7 +78,7 @@ class CopyvioMixIn(object): raise exceptions.UnknownSearchEngineError(engine) def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1, - no_searches=False, no_links=False): + no_searches=False, no_links=False, short_circuit=True): """Check the page for copyright violations. Returns a :class:`.CopyvioCheckResult` object with information on the @@ -102,6 +102,11 @@ class CopyvioMixIn(object): in the wikitext will be ignored; search engine queries will be made only. Setting both of these to ``True`` is pointless. + Normally, the checker will short-circuit if it finds a URL that meets + *min_confidence*. This behavior normally causes it to skip any + remaining URLs and web queries, but setting *short_circuit* to + ``False`` will prevent this. + Raises :exc:`.CopyvioCheckError` or subclasses (:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on errors. @@ -111,8 +116,9 @@ class CopyvioMixIn(object): searcher = self._get_search_engine() parser = ArticleTextParser(self.get()) article = MarkovChain(parser.strip()) - workspace = CopyvioWorkspace(article, min_confidence, max_time, - self._logger, self._addheaders) + workspace = CopyvioWorkspace( + article, min_confidence, max_time, self._logger, self._addheaders, + short_circuit=short_circuit) if self._exclusions_db: self._exclusions_db.sync(self.site.name) exclude = lambda u: self._exclusions_db.check(self.site.name, u) @@ -130,7 +136,7 @@ class CopyvioMixIn(object): if not no_searches: chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) for chunk in chunks: - if workspace.finished: + if short_circuit and workspace.finished: break log = u"[[{0}]] -> querying {1} for {2!r}" self._logger.debug(log.format(self.title, searcher.name, chunk)) diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 88c0c38..f76d6c0 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -234,7 +234,7 @@ class CopyvioWorkspace(object): """Manages a single copyvio check distributed across threads.""" def __init__(self, article, min_confidence, max_time, logger, headers, - url_timeout=5, num_workers=8): + url_timeout=5, num_workers=8, short_circuit=True): self.sources = [] self.finished = False @@ -245,6 +245,7 @@ class CopyvioWorkspace(object): self._until = (self._start_time + max_time) if max_time > 0 else None self._handled_urls = [] self._finish_lock = Lock() + self._short_circuit = short_circuit self._source_args = {"workspace": self, "headers": headers, "timeout": url_timeout} @@ -309,7 +310,7 @@ class CopyvioWorkspace(object): """ for url in urls: with self._queues.lock: - if self.finished: + if self._short_circuit and self.finished: break if url in self._handled_urls: continue @@ -343,7 +344,10 @@ class CopyvioWorkspace(object): with self._finish_lock: source.finish_work(conf, source_chain, delta) if not self.finished and conf >= self._min_confidence: - self._finish_early() + if self._short_circuit: + self._finish_early() + else: + self.finished = True def wait(self): """Wait for the workers to finish handling the sources."""