From 5194525a329bff3e99abc567660020426b7f0639 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 13 Oct 2014 16:33:46 -0500 Subject: [PATCH] Note when sources might have been missed. --- earwigbot/wiki/copyvios/__init__.py | 1 + earwigbot/wiki/copyvios/result.py | 5 ++++- earwigbot/wiki/copyvios/workers.py | 19 ++++++++++++------- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 419c84e..2888815 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -137,6 +137,7 @@ class CopyvioMixIn(object): chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) for chunk in chunks: if short_circuit and workspace.finished: + workspace.possible_miss = True break log = u"[[{0}]] -> querying {1} for {2!r}" self._logger.debug(log.format(self.title, searcher.name, chunk)) diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py index bbfc566..744253f 100644 --- a/earwigbot/wiki/copyvios/result.py +++ b/earwigbot/wiki/copyvios/result.py @@ -116,14 +116,17 @@ class CopyvioCheckResult(object): - :py:attr:`queries`: the number of queries used to reach a result - :py:attr:`time`: the amount of time the check took to complete - :py:attr:`article_chain`: the MarkovChain of the article text + - :py:attr:`possible_miss`: whether some URLs might have been missed """ - def __init__(self, violation, sources, queries, check_time, article_chain): + def __init__(self, violation, sources, queries, check_time, article_chain, + possible_miss): self.violation = violation self.sources = sources self.queries = queries self.time = check_time self.article_chain = article_chain + self.possible_miss = possible_miss def __repr__(self): """Return the canonical string representation of the result.""" diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index f143117..59e68b0 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -235,13 +235,14 @@ class CopyvioWorkspace(object): url_timeout=5, num_workers=8, short_circuit=True): self.sources = [] self.finished = False + self.possible_miss = False self._article = article self._logger = logger.getChild("copyvios") self._min_confidence = min_confidence self._start_time = time() self._until = (self._start_time + max_time) if max_time > 0 else None - self._handled_urls = [] + self._handled_urls = set() self._finish_lock = Lock() self._short_circuit = short_circuit self._source_args = {"workspace": self, "headers": headers, @@ -308,22 +309,25 @@ class CopyvioWorkspace(object): """ for url in urls: with self._queues.lock: - if self._short_circuit and self.finished: - break if url in self._handled_urls: continue - self._handled_urls.append(url) + self._handled_urls.add(url) if exclude_check and exclude_check(url): continue + source = CopyvioSource(url=url, **self._source_args) + self.sources.append(source) + if self._short_circuit and self.finished: + self._logger.debug(u"enqueue(): auto-skip {0}".format(url)) + source.skip() + continue + try: key = tldextract.extract(url).registered_domain except ImportError: # Fall back on very naive method from urlparse import urlparse key = u".".join(urlparse(url).netloc.split(".")[-2:]) - source = CopyvioSource(url=url, **self._source_args) - self.sources.append(source) logmsg = u"enqueue(): {0} {1} -> {2}" if key in self._queues.sites: self._logger.debug(logmsg.format("append", key, url)) @@ -372,4 +376,5 @@ class CopyvioWorkspace(object): self.sources.sort(cmpfunc) return CopyvioCheckResult(self.finished, self.sources, num_queries, - time() - self._start_time, self._article) + time() - self._start_time, self._article, + self.possible_miss)