From 5194525a329bff3e99abc567660020426b7f0639 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Mon, 13 Oct 2014 16:33:46 -0500
Subject: [PATCH] Note when sources might have been missed.

---
 earwigbot/wiki/copyvios/__init__.py |  1 +
 earwigbot/wiki/copyvios/result.py   |  5 ++++-
 earwigbot/wiki/copyvios/workers.py  | 19 ++++++++++++-------
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 419c84e..2888815 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -137,6 +137,7 @@ class CopyvioMixIn(object):
             chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
             for chunk in chunks:
                 if short_circuit and workspace.finished:
+                    workspace.possible_miss = True
                     break
                 log = u"[[{0}]] -> querying {1} for {2!r}"
                 self._logger.debug(log.format(self.title, searcher.name, chunk))
diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py
index bbfc566..744253f 100644
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -116,14 +116,17 @@ class CopyvioCheckResult(object):
     - :py:attr:`queries`:       the number of queries used to reach a result
     - :py:attr:`time`:          the amount of time the check took to complete
     - :py:attr:`article_chain`: the MarkovChain of the article text
+    - :py:attr:`possible_miss`: whether some URLs might have been missed
     """
 
-    def __init__(self, violation, sources, queries, check_time, article_chain):
+    def __init__(self, violation, sources, queries, check_time, article_chain,
+                 possible_miss):
         self.violation = violation
         self.sources = sources
         self.queries = queries
         self.time = check_time
         self.article_chain = article_chain
+        self.possible_miss = possible_miss
 
     def __repr__(self):
         """Return the canonical string representation of the result."""
diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py
index f143117..59e68b0 100644
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -235,13 +235,14 @@ class CopyvioWorkspace(object):
                  url_timeout=5, num_workers=8, short_circuit=True):
         self.sources = []
         self.finished = False
+        self.possible_miss = False
 
         self._article = article
         self._logger = logger.getChild("copyvios")
         self._min_confidence = min_confidence
         self._start_time = time()
         self._until = (self._start_time + max_time) if max_time > 0 else None
-        self._handled_urls = []
+        self._handled_urls = set()
         self._finish_lock = Lock()
         self._short_circuit = short_circuit
         self._source_args = {"workspace": self, "headers": headers,
@@ -308,22 +309,25 @@ class CopyvioWorkspace(object):
         """
         for url in urls:
             with self._queues.lock:
-                if self._short_circuit and self.finished:
-                    break
                 if url in self._handled_urls:
                     continue
-                self._handled_urls.append(url)
+                self._handled_urls.add(url)
                 if exclude_check and exclude_check(url):
                     continue
 
+                source = CopyvioSource(url=url, **self._source_args)
+                self.sources.append(source)
+                if self._short_circuit and self.finished:
+                    self._logger.debug(u"enqueue(): auto-skip {0}".format(url))
+                    source.skip()
+                    continue
+
                 try:
                     key = tldextract.extract(url).registered_domain
                 except ImportError:  # Fall back on very naive method
                     from urlparse import urlparse
                     key = u".".join(urlparse(url).netloc.split(".")[-2:])
 
-                source = CopyvioSource(url=url, **self._source_args)
-                self.sources.append(source)
                 logmsg = u"enqueue(): {0} {1} -> {2}"
                 if key in self._queues.sites:
                     self._logger.debug(logmsg.format("append", key, url))
@@ -372,4 +376,5 @@ class CopyvioWorkspace(object):
 
         self.sources.sort(cmpfunc)
         return CopyvioCheckResult(self.finished, self.sources, num_queries,
-                                  time() - self._start_time, self._article)
+                                  time() - self._start_time, self._article,
+                                  self.possible_miss)