Note when sources might have been missed.

10 years ago · 5194525a32
--- a/earwigbot/wiki/copyvios/init.py
+++ b/earwigbot/wiki/copyvios/init.py
@@ -137,6 +137,7 @@ class CopyvioMixIn(object):
            chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
            for chunk in chunks:
                if short_circuit and workspace.finished:
                    workspace.possible_miss = True
                    break
                log = u"[[{0}]] -> querying {1} for {2!r}"
                self._logger.debug(log.format(self.title, searcher.name, chunk))
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -116,14 +116,17 @@ class CopyvioCheckResult(object):
    - :py:attr:`queries`:       the number of queries used to reach a result
    - :py:attr:`time`:          the amount of time the check took to complete
    - :py:attr:`article_chain`: the MarkovChain of the article text
    - :py:attr:`possible_miss`: whether some URLs might have been missed
    """

    def __init__(self, violation, sources, queries, check_time, article_chain):
    def __init__(self, violation, sources, queries, check_time, article_chain,
                 possible_miss):
        self.violation = violation
        self.sources = sources
        self.queries = queries
        self.time = check_time
        self.article_chain = article_chain
        self.possible_miss = possible_miss

    def __repr__(self):
        """Return the canonical string representation of the result."""
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -235,13 +235,14 @@ class CopyvioWorkspace(object):
                 url_timeout=5, num_workers=8, short_circuit=True):
        self.sources = []
        self.finished = False
        self.possible_miss = False

        self._article = article
        self._logger = logger.getChild("copyvios")
        self._min_confidence = min_confidence
        self._start_time = time()
        self._until = (self._start_time + max_time) if max_time > 0 else None
        self._handled_urls = []
        self._handled_urls = set()
        self._finish_lock = Lock()
        self._short_circuit = short_circuit
        self._source_args = {"workspace": self, "headers": headers,
@@ -308,22 +309,25 @@ class CopyvioWorkspace(object):
        """
        for url in urls:
            with self._queues.lock:
                if self._short_circuit and self.finished:
                    break
                if url in self._handled_urls:
                    continue
                self._handled_urls.append(url)
                self._handled_urls.add(url)
                if exclude_check and exclude_check(url):
                    continue

                source = CopyvioSource(url=url, **self._source_args)
                self.sources.append(source)
                if self._short_circuit and self.finished:
                    self._logger.debug(u"enqueue(): auto-skip {0}".format(url))
                    source.skip()
                    continue

                try:
                    key = tldextract.extract(url).registered_domain
                except ImportError:  # Fall back on very naive method
                    from urlparse import urlparse
                    key = u".".join(urlparse(url).netloc.split(".")[-2:])

                source = CopyvioSource(url=url, **self._source_args)
                self.sources.append(source)
                logmsg = u"enqueue(): {0} {1} -> {2}"
                if key in self._queues.sites:
                    self._logger.debug(logmsg.format("append", key, url))
@@ -372,4 +376,5 @@ class CopyvioWorkspace(object):

        self.sources.sort(cmpfunc)
        return CopyvioCheckResult(self.finished, self.sources, num_queries,
                                  time() - self._start_time, self._article)
                                  time() - self._start_time, self._article,
                                  self.possible_miss)