Browse Source

Note when sources might have been missed.

tags/v0.2
Ben Kurtovic 10 years ago
parent
commit
5194525a32
3 changed files with 17 additions and 8 deletions
  1. +1
    -0
      earwigbot/wiki/copyvios/__init__.py
  2. +4
    -1
      earwigbot/wiki/copyvios/result.py
  3. +12
    -7
      earwigbot/wiki/copyvios/workers.py

+ 1
- 0
earwigbot/wiki/copyvios/__init__.py View File

@@ -137,6 +137,7 @@ class CopyvioMixIn(object):
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
for chunk in chunks:
if short_circuit and workspace.finished:
workspace.possible_miss = True
break
log = u"[[{0}]] -> querying {1} for {2!r}"
self._logger.debug(log.format(self.title, searcher.name, chunk))


+ 4
- 1
earwigbot/wiki/copyvios/result.py View File

@@ -116,14 +116,17 @@ class CopyvioCheckResult(object):
- :py:attr:`queries`: the number of queries used to reach a result
- :py:attr:`time`: the amount of time the check took to complete
- :py:attr:`article_chain`: the MarkovChain of the article text
- :py:attr:`possible_miss`: whether some URLs might have been missed
"""

def __init__(self, violation, sources, queries, check_time, article_chain):
def __init__(self, violation, sources, queries, check_time, article_chain,
possible_miss):
self.violation = violation
self.sources = sources
self.queries = queries
self.time = check_time
self.article_chain = article_chain
self.possible_miss = possible_miss

def __repr__(self):
"""Return the canonical string representation of the result."""


+ 12
- 7
earwigbot/wiki/copyvios/workers.py View File

@@ -235,13 +235,14 @@ class CopyvioWorkspace(object):
url_timeout=5, num_workers=8, short_circuit=True):
self.sources = []
self.finished = False
self.possible_miss = False

self._article = article
self._logger = logger.getChild("copyvios")
self._min_confidence = min_confidence
self._start_time = time()
self._until = (self._start_time + max_time) if max_time > 0 else None
self._handled_urls = []
self._handled_urls = set()
self._finish_lock = Lock()
self._short_circuit = short_circuit
self._source_args = {"workspace": self, "headers": headers,
@@ -308,22 +309,25 @@ class CopyvioWorkspace(object):
"""
for url in urls:
with self._queues.lock:
if self._short_circuit and self.finished:
break
if url in self._handled_urls:
continue
self._handled_urls.append(url)
self._handled_urls.add(url)
if exclude_check and exclude_check(url):
continue

source = CopyvioSource(url=url, **self._source_args)
self.sources.append(source)
if self._short_circuit and self.finished:
self._logger.debug(u"enqueue(): auto-skip {0}".format(url))
source.skip()
continue

try:
key = tldextract.extract(url).registered_domain
except ImportError: # Fall back on very naive method
from urlparse import urlparse
key = u".".join(urlparse(url).netloc.split(".")[-2:])

source = CopyvioSource(url=url, **self._source_args)
self.sources.append(source)
logmsg = u"enqueue(): {0} {1} -> {2}"
if key in self._queues.sites:
self._logger.debug(logmsg.format("append", key, url))
@@ -372,4 +376,5 @@ class CopyvioWorkspace(object):

self.sources.sort(cmpfunc)
return CopyvioCheckResult(self.finished, self.sources, num_queries,
time() - self._start_time, self._article)
time() - self._start_time, self._article,
self.possible_miss)

Loading…
Cancel
Save