diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 4a7ab1a..087f299 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -137,6 +137,7 @@ class CopyvioMixIn(object): workspace.enqueue(searcher.search(chunk), exclude) num_queries += 1 sleep(1) + workspace.wait() result = workspace.get_result(num_queries) self._logger.info(result.get_log_message(self.title)) diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py index 83eaafd..26f4d59 100644 --- a/earwigbot/wiki/copyvios/result.py +++ b/earwigbot/wiki/copyvios/result.py @@ -33,6 +33,13 @@ class CopyvioSource(object): A class that represents a single possible source of a copyright violation, i.e., a URL. + + *Attributes:* + + - :py:attr:`url`: the URL of the source + - :py:attr:`confidence`: the confidence of a violation, between 0 and 1 + - :py:attr:`chains`: a 2-tuple of the source chain and the delta chain + - :py:attr:`skipped`: whether this URL was skipped during the check """ def __init__(self, workspace, url, key, headers=None, timeout=5): @@ -101,6 +108,9 @@ class CopyvioCheckResult(object): - :py:attr:`violation`: ``True`` if this is a violation, else ``False`` - :py:attr:`sources`: a list of CopyvioSources, sorted by confidence + - :py:attr:`best`: the best matching CopyvioSource, or ``None`` + - :py:attr:`confidence`: the best matching source's confidence, or 0 + - :py:attr:`url`: the best matching source's URL, or ``None`` - :py:attr:`queries`: the number of queries used to reach a result - :py:attr:`time`: the amount of time the check took to complete - :py:attr:`article_chain`: the MarkovChain of the article text @@ -136,7 +146,7 @@ class CopyvioCheckResult(object): @property def url(self): - """The url of the best source, or None if no sources exist.""" + """The URL of the best source, or None if no sources exist.""" return self.best.url if self.best else None def get_log_message(self, title): @@ -144,7 +154,7 @@ class CopyvioCheckResult(object): if not self.sources: log = u"No violation for [[{0}]] (no sources; {1} queries; {2} seconds)" return log.format(title, self.queries, self.time) - log = u"{0} for [[{1}]] (best: {2} ({3} confidence); {4} queries; {5} seconds)" + log = u"{0} for [[{1}]] (best: {2} ({3} confidence); {4} sources; {5} queries; {6} seconds)" is_vio = "Violation detected" if self.violation else "No violation" return log.format(is_vio, title, self.url, self.confidence, - self.queries, self.time) + len(self.sources), self.queries, self.time) diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 157e3ec..2030b78 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -358,6 +358,11 @@ class CopyvioWorkspace(object): def get_result(self, num_queries=0): """Return a CopyvioCheckResult containing the results of this check.""" - self.sources.sort(key=lambda source: source.confidence, reverse=True) + def cmpfunc(s1, s2): + if s2.confidence != s1.confidence: + return 1 if s2.confidence > s1.confidence else -1 + return int(s1.skipped) - int(s2.skipped) + + self.sources.sort(cmpfunc) return CopyvioCheckResult(self.finished, self.sources, num_queries, time() - self._start_time, self._article)