diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py index afcb7f9..12c6b80 100644 --- a/earwigbot/tasks/afc_copyvios.py +++ b/earwigbot/tasks/afc_copyvios.py @@ -70,17 +70,17 @@ class AFCCopyvios(Task): """Detect copyvios in 'page' and add a note if any are found.""" title = page.title if title in self.ignore_list: - msg = "Skipping page in ignore list: [[{0}]]" + msg = u"Skipping page in ignore list: [[{0}]]" self.logger.info(msg.format(title)) return pageid = page.pageid if self.has_been_processed(pageid): - msg = "Skipping check on already processed page [[{0}]]" + msg = u"Skipping check on already processed page [[{0}]]" self.logger.info(msg.format(title)) return - self.logger.info("Checking [[{0}]]".format(title)) + self.logger.info(u"Checking [[{0}]]".format(title)) result = page.copyvio_check(self.min_confidence, self.max_queries) url = result.url confidence = "{0}%".format(round(result.confidence * 100, 2)) @@ -94,11 +94,11 @@ class AFCCopyvios(Task): page.edit(newtext, self.summary.format(url=url)) else: page.edit(newtext, self.summary) - msg = "Found violation: [[{0}]] -> {1} ({2} confidence)" - self.logger.warn(msg.format(title, url, confidence)) + msg = u"Found violation: [[{0}]] -> {1} ({2} confidence)" + self.logger.info(msg.format(title, url, confidence)) else: - msg = "No violations detected (best: {1} at {2} confidence)" - self.logger.debug(msg.format(url, confidence)) + msg = u"No violations detected in [[{0}]] (best: {1} at {2} confidence)" + self.logger.info(msg.format(title, url, confidence)) self.log_processed(pageid) if self.cache_results: diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 0f29403..e89a322 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -155,7 +155,10 @@ class CopyvioMixIn(object): while (chunks and best_confidence < min_confidence and (max_queries < 0 or num_queries < max_queries)): - urls = searcher.search(chunks.pop(0)) + chunk = chunks.pop(0) + log = u"[[{0}]] -> querying {1} for {2!r}" + self._logger.debug(log.format(self.title, searcher.name, chunk)) + urls = searcher.search(chunk) urls = [url for url in urls if url not in handled_urls] for url in urls: handled_urls.append(url) @@ -172,12 +175,19 @@ class CopyvioMixIn(object): sleep(interquery_sleep - diff) last_query = time() - if best_confidence >= min_confidence: # violation? - v = True + if best_confidence >= min_confidence: + is_violation = True + log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2}; using {3} queries)" + self._logger.debug(log.format(self.title, best_confidence, + best_match, num_queries)) else: - v = False - return CopyvioCheckResult(v, best_confidence, best_match, num_queries, - article_chain, best_chains) + is_violation = False + log = u"No violation for [[{0}]] (confidence: {1}; using {2} queries)" + self._logger.debug(log.format(self.title, best_confidence, + num_queries)) + + return CopyvioCheckResult(is_violation, best_confidence, best_match, + num_queries, article_chain, best_chains) def copyvio_compare(self, url, min_confidence=0.5): """Check the page like :py:meth:`copyvio_check` against a specific URL. @@ -208,7 +218,12 @@ class CopyvioMixIn(object): if confidence >= min_confidence: is_violation = True + log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2})" + self._logger.debug(log.format(self.title, confidence, url)) else: is_violation = False + log = u"No violation for [[{0}]] (confidence: {1}; URL: {2})" + self._logger.debug(log.format(self.title, confidence, url)) + return CopyvioCheckResult(is_violation, confidence, url, 0, article_chain, chains) diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index 7eb6a80..4640b1f 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -138,11 +138,11 @@ class ExclusionsDB(object): max_staleness = 60 * 60 * 24 * 30 time_since_update = int(time() - self._get_last_update()) if time_since_update > max_staleness: - log = "Updating stale database: {0} (last updated {1} seconds ago)" + log = u"Updating stale database: {0} (last updated {1} seconds ago)" self._logger.info(log.format(sitename, time_since_update)) self._update(sitename) else: - log = "Database for {0} is still fresh (last updated {1} seconds ago)" + log = u"Database for {0} is still fresh (last updated {1} seconds ago)" self._logger.debug(log.format(sitename, time_since_update)) def check(self, sitename, url): @@ -155,10 +155,10 @@ class ExclusionsDB(object): with sqlite.connect(self._dbfile) as conn, self._db_access_lock: for row in conn.execute(query, (sitename,)): if normalized.startswith(row[0]): - log = "Exclusion detected in {0} for {1}" + log = u"Exclusion detected in {0} for {1}" self._logger.debug(log.format(sitename, url)) return True - log = "No exclusions in {0} for {1}".format(sitename, url) + log = u"No exclusions in {0} for {1}".format(sitename, url) self._logger.debug(log) return False diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index cf2edb4..0ccd62e 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -34,6 +34,7 @@ __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] class BaseSearchEngine(object): """Base class for a simple search engine interface.""" + name = "Base" def __init__(self, cred): """Store credentials *cred* for searching later on.""" @@ -57,6 +58,7 @@ class BaseSearchEngine(object): class YahooBOSSSearchEngine(BaseSearchEngine): """A search engine interface with Yahoo! BOSS.""" + name = "Yahoo! BOSS" def search(self, query): """Do a Yahoo! BOSS web search for *query*.