@@ -70,17 +70,17 @@ class AFCCopyvios(Task): | |||
"""Detect copyvios in 'page' and add a note if any are found.""" | |||
title = page.title | |||
if title in self.ignore_list: | |||
msg = "Skipping page in ignore list: [[{0}]]" | |||
msg = u"Skipping page in ignore list: [[{0}]]" | |||
self.logger.info(msg.format(title)) | |||
return | |||
pageid = page.pageid | |||
if self.has_been_processed(pageid): | |||
msg = "Skipping check on already processed page [[{0}]]" | |||
msg = u"Skipping check on already processed page [[{0}]]" | |||
self.logger.info(msg.format(title)) | |||
return | |||
self.logger.info("Checking [[{0}]]".format(title)) | |||
self.logger.info(u"Checking [[{0}]]".format(title)) | |||
result = page.copyvio_check(self.min_confidence, self.max_queries) | |||
url = result.url | |||
confidence = "{0}%".format(round(result.confidence * 100, 2)) | |||
@@ -94,11 +94,11 @@ class AFCCopyvios(Task): | |||
page.edit(newtext, self.summary.format(url=url)) | |||
else: | |||
page.edit(newtext, self.summary) | |||
msg = "Found violation: [[{0}]] -> {1} ({2} confidence)" | |||
self.logger.warn(msg.format(title, url, confidence)) | |||
msg = u"Found violation: [[{0}]] -> {1} ({2} confidence)" | |||
self.logger.info(msg.format(title, url, confidence)) | |||
else: | |||
msg = "No violations detected (best: {1} at {2} confidence)" | |||
self.logger.debug(msg.format(url, confidence)) | |||
msg = u"No violations detected in [[{0}]] (best: {1} at {2} confidence)" | |||
self.logger.info(msg.format(title, url, confidence)) | |||
self.log_processed(pageid) | |||
if self.cache_results: | |||
@@ -155,7 +155,10 @@ class CopyvioMixIn(object): | |||
while (chunks and best_confidence < min_confidence and | |||
(max_queries < 0 or num_queries < max_queries)): | |||
urls = searcher.search(chunks.pop(0)) | |||
chunk = chunks.pop(0) | |||
log = u"[[{0}]] -> querying {1} for {2!r}" | |||
self._logger.debug(log.format(self.title, searcher.name, chunk)) | |||
urls = searcher.search(chunk) | |||
urls = [url for url in urls if url not in handled_urls] | |||
for url in urls: | |||
handled_urls.append(url) | |||
@@ -172,12 +175,19 @@ class CopyvioMixIn(object): | |||
sleep(interquery_sleep - diff) | |||
last_query = time() | |||
if best_confidence >= min_confidence: # violation? | |||
v = True | |||
if best_confidence >= min_confidence: | |||
is_violation = True | |||
log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2}; using {3} queries)" | |||
self._logger.debug(log.format(self.title, best_confidence, | |||
best_match, num_queries)) | |||
else: | |||
v = False | |||
return CopyvioCheckResult(v, best_confidence, best_match, num_queries, | |||
article_chain, best_chains) | |||
is_violation = False | |||
log = u"No violation for [[{0}]] (confidence: {1}; using {2} queries)" | |||
self._logger.debug(log.format(self.title, best_confidence, | |||
num_queries)) | |||
return CopyvioCheckResult(is_violation, best_confidence, best_match, | |||
num_queries, article_chain, best_chains) | |||
def copyvio_compare(self, url, min_confidence=0.5): | |||
"""Check the page like :py:meth:`copyvio_check` against a specific URL. | |||
@@ -208,7 +218,12 @@ class CopyvioMixIn(object): | |||
if confidence >= min_confidence: | |||
is_violation = True | |||
log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2})" | |||
self._logger.debug(log.format(self.title, confidence, url)) | |||
else: | |||
is_violation = False | |||
log = u"No violation for [[{0}]] (confidence: {1}; URL: {2})" | |||
self._logger.debug(log.format(self.title, confidence, url)) | |||
return CopyvioCheckResult(is_violation, confidence, url, 0, | |||
article_chain, chains) |
@@ -138,11 +138,11 @@ class ExclusionsDB(object): | |||
max_staleness = 60 * 60 * 24 * 30 | |||
time_since_update = int(time() - self._get_last_update()) | |||
if time_since_update > max_staleness: | |||
log = "Updating stale database: {0} (last updated {1} seconds ago)" | |||
log = u"Updating stale database: {0} (last updated {1} seconds ago)" | |||
self._logger.info(log.format(sitename, time_since_update)) | |||
self._update(sitename) | |||
else: | |||
log = "Database for {0} is still fresh (last updated {1} seconds ago)" | |||
log = u"Database for {0} is still fresh (last updated {1} seconds ago)" | |||
self._logger.debug(log.format(sitename, time_since_update)) | |||
def check(self, sitename, url): | |||
@@ -155,10 +155,10 @@ class ExclusionsDB(object): | |||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||
for row in conn.execute(query, (sitename,)): | |||
if normalized.startswith(row[0]): | |||
log = "Exclusion detected in {0} for {1}" | |||
log = u"Exclusion detected in {0} for {1}" | |||
self._logger.debug(log.format(sitename, url)) | |||
return True | |||
log = "No exclusions in {0} for {1}".format(sitename, url) | |||
log = u"No exclusions in {0} for {1}".format(sitename, url) | |||
self._logger.debug(log) | |||
return False |
@@ -34,6 +34,7 @@ __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] | |||
class BaseSearchEngine(object): | |||
"""Base class for a simple search engine interface.""" | |||
name = "Base" | |||
def __init__(self, cred): | |||
"""Store credentials *cred* for searching later on.""" | |||
@@ -57,6 +58,7 @@ class BaseSearchEngine(object): | |||
class YahooBOSSSearchEngine(BaseSearchEngine): | |||
"""A search engine interface with Yahoo! BOSS.""" | |||
name = "Yahoo! BOSS" | |||
def search(self, query): | |||
"""Do a Yahoo! BOSS web search for *query*. | |||