@@ -70,17 +70,17 @@ class AFCCopyvios(Task): | |||||
"""Detect copyvios in 'page' and add a note if any are found.""" | """Detect copyvios in 'page' and add a note if any are found.""" | ||||
title = page.title | title = page.title | ||||
if title in self.ignore_list: | if title in self.ignore_list: | ||||
msg = "Skipping page in ignore list: [[{0}]]" | |||||
msg = u"Skipping page in ignore list: [[{0}]]" | |||||
self.logger.info(msg.format(title)) | self.logger.info(msg.format(title)) | ||||
return | return | ||||
pageid = page.pageid | pageid = page.pageid | ||||
if self.has_been_processed(pageid): | if self.has_been_processed(pageid): | ||||
msg = "Skipping check on already processed page [[{0}]]" | |||||
msg = u"Skipping check on already processed page [[{0}]]" | |||||
self.logger.info(msg.format(title)) | self.logger.info(msg.format(title)) | ||||
return | return | ||||
self.logger.info("Checking [[{0}]]".format(title)) | |||||
self.logger.info(u"Checking [[{0}]]".format(title)) | |||||
result = page.copyvio_check(self.min_confidence, self.max_queries) | result = page.copyvio_check(self.min_confidence, self.max_queries) | ||||
url = result.url | url = result.url | ||||
confidence = "{0}%".format(round(result.confidence * 100, 2)) | confidence = "{0}%".format(round(result.confidence * 100, 2)) | ||||
@@ -94,11 +94,11 @@ class AFCCopyvios(Task): | |||||
page.edit(newtext, self.summary.format(url=url)) | page.edit(newtext, self.summary.format(url=url)) | ||||
else: | else: | ||||
page.edit(newtext, self.summary) | page.edit(newtext, self.summary) | ||||
msg = "Found violation: [[{0}]] -> {1} ({2} confidence)" | |||||
self.logger.warn(msg.format(title, url, confidence)) | |||||
msg = u"Found violation: [[{0}]] -> {1} ({2} confidence)" | |||||
self.logger.info(msg.format(title, url, confidence)) | |||||
else: | else: | ||||
msg = "No violations detected (best: {1} at {2} confidence)" | |||||
self.logger.debug(msg.format(url, confidence)) | |||||
msg = u"No violations detected in [[{0}]] (best: {1} at {2} confidence)" | |||||
self.logger.info(msg.format(title, url, confidence)) | |||||
self.log_processed(pageid) | self.log_processed(pageid) | ||||
if self.cache_results: | if self.cache_results: | ||||
@@ -155,7 +155,10 @@ class CopyvioMixIn(object): | |||||
while (chunks and best_confidence < min_confidence and | while (chunks and best_confidence < min_confidence and | ||||
(max_queries < 0 or num_queries < max_queries)): | (max_queries < 0 or num_queries < max_queries)): | ||||
urls = searcher.search(chunks.pop(0)) | |||||
chunk = chunks.pop(0) | |||||
log = u"[[{0}]] -> querying {1} for {2!r}" | |||||
self._logger.debug(log.format(self.title, searcher.name, chunk)) | |||||
urls = searcher.search(chunk) | |||||
urls = [url for url in urls if url not in handled_urls] | urls = [url for url in urls if url not in handled_urls] | ||||
for url in urls: | for url in urls: | ||||
handled_urls.append(url) | handled_urls.append(url) | ||||
@@ -172,12 +175,19 @@ class CopyvioMixIn(object): | |||||
sleep(interquery_sleep - diff) | sleep(interquery_sleep - diff) | ||||
last_query = time() | last_query = time() | ||||
if best_confidence >= min_confidence: # violation? | |||||
v = True | |||||
if best_confidence >= min_confidence: | |||||
is_violation = True | |||||
log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2}; using {3} queries)" | |||||
self._logger.debug(log.format(self.title, best_confidence, | |||||
best_match, num_queries)) | |||||
else: | else: | ||||
v = False | |||||
return CopyvioCheckResult(v, best_confidence, best_match, num_queries, | |||||
article_chain, best_chains) | |||||
is_violation = False | |||||
log = u"No violation for [[{0}]] (confidence: {1}; using {2} queries)" | |||||
self._logger.debug(log.format(self.title, best_confidence, | |||||
num_queries)) | |||||
return CopyvioCheckResult(is_violation, best_confidence, best_match, | |||||
num_queries, article_chain, best_chains) | |||||
def copyvio_compare(self, url, min_confidence=0.5): | def copyvio_compare(self, url, min_confidence=0.5): | ||||
"""Check the page like :py:meth:`copyvio_check` against a specific URL. | """Check the page like :py:meth:`copyvio_check` against a specific URL. | ||||
@@ -208,7 +218,12 @@ class CopyvioMixIn(object): | |||||
if confidence >= min_confidence: | if confidence >= min_confidence: | ||||
is_violation = True | is_violation = True | ||||
log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2})" | |||||
self._logger.debug(log.format(self.title, confidence, url)) | |||||
else: | else: | ||||
is_violation = False | is_violation = False | ||||
log = u"No violation for [[{0}]] (confidence: {1}; URL: {2})" | |||||
self._logger.debug(log.format(self.title, confidence, url)) | |||||
return CopyvioCheckResult(is_violation, confidence, url, 0, | return CopyvioCheckResult(is_violation, confidence, url, 0, | ||||
article_chain, chains) | article_chain, chains) |
@@ -138,11 +138,11 @@ class ExclusionsDB(object): | |||||
max_staleness = 60 * 60 * 24 * 30 | max_staleness = 60 * 60 * 24 * 30 | ||||
time_since_update = int(time() - self._get_last_update()) | time_since_update = int(time() - self._get_last_update()) | ||||
if time_since_update > max_staleness: | if time_since_update > max_staleness: | ||||
log = "Updating stale database: {0} (last updated {1} seconds ago)" | |||||
log = u"Updating stale database: {0} (last updated {1} seconds ago)" | |||||
self._logger.info(log.format(sitename, time_since_update)) | self._logger.info(log.format(sitename, time_since_update)) | ||||
self._update(sitename) | self._update(sitename) | ||||
else: | else: | ||||
log = "Database for {0} is still fresh (last updated {1} seconds ago)" | |||||
log = u"Database for {0} is still fresh (last updated {1} seconds ago)" | |||||
self._logger.debug(log.format(sitename, time_since_update)) | self._logger.debug(log.format(sitename, time_since_update)) | ||||
def check(self, sitename, url): | def check(self, sitename, url): | ||||
@@ -155,10 +155,10 @@ class ExclusionsDB(object): | |||||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | ||||
for row in conn.execute(query, (sitename,)): | for row in conn.execute(query, (sitename,)): | ||||
if normalized.startswith(row[0]): | if normalized.startswith(row[0]): | ||||
log = "Exclusion detected in {0} for {1}" | |||||
log = u"Exclusion detected in {0} for {1}" | |||||
self._logger.debug(log.format(sitename, url)) | self._logger.debug(log.format(sitename, url)) | ||||
return True | return True | ||||
log = "No exclusions in {0} for {1}".format(sitename, url) | |||||
log = u"No exclusions in {0} for {1}".format(sitename, url) | |||||
self._logger.debug(log) | self._logger.debug(log) | ||||
return False | return False |
@@ -34,6 +34,7 @@ __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] | |||||
class BaseSearchEngine(object): | class BaseSearchEngine(object): | ||||
"""Base class for a simple search engine interface.""" | """Base class for a simple search engine interface.""" | ||||
name = "Base" | |||||
def __init__(self, cred): | def __init__(self, cred): | ||||
"""Store credentials *cred* for searching later on.""" | """Store credentials *cred* for searching later on.""" | ||||
@@ -57,6 +58,7 @@ class BaseSearchEngine(object): | |||||
class YahooBOSSSearchEngine(BaseSearchEngine): | class YahooBOSSSearchEngine(BaseSearchEngine): | ||||
"""A search engine interface with Yahoo! BOSS.""" | """A search engine interface with Yahoo! BOSS.""" | ||||
name = "Yahoo! BOSS" | |||||
def search(self, query): | def search(self, query): | ||||
"""Do a Yahoo! BOSS web search for *query*. | """Do a Yahoo! BOSS web search for *query*. | ||||