From 56e614028470a209b920fdb7c91877c662f4ba0c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 17 Dec 2011 22:00:09 -0500 Subject: [PATCH] More work on copyright violation detection code. --- earwigbot/tasks/afc_copyvios.py | 32 +++++++++++++---- earwigbot/wiki/copyright.py | 77 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 98 insertions(+), 11 deletions(-) diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py index 2d881df..c90043e 100644 --- a/earwigbot/tasks/afc_copyvios.py +++ b/earwigbot/tasks/afc_copyvios.py @@ -42,6 +42,8 @@ class Task(BaseTask): cfg = config.tasks.get(self.name, {}) self.template = cfg.get("template", "AfC suspected copyvio") self.ignore_list = cfg.get("ignoreList", []) + self.min_confidence = cfg.get("minConfidence", 0.75) + self.max_queries = cfg.get("maxQueries", 10) default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}" self.summary = self.make_summary(cfg.get("summary", default_summary)) @@ -74,28 +76,40 @@ class Task(BaseTask): def process(self, page): """Detect copyvios in 'page' and add a note if any are found.""" + title = page.title() + if title in self.ignore_list: + msg = "Skipping page in ignore list: [[{0}]]" + self.logger.info(msg.format(title)) + return + pageid = page.pageid() if self.has_been_processed(pageid): msg = "Skipping check on already processed page [[{0}]]" - self.logger.info(msg.format(page.title())) + self.logger.info(msg.format(title)) return - self.logger.info("Checking [[{0}]]".format(page.title())) + self.logger.info("Checking [[{0}]]".format(title)) content = page.get() - result = page.copyvio_check(self.engine, self.credentials) - if result: + result = page.copyvio_check(self.engine, self.credentials, + self.min_confidence, self.max_queries) + if result.url: + url = result.url content = page.get() - template = "\{\{{0}|url={1}\}\}".format(self.template, result) + template = "\{\{{0}|url={1}\}\}".format(self.template, url) newtext = "\n".join((template, content)) - page.edit(newtext, self.summary.format(url=result)) + if "{url}" in self.summary: + page.edit(newtext, self.summary.format(url=url)) + else: + page.edit(newtext, self.summary) msg = "Found violation: [[{0}]] -> {1}" - self.logger.info(msg.format(page.title(), result)) + self.logger.warn(msg.format(title, url)) else: self.logger.debug("No violations detected") self.log_processed(pageid) def has_been_processed(self, pageid): + """Returns True if pageid was processed before, otherwise False.""" query = "SELECT 1 FROM processed WHERE page_id = ?" with self.conn.cursor() as cursor: cursor.execute(query, (pageid,)) @@ -105,6 +119,10 @@ class Task(BaseTask): return False def log_processed(self, pageid): + """Adds pageid to our database of processed pages. + + Raises an exception if the page has already been processed. + """ query = "INSERT INTO processed VALUES (?)" with self.conn.cursor() as cursor: cursor.execute(query, (pageid,)) diff --git a/earwigbot/wiki/copyright.py b/earwigbot/wiki/copyright.py index c5c7a64..0a86a9d 100644 --- a/earwigbot/wiki/copyright.py +++ b/earwigbot/wiki/copyright.py @@ -21,6 +21,7 @@ # SOFTWARE. from json import loads +from time import sleep, time from urllib import quote_plus, urlencode try: @@ -30,6 +31,17 @@ except ImportError: from earwigbot.wiki.exceptions import * +class CopyvioCheckResult(object): + def __init__(self, confidence, url, queries): + self.confidence = confidence + self.url = url + self.queries = queries + + def __repr__(self): + r = "CopyvioCheckResult(confidence={0!r}, url={1!r}, queries={2|r})" + return r.format(self.confidence, self.url, self.queries) + + class CopyrightMixin(object): """ EarwigBot's Wiki Toolset: Copyright Violation Mixin @@ -45,7 +57,8 @@ class CopyrightMixin(object): determined by Yahoo). Raises SearchQueryError() on errors. """ base_url = "http://yboss.yahooapis.com/ysearch/web" - params = {"q": quote_plus(query), "style": "raw", "format": "json"} + query = quote_plus(query.join('"', '"')) + params = {"q": query, "style": "raw", "format": "json"} url = "{0}?{1}".format(base_url, urlencode(params)) consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"]) @@ -68,8 +81,40 @@ class CopyrightMixin(object): return [] return [result["url"] for result in results] - def copyvio_check(self, engine, credentials, force=False): - """Check the page for copyright violations.""" + def _copyvio_strip_content(self, content): + return content + + def _copyvio_explode_content(self, content): + return content + + def _copyvio_compare_content(self, content, url): + return 0 + + def copyvio_check(self, engine, credentials, min_confidence=0.5, + max_queries=-1, interquery_sleep=1, force=False): + """Check the page for copyright violations. + + Returns a CopyvioCheckResult object, with three useful attributes: + "confidence", "url", and "queries". "confidence" is a number between + 0 and 1; if it is less than min_confidence, we could not find any + indication of a violation (so "url" will be None), otherwise it + indicates the relative faith in our results, and "url" will be the + place the article is suspected of being copied from. "queries" is the + number of queries used to determine the results. + + "max_queries" is self-explanatory; we will never make more than this + number of queries in a given check. If it's less than 0, we will not + limit our number of queries. + + "interquery_sleep" is the minimum amount of time we will sleep between + search engine queries, in seconds. + + "force" is simply passed to page.get() - it has the same behavior there + as it does here. + + Raises CopyvioCheckError or subclasses (UnknownSearchEngineError, + SearchQueryError, ...) on errors. + """ if engine == "Yahoo! BOSS": if not oauth: e = "The package 'oauth2' could not be imported" @@ -77,5 +122,29 @@ class CopyrightMixin(object): querier = self._yahoo_boss_query else: raise UnknownSearchEngineError(engine) + + handled_urls = [] + best_confidence = 0 + best_match = None + num_queries = 0 content = self.get(force) - return querier(content, credentials) + clean = self._copyvio_strip_content(content) + fragments = self._copyvio_explode_content(clean) + last_query = time() + + while (fragments and best_confidence < min_confidence and + (max_queries < 0 or num_queries < max_queries)): + urls = querier(fragments.pop(0), credentials) + urls = [url for url in urls if url not in handled_urls] + for url in urls: + confidence = self._copyvio_compare_content(content, url) + if confidence > best_confidence: + best_confidence = confidence + best_match = url + num_queries += 1 + diff = time() - last_query + if diff < interquery_sleep: + sleep(interquery_sleep - diff) + last_query = time() + + return CopyvioCheckResult(best_confidence, best_match, num_queries)