From 56e614028470a209b920fdb7c91877c662f4ba0c Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 17 Dec 2011 22:00:09 -0500
Subject: [PATCH] More work on copyright violation detection code.

---
 earwigbot/tasks/afc_copyvios.py | 32 +++++++++++++----
 earwigbot/wiki/copyright.py     | 77 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 98 insertions(+), 11 deletions(-)

diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py
index 2d881df..c90043e 100644
--- a/earwigbot/tasks/afc_copyvios.py
+++ b/earwigbot/tasks/afc_copyvios.py
@@ -42,6 +42,8 @@ class Task(BaseTask):
         cfg = config.tasks.get(self.name, {})
         self.template = cfg.get("template", "AfC suspected copyvio")
         self.ignore_list = cfg.get("ignoreList", [])
+        self.min_confidence = cfg.get("minConfidence", 0.75)
+        self.max_queries = cfg.get("maxQueries", 10)
         default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}"
         self.summary = self.make_summary(cfg.get("summary", default_summary))
 
@@ -74,28 +76,40 @@ class Task(BaseTask):
 
     def process(self, page):
         """Detect copyvios in 'page' and add a note if any are found."""
+        title = page.title()
+        if title in self.ignore_list:
+            msg = "Skipping page in ignore list: [[{0}]]"
+            self.logger.info(msg.format(title))
+            return
+
         pageid = page.pageid()
         if self.has_been_processed(pageid):
             msg = "Skipping check on already processed page [[{0}]]"
-            self.logger.info(msg.format(page.title()))
+            self.logger.info(msg.format(title))
             return
 
-        self.logger.info("Checking [[{0}]]".format(page.title()))
+        self.logger.info("Checking [[{0}]]".format(title))
         content = page.get() 
-        result = page.copyvio_check(self.engine, self.credentials)
-        if result:
+        result = page.copyvio_check(self.engine, self.credentials,
+                                    self.min_confidence, self.max_queries)
+        if result.url:
+            url = result.url
             content = page.get()
-            template = "\{\{{0}|url={1}\}\}".format(self.template, result)
+            template = "\{\{{0}|url={1}\}\}".format(self.template, url)
             newtext = "\n".join((template, content))
-            page.edit(newtext, self.summary.format(url=result))
+            if "{url}" in self.summary:
+                page.edit(newtext, self.summary.format(url=url))
+            else:
+                page.edit(newtext, self.summary)
             msg = "Found violation: [[{0}]] -> {1}"
-            self.logger.info(msg.format(page.title(), result))
+            self.logger.warn(msg.format(title, url))
         else:
             self.logger.debug("No violations detected")
 
         self.log_processed(pageid)
 
     def has_been_processed(self, pageid):
+        """Returns True if pageid was processed before, otherwise False."""
         query = "SELECT 1 FROM processed WHERE page_id = ?"
         with self.conn.cursor() as cursor:
             cursor.execute(query, (pageid,))
@@ -105,6 +119,10 @@ class Task(BaseTask):
         return False
 
     def log_processed(self, pageid):
+        """Adds pageid to our database of processed pages.
+
+        Raises an exception if the page has already been processed.
+        """
         query = "INSERT INTO processed VALUES (?)"
         with self.conn.cursor() as cursor:
             cursor.execute(query, (pageid,))
diff --git a/earwigbot/wiki/copyright.py b/earwigbot/wiki/copyright.py
index c5c7a64..0a86a9d 100644
--- a/earwigbot/wiki/copyright.py
+++ b/earwigbot/wiki/copyright.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 from json import loads
+from time import sleep, time
 from urllib import quote_plus, urlencode
 
 try:
@@ -30,6 +31,17 @@ except ImportError:
 
 from earwigbot.wiki.exceptions import *
 
+class CopyvioCheckResult(object):
+    def __init__(self, confidence, url, queries):
+        self.confidence = confidence
+        self.url = url
+        self.queries = queries
+
+    def __repr__(self):
+        r = "CopyvioCheckResult(confidence={0!r}, url={1!r}, queries={2|r})"
+        return r.format(self.confidence, self.url, self.queries)
+
+
 class CopyrightMixin(object):
     """
     EarwigBot's Wiki Toolset: Copyright Violation Mixin
@@ -45,7 +57,8 @@ class CopyrightMixin(object):
         determined by Yahoo). Raises SearchQueryError() on errors.
         """
         base_url = "http://yboss.yahooapis.com/ysearch/web"
-        params = {"q": quote_plus(query), "style": "raw", "format": "json"}
+        query = quote_plus(query.join('"', '"'))
+        params = {"q": query, "style": "raw", "format": "json"}
         url = "{0}?{1}".format(base_url, urlencode(params))
 
         consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"])
@@ -68,8 +81,40 @@ class CopyrightMixin(object):
             return []
         return [result["url"] for result in results]
 
-    def copyvio_check(self, engine, credentials, force=False):
-        """Check the page for copyright violations."""
+    def _copyvio_strip_content(self, content):
+        return content
+
+    def _copyvio_explode_content(self, content):
+        return content
+
+    def _copyvio_compare_content(self, content, url):
+        return 0
+
+    def copyvio_check(self, engine, credentials, min_confidence=0.5,
+                      max_queries=-1, interquery_sleep=1, force=False):
+        """Check the page for copyright violations.
+
+        Returns a CopyvioCheckResult object, with three useful attributes:
+        "confidence", "url", and "queries". "confidence" is a number between
+        0 and 1; if it is less than min_confidence, we could not find any
+        indication of a violation (so "url" will be None), otherwise it
+        indicates the relative faith in our results, and "url" will be the
+        place the article is suspected of being copied from. "queries" is the
+        number of queries used to determine the results.
+
+        "max_queries" is self-explanatory; we will never make more than this
+        number of queries in a given check. If it's less than 0, we will not
+        limit our number of queries.
+
+        "interquery_sleep" is the minimum amount of time we will sleep between
+        search engine queries, in seconds.
+
+        "force" is simply passed to page.get() - it has the same behavior there
+        as it does here.
+
+        Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
+        SearchQueryError, ...) on errors.
+        """
         if engine == "Yahoo! BOSS":
             if not oauth:
                 e = "The package 'oauth2' could not be imported"
@@ -77,5 +122,29 @@ class CopyrightMixin(object):
             querier = self._yahoo_boss_query
         else:
             raise UnknownSearchEngineError(engine)
+
+        handled_urls = []
+        best_confidence = 0
+        best_match = None
+        num_queries = 0
         content = self.get(force)
-        return querier(content, credentials)
+        clean = self._copyvio_strip_content(content)
+        fragments = self._copyvio_explode_content(clean)
+        last_query = time()
+
+        while (fragments and best_confidence < min_confidence and
+               (max_queries < 0 or num_queries < max_queries)):
+            urls = querier(fragments.pop(0), credentials)
+            urls = [url for url in urls if url not in handled_urls]
+            for url in urls:
+                confidence = self._copyvio_compare_content(content, url)
+                if confidence > best_confidence:
+                    best_confidence = confidence
+                    best_match = url
+            num_queries += 1
+            diff = time() - last_query
+            if diff < interquery_sleep:
+                sleep(interquery_sleep - diff)
+            last_query = time()
+
+        return CopyvioCheckResult(best_confidence, best_match, num_queries)