From 0b6d5eac5e29192d77c2049fb30d23eeabb13999 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 11 Dec 2011 17:31:34 -0500
Subject: [PATCH] Some code for copyvio detection, including querying Yahoo!
 BOSS correctly.

---
 README.md                         |  8 +++-
 earwigbot/rules.py                | 22 +++++------
 earwigbot/tasks/afc_copyvios.py   | 76 +++++++++++++++++++++++++++++++++++-
 earwigbot/tasks/afc_statistics.py |  2 +-
 earwigbot/wiki/copyright.py       | 81 +++++++++++++++++++++++++++++++++++++++
 earwigbot/wiki/exceptions.py      | 37 +++++++++++++++++-
 earwigbot/wiki/functions.py       |  3 +-
 earwigbot/wiki/page.py            |  6 ++-
 8 files changed, 215 insertions(+), 20 deletions(-)
 create mode 100644 earwigbot/wiki/copyright.py

diff --git a/README.md b/README.md
index fdb0207..3ecc02b 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-[EarwigBot](http://toolserver.org/~earwig/earwigbot/) is a
+[EarwigBot](http://en.wikipedia.org/wiki/User:EarwigBot) is a
 [Python](http://python.org/) robot that edits
 [Wikipedia](http://en.wikipedia.org/) and interacts with people over
 [IRC](http://en.wikipedia.org/wiki/Internet_Relay_Chat).
@@ -31,3 +31,9 @@ Additionally, the afc_history task uses
 [matplotlib](http://matplotlib.sourceforge.net/) and
 [numpy](http://numpy.scipy.org/) for graphing AfC statistics. Neither of these
 modules are required for the main bot itself.
+
+`earwigbot.wiki.copyright` requires access to a search engine for detecting
+copyright violations. Currently,
+[Yahoo! BOSS](http://developer.yahoo.com/search/boss/) is the only engine
+supported, and this requires
+[oauth2](https://github.com/simplegeo/python-oauth2).
diff --git a/earwigbot/rules.py b/earwigbot/rules.py
index ce851cc..715be2e 100644
--- a/earwigbot/rules.py
+++ b/earwigbot/rules.py
@@ -51,36 +51,34 @@ def process(rc):
     chans = set()  # channels to report this message to
     page_name = rc.page.lower()
     comment = rc.comment.lower()
-    
+
     if "!earwigbot" in rc.msg.lower():
         chans.update(("##earwigbot", "#wikipedia-en-afc"))
-        
+
     if r_page.search(page_name):
-        #tasks.start("afc_copyvios", action="edit", page=rc.page)
+        tasks.start("afc_copyvios", page=rc.page)
         chans.add("#wikipedia-en-afc")
-        
+
     elif r_ffu.match(page_name):
         chans.add("#wikipedia-en-afc")
-        
+
     elif page_name.startswith("template:afc submission"):
         chans.add("#wikipedia-en-afc")
-    
+
     elif rc.flags == "move" and (r_move1.match(comment) or
                                  r_move2.match(comment)):
         p = r_moved_pages.findall(rc.comment)[0]
-        #tasks.start("afc_copyvios", action="move", page=p)
         chans.add("#wikipedia-en-afc")
-    
+
     elif rc.flags == "delete" and r_delete.match(comment):
         p = r_deleted_page.findall(rc.comment)[0]
-        #tasks.start("afc_copyvios", action="delete", page=p)
         chans.add("#wikipedia-en-afc")
-    
+
     elif rc.flags == "restore" and r_restore.match(comment):
         p = r_restored_page.findall(rc.comment)[0]
-        #tasks.start("afc_copyvios", action="restore", page=p)
+        tasks.start("afc_copyvios", page=p)
         chans.add("#wikipedia-en-afc")
-    
+
     elif rc.flags == "protect" and r_protect.match(comment):
         chans.add("#wikipedia-en-afc")
 
diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py
index 2868cfe..2d881df 100644
--- a/earwigbot/tasks/afc_copyvios.py
+++ b/earwigbot/tasks/afc_copyvios.py
@@ -20,6 +20,12 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+from os.path import expanduser
+from threading import Lock
+
+import oursql
+
+from earwigbot import wiki
 from earwigbot.classes import BaseTask
 from earwigbot.config import config
 
@@ -30,9 +36,75 @@ class Task(BaseTask):
     number = 1
 
     def __init__(self):
-        self.cfg = cfg = config.tasks.get(self.name, {})
         config.decrypt(config.tasks, self.name, "search", "credentials", "key")
         config.decrypt(config.tasks, self.name, "search", "credentials", "secret")
 
+        cfg = config.tasks.get(self.name, {})
+        self.template = cfg.get("template", "AfC suspected copyvio")
+        self.ignore_list = cfg.get("ignoreList", [])
+        default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}"
+        self.summary = self.make_summary(cfg.get("summary", default_summary))
+
+        # Search API data:
+        search = cfg.get("search", {})
+        self.engine = search.get("engine")
+        self.credentials = search.get("credentials", {})
+
+        # Connection data for our SQL database:
+        kwargs = cfg.get("sql", {})
+        kwargs["read_default_file"] = expanduser("~/.my.cnf")
+        self.conn_data = kwargs
+        self.db_access_lock = Lock()
+
     def run(self, **kwargs):
-        pass
+        """Entry point for the bot task.
+
+        Takes a page title in kwargs and checks it for copyvios, adding
+        {{self.template}} at the top if a copyvio has been detected. A page is
+        only checked once (processed pages are stored by page_id in an SQL
+        database).
+        """
+        if self.shutoff_enabled():
+            return
+        title = kwargs["page"]
+        page = wiki.get_site().get_page(title)
+        with self.db_access_lock:
+            self.conn = oursql.connect(**self.conn_data)
+            self.process(page)
+
+    def process(self, page):
+        """Detect copyvios in 'page' and add a note if any are found."""
+        pageid = page.pageid()
+        if self.has_been_processed(pageid):
+            msg = "Skipping check on already processed page [[{0}]]"
+            self.logger.info(msg.format(page.title()))
+            return
+
+        self.logger.info("Checking [[{0}]]".format(page.title()))
+        content = page.get() 
+        result = page.copyvio_check(self.engine, self.credentials)
+        if result:
+            content = page.get()
+            template = "\{\{{0}|url={1}\}\}".format(self.template, result)
+            newtext = "\n".join((template, content))
+            page.edit(newtext, self.summary.format(url=result))
+            msg = "Found violation: [[{0}]] -> {1}"
+            self.logger.info(msg.format(page.title(), result))
+        else:
+            self.logger.debug("No violations detected")
+
+        self.log_processed(pageid)
+
+    def has_been_processed(self, pageid):
+        query = "SELECT 1 FROM processed WHERE page_id = ?"
+        with self.conn.cursor() as cursor:
+            cursor.execute(query, (pageid,))
+            results = cursor.fetchall()
+        if results:
+            return True
+        return False
+
+    def log_processed(self, pageid):
+        query = "INSERT INTO processed VALUES (?)"
+        with self.conn.cursor() as cursor:
+            cursor.execute(query, (pageid,))
diff --git a/earwigbot/tasks/afc_statistics.py b/earwigbot/tasks/afc_statistics.py
index 9f5e319..3c8c885 100644
--- a/earwigbot/tasks/afc_statistics.py
+++ b/earwigbot/tasks/afc_statistics.py
@@ -185,7 +185,7 @@ class Task(BaseTask):
 
         This is used by the template as a hidden sortkey.
         """
-        return (dt - datetime(1970, 1, 1)).total_seconds()
+        return int((dt - datetime(1970, 1, 1)).total_seconds())
 
     def sync(self, **kwargs):
         """Synchronize our local statistics database with the site.
diff --git a/earwigbot/wiki/copyright.py b/earwigbot/wiki/copyright.py
new file mode 100644
index 0000000..c5c7a64
--- /dev/null
+++ b/earwigbot/wiki/copyright.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8  -*-
+#
+# Copyright (C) 2009, 2010, 2011 by Ben Kurtovic <ben.kurtovic@verizon.net>
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is 
+# furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from json import loads
+from urllib import quote_plus, urlencode
+
+try:
+    import oauth2 as oauth
+except ImportError:
+    oauth = None
+
+from earwigbot.wiki.exceptions import *
+
+class CopyrightMixin(object):
+    """
+    EarwigBot's Wiki Toolset: Copyright Violation Mixin
+
+    This is a mixin that provides one public method, copyvio_check(), which
+    checks the page for copyright violations using a search engine API. The
+    API keys must be provided to the method as arguments.
+    """
+    def _yahoo_boss_query(self, query, cred):
+        """Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials.
+
+        Returns a list of URLs, no more than fifty, ranked by relevance (as
+        determined by Yahoo). Raises SearchQueryError() on errors.
+        """
+        base_url = "http://yboss.yahooapis.com/ysearch/web"
+        params = {"q": quote_plus(query), "style": "raw", "format": "json"}
+        url = "{0}?{1}".format(base_url, urlencode(params))
+
+        consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"])
+        client = oauth.Client(consumer)
+        headers, body = client.request(url, "GET")
+
+        if headers["status"] != "200":
+            e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
+            raise SearchQueryError(e.format(headers["status"], body))
+
+        try:
+            res = loads(body)
+        except ValueError:
+            e = "Yahoo! BOSS Error: JSON could not be decoded"
+            raise SearchQueryError(e)
+
+        try:
+            results = res["bossresponse"]["web"]["results"]
+        except KeyError:
+            return []
+        return [result["url"] for result in results]
+
+    def copyvio_check(self, engine, credentials, force=False):
+        """Check the page for copyright violations."""
+        if engine == "Yahoo! BOSS":
+            if not oauth:
+                e = "The package 'oauth2' could not be imported"
+                raise UnsupportedSearchEngineError(e)
+            querier = self._yahoo_boss_query
+        else:
+            raise UnknownSearchEngineError(engine)
+        content = self.get(force)
+        return querier(content, credentials)
diff --git a/earwigbot/wiki/exceptions.py b/earwigbot/wiki/exceptions.py
index c23a743..e88158f 100644
--- a/earwigbot/wiki/exceptions.py
+++ b/earwigbot/wiki/exceptions.py
@@ -23,7 +23,29 @@
 """
 EarwigBot's Wiki Toolset: Exceptions
 
-This module contains all exceptions used by the wiki package. There are a lot.
+This module contains all exceptions used by the wiki package. There are a lot:
+
+-- WikiToolsetError
+        -- SiteNotFoundError
+        -- SiteAPIError
+        -- LoginError
+        -- NamespaceNotFoundError
+        -- PageNotFoundError
+        -- InvalidPageError
+        -- RedirectError
+        -- UserNotFoundError
+        -- EditError
+                -- PermissionsError
+                -- EditConflictError
+                -- NoContentError
+                -- ContentTooBigError
+                -- SpamDetectedError
+                -- FilteredError
+        -- SQLError
+        -- CopyvioCheckError
+                -- UnknownSearchEngineError
+                -- UnsupportedSearchEngineError
+                -- SearchQueryError
 """
 
 class WikiToolsetError(Exception):
@@ -87,3 +109,16 @@ class FilteredError(EditError):
 
 class SQLError(WikiToolsetError):
     """Some error involving SQL querying occurred."""
+
+class CopyvioCheckError(WikiToolsetError):
+    """An error occured when checking a page for copyright violations."""
+
+class UnknownSearchEngineError(CopyvioCheckError):
+    """CopyrightMixin().copyvio_check() called with an unknown engine."""
+
+class UnsupportedSearchEngineError(CopyvioCheckError):
+    """The engine requested is not available, e.g., because a required package
+    is missing."""
+
+class SearchQueryError(CopyvioCheckError):
+    """Some error ocurred while doing a search query."""
diff --git a/earwigbot/wiki/functions.py b/earwigbot/wiki/functions.py
index 5648976..16f6f3c 100644
--- a/earwigbot/wiki/functions.py
+++ b/earwigbot/wiki/functions.py
@@ -54,7 +54,8 @@ def _load_config():
     is_encrypted = config.load()
     if is_encrypted:  # Passwords in the config file are encrypted
         key = getpass("Enter key to unencrypt bot passwords: ")
-        config.decrypt(key)
+        config._decryption_key = key
+        config.decrypt(config.wiki, "password")
 
 def _get_cookiejar():
     """Returns a LWPCookieJar object loaded from our .cookies file. The same
diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py
index 5fca120..5b359e1 100644
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -25,9 +25,10 @@ import re
 from time import gmtime, strftime
 from urllib import quote
 
+from earwigbot.wiki.copyright import CopyrightMixin
 from earwigbot.wiki.exceptions import *
 
-class Page(object):
+class Page(CopyrightMixin):
     """
     EarwigBot's Wiki Toolset: Page Class
 
@@ -49,7 +50,8 @@ class Page(object):
     get                 -- returns page content
     get_redirect_target -- if the page is a redirect, returns its destination
     edit                -- replaces the page's content or creates a new page
-    add_section         -- add a new section at the bottom of the page
+    add_section         -- adds a new section at the bottom of the page
+    copyvio_check       -- checks the page for copyright violations
     """
 
     re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"