From a074da853bd8956803b9f0061e12d4ca1d32cff0 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 8 Jul 2012 14:44:15 -0400
Subject: [PATCH] More work on copyvios, including an exclusions database (#5)

* Added exclusions module with a fully implemented ExclusionsDB that can pull
  from multiple sources for different sites.
* Moved CopyvioCheckResult to its own module, to be imported by __init__.
* Some other related changes.
---
 docs/api/earwigbot.wiki.copyvios.rst  |  14 +++
 docs/toolset.rst                      |   6 +-
 earwigbot/wiki/copyvios/__init__.py   |  56 +++---------
 earwigbot/wiki/copyvios/exclusions.py | 155 ++++++++++++++++++++++++++++++++++
 earwigbot/wiki/copyvios/result.py     |  60 +++++++++++++
 earwigbot/wiki/sitesdb.py             |   8 ++
 6 files changed, 252 insertions(+), 47 deletions(-)
 create mode 100644 earwigbot/wiki/copyvios/exclusions.py
 create mode 100644 earwigbot/wiki/copyvios/result.py

diff --git a/docs/api/earwigbot.wiki.copyvios.rst b/docs/api/earwigbot.wiki.copyvios.rst
index 7dbcf39..abddf7a 100644
--- a/docs/api/earwigbot.wiki.copyvios.rst
+++ b/docs/api/earwigbot.wiki.copyvios.rst
@@ -8,6 +8,13 @@ copyvios Package
     :members:
     :undoc-members:
 
+:mod:`exclusions` Module
+------------------------
+
+.. automodule:: earwigbot.wiki.copyvios.exclusions
+    :members:
+    :undoc-members:
+
 :mod:`markov` Module
 --------------------
 
@@ -24,6 +31,13 @@ copyvios Package
     :undoc-members:
     :show-inheritance:
 
+:mod:`result` Module
+--------------------
+
+.. automodule:: earwigbot.wiki.copyvios.result
+    :members:
+    :undoc-members:
+
 :mod:`search` Module
 --------------------
 
diff --git a/docs/toolset.rst b/docs/toolset.rst
index fcdfc6d..e2258c8 100644
--- a/docs/toolset.rst
+++ b/docs/toolset.rst
@@ -48,9 +48,9 @@ wikis, you can usually use code like this::
 
 This works because EarwigBot assumes that the URL for the site is
 ``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL
-connection info (if any) are stored as ``config.wiki["sql"]``. This might
-change if you're dealing with non-WMF wikis, where the code might look
-something more like::
+connection info (if any) is stored as ``config.wiki["sql"]``. This might change
+if you're dealing with non-WMF wikis, where the code might look something more
+like::
 
     project, lang = "mywiki", "it"
     try:
diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index cf2ddde..0f29403 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -33,47 +33,10 @@ except ImportError:
 from earwigbot import exceptions
 from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
 from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
+from earwigbot.wiki.copyvios.result import CopyvioCheckResult
 from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
 
-__all__ = ["CopyvioCheckResult", "CopyvioMixIn"]
-
-class CopyvioCheckResult(object):
-    """
-    **EarwigBot: Wiki Toolset: Copyvio Check Result**
-
-    A class holding information about the results of a copyvio check.
-
-    *Attributes:*
-
-    - :py:attr:`violation`:     ``True`` if this is a violation, else ``False``
-    - :py:attr:`confidence`:    a float between 0 and 1 indicating accuracy
-    - :py:attr:`url`:           the URL of the violated page
-    - :py:attr:`queries`:       the number of queries used to reach a result
-    - :py:attr:`article_chain`: the MarkovChain of the article text
-    - :py:attr:`source_chain`:  the MarkovChain of the violated page text
-    - :py:attr:`delta_chain`:   the MarkovChainIntersection comparing the two
-    """
-
-    def __init__(self, violation, confidence, url, queries, article, chains):
-        self.violation = violation
-        self.confidence = confidence
-        self.url = url
-        self.queries = queries
-        self.article_chain = article
-        self.source_chain = chains[0]
-        self.delta_chain = chains[1]
-
-    def __repr__(self):
-        """Return the canonical string representation of the result."""
-        res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
-        return res.format(self.violation, self.confidence, self.url,
-                          self.queries)
-
-    def __str__(self):
-        """Return a nice string representation of the result."""
-        res = "<CopyvioCheckResult ({0} with {1} conf)>"
-        return res.format(self.violation, self.confidence)
-
+__all__ = ["CopyvioMixIn"]
 
 class CopyvioMixIn(object):
     """
@@ -88,6 +51,7 @@ class CopyvioMixIn(object):
 
     def __init__(self, site):
         self._search_config = site._search_config
+        self._exclusions_db = self._search_config["exclusions_db"]
         self._opener = build_opener()
         self._opener.addheaders = site._opener.addheaders
 
@@ -156,8 +120,9 @@ class CopyvioMixIn(object):
                       interquery_sleep=1):
         """Check the page for copyright violations.
 
-        Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult`
-        object with information on the results of the check.
+        Returns a
+        :py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object
+        with information on the results of the check.
 
         *max_queries* is self-explanatory; we will never make more than this
         number of queries in a given check. If it's lower than 0, we will not
@@ -171,6 +136,7 @@ class CopyvioMixIn(object):
         :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors.
         """
         searcher = self._select_search_engine()
+        self._exclusions_db.sync(self.site.name)
         handled_urls = []
         best_confidence = 0
         best_match = None
@@ -193,6 +159,8 @@ class CopyvioMixIn(object):
             urls = [url for url in urls if url not in handled_urls]
             for url in urls:
                 handled_urls.append(url)
+                if self._exclusions_db.check(self.site.name, url):
+                    continue
                 conf, chains = self._copyvio_compare_content(article_chain, url)
                 if conf > best_confidence:
                     best_confidence = conf
@@ -216,9 +184,9 @@ class CopyvioMixIn(object):
 
         This is essentially a reduced version of the above - a copyivo
         comparison is made using Markov chains and the result is returned in a
-        :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but
-        without using a search engine, since the suspected "violated" URL is
-        supplied from the start.
+        :py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object -
+        but without using a search engine, since the suspected "violated" URL
+        is supplied from the start.
 
         Its primary use is to generate a result when the URL is retrieved from
         a cache, like the one used in EarwigBot's Toolserver site. After a
diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py
new file mode 100644
index 0000000..fdbaa39
--- /dev/null
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8  -*-
+#
+# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import re
+import sqlite3 as sqlite
+from threading import Lock
+from time import time
+
+from earwigbot import exceptions
+
+__all__ = ["ExclusionsDB"]
+
+default_sources = {
+    "enwiki": [
+        "Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def",
+        "Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl",
+        "Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr",
+        "Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz"
+    ]
+}
+
+class ExclusionsDB(object):
+    """
+    **EarwigBot: Wiki Toolset: Exclusions Database Manager**
+
+    Controls the :file:`.exclusions.db` file, which stores URLs excluded from
+    copyright violation checks on account of being known mirrors, for example.
+    """
+
+    def __init__(self, sitesdb, dbfile, logger):
+        self._sitesdb = sitesdb
+        self._dbfile = dbfile
+        self._logger = logger
+        self._db_access_lock = Lock()
+
+    def _create(self):
+        """Initialize the exclusions database with its necessary tables."""
+        script = """
+            CREATE TABLE sources (source_sitename, source_page);
+            CREATE TABLE updates (update_sitename, update_time);
+            CREATE TABLE exclusions (exclusion_sitename, exclusion_url);
+        """
+        query = "INSERT INTO sources VALUES (?, ?);"
+        sources = []
+        for sitename, pages in default_sources.iteritems():
+            [sources.append((sitename, page)) for page in pages]
+
+        with sqlite.connect(self._dbfile) as conn:
+            conn.executescript(script)
+            conn.executemany(query, sources)
+
+    def _load_source(self, site, source):
+        """Load from a specific source and return a set of URLs."""
+        urls = set()
+        try:
+            data = site.get_page(source).get()
+        except exceptions.PageNotFoundError:
+            return urls
+
+        regexes = [
+            "url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
+            "\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?"
+        ]
+        for regex in regexes:
+            [urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)]
+        return urls
+
+    def _update(self, sitename):
+        """Update the database from listed sources in the index."""
+        query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;"
+        query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
+        query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?"
+        query4 = "INSERT INTO exclusions VALUES (?, ?);"
+        query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;"
+        query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;"
+        query7 = "INSERT INTO updates VALUES (?, ?);"
+
+        site = self._sitesdb.get_site(sitename)
+        with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
+            urls = set()
+            for (source,) in conn.execute(query1, (sitename,)):
+                urls |= self._load_source(site, source)
+            for (url,) in conn.execute(query2, (sitename,)):
+                if url in urls:
+                    urls.remove(url)
+                else:
+                    conn.execute(query3, (sitename, url))
+            conn.executemany(query4, [(sitename, url) for url in urls])
+            if conn.execute(query5, (name,)).fetchone():
+                conn.execute(query6, (time(), sitename))
+            else:
+                conn.execute(query7, (sitename, time()))
+
+    def _get_last_update(self, sitename):
+        """Return the UNIX timestamp of the last time the db was updated."""
+        query = "SELECT update_time FROM updates WHERE update_sitename = ?;"
+        with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
+            try:
+                result = conn.execute(query, (sitename,)).fetchone()
+            except sqlite.OperationalError:
+                self._create()
+                return 0
+            return result[0] if result else 0
+
+    def sync(self, sitename):
+        """Update the database if it hasn't been updated in the past month.
+
+        This only updates the exclusions database for the *sitename* site.
+        """
+        max_staleness = 60 * 60 * 24 * 30
+        time_since_update = int(time() - self._get_last_update())
+        if time_since_update > max_staleness:
+            log = "Updating stale database: {0} (last updated {1} seconds ago)"
+            self._logger.info(log.format(sitename, time_since_update))
+            self._update(sitename)
+        else:
+            log = "Database for {0} is still fresh (last updated {1} seconds ago)"
+            self._logger.debug(log.format(sitename, time_since_update))
+
+    def check(self, sitename, url):
+        """Check whether a given URL is in the exclusions database.
+
+        Return ``True`` if the URL is in the database, or ``False`` otherwise.
+        """
+        normalized = re.sub("https?://", "", url.lower())
+        query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
+        with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
+            for row in conn.execute(query, (sitename,)):
+                if normalized.startswith(row[0]):
+                    log = "Exclusion detected in {0} for {1}"
+                    self._logger.debug(log.format(sitename, url))
+                    return True
+
+        log = "No exclusions in {0} for {1}".format(sitename, url)
+        self._logger.debug(log)
+        return False
diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py
new file mode 100644
index 0000000..0c3e98f
--- /dev/null
+++ b/earwigbot/wiki/copyvios/result.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8  -*-
+#
+# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+__all__ = ["CopyvioCheckResult"]
+
+class CopyvioCheckResult(object):
+    """
+    **EarwigBot: Wiki Toolset: Copyvio Check Result**
+
+    A class holding information about the results of a copyvio check.
+
+    *Attributes:*
+
+    - :py:attr:`violation`:     ``True`` if this is a violation, else ``False``
+    - :py:attr:`confidence`:    a float between 0 and 1 indicating accuracy
+    - :py:attr:`url`:           the URL of the violated page
+    - :py:attr:`queries`:       the number of queries used to reach a result
+    - :py:attr:`article_chain`: the MarkovChain of the article text
+    - :py:attr:`source_chain`:  the MarkovChain of the violated page text
+    - :py:attr:`delta_chain`:   the MarkovChainIntersection comparing the two
+    """
+
+    def __init__(self, violation, confidence, url, queries, article, chains):
+        self.violation = violation
+        self.confidence = confidence
+        self.url = url
+        self.queries = queries
+        self.article_chain = article
+        self.source_chain = chains[0]
+        self.delta_chain = chains[1]
+
+    def __repr__(self):
+        """Return the canonical string representation of the result."""
+        res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
+        return res.format(self.violation, self.confidence, self.url,
+                          self.queries)
+
+    def __str__(self):
+        """Return a nice string representation of the result."""
+        res = "<CopyvioCheckResult ({0} with {1} conf)>"
+        return res.format(self.violation, self.confidence)
diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py
index cdff1fe..9d2c828 100644
--- a/earwigbot/wiki/sitesdb.py
+++ b/earwigbot/wiki/sitesdb.py
@@ -29,6 +29,7 @@ import sqlite3 as sqlite
 
 from earwigbot import __version__
 from earwigbot.exceptions import SiteNotFoundError
+from earwigbot.wiki.copyvios.exclusions import ExclusionsDB
 from earwigbot.wiki.site import Site
 
 __all__ = ["SitesDB"]
@@ -58,11 +59,16 @@ class SitesDB(object):
         """Set up the manager with an attribute for the base Bot object."""
         self.config = bot.config
         self._logger = bot.logger.getChild("wiki")
+
         self._sites = {}  # Internal site cache
         self._sitesdb = path.join(bot.config.root_dir, "sites.db")
         self._cookie_file = path.join(bot.config.root_dir, ".cookies")
         self._cookiejar = None
 
+        excl_db = path.join(bot.config.root_dir, "exclusions.db")
+        excl_logger = self._logger.getChild("exclusionsdb")
+        self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger)
+
     def __repr__(self):
         """Return the canonical string representation of the SitesDB."""
         res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})"
@@ -195,6 +201,7 @@ class SitesDB(object):
         if search_config:
             nltk_dir = path.join(self.config.root_dir, ".nltk")
             search_config["nltk_dir"] = nltk_dir
+            search_config["exclusions_db"] = self._exclusions_db
 
         if not sql:
             sql = config.wiki.get("sql", {})
@@ -379,6 +386,7 @@ class SitesDB(object):
         if search_config:
             nltk_dir = path.join(self.config.root_dir, ".nltk")
             search_config["nltk_dir"] = nltk_dir
+            search_config["exclusions_db"] = self._exclusions_db
 
         if not sql:
             sql = config.wiki.get("sql", {})