From e6a381f3f7eb4ac37171c44b5ee930bdf4dda354 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 20 Mar 2012 12:25:45 -0400
Subject: [PATCH 01/19] Restructuring copyvio stuff as its own package.

---
 .../wiki/{copyright.py => copyvios/__init__.py}    | 83 ++++++----------------
 earwigbot/wiki/copyvios/markov.py                  | 63 ++++++++++++++++
 earwigbot/wiki/page.py                             | 18 ++---
 3 files changed, 93 insertions(+), 71 deletions(-)
 rename earwigbot/wiki/{copyright.py => copyvios/__init__.py} (82%)
 create mode 100644 earwigbot/wiki/copyvios/markov.py

diff --git a/earwigbot/wiki/copyright.py b/earwigbot/wiki/copyvios/__init__.py
similarity index 82%
rename from earwigbot/wiki/copyright.py
rename to earwigbot/wiki/copyvios/__init__.py
index c003ebb..68b4134 100644
--- a/earwigbot/wiki/copyright.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -1,17 +1,17 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
-# 
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is 
+# copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-# 
+#
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
-# 
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -20,11 +20,9 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-from collections import defaultdict
 from functools import partial
 from gzip import GzipFile
 from json import loads
-from re import sub, UNICODE
 from StringIO import StringIO
 from time import sleep, time
 from urllib import quote_plus, urlencode
@@ -36,8 +34,9 @@ except ImportError:
     oauth = None
 
 from earwigbot.wiki.exceptions import *
+from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
 
-class _CopyvioCheckResult(object):
+class CopyvioCheckResult(object):
     def __init__(self, violation, confidence, url, queries, article, chains):
         self.violation = violation
         self.confidence = confidence
@@ -48,51 +47,11 @@ class _CopyvioCheckResult(object):
         self.delta_chain = chains[1]
 
     def __repr__(self):
-        r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
+        r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
         return r.format(self.violation, self.confidence, self.url, self.queries)
 
 
-class _MarkovChain(object):
-    START = -1
-    END = -2
-
-    def __init__(self, text):
-        self.text = text
-        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
-        words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
-        prev = self.START
-        for word in words:
-            self.chain[prev][word] += 1
-            prev = word
-        try:  # This won't work if the source text is completely blank
-            self.chain[word][self.END] += 1
-        except KeyError:
-            pass
-
-    def size(self):
-        count = 0
-        for node in self.chain.itervalues():
-            for hits in node.itervalues():
-                count += hits
-        return count
-
-
-class _MarkovChainIntersection(_MarkovChain):
-    def __init__(self, mc1, mc2):
-        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
-        c1 = mc1.chain
-        c2 = mc2.chain
-
-        for word, nodes1 in c1.iteritems():
-            if word in c2:
-                nodes2 = c2[word]
-                for node, count1 in nodes1.iteritems():
-                    if node in nodes2:
-                        count2 = nodes2[node]
-                        self.chain[word][node] = min(count1, count2)
-
-
-class CopyrightMixin(object):
+class CopyvioMixin(object):
     """
     EarwigBot's Wiki Toolset: Copyright Violation Mixin
 
@@ -220,8 +179,8 @@ class CopyrightMixin(object):
         if not html:
             return 0
 
-        source = _MarkovChain(self._copyvio_strip_html(html))
-        delta = _MarkovChainIntersection(article, source)
+        source = MarkovChain(self._copyvio_strip_html(html))
+        delta = MarkovChainIntersection(article, source)
         return float(delta.size()) / article.size(), (source, delta)
 
     def copyvio_check(self, min_confidence=0.5, max_queries=-1,
@@ -255,17 +214,17 @@ class CopyrightMixin(object):
         best_confidence = 0
         best_match = None
         num_queries = 0
-        empty = _MarkovChain("")
-        best_chains = (empty, _MarkovChainIntersection(empty, empty))
+        empty = MarkovChain("")
+        best_chains = (empty, MarkovChainIntersection(empty, empty))
         content = self.get(force)
         clean = self._copyvio_strip_article(content)
         chunks = self._copyvio_chunk_article(clean, max_queries)
-        article_chain = _MarkovChain(clean)
+        article_chain = MarkovChain(clean)
         last_query = time()
 
         if article_chain.size() < 20:  # Auto-fail very small articles
-            return _CopyvioCheckResult(False, best_confidence, best_match,
-                                       num_queries, article_chain, best_chains)
+            return CopyvioCheckResult(False, best_confidence, best_match,
+                                      num_queries, article_chain, best_chains)
 
         while (chunks and best_confidence < min_confidence and
                (max_queries < 0 or num_queries < max_queries)):
@@ -288,8 +247,8 @@ class CopyrightMixin(object):
             v = True
         else:
             v = False
-        return _CopyvioCheckResult(v, best_confidence, best_match, num_queries,
-                                   article_chain, best_chains)
+        return CopyvioCheckResult(v, best_confidence, best_match, num_queries,
+                                  article_chain, best_chains)
 
     def copyvio_compare(self, url, min_confidence=0.5, force=False):
         """Check the page like copyvio_check(), but against a specific URL.
@@ -298,7 +257,7 @@ class CopyrightMixin(object):
         comparison is made using Markov chains and the result is returned in a
         _CopyvioCheckResult object - without using a search engine, as the
         suspected "violated" URL is supplied from the start.
-        
+
         Its primary use is to generate a result when the URL is retrieved from
         a cache, like the one used in EarwigBot's Toolserver site. After a
         search is done, the resulting URL is stored in a cache for 24 hours so
@@ -313,12 +272,12 @@ class CopyrightMixin(object):
         """
         content = self.get(force)
         clean = self._copyvio_strip_article(content)
-        article_chain = _MarkovChain(clean)
+        article_chain = MarkovChain(clean)
         confidence, chains = self._copyvio_compare_content(article_chain, url)
 
         if confidence >= min_confidence:
             is_violation = True
         else:
             is_violation = False
-        return _CopyvioCheckResult(is_violation, confidence, url, 0,
-                                   article_chain, chains)
+        return CopyvioCheckResult(is_violation, confidence, url, 0,
+                                  article_chain, chains)
diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py
new file mode 100644
index 0000000..4e77ebc
--- /dev/null
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8  -*-
+#
+# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from collections import defaultdict
+from re import sub, UNICODE
+
+class MarkovChain(object):
+    START = -1
+    END = -2
+
+    def __init__(self, text):
+        self.text = text
+        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
+        words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
+        prev = self.START
+        for word in words:
+            self.chain[prev][word] += 1
+            prev = word
+        try:  # This won't work if the source text is completely blank
+            self.chain[word][self.END] += 1
+        except KeyError:
+            pass
+
+    def size(self):
+        count = 0
+        for node in self.chain.itervalues():
+            for hits in node.itervalues():
+                count += hits
+        return count
+
+
+class MarkovChainIntersection(MarkovChain):
+    def __init__(self, mc1, mc2):
+        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
+        c1 = mc1.chain
+        c2 = mc2.chain
+
+        for word, nodes1 in c1.iteritems():
+            if word in c2:
+                nodes2 = c2[word]
+                for node, count1 in nodes1.iteritems():
+                    if node in nodes2:
+                        count2 = nodes2[node]
+                        self.chain[word][node] = min(count1, count2)
diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py
index 8407108..0d266b7 100644
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -1,17 +1,17 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
-# 
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is 
+# copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-# 
+#
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
-# 
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -25,10 +25,10 @@ import re
 from time import gmtime, strftime
 from urllib import quote
 
-from earwigbot.wiki.copyright import CopyrightMixin
+from earwigbot.wiki.copyvios import CopyvioMixin
 from earwigbot.wiki.exceptions import *
 
-class Page(CopyrightMixin):
+class Page(CopyvioMixin):
     """
     EarwigBot's Wiki Toolset: Page Class
 
@@ -264,7 +264,7 @@ class Page(CopyrightMixin):
         If `params` is given, we'll use it as our API query parameters.
         Otherwise, we'll build params using the given kwargs via
         _build_edit_params().
-        
+
         We'll then try to do the API query, and catch any errors the API raises
         in _handle_edit_errors(). We'll then throw these back as subclasses of
         EditError.
@@ -275,7 +275,7 @@ class Page(CopyrightMixin):
         if not self._token:
             e = "You don't have permission to edit this page."
             raise PermissionsError(e)
-        
+
         # Weed out invalid pages before we get too far:
         self._force_validity()
 
@@ -336,7 +336,7 @@ class Page(CopyrightMixin):
                 # Page does not exist; don't edit if it already exists:
                 params["createonly"] = "true"
         else:
-            params["recreate"] = "true"            
+            params["recreate"] = "true"
 
         return params
 

From d4e947b98bffc3ef156a0308f7856530d61cb987 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 24 Mar 2012 00:51:32 -0400
Subject: [PATCH 02/19] earwigbot.wiki.copyvios.search module split

---
 earwigbot/wiki/copyvios/__init__.py | 45 +++-------------------
 earwigbot/wiki/copyvios/search.py   | 75 +++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 40 deletions(-)
 create mode 100644 earwigbot/wiki/copyvios/search.py

diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 68b4134..0aaa9b5 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -20,12 +20,9 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-from functools import partial
 from gzip import GzipFile
-from json import loads
 from StringIO import StringIO
 from time import sleep, time
-from urllib import quote_plus, urlencode
 from urllib2 import build_opener, URLError
 
 try:
@@ -35,6 +32,7 @@ except ImportError:
 
 from earwigbot.wiki.exceptions import *
 from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
+from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
 
 class CopyvioCheckResult(object):
     def __init__(self, violation, confidence, url, queries, article, chains):
@@ -107,42 +105,9 @@ class CopyvioMixin(object):
             if not oauth:
                 e = "The package 'oauth2' could not be imported"
                 raise UnsupportedSearchEngineError(e)
-            searcher = self._yahoo_boss_query
-        else:
-            raise UnknownSearchEngineError(engine)
-
-        return partial(searcher, credentials)
-
-    def _yahoo_boss_query(self, cred, query):
-        """Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials.
-
-        Returns a list of URLs, no more than fifty, ranked by relevance (as
-        determined by Yahoo). Raises SearchQueryError() on errors.
-        """
-        base_url = "http://yboss.yahooapis.com/ysearch/web"
-        query = quote_plus(query.join('"', '"'))
-        params = {"q": query, "style": "raw", "format": "json"}
-        url = "{0}?{1}".format(base_url, urlencode(params))
+            return YahooBOSSSearchEngine(credentials)
 
-        consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"])
-        client = oauth.Client(consumer)
-        headers, body = client.request(url, "GET")
-
-        if headers["status"] != "200":
-            e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
-            raise SearchQueryError(e.format(headers["status"], body))
-
-        try:
-            res = loads(body)
-        except ValueError:
-            e = "Yahoo! BOSS Error: JSON could not be decoded"
-            raise SearchQueryError(e)
-
-        try:
-            results = res["bossresponse"]["web"]["results"]
-        except KeyError:
-            return []
-        return [result["url"] for result in results]
+        raise UnknownSearchEngineError(engine)
 
     def _copyvio_strip_html(self, html):
         """
@@ -209,7 +174,7 @@ class CopyvioMixin(object):
         Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
         SearchQueryError, ...) on errors.
         """
-        search = self._select_search_engine()
+        searcher = self._select_search_engine()
         handled_urls = []
         best_confidence = 0
         best_match = None
@@ -228,7 +193,7 @@ class CopyvioMixin(object):
 
         while (chunks and best_confidence < min_confidence and
                (max_queries < 0 or num_queries < max_queries)):
-            urls = search(chunks.pop(0))
+            urls = searcher.search(chunks.pop(0))
             urls = [url for url in urls if url not in handled_urls]
             for url in urls:
                 handled_urls.append(url)
diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py
new file mode 100644
index 0000000..59287cc
--- /dev/null
+++ b/earwigbot/wiki/copyvios/search.py
@@ -0,0 +1,75 @@
+# -*- coding: utf-8  -*-
+#
+# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from json import loads
+from urllib import quote_plus, urlencode
+
+try:
+    import oauth2 as oauth
+except ImportError:
+    oauth = None
+
+from earwigbot.wiki.exceptions import SearchQueryError
+
+class BaseSearchEngine(object):
+    def __init__(self, cred):
+        """Store credentials 'cred' for searching later on."""
+        self.cred = cred
+
+    def search(self, query):
+        """Use this engine to search for 'query'.
+
+        Not implemented in this base class; overridden in subclasses."""
+        raise NotImplementedError()
+
+
+class YahooBOSSSearchEngine(BaseSearchEngine):
+    def search(self, query):
+        """Do a Yahoo! BOSS web search for 'query'.
+
+        Returns a list of URLs, no more than fifty, ranked by relevance (as
+        determined by Yahoo). Raises SearchQueryError() on errors.
+        """
+        base_url = "http://yboss.yahooapis.com/ysearch/web"
+        query = quote_plus(query.join('"', '"'))
+        params = {"q": query, "style": "raw", "format": "json"}
+        url = "{0}?{1}".format(base_url, urlencode(params))
+
+        consumer = oauth.Consumer(key=self.cred["key"], secret=self.cred["secret"])
+        client = oauth.Client(consumer)
+        headers, body = client.request(url, "GET")
+
+        if headers["status"] != "200":
+            e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
+            raise SearchQueryError(e.format(headers["status"], body))
+
+        try:
+            res = loads(body)
+        except ValueError:
+            e = "Yahoo! BOSS Error: JSON could not be decoded"
+            raise SearchQueryError(e)
+
+        try:
+            results = res["bossresponse"]["web"]["results"]
+        except KeyError:
+            return []
+        return [result["url"] for result in results]

From 86a84407304aef0850d98ef8e5b5e10a35d41b13 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Wed, 28 Mar 2012 15:23:09 -0400
Subject: [PATCH 03/19] Moving parsers to own file.

---
 earwigbot/wiki/constants.py         |  9 +++--
 earwigbot/wiki/copyvios/__init__.py | 38 +++---------------
 earwigbot/wiki/copyvios/parsers.py  | 80 +++++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 36 deletions(-)
 create mode 100644 earwigbot/wiki/copyvios/parsers.py

diff --git a/earwigbot/wiki/constants.py b/earwigbot/wiki/constants.py
index 22aef9c..2431884 100644
--- a/earwigbot/wiki/constants.py
+++ b/earwigbot/wiki/constants.py
@@ -1,17 +1,17 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
-# 
+#
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is 
+# copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
-# 
+#
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
-# 
+#
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -34,6 +34,7 @@ Import with `from earwigbot.wiki import constants` or `from earwigbot.wiki.const
 from earwigbot import __version__ as _v
 from platform import python_version as _p
 USER_AGENT = "EarwigBot/{0} (Python/{1}; https://github.com/earwig/earwigbot)".format(_v, _p())
+del _v, _p
 
 # Default namespace IDs:
 NS_MAIN = 0
diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 0aaa9b5..46b27e2 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -30,9 +30,10 @@ try:
 except ImportError:
     oauth = None
 
-from earwigbot.wiki.exceptions import *
 from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
+from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
 from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
+from earwigbot.wiki.exceptions import *
 
 class CopyvioCheckResult(object):
     def __init__(self, violation, confidence, url, queries, article, chains):
@@ -109,33 +110,6 @@ class CopyvioMixin(object):
 
         raise UnknownSearchEngineError(engine)
 
-    def _copyvio_strip_html(self, html):
-        """
-        STUB
-        """
-        return html
-
-    def _copyvio_strip_article(self, content):
-        """Clean the page's raw text by removing templates and formatting.
-
-        Returns the page's text with all HTML and wikicode formatting removed,
-        including templates, tables, references, and the Bibliography/
-        References/Sources/See also section(s). It retains punctuation
-        (spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
-        quotes) and original capitalization, but not brackets (square and
-        angular), abnormal spacing, nor anything else. HTML entities are
-        replaced by their unicode equivalents.
-
-        STUB
-        """
-        return content
-
-    def _copyvio_chunk_article(self, content, max_chunks):
-        """
-        STUB
-        """
-        return [content]
-
     def _copyvio_compare_content(self, article, url):
         """
         DOCSTRING NEEDED
@@ -144,7 +118,7 @@ class CopyvioMixin(object):
         if not html:
             return 0
 
-        source = MarkovChain(self._copyvio_strip_html(html))
+        source = MarkovChain(HTMLTextParser(html).strip())
         delta = MarkovChainIntersection(article, source)
         return float(delta.size()) / article.size(), (source, delta)
 
@@ -182,8 +156,8 @@ class CopyvioMixin(object):
         empty = MarkovChain("")
         best_chains = (empty, MarkovChainIntersection(empty, empty))
         content = self.get(force)
-        clean = self._copyvio_strip_article(content)
-        chunks = self._copyvio_chunk_article(clean, max_queries)
+        clean = ArticleTextParser(content).strip()
+        chunks = ArticleTextParser(clean).chunk(max_queries)
         article_chain = MarkovChain(clean)
         last_query = time()
 
@@ -236,7 +210,7 @@ class CopyvioMixin(object):
         SearchQueryError will be raised.
         """
         content = self.get(force)
-        clean = self._copyvio_strip_article(content)
+        clean = ArticleTextParser(content).strip()
         article_chain = MarkovChain(clean)
         confidence, chains = self._copyvio_compare_content(article_chain, url)
 
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
new file mode 100644
index 0000000..f9bb4c2
--- /dev/null
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -0,0 +1,80 @@
+# -*- coding: utf-8  -*-
+#
+# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+class BaseTextParser(object):
+    def __init__(self, text):
+        self.text = text
+
+
+class ArticleTextParser(BaseTextParser):
+    def strip(self):
+        """Clean the page's raw text by removing templates and formatting.
+
+        Returns the page's text with all HTML and wikicode formatting removed,
+        including templates, tables, references, and the Bibliography/
+        References/Sources/See also section(s). It retains punctuation
+        (spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
+        quotes) and original capitalization, but not brackets (square and
+        angular), abnormal spacing, nor anything else. HTML entities are
+        replaced by their unicode equivalents.
+
+        The actual replacement is handled by a few private methods within this
+        class.
+        """
+        text = self._strip_tags(self.text)
+        text = self._strip_templates(text)
+        text = self._strip_sections(text)
+        text = self._strip_wikicode(text)
+        text = self._normalize(text)
+        return text
+
+    def chunk(self, max_chunks):
+        """Convert the article text into a list of web-searchable chunks.
+
+        No greater than max_chunks will be returned. Each chunk will only be a
+        couple sentences long at most. The idea here is to return a
+        representative sample of the article text rather than the entire
+        article, so we'll probably pick and choose from its introduction, body,
+        and conclusion, especially if the article is large and max_chunks are
+        few, so we don't end up just searching for the first paragraph.
+        """
+        return [self.text]
+
+    def _strip_tags(self, text):
+        return text
+
+    def _strip_templates(self, text):
+        return text
+
+    def _strip_sections(self, text):
+        return text
+
+    def _strip_wikicode(self, text):
+        return text
+
+    def _normalize(self, text):
+        return text
+
+
+class HTMLTextParser(BaseTextParser):
+    def strip(self):
+        return self.text

From 5ca1d91f3e398545c476bd317826b04941e6f98d Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 7 Apr 2012 12:33:13 -0400
Subject: [PATCH 04/19] Use __all__ within e.w.copyvios and shorter imports

---
 earwigbot/wiki/copyvios/__init__.py | 8 +++++---
 earwigbot/wiki/copyvios/markov.py   | 2 ++
 earwigbot/wiki/copyvios/parsers.py  | 2 ++
 earwigbot/wiki/copyvios/search.py   | 2 ++
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 46b27e2..a206a70 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -30,11 +30,13 @@ try:
 except ImportError:
     oauth = None
 
-from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
-from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
-from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
+from earwigbot.wiki.copyvios.markov import *
+from earwigbot.wiki.copyvios.parsers import *
+from earwigbot.wiki.copyvios.search import *
 from earwigbot.wiki.exceptions import *
 
+__all__ = ["CopyvioCheckResult", "CopyvioMixin"]
+
 class CopyvioCheckResult(object):
     def __init__(self, violation, confidence, url, queries, article, chains):
         self.violation = violation
diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py
index 4e77ebc..74783d0 100644
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -23,6 +23,8 @@
 from collections import defaultdict
 from re import sub, UNICODE
 
+__all__ = ["MarkovChain", "MarkovChainIntersection"]
+
 class MarkovChain(object):
     START = -1
     END = -2
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index f9bb4c2..9e97267 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,6 +20,8 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]
+
 class BaseTextParser(object):
     def __init__(self, text):
         self.text = text
diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py
index 59287cc..bc9dfe4 100644
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -30,6 +30,8 @@ except ImportError:
 
 from earwigbot.wiki.exceptions import SearchQueryError
 
+__all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"]
+
 class BaseSearchEngine(object):
     def __init__(self, cred):
         """Store credentials 'cred' for searching later on."""

From 7dbbe9683cbe6799528e8a7d8f6c1104f9813e67 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Fri, 6 Jul 2012 21:22:22 -0400
Subject: [PATCH 05/19] Update imports and exceptions.

---
 earwigbot/wiki/copyvios/__init__.py | 16 +++++++++-------
 earwigbot/wiki/copyvios/search.py   |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index f657f5b..30d4681 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -31,9 +31,9 @@ except ImportError:
     oauth = None
 
 from earwigbot import exceptions
-from earwigbot.wiki.copyvios.markov import *
-from earwigbot.wiki.copyvios.parsers import *
-from earwigbot.wiki.copyvios.search import *
+from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
+from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
+from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
 
 __all__ = ["CopyvioCheckResult", "CopyvioMixIn"]
 
@@ -107,14 +107,16 @@ class CopyvioMixIn(object):
         if engine == "Yahoo! BOSS":
             if not oauth:
                 e = "The package 'oauth2' could not be imported"
-                raise UnsupportedSearchEngineError(e)
+                raise exceptions.UnsupportedSearchEngineError(e)
             return YahooBOSSSearchEngine(credentials)
 
-        raise UnknownSearchEngineError(engine)
+        raise exceptions.UnknownSearchEngineError(engine)
 
     def _copyvio_compare_content(self, article, url):
-        """
-        DOCSTRING NEEDED
+        """Return a number comparing an article and a URL.
+
+        The *article* is a Markov chain, whereas the URL is a string that we
+        will try to open ourselves.
         """
         html = self._open_url_ignoring_errors(url)
         if not html:
diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py
index bc9dfe4..d8091ee 100644
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -28,7 +28,7 @@ try:
 except ImportError:
     oauth = None
 
-from earwigbot.wiki.exceptions import SearchQueryError
+from earwigbot.exceptions import SearchQueryError
 
 __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"]
 

From d87c226417f08716713e6f190f72d8b59d2eef35 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Fri, 6 Jul 2012 21:50:57 -0400
Subject: [PATCH 06/19] __repr__ and __str__ for everything per #5 and #22.

---
 earwigbot/wiki/copyvios/__init__.py | 11 +++++++++--
 earwigbot/wiki/copyvios/markov.py   | 19 +++++++++++++++++++
 earwigbot/wiki/copyvios/parsers.py  |  9 +++++++++
 earwigbot/wiki/copyvios/search.py   |  8 ++++++++
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 30d4681..f85ab22 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -48,8 +48,15 @@ class CopyvioCheckResult(object):
         self.delta_chain = chains[1]
 
     def __repr__(self):
-        r = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
-        return r.format(self.violation, self.confidence, self.url, self.queries)
+        """Return the canonical string representation of the result."""
+        res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
+        return res.format(self.violation, self.confidence, self.url,
+                          self.queries)
+
+    def __str__(self):
+        """Return a nice string representation of the result."""
+        res = "<CopyvioCheckResult ({0} with {1} conf)>"
+        return res.format(self.violation, self.confidence)
 
 
 class CopyvioMixIn(object):
diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py
index 74783d0..081469f 100644
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -42,6 +42,14 @@ class MarkovChain(object):
         except KeyError:
             pass
 
+    def __repr__(self):
+        """Return the canonical string representation of the MarkovChain."""
+        return "MarkovChain(text={0!r})".format(self.text)
+
+    def __str__(self):
+        """Return a nice string representation of the MarkovChain."""
+        return "<MarkovChain of size {0}>".format(self.size())
+
     def size(self):
         count = 0
         for node in self.chain.itervalues():
@@ -53,6 +61,7 @@ class MarkovChain(object):
 class MarkovChainIntersection(MarkovChain):
     def __init__(self, mc1, mc2):
         self.chain = defaultdict(lambda: defaultdict(lambda: 0))
+        self.mc1, self.mc2 = mc1, mc2
         c1 = mc1.chain
         c2 = mc2.chain
 
@@ -63,3 +72,13 @@ class MarkovChainIntersection(MarkovChain):
                     if node in nodes2:
                         count2 = nodes2[node]
                         self.chain[word][node] = min(count1, count2)
+
+    def __repr__(self):
+        """Return the canonical string representation of the intersection."""
+        res = "MarkovChainIntersection(mc1={0!r}, mc2={1!r})"
+        return res.format(self.mc1, self.mc2)
+
+    def __str__(self):
+        """Return a nice string representation of the intersection."""
+        res = "<MarkovChainIntersection of size {0} ({1} ^ {2})>"
+        return res.format(self.size(), self.mc1, self.mc2)
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 9e97267..0c3c17b 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -23,6 +23,15 @@
 __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]
 
 class BaseTextParser(object):
+    def __repr__(self):
+        """Return the canonical string representation of the text parser."""
+        return "{0}(text={1!r})".format(self.__class__.__name__, self.text)
+
+    def __str__(self):
+        """Return a nice string representation of the text parser."""
+        name = self.__class__.__name__
+        return "<{0} of text with size {1}>".format(name, len(text))
+
     def __init__(self, text):
         self.text = text
 
diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py
index d8091ee..4345b29 100644
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -37,6 +37,14 @@ class BaseSearchEngine(object):
         """Store credentials 'cred' for searching later on."""
         self.cred = cred
 
+    def __repr__(self):
+        """Return the canonical string representation of the search engine."""
+        return "{0}()".format(self.__class__.__name__)
+
+    def __str__(self):
+        """Return a nice string representation of the search engine."""
+        return "<{0}>".format(self.__class__.__name__)
+
     def search(self, query):
         """Use this engine to search for 'query'.
 

From d45e342bac59c8587c8e34c2c794023452ef6fda Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Fri, 6 Jul 2012 22:55:23 -0400
Subject: [PATCH 07/19] DOCUMENT EVERYTHING (#5)

Also implementing MWParserFromHell, plus some cleanup.
---
 docs/api/earwigbot.wiki.copyvios.rst | 33 +++++++++++++
 docs/api/earwigbot.wiki.rst          | 14 +++---
 earwigbot/wiki/copyvios/__init__.py  | 91 ++++++++++++++++++++----------------
 earwigbot/wiki/copyvios/markov.py    |  4 ++
 earwigbot/wiki/copyvios/parsers.py   | 66 +++++++++++---------------
 earwigbot/wiki/copyvios/search.py    | 19 +++++---
 6 files changed, 136 insertions(+), 91 deletions(-)
 create mode 100644 docs/api/earwigbot.wiki.copyvios.rst

diff --git a/docs/api/earwigbot.wiki.copyvios.rst b/docs/api/earwigbot.wiki.copyvios.rst
new file mode 100644
index 0000000..7dbcf39
--- /dev/null
+++ b/docs/api/earwigbot.wiki.copyvios.rst
@@ -0,0 +1,33 @@
+copyvios Package
+================
+
+:mod:`copyvios` Package
+-----------------------
+
+.. automodule:: earwigbot.wiki.copyvios
+    :members:
+    :undoc-members:
+
+:mod:`markov` Module
+--------------------
+
+.. automodule:: earwigbot.wiki.copyvios.markov
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`parsers` Module
+---------------------
+
+.. automodule:: earwigbot.wiki.copyvios.parsers
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`search` Module
+--------------------
+
+.. automodule:: earwigbot.wiki.copyvios.search
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/api/earwigbot.wiki.rst b/docs/api/earwigbot.wiki.rst
index 806b3eb..45b009b 100644
--- a/docs/api/earwigbot.wiki.rst
+++ b/docs/api/earwigbot.wiki.rst
@@ -22,13 +22,6 @@ wiki Package
     :members:
     :undoc-members:
 
-:mod:`copyright` Module
------------------------
-
-.. automodule:: earwigbot.wiki.copyright
-    :members:
-    :undoc-members:
-
 :mod:`page` Module
 ------------------
 
@@ -57,3 +50,10 @@ wiki Package
 .. automodule:: earwigbot.wiki.user
     :members:
     :undoc-members:
+
+Subpackages
+-----------
+
+.. toctree::
+
+    earwigbot.wiki.copyvios
diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index f85ab22..2c2bb23 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -38,6 +38,22 @@ from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
 __all__ = ["CopyvioCheckResult", "CopyvioMixIn"]
 
 class CopyvioCheckResult(object):
+    """
+    **EarwigBot: Wiki Toolset: Copyvio Check Result**
+
+    A class holding information about the results of a copyvio check.
+
+    *Attributes:*
+
+    - :py:attr:`violation`:     ``True`` if this is a violation, else ``False``
+    - :py:attr:`confidence`:    a float between 0 and 1 indicating accuracy
+    - :py:attr:`url`:           the URL of the violated page
+    - :py:attr:`queries`:       the number of queries used to reach a result
+    - :py:attr:`article_chain`: the MarkovChain of the article text
+    - :py:attr:`source_chain`:  the MarkovChain of the violated page text
+    - :py:attr:`delta_chain`:   the MarkovChainIntersection comparing the two
+    """
+
     def __init__(self, violation, confidence, url, queries, article, chains):
         self.violation = violation
         self.confidence = confidence
@@ -61,14 +77,15 @@ class CopyvioCheckResult(object):
 
 class CopyvioMixIn(object):
     """
-    EarwigBot's Wiki Toolset: Copyright Violation Mixin
+    **EarwigBot: Wiki Toolset: Copyright Violation MixIn**
 
-    This is a mixin that provides two public methods, copyvio_check() and
-    copyvio_compare(). The former checks the page for copyright violations
-    using a search engine API, and the latter compares the page against a
-    specified URL. Credentials for the search engine API are stored in the
-    site's config.
+    This is a mixin that provides two public methods, :py:meth:`copyvio_check`
+    and :py:meth:`copyvio_compare`. The former checks the page for copyright
+    violations using a search engine API, and the latter compares the page
+    against a given URL. Credentials for the search engine API are stored in
+    the :py:class:`~earwigbot.wiki.site.Site`'s config.
     """
+
     def __init__(self, site):
         self._opener = build_opener()
         self._opener.addheaders = site._opener.addheaders
@@ -100,10 +117,10 @@ class CopyvioMixIn(object):
     def _select_search_engine(self):
         """Return a function that can be called to do web searches.
 
-        The "function" is a functools.partial object that takes one argument, a
-        query, and returns a list of URLs, ranked by importance. The underlying
-        logic depends on the 'engine' argument; for example, if 'engine' is
-        "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.
+        The function takes one argument, a search query, and returns a list of
+        URLs, ranked by importance. The underlying logic depends on the
+        *engine* argument within our config; for example, if *engine* is
+        "Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying.
 
         Raises UnknownSearchEngineError if the 'engine' listed in our config is
         unknown to us, and UnsupportedSearchEngineError if we are missing a
@@ -122,8 +139,8 @@ class CopyvioMixIn(object):
     def _copyvio_compare_content(self, article, url):
         """Return a number comparing an article and a URL.
 
-        The *article* is a Markov chain, whereas the URL is a string that we
-        will try to open ourselves.
+        The *article* is a Markov chain, whereas the *url* is just a string
+        that we'll try to open and read ourselves.
         """
         html = self._open_url_ignoring_errors(url)
         if not html:
@@ -134,30 +151,22 @@ class CopyvioMixIn(object):
         return float(delta.size()) / article.size(), (source, delta)
 
     def copyvio_check(self, min_confidence=0.5, max_queries=-1,
-                      interquery_sleep=1, force=False):
+                      interquery_sleep=1):
         """Check the page for copyright violations.
 
-        Returns a _CopyvioCheckResult object with four useful attributes:
-        "violation", "confidence", "url", and "queries". "confidence" is a
-        number between 0 and 1; if it is less than "min_confidence", we could
-        not find any indication of a violation (so "violation" will be False
-        and "url" may or may not be None), otherwise it indicates the relative
-        faith in our results, "violation" will be True, and "url" will be the
-        place the article is suspected of being copied from. "queries" is the
-        number of queries used to determine the results.
+        Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult`
+        object with information on the results of the check.
 
-        "max_queries" is self-explanatory; we will never make more than this
-        number of queries in a given check. If it's less than 0, we will not
-        limit our number of queries.
+        *max_queries* is self-explanatory; we will never make more than this
+        number of queries in a given check. If it's lower than 0, we will not
+        limit the number of queries.
 
-        "interquery_sleep" is the minimum amount of time we will sleep between
+        *interquery_sleep* is the minimum amount of time we will sleep between
         search engine queries, in seconds.
 
-        "force" is simply passed to page.get() - it has the same behavior there
-        as it does here.
-
-        Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
-        SearchQueryError, ...) on errors.
+        Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses
+        (:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`,
+        :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors.
         """
         searcher = self._select_search_engine()
         handled_urls = []
@@ -166,9 +175,9 @@ class CopyvioMixIn(object):
         num_queries = 0
         empty = MarkovChain("")
         best_chains = (empty, MarkovChainIntersection(empty, empty))
-        content = self.get(force)
-        clean = ArticleTextParser(content).strip()
-        chunks = ArticleTextParser(clean).chunk(max_queries)
+        parser = ArticleTextParser(self.get())
+        clean = parser.strip()
+        chunks = parser.chunk(max_queries)
         article_chain = MarkovChain(clean)
         last_query = time()
 
@@ -200,13 +209,14 @@ class CopyvioMixIn(object):
         return CopyvioCheckResult(v, best_confidence, best_match, num_queries,
                                   article_chain, best_chains)
 
-    def copyvio_compare(self, url, min_confidence=0.5, force=False):
-        """Check the page like copyvio_check(), but against a specific URL.
+    def copyvio_compare(self, url, min_confidence=0.5):
+        """Check the page like :py:meth:`copyvio_check` against a specific URL.
 
         This is essentially a reduced version of the above - a copyivo
         comparison is made using Markov chains and the result is returned in a
-        _CopyvioCheckResult object - without using a search engine, as the
-        suspected "violated" URL is supplied from the start.
+        :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but
+        without using a search engine, since the suspected "violated" URL is
+        supplied from the start.
 
         Its primary use is to generate a result when the URL is retrieved from
         a cache, like the one used in EarwigBot's Toolserver site. After a
@@ -217,10 +227,11 @@ class CopyvioMixIn(object):
         be stored for data retention reasons, so a fresh comparison is made
         using this function.
 
-        Since no searching is done, neither UnknownSearchEngineError nor
-        SearchQueryError will be raised.
+        Since no searching is done, neither
+        :py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor
+        :py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised.
         """
-        content = self.get(force)
+        content = self.get()
         clean = ArticleTextParser(content).strip()
         article_chain = MarkovChain(clean)
         confidence, chains = self._copyvio_compare_content(article_chain, url)
diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py
index 081469f..657b4b9 100644
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -26,6 +26,7 @@ from re import sub, UNICODE
 __all__ = ["MarkovChain", "MarkovChainIntersection"]
 
 class MarkovChain(object):
+    """Implements a basic bigram Markov chain of words."""
     START = -1
     END = -2
 
@@ -51,6 +52,7 @@ class MarkovChain(object):
         return "<MarkovChain of size {0}>".format(self.size())
 
     def size(self):
+        """Return the size of the Markov chain: the total number of nodes."""
         count = 0
         for node in self.chain.itervalues():
             for hits in node.itervalues():
@@ -59,6 +61,8 @@ class MarkovChain(object):
 
 
 class MarkovChainIntersection(MarkovChain):
+    """Implements the intersection of two chains (i.e., their shared nodes)."""
+
     def __init__(self, mc1, mc2):
         self.chain = defaultdict(lambda: defaultdict(lambda: 0))
         self.mc1, self.mc2 = mc1, mc2
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 0c3c17b..8a31127 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,9 +20,19 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+try:
+    import mwparserfromhell
+except ImportError:
+    mwparserfromhell = None
+
 __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]
 
 class BaseTextParser(object):
+    """Base class for a parser that handles text."""
+
+    def __init__(self, text):
+        self.text = text
+
     def __repr__(self):
         """Return the canonical string representation of the text parser."""
         return "{0}(text={1!r})".format(self.__class__.__name__, self.text)
@@ -32,60 +42,40 @@ class BaseTextParser(object):
         name = self.__class__.__name__
         return "<{0} of text with size {1}>".format(name, len(text))
 
-    def __init__(self, text):
-        self.text = text
-
 
 class ArticleTextParser(BaseTextParser):
+    """A parser that can strip and chunk wikicode article text."""
+
     def strip(self):
         """Clean the page's raw text by removing templates and formatting.
 
-        Returns the page's text with all HTML and wikicode formatting removed,
-        including templates, tables, references, and the Bibliography/
-        References/Sources/See also section(s). It retains punctuation
+        Return the page's text with all HTML and wikicode formatting removed,
+        including templates, tables, and references. It retains punctuation
         (spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
-        quotes) and original capitalization, but not brackets (square and
-        angular), abnormal spacing, nor anything else. HTML entities are
+        quotes), original capitalization, and so forth. HTML entities are
         replaced by their unicode equivalents.
 
-        The actual replacement is handled by a few private methods within this
-        class.
+        The actual stripping is handled by :py:mod:`mwparserfromhell`.
         """
-        text = self._strip_tags(self.text)
-        text = self._strip_templates(text)
-        text = self._strip_sections(text)
-        text = self._strip_wikicode(text)
-        text = self._normalize(text)
-        return text
+        wikicode = mwparserfromhell.parse(self.text)
+        self.clean = u" ".join(wikicode.normalize().ifilter_text())
+        return self.clean
 
     def chunk(self, max_chunks):
-        """Convert the article text into a list of web-searchable chunks.
+        """Convert the clean article text into a list of web-searchable chunks.
 
-        No greater than max_chunks will be returned. Each chunk will only be a
-        couple sentences long at most. The idea here is to return a
+        No greater than *max_chunks* will be returned. Each chunk will only be
+        a couple sentences long at most. The idea here is to return a
         representative sample of the article text rather than the entire
         article, so we'll probably pick and choose from its introduction, body,
-        and conclusion, especially if the article is large and max_chunks are
-        few, so we don't end up just searching for the first paragraph.
+        and conclusion, especially if the article is large and *max_chunks* is
+        low, so we don't end up just searching for the first paragraph.
         """
-        return [self.text]
-
-    def _strip_tags(self, text):
-        return text
-
-    def _strip_templates(self, text):
-        return text
-
-    def _strip_sections(self, text):
-        return text
-
-    def _strip_wikicode(self, text):
-        return text
-
-    def _normalize(self, text):
-        return text
+        return [self.text]                                                                          # TODO: NotImplemented
 
 
 class HTMLTextParser(BaseTextParser):
+    """A parser that can extract the text from an HTML document."""
+
     def strip(self):
-        return self.text
+        return self.text                                                                            # TODO: NotImplemented
diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py
index 4345b29..ac40613 100644
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -33,8 +33,10 @@ from earwigbot.exceptions import SearchQueryError
 __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"]
 
 class BaseSearchEngine(object):
+    """Base class for a simple search engine interface."""
+
     def __init__(self, cred):
-        """Store credentials 'cred' for searching later on."""
+        """Store credentials *cred* for searching later on."""
         self.cred = cred
 
     def __repr__(self):
@@ -46,25 +48,30 @@ class BaseSearchEngine(object):
         return "<{0}>".format(self.__class__.__name__)
 
     def search(self, query):
-        """Use this engine to search for 'query'.
+        """Use this engine to search for *query*.
 
-        Not implemented in this base class; overridden in subclasses."""
+        Not implemented in this base class; overridden in subclasses.
+        """
         raise NotImplementedError()
 
 
 class YahooBOSSSearchEngine(BaseSearchEngine):
+    """A search engine interface with Yahoo! BOSS."""
+
     def search(self, query):
-        """Do a Yahoo! BOSS web search for 'query'.
+        """Do a Yahoo! BOSS web search for *query*.
 
         Returns a list of URLs, no more than fifty, ranked by relevance (as
-        determined by Yahoo). Raises SearchQueryError() on errors.
+        determined by Yahoo). Raises
+        :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
         """
         base_url = "http://yboss.yahooapis.com/ysearch/web"
         query = quote_plus(query.join('"', '"'))
         params = {"q": query, "style": "raw", "format": "json"}
         url = "{0}?{1}".format(base_url, urlencode(params))
 
-        consumer = oauth.Consumer(key=self.cred["key"], secret=self.cred["secret"])
+        consumer = oauth.Consumer(key=self.cred["key"],
+                                  secret=self.cred["secret"])
         client = oauth.Client(consumer)
         headers, body = client.request(url, "GET")
 

From 1af4217b63a10faf41547501a9d2ec688344945d Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 7 Jul 2012 00:16:54 -0400
Subject: [PATCH 08/19] Update copyright notices and some other improvements.

---
 docs/api/modules.rst                | 2 +-
 earwigbot/wiki/copyvios/__init__.py | 2 +-
 earwigbot/wiki/copyvios/markov.py   | 2 +-
 earwigbot/wiki/copyvios/parsers.py  | 2 +-
 earwigbot/wiki/copyvios/search.py   | 2 +-
 earwigbot/wiki/page.py              | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/api/modules.rst b/docs/api/modules.rst
index 7c4c110..3bf56b4 100644
--- a/docs/api/modules.rst
+++ b/docs/api/modules.rst
@@ -2,6 +2,6 @@ earwigbot
 =========
 
 .. toctree::
-   :maxdepth: 4
+   :maxdepth: 6
 
    earwigbot
diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 2c2bb23..a17f800 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
-# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
+# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py
index 657b4b9..28cdb97 100644
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
-# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
+# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 8a31127..565acff 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
-# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
+# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py
index ac40613..a768141 100644
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
-# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
+# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py
index 248334e..3125b33 100644
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -35,7 +35,7 @@ from earwigbot.wiki.copyvios import CopyvioMixIn
 
 __all__ = ["Page"]
 
-class Page(CopyvioMixin):
+class Page(CopyvioMixIn):
     """
     **EarwigBot: Wiki Toolset: Page**
 

From cb870041079843c521a65e74784d41e224ffadd9 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 7 Jul 2012 03:37:15 -0400
Subject: [PATCH 09/19] Primitive screen scraper for HTML using BeautifulSoup
 and LXML.

Obviously this can and should be improved significantly later, but it seems
good enough for now.
---
 earwigbot/wiki/copyvios/parsers.py | 27 ++++++++++++++++++++++++++-
 setup.py                           |  2 ++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 565acff..8b9655b 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,6 +20,13 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import htmlentitydefs
+
+try:
+    from bs4 import BeautifulSoup
+except ImportError:
+    BeautifulSoup = None
+
 try:
     import mwparserfromhell
 except ImportError:
@@ -76,6 +83,24 @@ class ArticleTextParser(BaseTextParser):
 
 class HTMLTextParser(BaseTextParser):
     """A parser that can extract the text from an HTML document."""
+    hidden_tags = [
+        "script", "style"
+    ]
 
     def strip(self):
-        return self.text                                                                            # TODO: NotImplemented
+        """Return the actual text contained within an HTML document.
+
+        Implemented using :py:mod:`BeautifulSoup <bs4>`
+        (http://www.crummy.com/software/BeautifulSoup/).
+        """
+        try:
+            soup = BeautifulSoup(self.text, "lxml").body
+        except ValueError:
+            soup = BeautifulSoup(self.text).body
+
+        is_comment = lambda text: isinstance(text, bs4.element.Comment)
+        [comment.extract() for comment in soup.find_all(text=is_comment)]
+        for tag in self.hidden_tags:
+            [element.extract() for element in soup.find_all(tag)]
+
+        return "\n".join(soup.stripped_strings)
diff --git a/setup.py b/setup.py
index 9db6676..3c3c7cd 100644
--- a/setup.py
+++ b/setup.py
@@ -34,6 +34,8 @@ setup(
     entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]},
     install_requires = ["GitPython >= 0.3.2.RC1",  # Interfacing with git
                         "PyYAML >= 3.10",  # Config parsing
+                        "beautifulsoup4 >= 4.1.1",  # HTML parsing/scraping
+                        "lxml >= 2.3.4",  # Faster parser for BeautifulSoup
                         "mwparserfromhell >= 0.1",  # Wikicode parsing
                         "oursql >= 0.9.3",  # Talking with MediaWiki databases
                         "oauth2 >= 1.5.211",  # Talking with Yahoo BOSS Search

From bf1ad08dc66ce4cc8e3e0110a7a9e311ef95f44b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 7 Jul 2012 04:30:42 -0400
Subject: [PATCH 10/19] Make Markov chain degree-independent. Testing trigrams.

---
 earwigbot/wiki/copyvios/markov.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py
index 28cdb97..00567b2 100644
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -26,22 +26,21 @@ from re import sub, UNICODE
 __all__ = ["MarkovChain", "MarkovChainIntersection"]
 
 class MarkovChain(object):
-    """Implements a basic bigram Markov chain of words."""
+    """Implements a basic ngram Markov chain of words."""
     START = -1
     END = -2
+    degree = 3  # 2 for bigrams, 3 for trigrams, etc.
 
     def __init__(self, text):
         self.text = text
         self.chain = defaultdict(lambda: defaultdict(lambda: 0))
         words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
-        prev = self.START
-        for word in words:
-            self.chain[prev][word] += 1
-            prev = word
-        try:  # This won't work if the source text is completely blank
-            self.chain[word][self.END] += 1
-        except KeyError:
-            pass
+
+        padding = self.degree - 1
+        words = ([self.START] * padding) + words + ([self.END] * padding)
+        for i in range(len(words) - self.degree + 1):
+            last = i + self.degree - 1
+            self.chain[words[i:last]][last] += 1
 
     def __repr__(self):
         """Return the canonical string representation of the MarkovChain."""

From 17eee28a4bf42b01df072daddfefba611eb4171f Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 7 Jul 2012 04:32:52 -0400
Subject: [PATCH 11/19] Whoops, got the slicing wrong.

---
 earwigbot/wiki/copyvios/markov.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py
index 00567b2..7813f61 100644
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -40,7 +40,7 @@ class MarkovChain(object):
         words = ([self.START] * padding) + words + ([self.END] * padding)
         for i in range(len(words) - self.degree + 1):
             last = i + self.degree - 1
-            self.chain[words[i:last]][last] += 1
+            self.chain[words[i:last]][words[last]] += 1
 
     def __repr__(self):
         """Return the canonical string representation of the MarkovChain."""

From 569c815d994587347c8734e01751523eab193bf4 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 7 Jul 2012 16:40:27 -0400
Subject: [PATCH 12/19] Implement NLTK for chunking article content (#5).

---
 earwigbot/wiki/copyvios/__init__.py |  6 ++++--
 earwigbot/wiki/copyvios/parsers.py  | 34 ++++++++++++++++++++++++++--------
 earwigbot/wiki/copyvios/search.py   |  2 +-
 earwigbot/wiki/site.py              |  2 +-
 earwigbot/wiki/sitesdb.py           | 15 ++++++++++++++-
 setup.py                            | 31 ++++++++++++++++++++-----------
 6 files changed, 66 insertions(+), 24 deletions(-)

diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index a17f800..5fb7bf2 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -87,6 +87,7 @@ class CopyvioMixIn(object):
     """
 
     def __init__(self, site):
+        self._search_config = site._search_config
         self._opener = build_opener()
         self._opener.addheaders = site._opener.addheaders
 
@@ -126,7 +127,8 @@ class CopyvioMixIn(object):
         unknown to us, and UnsupportedSearchEngineError if we are missing a
         required package or module, like oauth2 for "Yahoo! BOSS".
         """
-        engine, credentials = self._site._search_config
+        engine = self._search_config["engine"]
+        credentials = self._search_config["credentials"]
 
         if engine == "Yahoo! BOSS":
             if not oauth:
@@ -177,7 +179,7 @@ class CopyvioMixIn(object):
         best_chains = (empty, MarkovChainIntersection(empty, empty))
         parser = ArticleTextParser(self.get())
         clean = parser.strip()
-        chunks = parser.chunk(max_queries)
+        chunks = parser.chunk(max_queries, self._search_config["nltk_dir"])
         article_chain = MarkovChain(clean)
         last_query = time()
 
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 8b9655b..a00369d 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,7 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-import htmlentitydefs
+from os import path
 
 try:
     from bs4 import BeautifulSoup
@@ -32,6 +32,11 @@ try:
 except ImportError:
     mwparserfromhell = None
 
+try:
+    import nltk
+except ImportError:
+    nltk = None
+
 __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]
 
 class BaseTextParser(object):
@@ -68,17 +73,30 @@ class ArticleTextParser(BaseTextParser):
         self.clean = u" ".join(wikicode.normalize().ifilter_text())
         return self.clean
 
-    def chunk(self, max_chunks):
+    def chunk(self, max_chunks, nltk_dir):
         """Convert the clean article text into a list of web-searchable chunks.
 
         No greater than *max_chunks* will be returned. Each chunk will only be
-        a couple sentences long at most. The idea here is to return a
-        representative sample of the article text rather than the entire
-        article, so we'll probably pick and choose from its introduction, body,
-        and conclusion, especially if the article is large and *max_chunks* is
-        low, so we don't end up just searching for the first paragraph.
+        a sentence or two long at most. The idea here is to return a
+        representative sample of the article text rather than the whole, so
+        we'll probably pick and choose from its introduction, body, and
+        conclusion, especially if the article is large and *max_chunks* is low,
+        so we don't end up just searching for the first paragraph.
+
+        This is implemented using :py:mod:`nltk` (http://nltk.org/). A base
+        directory (*nltk_dir*) is required to store nltk's punctuation
+        database. This is typically located in the bot's working directory.
         """
-        return [self.text]                                                                          # TODO: NotImplemented
+        datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
+        try:
+            tokenizer = nltk.data.load(datafile)
+        except LookupError:
+            nltk.download("punkt", nltk_dir)
+            tokenizer = nltk.data.load(datafile)
+
+        sentences = tokenizer.tokenize(self.clean)
+        #if max_chunks >= len(sentences):
+        #    return sentences
 
 
 class HTMLTextParser(BaseTextParser):
diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py
index a768141..cf2edb4 100644
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -67,7 +67,7 @@ class YahooBOSSSearchEngine(BaseSearchEngine):
         """
         base_url = "http://yboss.yahooapis.com/ysearch/web"
         query = quote_plus(query.join('"', '"'))
-        params = {"q": query, "style": "raw", "format": "json"}
+        params = {"q": query, "type": "html,text", "format": "json"}
         url = "{0}?{1}".format(base_url, urlencode(params))
 
         consumer = oauth.Consumer(key=self.cred["key"],
diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py
index 4d88505..f627a02 100644
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -92,7 +92,7 @@ class Site(object):
                  namespaces=None, login=(None, None), cookiejar=None,
                  user_agent=None, use_https=False, assert_edit=None,
                  maxlag=None, wait_between_queries=3, logger=None,
-                 search_config=(None, None)):
+                 search_config=None):
         """Constructor for new Site instances.
 
         This probably isn't necessary to call yourself unless you're building a
diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py
index 1f3265b..5af7e3a 100644
--- a/earwigbot/wiki/sitesdb.py
+++ b/earwigbot/wiki/sitesdb.py
@@ -192,6 +192,10 @@ class SitesDB(object):
             user_agent = user_agent.replace("$1", __version__)
             user_agent = user_agent.replace("$2", python_version())
 
+        if search_config:
+            nltk_dir = path.join(self.config.root_dir, ".nltk")
+            search_config["nltk_dir"] = nltk_dir
+
         return Site(name=name, project=project, lang=lang, base_url=base_url,
                     article_path=article_path, script_path=script_path,
                     sql=sql, namespaces=namespaces, login=login,
@@ -360,14 +364,23 @@ class SitesDB(object):
         assert_edit = config.wiki.get("assert")
         maxlag = config.wiki.get("maxlag")
         wait_between_queries = config.wiki.get("waitTime", 5)
+        logger = self._logger.getChild(name)
         search_config = config.wiki.get("search")
 
+        if user_agent:
+            user_agent = user_agent.replace("$1", __version__)
+            user_agent = user_agent.replace("$2", python_version())
+
+        if search_config:
+            nltk_dir = path.join(self.config.root_dir, ".nltk")
+            search_config["nltk_dir"] = nltk_dir
+
         # Create a Site object to log in and load the other attributes:
         site = Site(base_url=base_url, script_path=script_path, sql=sql,
                     login=login, cookiejar=cookiejar, user_agent=user_agent,
                     use_https=use_https, assert_edit=assert_edit,
                     maxlag=maxlag, wait_between_queries=wait_between_queries,
-                    search_config=search_config)
+                    logger=logger, search_config=search_config)
 
         self._add_site_to_sitesdb(site)
         self._sites[site.name] = site
diff --git a/setup.py b/setup.py
index 3c3c7cd..b68ae4d 100644
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,25 @@ from setuptools import setup, find_packages
 
 from earwigbot import __version__
 
+# Not all of these dependencies are required, particularly the copyvio-specific
+# ones (bs4, lxml, nltk, and oauth2) or the command-specific ones (GitPython,
+# pytz). The bot should run fine without them, but will raise an exception if
+# you try to detect copyvios or run a command that requries one.
+
+dependencies = [
+    "GitPython >= 0.3.2.RC1",  # Interfacing with git for !git and __version__
+    "PyYAML >= 3.10",  # Parsing config files
+    "beautifulsoup4 >= 4.1.1",  # Parsing/scraping HTML for copyvios
+    "lxml >= 2.3.4",  # Faster parser for BeautifulSoup
+    "mwparserfromhell >= 0.1",  # Parsing wikicode for manipulation
+    "nltk >= 2.0.2",  # Parsing sentences to split article content for copyvios
+    "oursql >= 0.9.3",  # Interfacing with MediaWiki databases
+    "oauth2 >= 1.5.211",  # Interfacing with Yahoo! BOSS Search for copyvios
+    "py-bcrypt >= 0.2",  # Hashing the bot key in the config file
+    "pycrypto >= 2.5",  # Storing bot passwords and keys in the config file
+    "pytz >= 2012c",  # Handling timezones for the !time IRC command
+]
+
 with open("README.rst") as fp:
     long_docs = fp.read()
 
@@ -32,17 +51,7 @@ setup(
     name = "earwigbot",
     packages = find_packages(exclude=("tests",)),
     entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]},
-    install_requires = ["GitPython >= 0.3.2.RC1",  # Interfacing with git
-                        "PyYAML >= 3.10",  # Config parsing
-                        "beautifulsoup4 >= 4.1.1",  # HTML parsing/scraping
-                        "lxml >= 2.3.4",  # Faster parser for BeautifulSoup
-                        "mwparserfromhell >= 0.1",  # Wikicode parsing
-                        "oursql >= 0.9.3",  # Talking with MediaWiki databases
-                        "oauth2 >= 1.5.211",  # Talking with Yahoo BOSS Search
-                        "py-bcrypt >= 0.2",  # Password hashing in config
-                        "pycrypto >= 2.5",  # Storing bot passwords and keys
-                        "pytz >= 2012c",  # Timezone handling
-                        ],
+    install_requires = dependencies,
     test_suite = "tests",
     version = __version__,
     author = "Ben Kurtovic",

From c260648bdb2a45a9c0a76f6e4df53889f28f270c Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 7 Jul 2012 21:40:54 -0400
Subject: [PATCH 13/19] Finish chunking algorithm, improve !link, other fixes.

---
 earwigbot/commands/link.py          | 14 ++++-------
 earwigbot/wiki/copyvios/__init__.py |  2 +-
 earwigbot/wiki/copyvios/parsers.py  | 50 +++++++++++++++++++++++++++----------
 earwigbot/wiki/site.py              |  6 ++---
 earwigbot/wiki/sitesdb.py           |  2 +-
 5 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/earwigbot/commands/link.py b/earwigbot/commands/link.py
index 0b54554..ebe3669 100644
--- a/earwigbot/commands/link.py
+++ b/earwigbot/commands/link.py
@@ -30,6 +30,7 @@ class Link(Command):
     name = "link"
 
     def process(self, data):
+        self.site = self.bot.wiki.get_site()
         msg = data.msg
 
         if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg):
@@ -41,8 +42,8 @@ class Link(Command):
             if not data.args:
                 self.reply(data, "what do you want me to link to?")
                 return
-            pagename = ' '.join(data.args)
-            link = self.parse_link(pagename)
+            pagename = " ".join(data.args)
+            link = self.site.get_page(pagename).url
             self.reply(data, link)
 
     def parse_line(self, line):
@@ -56,8 +57,7 @@ class Link(Command):
         if links:
             # re.findall() returns a list of tuples, but we only want the 2nd
             # item in each tuple:
-            links = [i[1] for i in links]
-            results = map(self.parse_link, links)
+            results = [self.site.get_page(name[1]).url for name in links]
 
         # Find all {{templates}}
         templates = re.findall("(\{\{(.*?)(\||\}\}))", line)
@@ -67,10 +67,6 @@ class Link(Command):
 
         return results
 
-    def parse_link(self, pagename):
-        link = quote(pagename.replace(" ", "_"), safe="/:")
-        return "".join(("http://enwp.org/", link))
-
     def parse_template(self, pagename):
         pagename = "".join(("Template:", pagename))
-        return self.parse_link(pagename)
+        return self.site.get_page(pagename).url
diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 5fb7bf2..cf2ddde 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -179,7 +179,7 @@ class CopyvioMixIn(object):
         best_chains = (empty, MarkovChainIntersection(empty, empty))
         parser = ArticleTextParser(self.get())
         clean = parser.strip()
-        chunks = parser.chunk(max_queries, self._search_config["nltk_dir"])
+        chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
         article_chain = MarkovChain(clean)
         last_query = time()
 
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index a00369d..b258730 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -70,18 +70,18 @@ class ArticleTextParser(BaseTextParser):
         The actual stripping is handled by :py:mod:`mwparserfromhell`.
         """
         wikicode = mwparserfromhell.parse(self.text)
-        self.clean = u" ".join(wikicode.normalize().ifilter_text())
+        self.clean = wikicode.strip_code(normalize=True)
         return self.clean
 
-    def chunk(self, max_chunks, nltk_dir):
+    def chunk(self, nltk_dir, max_chunks, max_query=256):
         """Convert the clean article text into a list of web-searchable chunks.
 
         No greater than *max_chunks* will be returned. Each chunk will only be
-        a sentence or two long at most. The idea here is to return a
-        representative sample of the article text rather than the whole, so
-        we'll probably pick and choose from its introduction, body, and
-        conclusion, especially if the article is large and *max_chunks* is low,
-        so we don't end up just searching for the first paragraph.
+        a sentence or two long at most (no more than *max_query*). The idea is
+        to return a sample of the article text rather than the whole, so we'll
+        pick and choose from parts of it, especially if the article is large
+        and *max_chunks* is low, so we don't end up just searching for just the
+        first paragraph.
 
         This is implemented using :py:mod:`nltk` (http://nltk.org/). A base
         directory (*nltk_dir*) is required to store nltk's punctuation
@@ -89,14 +89,38 @@ class ArticleTextParser(BaseTextParser):
         """
         datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
         try:
-            tokenizer = nltk.data.load(datafile)
+            tokenizer = nltk.data.load("file:" + datafile)
         except LookupError:
             nltk.download("punkt", nltk_dir)
-            tokenizer = nltk.data.load(datafile)
-
-        sentences = tokenizer.tokenize(self.clean)
-        #if max_chunks >= len(sentences):
-        #    return sentences
+            tokenizer = nltk.data.load("file:" + datafile)
+
+        sentences = []
+        for sentence in tokenizer.tokenize(self.clean):
+            if len(sentence) > max_query:
+                words = sentence.split()
+                while len(" ".join(words)) > max_query:
+                    words.pop()
+                sentence = " ".join(words)
+            sentences.append(sentence)
+
+        if max_chunks >= len(sentences):
+            return sentences
+
+        chunks = []
+        while len(chunks) < max_chunks:
+            if len(chunks) % 5 == 0:
+                chunk = sentences.pop(0)  # Pop from beginning
+            elif len(chunks) % 5 == 1:
+                chunk = sentences.pop()  # Pop from end
+            elif len(chunks) % 5 == 2:
+                chunk = sentences.pop(len(sentences) / 2)  # Pop from Q2
+            elif len(chunks) % 5 == 3:
+                chunk = sentences.pop(len(sentences) / 4)  # Pop from Q1
+            else:
+                chunk = sentences.pop(3 * len(sentences) / 4)  # Pop from Q3
+            chunks.append(chunk)
+
+        return chunks
 
 
 class HTMLTextParser(BaseTextParser):
diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py
index f627a02..8261703 100644
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -560,10 +560,10 @@ class Site(object):
                 return [self.SERVICE_API]
             sqllag = self._sql_info_cache["replag"]
 
-        if sqllag > 180:
+        if sqllag > 300:
             if not self._maxlag:
                 return [self.SERVICE_API, self.SERVICE_SQL]
-            if now - self._api_info_cache["lastcheck"] > 120:
+            if now - self._api_info_cache["lastcheck"] > 300:
                 self._api_info_cache["lastcheck"] = now
                 try:
                     self._api_info_cache["maxlag"] = apilag = self.get_maxlag()
@@ -571,7 +571,7 @@ class Site(object):
                     self._api_info_cache["maxlag"] = apilag = 0
             else:
                 apilag = self._api_info_cache["maxlag"]
-            if sqllag / (180.0 / self._maxlag) < apilag:
+            if apilag > self._maxlag:
                 return [self.SERVICE_SQL, self.SERVICE_API]
             return [self.SERVICE_API, self.SERVICE_SQL]
 
diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py
index 5af7e3a..fd3c521 100644
--- a/earwigbot/wiki/sitesdb.py
+++ b/earwigbot/wiki/sitesdb.py
@@ -363,7 +363,7 @@ class SitesDB(object):
         use_https = config.wiki.get("useHTTPS", False)
         assert_edit = config.wiki.get("assert")
         maxlag = config.wiki.get("maxlag")
-        wait_between_queries = config.wiki.get("waitTime", 5)
+        wait_between_queries = config.wiki.get("waitTime", 3)
         logger = self._logger.getChild(name)
         search_config = config.wiki.get("search")
 

From 3744a34f28f88c94f71aa79bc823ba20aca2b3c3 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 7 Jul 2012 22:59:15 -0400
Subject: [PATCH 14/19] Allow templated SQL connection info.

---
 docs/toolset.rst          |  3 ++-
 earwigbot/wiki/sitesdb.py | 25 ++++++++++++++++++-------
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/docs/toolset.rst b/docs/toolset.rst
index c7808d2..fcdfc6d 100644
--- a/docs/toolset.rst
+++ b/docs/toolset.rst
@@ -47,7 +47,8 @@ wikis, you can usually use code like this::
         site = bot.wiki.add_site(project=project, lang=lang)
 
 This works because EarwigBot assumes that the URL for the site is
-``"//{lang}.{project}.org"`` and the API is at ``/w/api.php``; this might
+``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL
+connection info (if any) are stored as ``config.wiki["sql"]``. This might
 change if you're dealing with non-WMF wikis, where the code might look
 something more like::
 
diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py
index fd3c521..cdff1fe 100644
--- a/earwigbot/wiki/sitesdb.py
+++ b/earwigbot/wiki/sitesdb.py
@@ -196,6 +196,12 @@ class SitesDB(object):
             nltk_dir = path.join(self.config.root_dir, ".nltk")
             search_config["nltk_dir"] = nltk_dir
 
+        if not sql:
+            sql = config.wiki.get("sql", {})
+            for key, value in sql.iteritems():
+                if "$1" in value:
+                    sql[key] = value.replace("$1", name)
+
         return Site(name=name, project=project, lang=lang, base_url=base_url,
                     article_path=article_path, script_path=script_path,
                     sql=sql, namespaces=namespaces, login=login,
@@ -336,13 +342,12 @@ class SitesDB(object):
         the script path (meaning the API is located at
         ``"{base_url}{script_path}/api.php"`` ->
         ``"//{lang}.{project}.org/w/api.php"``), so this is the default. If
-        your wiki is different, provide the script_path as an argument. The
-        only other argument to :py:class:`~earwigbot.wiki.site.Site` that we
-        can't get from config files or by querying the wiki itself is SQL
-        connection info, so provide a dict of kwargs as *sql* and Site will
-        pass it to :py:func:`oursql.connect(**sql) <oursql.connect>`, allowing
-        you to make queries with :py:meth:`site.sql_query
-        <earwigbot.wiki.site.Site.sql_query>`.
+        your wiki is different, provide the script_path as an argument. SQL
+        connection settings are guessed automatically using config's template
+        value. If this is wrong or not specified, provide a dict of kwargs as
+        *sql* and Site will pass it to :py:func:`oursql.connect(**sql)
+        <oursql.connect>`, allowing you to make queries with
+        :py:meth:`site.sql_query <earwigbot.wiki.site.Site.sql_query>`.
 
         Returns ``True`` if the site was added successfully or ``False`` if the
         site is already in our sitesdb (this can be done purposefully to update
@@ -375,6 +380,12 @@ class SitesDB(object):
             nltk_dir = path.join(self.config.root_dir, ".nltk")
             search_config["nltk_dir"] = nltk_dir
 
+        if not sql:
+            sql = config.wiki.get("sql", {})
+            for key, value in sql.iteritems():
+                if "$1" in value:
+                    sql[key] = value.replace("$1", name)
+
         # Create a Site object to log in and load the other attributes:
         site = Site(base_url=base_url, script_path=script_path, sql=sql,
                     login=login, cookiejar=cookiejar, user_agent=user_agent,

From a074da853bd8956803b9f0061e12d4ca1d32cff0 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 8 Jul 2012 14:44:15 -0400
Subject: [PATCH 15/19] More work on copyvios, including an exclusions database
 (#5)

* Added exclusions module with a fully implemented ExclusionsDB that can pull
  from multiple sources for different sites.
* Moved CopyvioCheckResult to its own module, to be imported by __init__.
* Some other related changes.
---
 docs/api/earwigbot.wiki.copyvios.rst  |  14 +++
 docs/toolset.rst                      |   6 +-
 earwigbot/wiki/copyvios/__init__.py   |  56 +++---------
 earwigbot/wiki/copyvios/exclusions.py | 155 ++++++++++++++++++++++++++++++++++
 earwigbot/wiki/copyvios/result.py     |  60 +++++++++++++
 earwigbot/wiki/sitesdb.py             |   8 ++
 6 files changed, 252 insertions(+), 47 deletions(-)
 create mode 100644 earwigbot/wiki/copyvios/exclusions.py
 create mode 100644 earwigbot/wiki/copyvios/result.py

diff --git a/docs/api/earwigbot.wiki.copyvios.rst b/docs/api/earwigbot.wiki.copyvios.rst
index 7dbcf39..abddf7a 100644
--- a/docs/api/earwigbot.wiki.copyvios.rst
+++ b/docs/api/earwigbot.wiki.copyvios.rst
@@ -8,6 +8,13 @@ copyvios Package
     :members:
     :undoc-members:
 
+:mod:`exclusions` Module
+------------------------
+
+.. automodule:: earwigbot.wiki.copyvios.exclusions
+    :members:
+    :undoc-members:
+
 :mod:`markov` Module
 --------------------
 
@@ -24,6 +31,13 @@ copyvios Package
     :undoc-members:
     :show-inheritance:
 
+:mod:`result` Module
+--------------------
+
+.. automodule:: earwigbot.wiki.copyvios.result
+    :members:
+    :undoc-members:
+
 :mod:`search` Module
 --------------------
 
diff --git a/docs/toolset.rst b/docs/toolset.rst
index fcdfc6d..e2258c8 100644
--- a/docs/toolset.rst
+++ b/docs/toolset.rst
@@ -48,9 +48,9 @@ wikis, you can usually use code like this::
 
 This works because EarwigBot assumes that the URL for the site is
 ``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL
-connection info (if any) are stored as ``config.wiki["sql"]``. This might
-change if you're dealing with non-WMF wikis, where the code might look
-something more like::
+connection info (if any) is stored as ``config.wiki["sql"]``. This might change
+if you're dealing with non-WMF wikis, where the code might look something more
+like::
 
     project, lang = "mywiki", "it"
     try:
diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index cf2ddde..0f29403 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -33,47 +33,10 @@ except ImportError:
 from earwigbot import exceptions
 from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
 from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
+from earwigbot.wiki.copyvios.result import CopyvioCheckResult
 from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
 
-__all__ = ["CopyvioCheckResult", "CopyvioMixIn"]
-
-class CopyvioCheckResult(object):
-    """
-    **EarwigBot: Wiki Toolset: Copyvio Check Result**
-
-    A class holding information about the results of a copyvio check.
-
-    *Attributes:*
-
-    - :py:attr:`violation`:     ``True`` if this is a violation, else ``False``
-    - :py:attr:`confidence`:    a float between 0 and 1 indicating accuracy
-    - :py:attr:`url`:           the URL of the violated page
-    - :py:attr:`queries`:       the number of queries used to reach a result
-    - :py:attr:`article_chain`: the MarkovChain of the article text
-    - :py:attr:`source_chain`:  the MarkovChain of the violated page text
-    - :py:attr:`delta_chain`:   the MarkovChainIntersection comparing the two
-    """
-
-    def __init__(self, violation, confidence, url, queries, article, chains):
-        self.violation = violation
-        self.confidence = confidence
-        self.url = url
-        self.queries = queries
-        self.article_chain = article
-        self.source_chain = chains[0]
-        self.delta_chain = chains[1]
-
-    def __repr__(self):
-        """Return the canonical string representation of the result."""
-        res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
-        return res.format(self.violation, self.confidence, self.url,
-                          self.queries)
-
-    def __str__(self):
-        """Return a nice string representation of the result."""
-        res = "<CopyvioCheckResult ({0} with {1} conf)>"
-        return res.format(self.violation, self.confidence)
-
+__all__ = ["CopyvioMixIn"]
 
 class CopyvioMixIn(object):
     """
@@ -88,6 +51,7 @@ class CopyvioMixIn(object):
 
     def __init__(self, site):
         self._search_config = site._search_config
+        self._exclusions_db = self._search_config["exclusions_db"]
         self._opener = build_opener()
         self._opener.addheaders = site._opener.addheaders
 
@@ -156,8 +120,9 @@ class CopyvioMixIn(object):
                       interquery_sleep=1):
         """Check the page for copyright violations.
 
-        Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult`
-        object with information on the results of the check.
+        Returns a
+        :py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object
+        with information on the results of the check.
 
         *max_queries* is self-explanatory; we will never make more than this
         number of queries in a given check. If it's lower than 0, we will not
@@ -171,6 +136,7 @@ class CopyvioMixIn(object):
         :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors.
         """
         searcher = self._select_search_engine()
+        self._exclusions_db.sync(self.site.name)
         handled_urls = []
         best_confidence = 0
         best_match = None
@@ -193,6 +159,8 @@ class CopyvioMixIn(object):
             urls = [url for url in urls if url not in handled_urls]
             for url in urls:
                 handled_urls.append(url)
+                if self._exclusions_db.check(self.site.name, url):
+                    continue
                 conf, chains = self._copyvio_compare_content(article_chain, url)
                 if conf > best_confidence:
                     best_confidence = conf
@@ -216,9 +184,9 @@ class CopyvioMixIn(object):
 
         This is essentially a reduced version of the above - a copyivo
         comparison is made using Markov chains and the result is returned in a
-        :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but
-        without using a search engine, since the suspected "violated" URL is
-        supplied from the start.
+        :py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object -
+        but without using a search engine, since the suspected "violated" URL
+        is supplied from the start.
 
         Its primary use is to generate a result when the URL is retrieved from
         a cache, like the one used in EarwigBot's Toolserver site. After a
diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py
new file mode 100644
index 0000000..fdbaa39
--- /dev/null
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8  -*-
+#
+# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import re
+import sqlite3 as sqlite
+from threading import Lock
+from time import time
+
+from earwigbot import exceptions
+
+__all__ = ["ExclusionsDB"]
+
+default_sources = {
+    "enwiki": [
+        "Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def",
+        "Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl",
+        "Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr",
+        "Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz"
+    ]
+}
+
+class ExclusionsDB(object):
+    """
+    **EarwigBot: Wiki Toolset: Exclusions Database Manager**
+
+    Controls the :file:`.exclusions.db` file, which stores URLs excluded from
+    copyright violation checks on account of being known mirrors, for example.
+    """
+
+    def __init__(self, sitesdb, dbfile, logger):
+        self._sitesdb = sitesdb
+        self._dbfile = dbfile
+        self._logger = logger
+        self._db_access_lock = Lock()
+
+    def _create(self):
+        """Initialize the exclusions database with its necessary tables."""
+        script = """
+            CREATE TABLE sources (source_sitename, source_page);
+            CREATE TABLE updates (update_sitename, update_time);
+            CREATE TABLE exclusions (exclusion_sitename, exclusion_url);
+        """
+        query = "INSERT INTO sources VALUES (?, ?);"
+        sources = []
+        for sitename, pages in default_sources.iteritems():
+            [sources.append((sitename, page)) for page in pages]
+
+        with sqlite.connect(self._dbfile) as conn:
+            conn.executescript(script)
+            conn.executemany(query, sources)
+
+    def _load_source(self, site, source):
+        """Load from a specific source and return a set of URLs."""
+        urls = set()
+        try:
+            data = site.get_page(source).get()
+        except exceptions.PageNotFoundError:
+            return urls
+
+        regexes = [
+            "url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
+            "\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?"
+        ]
+        for regex in regexes:
+            [urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)]
+        return urls
+
+    def _update(self, sitename):
+        """Update the database from listed sources in the index."""
+        query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;"
+        query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
+        query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?"
+        query4 = "INSERT INTO exclusions VALUES (?, ?);"
+        query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;"
+        query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;"
+        query7 = "INSERT INTO updates VALUES (?, ?);"
+
+        site = self._sitesdb.get_site(sitename)
+        with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
+            urls = set()
+            for (source,) in conn.execute(query1, (sitename,)):
+                urls |= self._load_source(site, source)
+            for (url,) in conn.execute(query2, (sitename,)):
+                if url in urls:
+                    urls.remove(url)
+                else:
+                    conn.execute(query3, (sitename, url))
+            conn.executemany(query4, [(sitename, url) for url in urls])
+            if conn.execute(query5, (name,)).fetchone():
+                conn.execute(query6, (time(), sitename))
+            else:
+                conn.execute(query7, (sitename, time()))
+
+    def _get_last_update(self, sitename):
+        """Return the UNIX timestamp of the last time the db was updated."""
+        query = "SELECT update_time FROM updates WHERE update_sitename = ?;"
+        with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
+            try:
+                result = conn.execute(query, (sitename,)).fetchone()
+            except sqlite.OperationalError:
+                self._create()
+                return 0
+            return result[0] if result else 0
+
+    def sync(self, sitename):
+        """Update the database if it hasn't been updated in the past month.
+
+        This only updates the exclusions database for the *sitename* site.
+        """
+        max_staleness = 60 * 60 * 24 * 30
+        time_since_update = int(time() - self._get_last_update())
+        if time_since_update > max_staleness:
+            log = "Updating stale database: {0} (last updated {1} seconds ago)"
+            self._logger.info(log.format(sitename, time_since_update))
+            self._update(sitename)
+        else:
+            log = "Database for {0} is still fresh (last updated {1} seconds ago)"
+            self._logger.debug(log.format(sitename, time_since_update))
+
+    def check(self, sitename, url):
+        """Check whether a given URL is in the exclusions database.
+
+        Return ``True`` if the URL is in the database, or ``False`` otherwise.
+        """
+        normalized = re.sub("https?://", "", url.lower())
+        query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
+        with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
+            for row in conn.execute(query, (sitename,)):
+                if normalized.startswith(row[0]):
+                    log = "Exclusion detected in {0} for {1}"
+                    self._logger.debug(log.format(sitename, url))
+                    return True
+
+        log = "No exclusions in {0} for {1}".format(sitename, url)
+        self._logger.debug(log)
+        return False
diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py
new file mode 100644
index 0000000..0c3e98f
--- /dev/null
+++ b/earwigbot/wiki/copyvios/result.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8  -*-
+#
+# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+__all__ = ["CopyvioCheckResult"]
+
+class CopyvioCheckResult(object):
+    """
+    **EarwigBot: Wiki Toolset: Copyvio Check Result**
+
+    A class holding information about the results of a copyvio check.
+
+    *Attributes:*
+
+    - :py:attr:`violation`:     ``True`` if this is a violation, else ``False``
+    - :py:attr:`confidence`:    a float between 0 and 1 indicating accuracy
+    - :py:attr:`url`:           the URL of the violated page
+    - :py:attr:`queries`:       the number of queries used to reach a result
+    - :py:attr:`article_chain`: the MarkovChain of the article text
+    - :py:attr:`source_chain`:  the MarkovChain of the violated page text
+    - :py:attr:`delta_chain`:   the MarkovChainIntersection comparing the two
+    """
+
+    def __init__(self, violation, confidence, url, queries, article, chains):
+        self.violation = violation
+        self.confidence = confidence
+        self.url = url
+        self.queries = queries
+        self.article_chain = article
+        self.source_chain = chains[0]
+        self.delta_chain = chains[1]
+
+    def __repr__(self):
+        """Return the canonical string representation of the result."""
+        res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
+        return res.format(self.violation, self.confidence, self.url,
+                          self.queries)
+
+    def __str__(self):
+        """Return a nice string representation of the result."""
+        res = "<CopyvioCheckResult ({0} with {1} conf)>"
+        return res.format(self.violation, self.confidence)
diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py
index cdff1fe..9d2c828 100644
--- a/earwigbot/wiki/sitesdb.py
+++ b/earwigbot/wiki/sitesdb.py
@@ -29,6 +29,7 @@ import sqlite3 as sqlite
 
 from earwigbot import __version__
 from earwigbot.exceptions import SiteNotFoundError
+from earwigbot.wiki.copyvios.exclusions import ExclusionsDB
 from earwigbot.wiki.site import Site
 
 __all__ = ["SitesDB"]
@@ -58,11 +59,16 @@ class SitesDB(object):
         """Set up the manager with an attribute for the base Bot object."""
         self.config = bot.config
         self._logger = bot.logger.getChild("wiki")
+
         self._sites = {}  # Internal site cache
         self._sitesdb = path.join(bot.config.root_dir, "sites.db")
         self._cookie_file = path.join(bot.config.root_dir, ".cookies")
         self._cookiejar = None
 
+        excl_db = path.join(bot.config.root_dir, "exclusions.db")
+        excl_logger = self._logger.getChild("exclusionsdb")
+        self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger)
+
     def __repr__(self):
         """Return the canonical string representation of the SitesDB."""
         res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})"
@@ -195,6 +201,7 @@ class SitesDB(object):
         if search_config:
             nltk_dir = path.join(self.config.root_dir, ".nltk")
             search_config["nltk_dir"] = nltk_dir
+            search_config["exclusions_db"] = self._exclusions_db
 
         if not sql:
             sql = config.wiki.get("sql", {})
@@ -379,6 +386,7 @@ class SitesDB(object):
         if search_config:
             nltk_dir = path.join(self.config.root_dir, ".nltk")
             search_config["nltk_dir"] = nltk_dir
+            search_config["exclusions_db"] = self._exclusions_db
 
         if not sql:
             sql = config.wiki.get("sql", {})

From 1c2dcc999a7ce8593630931f5a40fe5a317daff6 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 8 Jul 2012 14:53:21 -0400
Subject: [PATCH 16/19] __repr__ and __str__ for ExclusionsDB (#5).

---
 earwigbot/wiki/copyvios/exclusions.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py
index fdbaa39..7eb6a80 100644
--- a/earwigbot/wiki/copyvios/exclusions.py
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -52,6 +52,15 @@ class ExclusionsDB(object):
         self._logger = logger
         self._db_access_lock = Lock()
 
+    def __repr__(self):
+        """Return the canonical string representation of the ExclusionsDB."""
+        res = "ExclusionsDB(sitesdb={0!r}, dbfile={1!r}, logger={2!r})"
+        return res.format(self._sitesdb, self._dbfile, self._logger)
+
+    def __str__(self):
+        """Return a nice string representation of the ExclusionsDB."""
+        return "<ExclusionsDB at {0}>".format(self._dbfile)
+
     def _create(self):
         """Initialize the exclusions database with its necessary tables."""
         script = """

From d07f0b5f9af88dd532815e8156bcf56955830af2 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 8 Jul 2012 15:04:44 -0400
Subject: [PATCH 17/19] Add loggers to Category, Page, and User.

---
 earwigbot/wiki/page.py | 12 +++++++++++-
 earwigbot/wiki/site.py |  9 +++++----
 earwigbot/wiki/user.py | 10 +++++++++-
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py
index 3125b33..92bb5b7 100644
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 from hashlib import md5
+from logging import getLogger, NullHandler
 import re
 from time import gmtime, strftime
 from urllib import quote
@@ -82,7 +83,8 @@ class Page(CopyvioMixIn):
     PAGE_MISSING = 2
     PAGE_EXISTS = 3
 
-    def __init__(self, site, title, follow_redirects=False, pageid=None):
+    def __init__(self, site, title, follow_redirects=False, pageid=None,
+                 logger=None):
         """Constructor for new Page instances.
 
         Takes four arguments: a Site object, the Page's title (or pagename),
@@ -101,6 +103,14 @@ class Page(CopyvioMixIn):
         self._follow_redirects = self._keep_following = follow_redirects
         self._pageid = pageid
 
+        # Set up our internal logger:
+        if logger:
+            self._logger = logger
+        else:  # Just set up a null logger to eat up our messages:
+            self._logger = getLogger("earwigbot.wiki")
+            self._logger.addHandler(NullHandler())
+
+        # Attributes to be loaded through the API:
         self._exists = self.PAGE_UNKNOWN
         self._is_redirect = None
         self._lastrevid = None
diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py
index 8261703..bd6c95b 100644
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -789,8 +789,9 @@ class Site(object):
         prefix = title.split(":", 1)[0]
         if prefix != title:  # Avoid a page that is simply "Category"
             if prefix in prefixes:
-                return Category(self, title, follow_redirects, pageid)
-        return Page(self, title, follow_redirects, pageid)
+                return Category(self, title, follow_redirects, pageid,
+                                self._logger)
+        return Page(self, title, follow_redirects, pageid, self._logger)
 
     def get_category(self, catname, follow_redirects=False, pageid=None):
         """Return a :py:class:`Category` object for the given category name.
@@ -802,7 +803,7 @@ class Site(object):
         catname = self._unicodeify(catname)
         prefix = self.namespace_id_to_name(constants.NS_CATEGORY)
         pagename = u':'.join((prefix, catname))
-        return Category(self, pagename, follow_redirects, pageid)
+        return Category(self, pagename, follow_redirects, pageid, self._logger)
 
     def get_user(self, username=None):
         """Return a :py:class:`User` object for the given username.
@@ -815,7 +816,7 @@ class Site(object):
             username = self._unicodeify(username)
         else:
             username = self._get_username()
-        return User(self, username)
+        return User(self, username, self._logger)
 
     def delegate(self, services, args=None, kwargs=None):
         """Delegate a task to either the API or SQL depending on conditions.
diff --git a/earwigbot/wiki/user.py b/earwigbot/wiki/user.py
index b71b502..92da1e6 100644
--- a/earwigbot/wiki/user.py
+++ b/earwigbot/wiki/user.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+from logging import getLogger, NullHandler
 from time import gmtime, strptime
 
 from earwigbot.exceptions import UserNotFoundError
@@ -60,7 +61,7 @@ class User(object):
       talkpage
     """
 
-    def __init__(self, site, name):
+    def __init__(self, site, name, logger=None):
         """Constructor for new User instances.
 
         Takes two arguments, a Site object (necessary for doing API queries),
@@ -76,6 +77,13 @@ class User(object):
         self._site = site
         self._name = name
 
+        # Set up our internal logger:
+        if logger:
+            self._logger = logger
+        else:  # Just set up a null logger to eat up our messages:
+            self._logger = getLogger("earwigbot.wiki")
+            self._logger.addHandler(NullHandler())
+
     def __repr__(self):
         """Return the canonical string representation of the User."""
         return "User(name={0!r}, site={1!r})".format(self._name, self._site)

From 439b8552540f8253820a3ce5ffd4a47026dd79ce Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 8 Jul 2012 15:28:58 -0400
Subject: [PATCH 18/19] Fully implement logging; fix non-unicode log messages.

---
 earwigbot/tasks/afc_copyvios.py       | 14 +++++++-------
 earwigbot/wiki/copyvios/__init__.py   | 27 +++++++++++++++++++++------
 earwigbot/wiki/copyvios/exclusions.py |  8 ++++----
 earwigbot/wiki/copyvios/search.py     |  2 ++
 4 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py
index afcb7f9..12c6b80 100644
--- a/earwigbot/tasks/afc_copyvios.py
+++ b/earwigbot/tasks/afc_copyvios.py
@@ -70,17 +70,17 @@ class AFCCopyvios(Task):
         """Detect copyvios in 'page' and add a note if any are found."""
         title = page.title
         if title in self.ignore_list:
-            msg = "Skipping page in ignore list: [[{0}]]"
+            msg = u"Skipping page in ignore list: [[{0}]]"
             self.logger.info(msg.format(title))
             return
 
         pageid = page.pageid
         if self.has_been_processed(pageid):
-            msg = "Skipping check on already processed page [[{0}]]"
+            msg = u"Skipping check on already processed page [[{0}]]"
             self.logger.info(msg.format(title))
             return
 
-        self.logger.info("Checking [[{0}]]".format(title))
+        self.logger.info(u"Checking [[{0}]]".format(title))
         result = page.copyvio_check(self.min_confidence, self.max_queries)
         url = result.url
         confidence = "{0}%".format(round(result.confidence * 100, 2))
@@ -94,11 +94,11 @@ class AFCCopyvios(Task):
                 page.edit(newtext, self.summary.format(url=url))
             else:
                 page.edit(newtext, self.summary)
-            msg = "Found violation: [[{0}]] -> {1} ({2} confidence)"
-            self.logger.warn(msg.format(title, url, confidence))
+            msg = u"Found violation: [[{0}]] -> {1} ({2} confidence)"
+            self.logger.info(msg.format(title, url, confidence))
         else:
-            msg = "No violations detected (best: {1} at {2} confidence)"
-            self.logger.debug(msg.format(url, confidence))
+            msg = u"No violations detected in [[{0}]] (best: {1} at {2} confidence)"
+            self.logger.info(msg.format(title, url, confidence))
 
         self.log_processed(pageid)
         if self.cache_results:
diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 0f29403..e89a322 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -155,7 +155,10 @@ class CopyvioMixIn(object):
 
         while (chunks and best_confidence < min_confidence and
                (max_queries < 0 or num_queries < max_queries)):
-            urls = searcher.search(chunks.pop(0))
+            chunk = chunks.pop(0)
+            log = u"[[{0}]] -> querying {1} for {2!r}"
+            self._logger.debug(log.format(self.title, searcher.name, chunk))
+            urls = searcher.search(chunk)
             urls = [url for url in urls if url not in handled_urls]
             for url in urls:
                 handled_urls.append(url)
@@ -172,12 +175,19 @@ class CopyvioMixIn(object):
                 sleep(interquery_sleep - diff)
             last_query = time()
 
-        if best_confidence >= min_confidence:  # violation?
-            v = True
+        if best_confidence >= min_confidence:
+            is_violation = True
+            log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2}; using {3} queries)"
+            self._logger.debug(log.format(self.title, best_confidence,
+                                          best_match, num_queries))
         else:
-            v = False
-        return CopyvioCheckResult(v, best_confidence, best_match, num_queries,
-                                  article_chain, best_chains)
+            is_violation = False
+            log = u"No violation for [[{0}]] (confidence: {1}; using {2} queries)"
+            self._logger.debug(log.format(self.title, best_confidence,
+                                          num_queries))
+
+        return CopyvioCheckResult(is_violation, best_confidence, best_match,
+                                  num_queries, article_chain, best_chains)
 
     def copyvio_compare(self, url, min_confidence=0.5):
         """Check the page like :py:meth:`copyvio_check` against a specific URL.
@@ -208,7 +218,12 @@ class CopyvioMixIn(object):
 
         if confidence >= min_confidence:
             is_violation = True
+            log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2})"
+            self._logger.debug(log.format(self.title, confidence, url))
         else:
             is_violation = False
+            log = u"No violation for [[{0}]] (confidence: {1}; URL: {2})"
+            self._logger.debug(log.format(self.title, confidence, url))
+
         return CopyvioCheckResult(is_violation, confidence, url, 0,
                                   article_chain, chains)
diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py
index 7eb6a80..4640b1f 100644
--- a/earwigbot/wiki/copyvios/exclusions.py
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -138,11 +138,11 @@ class ExclusionsDB(object):
         max_staleness = 60 * 60 * 24 * 30
         time_since_update = int(time() - self._get_last_update())
         if time_since_update > max_staleness:
-            log = "Updating stale database: {0} (last updated {1} seconds ago)"
+            log = u"Updating stale database: {0} (last updated {1} seconds ago)"
             self._logger.info(log.format(sitename, time_since_update))
             self._update(sitename)
         else:
-            log = "Database for {0} is still fresh (last updated {1} seconds ago)"
+            log = u"Database for {0} is still fresh (last updated {1} seconds ago)"
             self._logger.debug(log.format(sitename, time_since_update))
 
     def check(self, sitename, url):
@@ -155,10 +155,10 @@ class ExclusionsDB(object):
         with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
             for row in conn.execute(query, (sitename,)):
                 if normalized.startswith(row[0]):
-                    log = "Exclusion detected in {0} for {1}"
+                    log = u"Exclusion detected in {0} for {1}"
                     self._logger.debug(log.format(sitename, url))
                     return True
 
-        log = "No exclusions in {0} for {1}".format(sitename, url)
+        log = u"No exclusions in {0} for {1}".format(sitename, url)
         self._logger.debug(log)
         return False
diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py
index cf2edb4..0ccd62e 100644
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -34,6 +34,7 @@ __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"]
 
 class BaseSearchEngine(object):
     """Base class for a simple search engine interface."""
+    name = "Base"
 
     def __init__(self, cred):
         """Store credentials *cred* for searching later on."""
@@ -57,6 +58,7 @@ class BaseSearchEngine(object):
 
 class YahooBOSSSearchEngine(BaseSearchEngine):
     """A search engine interface with Yahoo! BOSS."""
+    name = "Yahoo! BOSS"
 
     def search(self, query):
         """Do a Yahoo! BOSS web search for *query*.

From becd135c5242b1a093bdcf88026c7c7328d4e7d7 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 8 Jul 2012 16:09:00 -0400
Subject: [PATCH 19/19] Minor cleanup for afc_copyvios, mainly Unicode fixes.

---
 earwigbot/tasks/afc_copyvios.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py
index 12c6b80..3dc3902 100644
--- a/earwigbot/tasks/afc_copyvios.py
+++ b/earwigbot/tasks/afc_copyvios.py
@@ -23,6 +23,7 @@
 from hashlib import sha256
 from os.path import expanduser
 from threading import Lock
+from urllib import quote
 
 import oursql
 
@@ -86,9 +87,10 @@ class AFCCopyvios(Task):
         confidence = "{0}%".format(round(result.confidence * 100, 2))
 
         if result.violation:
+            safeurl = quote(url.encode("utf8"), safe="/:").decode("utf8")
             content = page.get()
-            template = "\{\{{0}|url={1}|confidence={2}\}\}\n"
-            template = template.format(self.template, url, confidence)
+            template = u"\{\{{0}|url={1}|confidence={2}\}\}\n"
+            template = template.format(self.template, safeurl, confidence)
             newtext = template + content
             if "{url}" in self.summary:
                 page.edit(newtext, self.summary.format(url=url))
@@ -110,9 +112,7 @@ class AFCCopyvios(Task):
         with self.conn.cursor() as cursor:
             cursor.execute(query, (pageid,))
             results = cursor.fetchall()
-        if results:
-            return True
-        return False
+            return True if results else False
 
     def log_processed(self, pageid):
         """Adds pageid to our database of processed pages.
@@ -138,8 +138,8 @@ class AFCCopyvios(Task):
         be) retained for one day; this task does not remove old entries (that
         is handled by the Toolserver component).
 
-        This will only be called if "cache_results" == True in the task's
-        config, which is False by default.
+        This will only be called if ``cache_results == True`` in the task's
+        config, which is ``False`` by default.
         """
         pageid = page.pageid
         hash = sha256(page.get()).hexdigest()