Merge branch 'feature/copyvios' into develop

OH MY GOD I'M FINALLY DONE.
12 anni fa · 4944c120bd
--- a/docs/api/earwigbot.wiki.copyvios.rst
+++ b/docs/api/earwigbot.wiki.copyvios.rst
@@ -0,0 +1,47 @@
 copyvios Package
 ================

 :mod:`copyvios` Package
 -----------------------

 .. automodule:: earwigbot.wiki.copyvios
    :members:
    :undoc-members:

 :mod:`exclusions` Module
 ------------------------

 .. automodule:: earwigbot.wiki.copyvios.exclusions
    :members:
    :undoc-members:

 :mod:`markov` Module
 --------------------

 .. automodule:: earwigbot.wiki.copyvios.markov
    :members:
    :undoc-members:
    :show-inheritance:

 :mod:`parsers` Module
 ---------------------

 .. automodule:: earwigbot.wiki.copyvios.parsers
    :members:
    :undoc-members:
    :show-inheritance:

 :mod:`result` Module
 --------------------

 .. automodule:: earwigbot.wiki.copyvios.result
    :members:
    :undoc-members:

 :mod:`search` Module
 --------------------

 .. automodule:: earwigbot.wiki.copyvios.search
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/api/earwigbot.wiki.rst
+++ b/docs/api/earwigbot.wiki.rst
@@ -22,13 +22,6 @@ wiki Package
    :members:
    :undoc-members:

 :mod:`copyright` Module

 .. automodule:: earwigbot.wiki.copyright
    :members:
    :undoc-members:

 :mod:`page` Module
 ------------------

@@ -57,3 +50,10 @@ wiki Package
 .. automodule:: earwigbot.wiki.user
    :members:
    :undoc-members:

 Subpackages
 -----------

 .. toctree::

    earwigbot.wiki.copyvios
--- a/docs/api/modules.rst
+++ b/docs/api/modules.rst
@@ -2,6 +2,6 @@ earwigbot
 =========

 .. toctree::
   :maxdepth: 4
   :maxdepth: 6

   earwigbot
--- a/docs/toolset.rst
+++ b/docs/toolset.rst
@@ -47,9 +47,10 @@ wikis, you can usually use code like this::
        site = bot.wiki.add_site(project=project, lang=lang)

 This works because EarwigBot assumes that the URL for the site is
 ``"//{lang}.{project}.org"`` and the API is at ``/w/api.php``; this might
 change if you're dealing with non-WMF wikis, where the code might look
 something more like::
 ``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL
 connection info (if any) is stored as ``config.wiki["sql"]``. This might change
 if you're dealing with non-WMF wikis, where the code might look something more
 like::

    project, lang = "mywiki", "it"
    try:
--- a/earwigbot/commands/link.py
+++ b/earwigbot/commands/link.py
@@ -30,6 +30,7 @@ class Link(Command):
    name = "link"

    def process(self, data):
        self.site = self.bot.wiki.get_site()
        msg = data.msg

        if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg):
@@ -41,8 +42,8 @@ class Link(Command):
            if not data.args:
                self.reply(data, "what do you want me to link to?")
                return
            pagename = ' '.join(data.args)
            link = self.parse_link(pagename)
            pagename = " ".join(data.args)
            link = self.site.get_page(pagename).url
            self.reply(data, link)

    def parse_line(self, line):
@@ -56,8 +57,7 @@ class Link(Command):
        if links:
            # re.findall() returns a list of tuples, but we only want the 2nd
            # item in each tuple:
            links = [i[1] for i in links]
            results = map(self.parse_link, links)
            results = [self.site.get_page(name[1]).url for name in links]

        # Find all {{templates}}
        templates = re.findall("(\{\{(.*?)(\||\}\}))", line)
@@ -67,10 +67,6 @@ class Link(Command):

        return results

    def parse_link(self, pagename):
        link = quote(pagename.replace(" ", "_"), safe="/:")
        return "".join(("http://enwp.org/", link))

    def parse_template(self, pagename):
        pagename = "".join(("Template:", pagename))
        return self.parse_link(pagename)
        return self.site.get_page(pagename).url
--- a/earwigbot/tasks/afc_copyvios.py
+++ b/earwigbot/tasks/afc_copyvios.py
@@ -23,6 +23,7 @@
 from hashlib import sha256
 from os.path import expanduser
 from threading import Lock
 from urllib import quote

 import oursql

@@ -70,35 +71,36 @@ class AFCCopyvios(Task):
        """Detect copyvios in 'page' and add a note if any are found."""
        title = page.title
        if title in self.ignore_list:
            msg = "Skipping page in ignore list: [[{0}]]"
            msg = u"Skipping page in ignore list: [[{0}]]"
            self.logger.info(msg.format(title))
            return

        pageid = page.pageid
        if self.has_been_processed(pageid):
            msg = "Skipping check on already processed page [[{0}]]"
            msg = u"Skipping check on already processed page [[{0}]]"
            self.logger.info(msg.format(title))
            return

        self.logger.info("Checking [[{0}]]".format(title))
        self.logger.info(u"Checking [[{0}]]".format(title))
        result = page.copyvio_check(self.min_confidence, self.max_queries)
        url = result.url
        confidence = "{0}%".format(round(result.confidence * 100, 2))

        if result.violation:
            safeurl = quote(url.encode("utf8"), safe="/:").decode("utf8")
            content = page.get()
            template = "\{\{{0}|url={1}|confidence={2}\}\}\n"
            template = template.format(self.template, url, confidence)
            template = u"\{\{{0}|url={1}|confidence={2}\}\}\n"
            template = template.format(self.template, safeurl, confidence)
            newtext = template + content
            if "{url}" in self.summary:
                page.edit(newtext, self.summary.format(url=url))
            else:
                page.edit(newtext, self.summary)
            msg = "Found violation: [[{0}]] -> {1} ({2} confidence)"
            self.logger.warn(msg.format(title, url, confidence))
            msg = u"Found violation: [[{0}]] -> {1} ({2} confidence)"
            self.logger.info(msg.format(title, url, confidence))
        else:
            msg = "No violations detected (best: {1} at {2} confidence)"
            self.logger.debug(msg.format(url, confidence))
            msg = u"No violations detected in [[{0}]] (best: {1} at {2} confidence)"
            self.logger.info(msg.format(title, url, confidence))

        self.log_processed(pageid)
        if self.cache_results:
@@ -110,9 +112,7 @@ class AFCCopyvios(Task):
        with self.conn.cursor() as cursor:
            cursor.execute(query, (pageid,))
            results = cursor.fetchall()
        if results:
            return True
        return False
            return True if results else False

    def log_processed(self, pageid):
        """Adds pageid to our database of processed pages.
@@ -138,8 +138,8 @@ class AFCCopyvios(Task):
        be) retained for one day; this task does not remove old entries (that
        is handled by the Toolserver component).

        This will only be called if "cache_results" == True in the task's
        config, which is False by default.
        This will only be called if ``cache_results == True`` in the task's
        config, which is ``False`` by default.
        """
        pageid = page.pageid
        hash = sha256(page.get()).hexdigest()
--- a/earwigbot/wiki/copyright.py
+++ b/earwigbot/wiki/copyright.py
@@ -1,324 +0,0 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 from collections import defaultdict
 from functools import partial
 from gzip import GzipFile
 from json import loads
 from re import sub, UNICODE
 from StringIO import StringIO
 from time import sleep, time
 from urllib import quote_plus, urlencode
 from urllib2 import build_opener, URLError

 try:
    import oauth2 as oauth
 except ImportError:
    oauth = None

 from earwigbot.exceptions import *

 class _CopyvioCheckResult(object):
    def __init__(self, violation, confidence, url, queries, article, chains):
        self.violation = violation
        self.confidence = confidence
        self.url = url
        self.queries = queries
        self.article_chain = article
        self.source_chain = chains[0]
        self.delta_chain = chains[1]

    def __repr__(self):
        r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
        return r.format(self.violation, self.confidence, self.url, self.queries)


 class _MarkovChain(object):
    START = -1
    END = -2

    def __init__(self, text):
        self.text = text
        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
        words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
        prev = self.START
        for word in words:
            self.chain[prev][word] += 1
            prev = word
        try:  # This won't work if the source text is completely blank
            self.chain[word][self.END] += 1
        except KeyError:
            pass

    def size(self):
        count = 0
        for node in self.chain.itervalues():
            for hits in node.itervalues():
                count += hits
        return count


 class _MarkovChainIntersection(_MarkovChain):
    def __init__(self, mc1, mc2):
        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
        c1 = mc1.chain
        c2 = mc2.chain

        for word, nodes1 in c1.iteritems():
            if word in c2:
                nodes2 = c2[word]
                for node, count1 in nodes1.iteritems():
                    if node in nodes2:
                        count2 = nodes2[node]
                        self.chain[word][node] = min(count1, count2)


 class CopyrightMixIn(object):
    """
    EarwigBot's Wiki Toolset: Copyright Violation Mixin

    This is a mixin that provides two public methods, copyvio_check() and
    copyvio_compare(). The former checks the page for copyright violations
    using a search engine API, and the latter compares the page against a
    specified URL. Credentials for the search engine API are stored in the
    site's config.
    """
    def __init__(self, site):
        self._opener = build_opener()
        self._opener.addheaders = site._opener.addheaders

    def _open_url_ignoring_errors(self, url):
        """Open a URL using self._opener and return its content, or None.

        Will decompress the content if the headers contain "gzip" as its
        content encoding, and will return None if URLError is raised while
        opening the URL. IOErrors while gunzipping a compressed response are
        ignored, and the original content is returned.
        """
        try:
            response = self._opener.open(url)
        except URLError:
            return None
        result = response.read()

        if response.headers.get("Content-Encoding") == "gzip":
            stream = StringIO(result)
            gzipper = GzipFile(fileobj=stream)
            try:
                result = gzipper.read()
            except IOError:
                pass

        return result

    def _select_search_engine(self):
        """Return a function that can be called to do web searches.

        The "function" is a functools.partial object that takes one argument, a
        query, and returns a list of URLs, ranked by importance. The underlying
        logic depends on the 'engine' argument; for example, if 'engine' is
        "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.

        Raises UnknownSearchEngineError if the 'engine' listed in our config is
        unknown to us, and UnsupportedSearchEngineError if we are missing a
        required package or module, like oauth2 for "Yahoo! BOSS".
        """
        engine, credentials = self._site._search_config

        if engine == "Yahoo! BOSS":
            if not oauth:
                e = "The package 'oauth2' could not be imported"
                raise UnsupportedSearchEngineError(e)
            searcher = self._yahoo_boss_query
        else:
            raise UnknownSearchEngineError(engine)

        return partial(searcher, credentials)

    def _yahoo_boss_query(self, cred, query):
        """Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials.

        Returns a list of URLs, no more than fifty, ranked by relevance (as
        determined by Yahoo). Raises SearchQueryError() on errors.
        """
        base_url = "http://yboss.yahooapis.com/ysearch/web"
        query = quote_plus(query.join('"', '"'))
        params = {"q": query, "style": "raw", "format": "json"}
        url = "{0}?{1}".format(base_url, urlencode(params))

        consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"])
        client = oauth.Client(consumer)
        headers, body = client.request(url, "GET")

        if headers["status"] != "200":
            e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
            raise SearchQueryError(e.format(headers["status"], body))

        try:
            res = loads(body)
        except ValueError:
            e = "Yahoo! BOSS Error: JSON could not be decoded"
            raise SearchQueryError(e)

        try:
            results = res["bossresponse"]["web"]["results"]
        except KeyError:
            return []
        return [result["url"] for result in results]

    def _copyvio_strip_html(self, html):
        """
        STUB
        """
        return html

    def _copyvio_strip_article(self, content):
        """Clean the page's raw text by removing templates and formatting.

        Returns the page's text with all HTML and wikicode formatting removed,
        including templates, tables, references, and the Bibliography/
        References/Sources/See also section(s). It retains punctuation
        (spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
        quotes) and original capitalization, but not brackets (square and
        angular), abnormal spacing, nor anything else. HTML entities are
        replaced by their unicode equivalents.

        STUB
        """
        return content

    def _copyvio_chunk_article(self, content, max_chunks):
        """
        STUB
        """
        return [content]

    def _copyvio_compare_content(self, article, url):
        """
        DOCSTRING NEEDED
        """
        html = self._open_url_ignoring_errors(url)
        if not html:
            return 0

        source = _MarkovChain(self._copyvio_strip_html(html))
        delta = _MarkovChainIntersection(article, source)
        return float(delta.size()) / article.size(), (source, delta)

    def copyvio_check(self, min_confidence=0.5, max_queries=-1,
                      interquery_sleep=1, force=False):
        """Check the page for copyright violations.

        Returns a _CopyvioCheckResult object with four useful attributes:
        "violation", "confidence", "url", and "queries". "confidence" is a
        number between 0 and 1; if it is less than "min_confidence", we could
        not find any indication of a violation (so "violation" will be False
        and "url" may or may not be None), otherwise it indicates the relative
        faith in our results, "violation" will be True, and "url" will be the
        place the article is suspected of being copied from. "queries" is the
        number of queries used to determine the results.

        "max_queries" is self-explanatory; we will never make more than this
        number of queries in a given check. If it's less than 0, we will not
        limit our number of queries.

        "interquery_sleep" is the minimum amount of time we will sleep between
        search engine queries, in seconds.

        "force" is simply passed to page.get() - it has the same behavior there
        as it does here.

        Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
        SearchQueryError, ...) on errors.
        """
        search = self._select_search_engine()
        handled_urls = []
        best_confidence = 0
        best_match = None
        num_queries = 0
        empty = _MarkovChain("")
        best_chains = (empty, _MarkovChainIntersection(empty, empty))
        content = self.get(force)
        clean = self._copyvio_strip_article(content)
        chunks = self._copyvio_chunk_article(clean, max_queries)
        article_chain = _MarkovChain(clean)
        last_query = time()

        if article_chain.size() < 20:  # Auto-fail very small articles
            return _CopyvioCheckResult(False, best_confidence, best_match,
                                       num_queries, article_chain, best_chains)

        while (chunks and best_confidence < min_confidence and
               (max_queries < 0 or num_queries < max_queries)):
            urls = search(chunks.pop(0))
            urls = [url for url in urls if url not in handled_urls]
            for url in urls:
                handled_urls.append(url)
                conf, chains = self._copyvio_compare_content(article_chain, url)
                if conf > best_confidence:
                    best_confidence = conf
                    best_match = url
                    best_chains = chains
            num_queries += 1
            diff = time() - last_query
            if diff < interquery_sleep:
                sleep(interquery_sleep - diff)
            last_query = time()

        if best_confidence >= min_confidence:  # violation?
            v = True
        else:
            v = False
        return _CopyvioCheckResult(v, best_confidence, best_match, num_queries,
                                   article_chain, best_chains)

    def copyvio_compare(self, url, min_confidence=0.5, force=False):
        """Check the page like copyvio_check(), but against a specific URL.

        This is essentially a reduced version of the above - a copyivo
        comparison is made using Markov chains and the result is returned in a
        _CopyvioCheckResult object - without using a search engine, as the
        suspected "violated" URL is supplied from the start.

        Its primary use is to generate a result when the URL is retrieved from
        a cache, like the one used in EarwigBot's Toolserver site. After a
        search is done, the resulting URL is stored in a cache for 24 hours so
        future checks against that page will not require another set of
        time-and-money-consuming search engine queries. However, the comparison
        itself (which includes the article's and the source's content) cannot
        be stored for data retention reasons, so a fresh comparison is made
        using this function.

        Since no searching is done, neither UnknownSearchEngineError nor
        SearchQueryError will be raised.
        """
        content = self.get(force)
        clean = self._copyvio_strip_article(content)
        article_chain = _MarkovChain(clean)
        confidence, chains = self._copyvio_compare_content(article_chain, url)

        if confidence >= min_confidence:
            is_violation = True
        else:
            is_violation = False
        return _CopyvioCheckResult(is_violation, confidence, url, 0,
                                   article_chain, chains)
--- a/earwigbot/wiki/copyvios/init.py
+++ b/earwigbot/wiki/copyvios/init.py
@@ -0,0 +1,229 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 from gzip import GzipFile
 from StringIO import StringIO
 from time import sleep, time
 from urllib2 import build_opener, URLError

 try:
    import oauth2 as oauth
 except ImportError:
    oauth = None

 from earwigbot import exceptions
 from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
 from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
 from earwigbot.wiki.copyvios.result import CopyvioCheckResult
 from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine

 __all__ = ["CopyvioMixIn"]

 class CopyvioMixIn(object):
    """
    **EarwigBot: Wiki Toolset: Copyright Violation MixIn**

    This is a mixin that provides two public methods, :py:meth:`copyvio_check`
    and :py:meth:`copyvio_compare`. The former checks the page for copyright
    violations using a search engine API, and the latter compares the page
    against a given URL. Credentials for the search engine API are stored in
    the :py:class:`~earwigbot.wiki.site.Site`'s config.
    """

    def __init__(self, site):
        self._search_config = site._search_config
        self._exclusions_db = self._search_config["exclusions_db"]
        self._opener = build_opener()
        self._opener.addheaders = site._opener.addheaders

    def _open_url_ignoring_errors(self, url):
        """Open a URL using self._opener and return its content, or None.

        Will decompress the content if the headers contain "gzip" as its
        content encoding, and will return None if URLError is raised while
        opening the URL. IOErrors while gunzipping a compressed response are
        ignored, and the original content is returned.
        """
        try:
            response = self._opener.open(url)
        except URLError:
            return None
        result = response.read()

        if response.headers.get("Content-Encoding") == "gzip":
            stream = StringIO(result)
            gzipper = GzipFile(fileobj=stream)
            try:
                result = gzipper.read()
            except IOError:
                pass

        return result

    def _select_search_engine(self):
        """Return a function that can be called to do web searches.

        The function takes one argument, a search query, and returns a list of
        URLs, ranked by importance. The underlying logic depends on the
        *engine* argument within our config; for example, if *engine* is
        "Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying.

        Raises UnknownSearchEngineError if the 'engine' listed in our config is
        unknown to us, and UnsupportedSearchEngineError if we are missing a
        required package or module, like oauth2 for "Yahoo! BOSS".
        """
        engine = self._search_config["engine"]
        credentials = self._search_config["credentials"]

        if engine == "Yahoo! BOSS":
            if not oauth:
                e = "The package 'oauth2' could not be imported"
                raise exceptions.UnsupportedSearchEngineError(e)
            return YahooBOSSSearchEngine(credentials)

        raise exceptions.UnknownSearchEngineError(engine)

    def _copyvio_compare_content(self, article, url):
        """Return a number comparing an article and a URL.

        The *article* is a Markov chain, whereas the *url* is just a string
        that we'll try to open and read ourselves.
        """
        html = self._open_url_ignoring_errors(url)
        if not html:
            return 0

        source = MarkovChain(HTMLTextParser(html).strip())
        delta = MarkovChainIntersection(article, source)
        return float(delta.size()) / article.size(), (source, delta)

    def copyvio_check(self, min_confidence=0.5, max_queries=-1,
                      interquery_sleep=1):
        """Check the page for copyright violations.

        Returns a
        :py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object
        with information on the results of the check.

        *max_queries* is self-explanatory; we will never make more than this
        number of queries in a given check. If it's lower than 0, we will not
        limit the number of queries.

        *interquery_sleep* is the minimum amount of time we will sleep between
        search engine queries, in seconds.

        Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses
        (:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`,
        :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors.
        """
        searcher = self._select_search_engine()
        self._exclusions_db.sync(self.site.name)
        handled_urls = []
        best_confidence = 0
        best_match = None
        num_queries = 0
        empty = MarkovChain("")
        best_chains = (empty, MarkovChainIntersection(empty, empty))
        parser = ArticleTextParser(self.get())
        clean = parser.strip()
        chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
        article_chain = MarkovChain(clean)
        last_query = time()

        if article_chain.size() < 20:  # Auto-fail very small articles
            return CopyvioCheckResult(False, best_confidence, best_match,
                                      num_queries, article_chain, best_chains)

        while (chunks and best_confidence < min_confidence and
               (max_queries < 0 or num_queries < max_queries)):
            chunk = chunks.pop(0)
            log = u"[[{0}]] -> querying {1} for {2!r}"
            self._logger.debug(log.format(self.title, searcher.name, chunk))
            urls = searcher.search(chunk)
            urls = [url for url in urls if url not in handled_urls]
            for url in urls:
                handled_urls.append(url)
                if self._exclusions_db.check(self.site.name, url):
                    continue
                conf, chains = self._copyvio_compare_content(article_chain, url)
                if conf > best_confidence:
                    best_confidence = conf
                    best_match = url
                    best_chains = chains
            num_queries += 1
            diff = time() - last_query
            if diff < interquery_sleep:
                sleep(interquery_sleep - diff)
            last_query = time()

        if best_confidence >= min_confidence:
            is_violation = True
            log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2}; using {3} queries)"
            self._logger.debug(log.format(self.title, best_confidence,
                                          best_match, num_queries))
        else:
            is_violation = False
            log = u"No violation for [[{0}]] (confidence: {1}; using {2} queries)"
            self._logger.debug(log.format(self.title, best_confidence,
                                          num_queries))

        return CopyvioCheckResult(is_violation, best_confidence, best_match,
                                  num_queries, article_chain, best_chains)

    def copyvio_compare(self, url, min_confidence=0.5):
        """Check the page like :py:meth:`copyvio_check` against a specific URL.

        This is essentially a reduced version of the above - a copyivo
        comparison is made using Markov chains and the result is returned in a
        :py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object -
        but without using a search engine, since the suspected "violated" URL
        is supplied from the start.

        Its primary use is to generate a result when the URL is retrieved from
        a cache, like the one used in EarwigBot's Toolserver site. After a
        search is done, the resulting URL is stored in a cache for 24 hours so
        future checks against that page will not require another set of
        time-and-money-consuming search engine queries. However, the comparison
        itself (which includes the article's and the source's content) cannot
        be stored for data retention reasons, so a fresh comparison is made
        using this function.

        Since no searching is done, neither
        :py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor
        :py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised.
        """
        content = self.get()
        clean = ArticleTextParser(content).strip()
        article_chain = MarkovChain(clean)
        confidence, chains = self._copyvio_compare_content(article_chain, url)

        if confidence >= min_confidence:
            is_violation = True
            log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2})"
            self._logger.debug(log.format(self.title, confidence, url))
        else:
            is_violation = False
            log = u"No violation for [[{0}]] (confidence: {1}; URL: {2})"
            self._logger.debug(log.format(self.title, confidence, url))

        return CopyvioCheckResult(is_violation, confidence, url, 0,
                                  article_chain, chains)
--- a/earwigbot/wiki/copyvios/exclusions.py
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -0,0 +1,164 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 import re
 import sqlite3 as sqlite
 from threading import Lock
 from time import time

 from earwigbot import exceptions

 __all__ = ["ExclusionsDB"]

 default_sources = {
    "enwiki": [
        "Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def",
        "Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl",
        "Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr",
        "Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz"
    ]
 }

 class ExclusionsDB(object):
    """
    **EarwigBot: Wiki Toolset: Exclusions Database Manager**

    Controls the :file:`.exclusions.db` file, which stores URLs excluded from
    copyright violation checks on account of being known mirrors, for example.
    """

    def __init__(self, sitesdb, dbfile, logger):
        self._sitesdb = sitesdb
        self._dbfile = dbfile
        self._logger = logger
        self._db_access_lock = Lock()

    def __repr__(self):
        """Return the canonical string representation of the ExclusionsDB."""
        res = "ExclusionsDB(sitesdb={0!r}, dbfile={1!r}, logger={2!r})"
        return res.format(self._sitesdb, self._dbfile, self._logger)

    def __str__(self):
        """Return a nice string representation of the ExclusionsDB."""
        return "<ExclusionsDB at {0}>".format(self._dbfile)

    def _create(self):
        """Initialize the exclusions database with its necessary tables."""
        script = """
            CREATE TABLE sources (source_sitename, source_page);
            CREATE TABLE updates (update_sitename, update_time);
            CREATE TABLE exclusions (exclusion_sitename, exclusion_url);
        """
        query = "INSERT INTO sources VALUES (?, ?);"
        sources = []
        for sitename, pages in default_sources.iteritems():
            [sources.append((sitename, page)) for page in pages]

        with sqlite.connect(self._dbfile) as conn:
            conn.executescript(script)
            conn.executemany(query, sources)

    def _load_source(self, site, source):
        """Load from a specific source and return a set of URLs."""
        urls = set()
        try:
            data = site.get_page(source).get()
        except exceptions.PageNotFoundError:
            return urls

        regexes = [
            "url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
            "\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?"
        ]
        for regex in regexes:
            [urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)]
        return urls

    def _update(self, sitename):
        """Update the database from listed sources in the index."""
        query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;"
        query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
        query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?"
        query4 = "INSERT INTO exclusions VALUES (?, ?);"
        query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;"
        query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;"
        query7 = "INSERT INTO updates VALUES (?, ?);"

        site = self._sitesdb.get_site(sitename)
        with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
            urls = set()
            for (source,) in conn.execute(query1, (sitename,)):
                urls |= self._load_source(site, source)
            for (url,) in conn.execute(query2, (sitename,)):
                if url in urls:
                    urls.remove(url)
                else:
                    conn.execute(query3, (sitename, url))
            conn.executemany(query4, [(sitename, url) for url in urls])
            if conn.execute(query5, (name,)).fetchone():
                conn.execute(query6, (time(), sitename))
            else:
                conn.execute(query7, (sitename, time()))

    def _get_last_update(self, sitename):
        """Return the UNIX timestamp of the last time the db was updated."""
        query = "SELECT update_time FROM updates WHERE update_sitename = ?;"
        with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
            try:
                result = conn.execute(query, (sitename,)).fetchone()
            except sqlite.OperationalError:
                self._create()
                return 0
            return result[0] if result else 0

    def sync(self, sitename):
        """Update the database if it hasn't been updated in the past month.

        This only updates the exclusions database for the *sitename* site.
        """
        max_staleness = 60 * 60 * 24 * 30
        time_since_update = int(time() - self._get_last_update())
        if time_since_update > max_staleness:
            log = u"Updating stale database: {0} (last updated {1} seconds ago)"
            self._logger.info(log.format(sitename, time_since_update))
            self._update(sitename)
        else:
            log = u"Database for {0} is still fresh (last updated {1} seconds ago)"
            self._logger.debug(log.format(sitename, time_since_update))

    def check(self, sitename, url):
        """Check whether a given URL is in the exclusions database.

        Return ``True`` if the URL is in the database, or ``False`` otherwise.
        """
        normalized = re.sub("https?://", "", url.lower())
        query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
        with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
            for row in conn.execute(query, (sitename,)):
                if normalized.startswith(row[0]):
                    log = u"Exclusion detected in {0} for {1}"
                    self._logger.debug(log.format(sitename, url))
                    return True

        log = u"No exclusions in {0} for {1}".format(sitename, url)
        self._logger.debug(log)
        return False
--- a/earwigbot/wiki/copyvios/markov.py
+++ b/earwigbot/wiki/copyvios/markov.py
@@ -0,0 +1,87 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 from collections import defaultdict
 from re import sub, UNICODE

 __all__ = ["MarkovChain", "MarkovChainIntersection"]

 class MarkovChain(object):
    """Implements a basic ngram Markov chain of words."""
    START = -1
    END = -2
    degree = 3  # 2 for bigrams, 3 for trigrams, etc.

    def __init__(self, text):
        self.text = text
        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
        words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()

        padding = self.degree - 1
        words = ([self.START] * padding) + words + ([self.END] * padding)
        for i in range(len(words) - self.degree + 1):
            last = i + self.degree - 1
            self.chain[words[i:last]][words[last]] += 1

    def __repr__(self):
        """Return the canonical string representation of the MarkovChain."""
        return "MarkovChain(text={0!r})".format(self.text)

    def __str__(self):
        """Return a nice string representation of the MarkovChain."""
        return "<MarkovChain of size {0}>".format(self.size())

    def size(self):
        """Return the size of the Markov chain: the total number of nodes."""
        count = 0
        for node in self.chain.itervalues():
            for hits in node.itervalues():
                count += hits
        return count


 class MarkovChainIntersection(MarkovChain):
    """Implements the intersection of two chains (i.e., their shared nodes)."""

    def __init__(self, mc1, mc2):
        self.chain = defaultdict(lambda: defaultdict(lambda: 0))
        self.mc1, self.mc2 = mc1, mc2
        c1 = mc1.chain
        c2 = mc2.chain

        for word, nodes1 in c1.iteritems():
            if word in c2:
                nodes2 = c2[word]
                for node, count1 in nodes1.iteritems():
                    if node in nodes2:
                        count2 = nodes2[node]
                        self.chain[word][node] = min(count1, count2)

    def __repr__(self):
        """Return the canonical string representation of the intersection."""
        res = "MarkovChainIntersection(mc1={0!r}, mc2={1!r})"
        return res.format(self.mc1, self.mc2)

    def __str__(self):
        """Return a nice string representation of the intersection."""
        res = "<MarkovChainIntersection of size {0} ({1} ^ {2})>"
        return res.format(self.size(), self.mc1, self.mc2)
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -0,0 +1,148 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 from os import path

 try:
    from bs4 import BeautifulSoup
 except ImportError:
    BeautifulSoup = None

 try:
    import mwparserfromhell
 except ImportError:
    mwparserfromhell = None

 try:
    import nltk
 except ImportError:
    nltk = None

 __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]

 class BaseTextParser(object):
    """Base class for a parser that handles text."""

    def __init__(self, text):
        self.text = text

    def __repr__(self):
        """Return the canonical string representation of the text parser."""
        return "{0}(text={1!r})".format(self.__class__.__name__, self.text)

    def __str__(self):
        """Return a nice string representation of the text parser."""
        name = self.__class__.__name__
        return "<{0} of text with size {1}>".format(name, len(text))


 class ArticleTextParser(BaseTextParser):
    """A parser that can strip and chunk wikicode article text."""

    def strip(self):
        """Clean the page's raw text by removing templates and formatting.

        Return the page's text with all HTML and wikicode formatting removed,
        including templates, tables, and references. It retains punctuation
        (spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
        quotes), original capitalization, and so forth. HTML entities are
        replaced by their unicode equivalents.

        The actual stripping is handled by :py:mod:`mwparserfromhell`.
        """
        wikicode = mwparserfromhell.parse(self.text)
        self.clean = wikicode.strip_code(normalize=True)
        return self.clean

    def chunk(self, nltk_dir, max_chunks, max_query=256):
        """Convert the clean article text into a list of web-searchable chunks.

        No greater than *max_chunks* will be returned. Each chunk will only be
        a sentence or two long at most (no more than *max_query*). The idea is
        to return a sample of the article text rather than the whole, so we'll
        pick and choose from parts of it, especially if the article is large
        and *max_chunks* is low, so we don't end up just searching for just the
        first paragraph.

        This is implemented using :py:mod:`nltk` (http://nltk.org/). A base
        directory (*nltk_dir*) is required to store nltk's punctuation
        database. This is typically located in the bot's working directory.
        """
        datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
        try:
            tokenizer = nltk.data.load("file:" + datafile)
        except LookupError:
            nltk.download("punkt", nltk_dir)
            tokenizer = nltk.data.load("file:" + datafile)

        sentences = []
        for sentence in tokenizer.tokenize(self.clean):
            if len(sentence) > max_query:
                words = sentence.split()
                while len(" ".join(words)) > max_query:
                    words.pop()
                sentence = " ".join(words)
            sentences.append(sentence)

        if max_chunks >= len(sentences):
            return sentences

        chunks = []
        while len(chunks) < max_chunks:
            if len(chunks) % 5 == 0:
                chunk = sentences.pop(0)  # Pop from beginning
            elif len(chunks) % 5 == 1:
                chunk = sentences.pop()  # Pop from end
            elif len(chunks) % 5 == 2:
                chunk = sentences.pop(len(sentences) / 2)  # Pop from Q2
            elif len(chunks) % 5 == 3:
                chunk = sentences.pop(len(sentences) / 4)  # Pop from Q1
            else:
                chunk = sentences.pop(3 * len(sentences) / 4)  # Pop from Q3
            chunks.append(chunk)

        return chunks


 class HTMLTextParser(BaseTextParser):
    """A parser that can extract the text from an HTML document."""
    hidden_tags = [
        "script", "style"
    ]

    def strip(self):
        """Return the actual text contained within an HTML document.

        Implemented using :py:mod:`BeautifulSoup <bs4>`
        (http://www.crummy.com/software/BeautifulSoup/).
        """
        try:
            soup = BeautifulSoup(self.text, "lxml").body
        except ValueError:
            soup = BeautifulSoup(self.text).body

        is_comment = lambda text: isinstance(text, bs4.element.Comment)
        [comment.extract() for comment in soup.find_all(text=is_comment)]
        for tag in self.hidden_tags:
            [element.extract() for element in soup.find_all(tag)]

        return "\n".join(soup.stripped_strings)
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -0,0 +1,60 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 __all__ = ["CopyvioCheckResult"]

 class CopyvioCheckResult(object):
    """
    **EarwigBot: Wiki Toolset: Copyvio Check Result**

    A class holding information about the results of a copyvio check.

    *Attributes:*

    - :py:attr:`violation`:     ``True`` if this is a violation, else ``False``
    - :py:attr:`confidence`:    a float between 0 and 1 indicating accuracy
    - :py:attr:`url`:           the URL of the violated page
    - :py:attr:`queries`:       the number of queries used to reach a result
    - :py:attr:`article_chain`: the MarkovChain of the article text
    - :py:attr:`source_chain`:  the MarkovChain of the violated page text
    - :py:attr:`delta_chain`:   the MarkovChainIntersection comparing the two
    """

    def __init__(self, violation, confidence, url, queries, article, chains):
        self.violation = violation
        self.confidence = confidence
        self.url = url
        self.queries = queries
        self.article_chain = article
        self.source_chain = chains[0]
        self.delta_chain = chains[1]

    def __repr__(self):
        """Return the canonical string representation of the result."""
        res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
        return res.format(self.violation, self.confidence, self.url,
                          self.queries)

    def __str__(self):
        """Return a nice string representation of the result."""
        res = "<CopyvioCheckResult ({0} with {1} conf)>"
        return res.format(self.violation, self.confidence)
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -0,0 +1,94 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is
 # furnished to do so, subject to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 from json import loads
 from urllib import quote_plus, urlencode

 try:
    import oauth2 as oauth
 except ImportError:
    oauth = None

 from earwigbot.exceptions import SearchQueryError

 __all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"]

 class BaseSearchEngine(object):
    """Base class for a simple search engine interface."""
    name = "Base"

    def __init__(self, cred):
        """Store credentials *cred* for searching later on."""
        self.cred = cred

    def __repr__(self):
        """Return the canonical string representation of the search engine."""
        return "{0}()".format(self.__class__.__name__)

    def __str__(self):
        """Return a nice string representation of the search engine."""
        return "<{0}>".format(self.__class__.__name__)

    def search(self, query):
        """Use this engine to search for *query*.

        Not implemented in this base class; overridden in subclasses.
        """
        raise NotImplementedError()


 class YahooBOSSSearchEngine(BaseSearchEngine):
    """A search engine interface with Yahoo! BOSS."""
    name = "Yahoo! BOSS"

    def search(self, query):
        """Do a Yahoo! BOSS web search for *query*.

        Returns a list of URLs, no more than fifty, ranked by relevance (as
        determined by Yahoo). Raises
        :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
        """
        base_url = "http://yboss.yahooapis.com/ysearch/web"
        query = quote_plus(query.join('"', '"'))
        params = {"q": query, "type": "html,text", "format": "json"}
        url = "{0}?{1}".format(base_url, urlencode(params))

        consumer = oauth.Consumer(key=self.cred["key"],
                                  secret=self.cred["secret"])
        client = oauth.Client(consumer)
        headers, body = client.request(url, "GET")

        if headers["status"] != "200":
            e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
            raise SearchQueryError(e.format(headers["status"], body))

        try:
            res = loads(body)
        except ValueError:
            e = "Yahoo! BOSS Error: JSON could not be decoded"
            raise SearchQueryError(e)

        try:
            results = res["bossresponse"]["web"]["results"]
        except KeyError:
            return []
        return [result["url"] for result in results]
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -21,6 +21,7 @@
 # SOFTWARE.

 from hashlib import md5
 from logging import getLogger, NullHandler
 import re
 from time import gmtime, strftime
 from urllib import quote
@@ -31,11 +32,11 @@ except ImportError:
    mwparserfromhell = None

 from earwigbot import exceptions
 from earwigbot.wiki.copyright import CopyrightMixIn
 from earwigbot.wiki.copyvios import CopyvioMixIn

 __all__ = ["Page"]

 class Page(CopyrightMixIn):
 class Page(CopyvioMixIn):
    """
    **EarwigBot: Wiki Toolset: Page**

@@ -81,7 +82,8 @@ class Page(CopyrightMixIn):
    PAGE_MISSING = 2
    PAGE_EXISTS = 3

    def __init__(self, site, title, follow_redirects=False, pageid=None):
    def __init__(self, site, title, follow_redirects=False, pageid=None,
                 logger=None):
        """Constructor for new Page instances.

        Takes four arguments: a Site object, the Page's title (or pagename),
@@ -100,6 +102,14 @@ class Page(CopyrightMixIn):
        self._follow_redirects = self._keep_following = follow_redirects
        self._pageid = pageid

        # Set up our internal logger:
        if logger:
            self._logger = logger
        else:  # Just set up a null logger to eat up our messages:
            self._logger = getLogger("earwigbot.wiki")
            self._logger.addHandler(NullHandler())

        # Attributes to be loaded through the API:
        self._exists = self.PAGE_UNKNOWN
        self._is_redirect = None
        self._lastrevid = None
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -92,7 +92,7 @@ class Site(object):
                 namespaces=None, login=(None, None), cookiejar=None,
                 user_agent=None, use_https=False, assert_edit=None,
                 maxlag=None, wait_between_queries=3, logger=None,
                 search_config=(None, None)):
                 search_config=None):
        """Constructor for new Site instances.

        This probably isn't necessary to call yourself unless you're building a
@@ -560,10 +560,10 @@ class Site(object):
                return [self.SERVICE_API]
            sqllag = self._sql_info_cache["replag"]

        if sqllag > 180:
        if sqllag > 300:
            if not self._maxlag:
                return [self.SERVICE_API, self.SERVICE_SQL]
            if now - self._api_info_cache["lastcheck"] > 120:
            if now - self._api_info_cache["lastcheck"] > 300:
                self._api_info_cache["lastcheck"] = now
                try:
                    self._api_info_cache["maxlag"] = apilag = self.get_maxlag()
@@ -571,7 +571,7 @@ class Site(object):
                    self._api_info_cache["maxlag"] = apilag = 0
            else:
                apilag = self._api_info_cache["maxlag"]
            if sqllag / (180.0 / self._maxlag) < apilag:
            if apilag > self._maxlag:
                return [self.SERVICE_SQL, self.SERVICE_API]
            return [self.SERVICE_API, self.SERVICE_SQL]

@@ -789,8 +789,9 @@ class Site(object):
        prefix = title.split(":", 1)[0]
        if prefix != title:  # Avoid a page that is simply "Category"
            if prefix in prefixes:
                return Category(self, title, follow_redirects, pageid)
        return Page(self, title, follow_redirects, pageid)
                return Category(self, title, follow_redirects, pageid,
                                self._logger)
        return Page(self, title, follow_redirects, pageid, self._logger)

    def get_category(self, catname, follow_redirects=False, pageid=None):
        """Return a :py:class:`Category` object for the given category name.
@@ -802,7 +803,7 @@ class Site(object):
        catname = self._unicodeify(catname)
        prefix = self.namespace_id_to_name(constants.NS_CATEGORY)
        pagename = u':'.join((prefix, catname))
        return Category(self, pagename, follow_redirects, pageid)
        return Category(self, pagename, follow_redirects, pageid, self._logger)

    def get_user(self, username=None):
        """Return a :py:class:`User` object for the given username.
@@ -815,7 +816,7 @@ class Site(object):
            username = self._unicodeify(username)
        else:
            username = self._get_username()
        return User(self, username)
        return User(self, username, self._logger)

    def delegate(self, services, args=None, kwargs=None):
        """Delegate a task to either the API or SQL depending on conditions.
--- a/earwigbot/wiki/sitesdb.py
+++ b/earwigbot/wiki/sitesdb.py
@@ -29,6 +29,7 @@ import sqlite3 as sqlite

 from earwigbot import __version__
 from earwigbot.exceptions import SiteNotFoundError
 from earwigbot.wiki.copyvios.exclusions import ExclusionsDB
 from earwigbot.wiki.site import Site

 __all__ = ["SitesDB"]
@@ -58,11 +59,16 @@ class SitesDB(object):
        """Set up the manager with an attribute for the base Bot object."""
        self.config = bot.config
        self._logger = bot.logger.getChild("wiki")

        self._sites = {}  # Internal site cache
        self._sitesdb = path.join(bot.config.root_dir, "sites.db")
        self._cookie_file = path.join(bot.config.root_dir, ".cookies")
        self._cookiejar = None

        excl_db = path.join(bot.config.root_dir, "exclusions.db")
        excl_logger = self._logger.getChild("exclusionsdb")
        self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger)

    def __repr__(self):
        """Return the canonical string representation of the SitesDB."""
        res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})"
@@ -192,6 +198,17 @@ class SitesDB(object):
            user_agent = user_agent.replace("$1", __version__)
            user_agent = user_agent.replace("$2", python_version())

        if search_config:
            nltk_dir = path.join(self.config.root_dir, ".nltk")
            search_config["nltk_dir"] = nltk_dir
            search_config["exclusions_db"] = self._exclusions_db

        if not sql:
            sql = config.wiki.get("sql", {})
            for key, value in sql.iteritems():
                if "$1" in value:
                    sql[key] = value.replace("$1", name)

        return Site(name=name, project=project, lang=lang, base_url=base_url,
                    article_path=article_path, script_path=script_path,
                    sql=sql, namespaces=namespaces, login=login,
@@ -332,13 +349,12 @@ class SitesDB(object):
        the script path (meaning the API is located at
        ``"{base_url}{script_path}/api.php"`` ->
        ``"//{lang}.{project}.org/w/api.php"``), so this is the default. If
        your wiki is different, provide the script_path as an argument. The
        only other argument to :py:class:`~earwigbot.wiki.site.Site` that we
        can't get from config files or by querying the wiki itself is SQL
        connection info, so provide a dict of kwargs as *sql* and Site will
        pass it to :py:func:`oursql.connect(**sql) <oursql.connect>`, allowing
        you to make queries with :py:meth:`site.sql_query
        <earwigbot.wiki.site.Site.sql_query>`.
        your wiki is different, provide the script_path as an argument. SQL
        connection settings are guessed automatically using config's template
        value. If this is wrong or not specified, provide a dict of kwargs as
        *sql* and Site will pass it to :py:func:`oursql.connect(**sql)
        <oursql.connect>`, allowing you to make queries with
        :py:meth:`site.sql_query <earwigbot.wiki.site.Site.sql_query>`.

        Returns ``True`` if the site was added successfully or ``False`` if the
        site is already in our sitesdb (this can be done purposefully to update
@@ -359,15 +375,31 @@ class SitesDB(object):
        use_https = config.wiki.get("useHTTPS", False)
        assert_edit = config.wiki.get("assert")
        maxlag = config.wiki.get("maxlag")
        wait_between_queries = config.wiki.get("waitTime", 5)
        wait_between_queries = config.wiki.get("waitTime", 3)
        logger = self._logger.getChild(name)
        search_config = config.wiki.get("search")

        if user_agent:
            user_agent = user_agent.replace("$1", __version__)
            user_agent = user_agent.replace("$2", python_version())

        if search_config:
            nltk_dir = path.join(self.config.root_dir, ".nltk")
            search_config["nltk_dir"] = nltk_dir
            search_config["exclusions_db"] = self._exclusions_db

        if not sql:
            sql = config.wiki.get("sql", {})
            for key, value in sql.iteritems():
                if "$1" in value:
                    sql[key] = value.replace("$1", name)

        # Create a Site object to log in and load the other attributes:
        site = Site(base_url=base_url, script_path=script_path, sql=sql,
                    login=login, cookiejar=cookiejar, user_agent=user_agent,
                    use_https=use_https, assert_edit=assert_edit,
                    maxlag=maxlag, wait_between_queries=wait_between_queries,
                    search_config=search_config)
                    logger=logger, search_config=search_config)

        self._add_site_to_sitesdb(site)
        self._sites[site.name] = site
--- a/earwigbot/wiki/user.py
+++ b/earwigbot/wiki/user.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 from logging import getLogger, NullHandler
 from time import gmtime, strptime

 from earwigbot.exceptions import UserNotFoundError
@@ -60,7 +61,7 @@ class User(object):
      talkpage
    """

    def __init__(self, site, name):
    def __init__(self, site, name, logger=None):
        """Constructor for new User instances.

        Takes two arguments, a Site object (necessary for doing API queries),
@@ -76,6 +77,13 @@ class User(object):
        self._site = site
        self._name = name

        # Set up our internal logger:
        if logger:
            self._logger = logger
        else:  # Just set up a null logger to eat up our messages:
            self._logger = getLogger("earwigbot.wiki")
            self._logger.addHandler(NullHandler())

    def __repr__(self):
        """Return the canonical string representation of the User."""
        return "User(name={0!r}, site={1!r})".format(self._name, self._site)
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,25 @@ from setuptools import setup, find_packages

 from earwigbot import __version__

 # Not all of these dependencies are required, particularly the copyvio-specific
 # ones (bs4, lxml, nltk, and oauth2) or the command-specific ones (GitPython,
 # pytz). The bot should run fine without them, but will raise an exception if
 # you try to detect copyvios or run a command that requries one.

 dependencies = [
    "GitPython >= 0.3.2.RC1",  # Interfacing with git for !git and __version__
    "PyYAML >= 3.10",  # Parsing config files
    "beautifulsoup4 >= 4.1.1",  # Parsing/scraping HTML for copyvios
    "lxml >= 2.3.4",  # Faster parser for BeautifulSoup
    "mwparserfromhell >= 0.1",  # Parsing wikicode for manipulation
    "nltk >= 2.0.2",  # Parsing sentences to split article content for copyvios
    "oursql >= 0.9.3",  # Interfacing with MediaWiki databases
    "oauth2 >= 1.5.211",  # Interfacing with Yahoo! BOSS Search for copyvios
    "py-bcrypt >= 0.2",  # Hashing the bot key in the config file
    "pycrypto >= 2.5",  # Storing bot passwords and keys in the config file
    "pytz >= 2012c",  # Handling timezones for the !time IRC command
 ]

 with open("README.rst") as fp:
    long_docs = fp.read()

@@ -32,15 +51,7 @@ setup(
    name = "earwigbot",
    packages = find_packages(exclude=("tests",)),
    entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]},
    install_requires = ["GitPython >= 0.3.2.RC1",  # Interfacing with git
                        "PyYAML >= 3.10",  # Config parsing
                        "mwparserfromhell >= 0.1",  # Wikicode parsing
                        "oursql >= 0.9.3",  # Talking with MediaWiki databases
                        "oauth2 >= 1.5.211",  # Talking with Yahoo BOSS Search
                        "py-bcrypt >= 0.2",  # Password hashing in config
                        "pycrypto >= 2.5",  # Storing bot passwords and keys
                        "pytz >= 2012c",  # Timezone handling
                        ],
    install_requires = dependencies,
    test_suite = "tests",
    version = __version__,
    author = "Ben Kurtovic",