Some code for copyvio detection, including querying Yahoo! BOSS correctly.

13 年之前 · 0b6d5eac5e
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
 [EarwigBot](http://toolserver.org/~earwig/earwigbot/) is a
 [EarwigBot](http://en.wikipedia.org/wiki/User:EarwigBot) is a
 [Python](http://python.org/) robot that edits
 [Wikipedia](http://en.wikipedia.org/) and interacts with people over
 [IRC](http://en.wikipedia.org/wiki/Internet_Relay_Chat).
@@ -31,3 +31,9 @@ Additionally, the afc_history task uses
 [matplotlib](http://matplotlib.sourceforge.net/) and
 [numpy](http://numpy.scipy.org/) for graphing AfC statistics. Neither of these
 modules are required for the main bot itself.

 `earwigbot.wiki.copyright` requires access to a search engine for detecting
 copyright violations. Currently,
 [Yahoo! BOSS](http://developer.yahoo.com/search/boss/) is the only engine
 supported, and this requires
 [oauth2](https://github.com/simplegeo/python-oauth2).
--- a/earwigbot/rules.py
+++ b/earwigbot/rules.py
@@ -51,36 +51,34 @@ def process(rc):
    chans = set()  # channels to report this message to
    page_name = rc.page.lower()
    comment = rc.comment.lower()
    

    if "!earwigbot" in rc.msg.lower():
        chans.update(("##earwigbot", "#wikipedia-en-afc"))
        

    if r_page.search(page_name):
        #tasks.start("afc_copyvios", action="edit", page=rc.page)
        tasks.start("afc_copyvios", page=rc.page)
        chans.add("#wikipedia-en-afc")
        

    elif r_ffu.match(page_name):
        chans.add("#wikipedia-en-afc")
        

    elif page_name.startswith("template:afc submission"):
        chans.add("#wikipedia-en-afc")
    

    elif rc.flags == "move" and (r_move1.match(comment) or
                                 r_move2.match(comment)):
        p = r_moved_pages.findall(rc.comment)[0]
        #tasks.start("afc_copyvios", action="move", page=p)
        chans.add("#wikipedia-en-afc")
    

    elif rc.flags == "delete" and r_delete.match(comment):
        p = r_deleted_page.findall(rc.comment)[0]
        #tasks.start("afc_copyvios", action="delete", page=p)
        chans.add("#wikipedia-en-afc")
    

    elif rc.flags == "restore" and r_restore.match(comment):
        p = r_restored_page.findall(rc.comment)[0]
        #tasks.start("afc_copyvios", action="restore", page=p)
        tasks.start("afc_copyvios", page=p)
        chans.add("#wikipedia-en-afc")
    

    elif rc.flags == "protect" and r_protect.match(comment):
        chans.add("#wikipedia-en-afc")

--- a/earwigbot/tasks/afc_copyvios.py
+++ b/earwigbot/tasks/afc_copyvios.py
@@ -20,6 +20,12 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 from os.path import expanduser
 from threading import Lock

 import oursql

 from earwigbot import wiki
 from earwigbot.classes import BaseTask
 from earwigbot.config import config

@@ -30,9 +36,75 @@ class Task(BaseTask):
    number = 1

    def __init__(self):
        self.cfg = cfg = config.tasks.get(self.name, {})
        config.decrypt(config.tasks, self.name, "search", "credentials", "key")
        config.decrypt(config.tasks, self.name, "search", "credentials", "secret")

        cfg = config.tasks.get(self.name, {})
        self.template = cfg.get("template", "AfC suspected copyvio")
        self.ignore_list = cfg.get("ignoreList", [])
        default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}"
        self.summary = self.make_summary(cfg.get("summary", default_summary))

        # Search API data:
        search = cfg.get("search", {})
        self.engine = search.get("engine")
        self.credentials = search.get("credentials", {})

        # Connection data for our SQL database:
        kwargs = cfg.get("sql", {})
        kwargs["read_default_file"] = expanduser("~/.my.cnf")
        self.conn_data = kwargs
        self.db_access_lock = Lock()

    def run(self, **kwargs):
        pass
        """Entry point for the bot task.

        Takes a page title in kwargs and checks it for copyvios, adding
        {{self.template}} at the top if a copyvio has been detected. A page is
        only checked once (processed pages are stored by page_id in an SQL
        database).
        """
        if self.shutoff_enabled():
            return
        title = kwargs["page"]
        page = wiki.get_site().get_page(title)
        with self.db_access_lock:
            self.conn = oursql.connect(**self.conn_data)
            self.process(page)

    def process(self, page):
        """Detect copyvios in 'page' and add a note if any are found."""
        pageid = page.pageid()
        if self.has_been_processed(pageid):
            msg = "Skipping check on already processed page [[{0}]]"
            self.logger.info(msg.format(page.title()))
            return

        self.logger.info("Checking [[{0}]]".format(page.title()))
        content = page.get() 
        result = page.copyvio_check(self.engine, self.credentials)
        if result:
            content = page.get()
            template = "\{\{{0}|url={1}\}\}".format(self.template, result)
            newtext = "\n".join((template, content))
            page.edit(newtext, self.summary.format(url=result))
            msg = "Found violation: [[{0}]] -> {1}"
            self.logger.info(msg.format(page.title(), result))
        else:
            self.logger.debug("No violations detected")

        self.log_processed(pageid)

    def has_been_processed(self, pageid):
        query = "SELECT 1 FROM processed WHERE page_id = ?"
        with self.conn.cursor() as cursor:
            cursor.execute(query, (pageid,))
            results = cursor.fetchall()
        if results:
            return True
        return False

    def log_processed(self, pageid):
        query = "INSERT INTO processed VALUES (?)"
        with self.conn.cursor() as cursor:
            cursor.execute(query, (pageid,))
--- a/earwigbot/tasks/afc_statistics.py
+++ b/earwigbot/tasks/afc_statistics.py
@@ -185,7 +185,7 @@ class Task(BaseTask):

        This is used by the template as a hidden sortkey.
        """
        return (dt - datetime(1970, 1, 1)).total_seconds()
        return int((dt - datetime(1970, 1, 1)).total_seconds())

    def sync(self, **kwargs):
        """Synchronize our local statistics database with the site.
--- a/earwigbot/wiki/copyright.py
+++ b/earwigbot/wiki/copyright.py
@@ -0,0 +1,81 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2009, 2010, 2011 by Ben Kurtovic <ben.kurtovic@verizon.net>
 # 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
 # in the Software without restriction, including without limitation the rights
 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 # copies of the Software, and to permit persons to whom the Software is 
 # furnished to do so, subject to the following conditions:
 # 
 # The above copyright notice and this permission notice shall be included in
 # all copies or substantial portions of the Software.
 # 
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 from json import loads
 from urllib import quote_plus, urlencode

 try:
    import oauth2 as oauth
 except ImportError:
    oauth = None

 from earwigbot.wiki.exceptions import *

 class CopyrightMixin(object):
    """
    EarwigBot's Wiki Toolset: Copyright Violation Mixin

    This is a mixin that provides one public method, copyvio_check(), which
    checks the page for copyright violations using a search engine API. The
    API keys must be provided to the method as arguments.
    """
    def _yahoo_boss_query(self, query, cred):
        """Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials.

        Returns a list of URLs, no more than fifty, ranked by relevance (as
        determined by Yahoo). Raises SearchQueryError() on errors.
        """
        base_url = "http://yboss.yahooapis.com/ysearch/web"
        params = {"q": quote_plus(query), "style": "raw", "format": "json"}
        url = "{0}?{1}".format(base_url, urlencode(params))

        consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"])
        client = oauth.Client(consumer)
        headers, body = client.request(url, "GET")

        if headers["status"] != "200":
            e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
            raise SearchQueryError(e.format(headers["status"], body))

        try:
            res = loads(body)
        except ValueError:
            e = "Yahoo! BOSS Error: JSON could not be decoded"
            raise SearchQueryError(e)

        try:
            results = res["bossresponse"]["web"]["results"]
        except KeyError:
            return []
        return [result["url"] for result in results]

    def copyvio_check(self, engine, credentials, force=False):
        """Check the page for copyright violations."""
        if engine == "Yahoo! BOSS":
            if not oauth:
                e = "The package 'oauth2' could not be imported"
                raise UnsupportedSearchEngineError(e)
            querier = self._yahoo_boss_query
        else:
            raise UnknownSearchEngineError(engine)
        content = self.get(force)
        return querier(content, credentials)
--- a/earwigbot/wiki/exceptions.py
+++ b/earwigbot/wiki/exceptions.py
@@ -23,7 +23,29 @@
 """
 EarwigBot's Wiki Toolset: Exceptions

 This module contains all exceptions used by the wiki package. There are a lot.
 This module contains all exceptions used by the wiki package. There are a lot:

 -- WikiToolsetError
        -- SiteNotFoundError
        -- SiteAPIError
        -- LoginError
        -- NamespaceNotFoundError
        -- PageNotFoundError
        -- InvalidPageError
        -- RedirectError
        -- UserNotFoundError
        -- EditError
                -- PermissionsError
                -- EditConflictError
                -- NoContentError
                -- ContentTooBigError
                -- SpamDetectedError
                -- FilteredError
        -- SQLError
        -- CopyvioCheckError
                -- UnknownSearchEngineError
                -- UnsupportedSearchEngineError
                -- SearchQueryError
 """

 class WikiToolsetError(Exception):
@@ -87,3 +109,16 @@ class FilteredError(EditError):

 class SQLError(WikiToolsetError):
    """Some error involving SQL querying occurred."""

 class CopyvioCheckError(WikiToolsetError):
    """An error occured when checking a page for copyright violations."""

 class UnknownSearchEngineError(CopyvioCheckError):
    """CopyrightMixin().copyvio_check() called with an unknown engine."""

 class UnsupportedSearchEngineError(CopyvioCheckError):
    """The engine requested is not available, e.g., because a required package
    is missing."""

 class SearchQueryError(CopyvioCheckError):
    """Some error ocurred while doing a search query."""
--- a/earwigbot/wiki/functions.py
+++ b/earwigbot/wiki/functions.py
@@ -54,7 +54,8 @@ def _load_config():
    is_encrypted = config.load()
    if is_encrypted:  # Passwords in the config file are encrypted
        key = getpass("Enter key to unencrypt bot passwords: ")
        config.decrypt(key)
        config._decryption_key = key
        config.decrypt(config.wiki, "password")

 def _get_cookiejar():
    """Returns a LWPCookieJar object loaded from our .cookies file. The same
--- a/earwigbot/wiki/page.py
+++ b/earwigbot/wiki/page.py
@@ -25,9 +25,10 @@ import re
 from time import gmtime, strftime
 from urllib import quote

 from earwigbot.wiki.copyright import CopyrightMixin
 from earwigbot.wiki.exceptions import *

 class Page(object):
 class Page(CopyrightMixin):
    """
    EarwigBot's Wiki Toolset: Page Class

@@ -49,7 +50,8 @@ class Page(object):
    get                 -- returns page content
    get_redirect_target -- if the page is a redirect, returns its destination
    edit                -- replaces the page's content or creates a new page
    add_section         -- add a new section at the bottom of the page
    add_section         -- adds a new section at the bottom of the page
    copyvio_check       -- checks the page for copyright violations
    """

    re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"