From 0b6d5eac5e29192d77c2049fb30d23eeabb13999 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 11 Dec 2011 17:31:34 -0500 Subject: [PATCH] Some code for copyvio detection, including querying Yahoo! BOSS correctly. --- README.md | 8 +++- earwigbot/rules.py | 22 +++++------ earwigbot/tasks/afc_copyvios.py | 76 +++++++++++++++++++++++++++++++++++- earwigbot/tasks/afc_statistics.py | 2 +- earwigbot/wiki/copyright.py | 81 +++++++++++++++++++++++++++++++++++++++ earwigbot/wiki/exceptions.py | 37 +++++++++++++++++- earwigbot/wiki/functions.py | 3 +- earwigbot/wiki/page.py | 6 ++- 8 files changed, 215 insertions(+), 20 deletions(-) create mode 100644 earwigbot/wiki/copyright.py diff --git a/README.md b/README.md index fdb0207..3ecc02b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[EarwigBot](http://toolserver.org/~earwig/earwigbot/) is a +[EarwigBot](http://en.wikipedia.org/wiki/User:EarwigBot) is a [Python](http://python.org/) robot that edits [Wikipedia](http://en.wikipedia.org/) and interacts with people over [IRC](http://en.wikipedia.org/wiki/Internet_Relay_Chat). @@ -31,3 +31,9 @@ Additionally, the afc_history task uses [matplotlib](http://matplotlib.sourceforge.net/) and [numpy](http://numpy.scipy.org/) for graphing AfC statistics. Neither of these modules are required for the main bot itself. + +`earwigbot.wiki.copyright` requires access to a search engine for detecting +copyright violations. Currently, +[Yahoo! BOSS](http://developer.yahoo.com/search/boss/) is the only engine +supported, and this requires +[oauth2](https://github.com/simplegeo/python-oauth2). diff --git a/earwigbot/rules.py b/earwigbot/rules.py index ce851cc..715be2e 100644 --- a/earwigbot/rules.py +++ b/earwigbot/rules.py @@ -51,36 +51,34 @@ def process(rc): chans = set() # channels to report this message to page_name = rc.page.lower() comment = rc.comment.lower() - + if "!earwigbot" in rc.msg.lower(): chans.update(("##earwigbot", "#wikipedia-en-afc")) - + if r_page.search(page_name): - #tasks.start("afc_copyvios", action="edit", page=rc.page) + tasks.start("afc_copyvios", page=rc.page) chans.add("#wikipedia-en-afc") - + elif r_ffu.match(page_name): chans.add("#wikipedia-en-afc") - + elif page_name.startswith("template:afc submission"): chans.add("#wikipedia-en-afc") - + elif rc.flags == "move" and (r_move1.match(comment) or r_move2.match(comment)): p = r_moved_pages.findall(rc.comment)[0] - #tasks.start("afc_copyvios", action="move", page=p) chans.add("#wikipedia-en-afc") - + elif rc.flags == "delete" and r_delete.match(comment): p = r_deleted_page.findall(rc.comment)[0] - #tasks.start("afc_copyvios", action="delete", page=p) chans.add("#wikipedia-en-afc") - + elif rc.flags == "restore" and r_restore.match(comment): p = r_restored_page.findall(rc.comment)[0] - #tasks.start("afc_copyvios", action="restore", page=p) + tasks.start("afc_copyvios", page=p) chans.add("#wikipedia-en-afc") - + elif rc.flags == "protect" and r_protect.match(comment): chans.add("#wikipedia-en-afc") diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py index 2868cfe..2d881df 100644 --- a/earwigbot/tasks/afc_copyvios.py +++ b/earwigbot/tasks/afc_copyvios.py @@ -20,6 +20,12 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from os.path import expanduser +from threading import Lock + +import oursql + +from earwigbot import wiki from earwigbot.classes import BaseTask from earwigbot.config import config @@ -30,9 +36,75 @@ class Task(BaseTask): number = 1 def __init__(self): - self.cfg = cfg = config.tasks.get(self.name, {}) config.decrypt(config.tasks, self.name, "search", "credentials", "key") config.decrypt(config.tasks, self.name, "search", "credentials", "secret") + cfg = config.tasks.get(self.name, {}) + self.template = cfg.get("template", "AfC suspected copyvio") + self.ignore_list = cfg.get("ignoreList", []) + default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}" + self.summary = self.make_summary(cfg.get("summary", default_summary)) + + # Search API data: + search = cfg.get("search", {}) + self.engine = search.get("engine") + self.credentials = search.get("credentials", {}) + + # Connection data for our SQL database: + kwargs = cfg.get("sql", {}) + kwargs["read_default_file"] = expanduser("~/.my.cnf") + self.conn_data = kwargs + self.db_access_lock = Lock() + def run(self, **kwargs): - pass + """Entry point for the bot task. + + Takes a page title in kwargs and checks it for copyvios, adding + {{self.template}} at the top if a copyvio has been detected. A page is + only checked once (processed pages are stored by page_id in an SQL + database). + """ + if self.shutoff_enabled(): + return + title = kwargs["page"] + page = wiki.get_site().get_page(title) + with self.db_access_lock: + self.conn = oursql.connect(**self.conn_data) + self.process(page) + + def process(self, page): + """Detect copyvios in 'page' and add a note if any are found.""" + pageid = page.pageid() + if self.has_been_processed(pageid): + msg = "Skipping check on already processed page [[{0}]]" + self.logger.info(msg.format(page.title())) + return + + self.logger.info("Checking [[{0}]]".format(page.title())) + content = page.get() + result = page.copyvio_check(self.engine, self.credentials) + if result: + content = page.get() + template = "\{\{{0}|url={1}\}\}".format(self.template, result) + newtext = "\n".join((template, content)) + page.edit(newtext, self.summary.format(url=result)) + msg = "Found violation: [[{0}]] -> {1}" + self.logger.info(msg.format(page.title(), result)) + else: + self.logger.debug("No violations detected") + + self.log_processed(pageid) + + def has_been_processed(self, pageid): + query = "SELECT 1 FROM processed WHERE page_id = ?" + with self.conn.cursor() as cursor: + cursor.execute(query, (pageid,)) + results = cursor.fetchall() + if results: + return True + return False + + def log_processed(self, pageid): + query = "INSERT INTO processed VALUES (?)" + with self.conn.cursor() as cursor: + cursor.execute(query, (pageid,)) diff --git a/earwigbot/tasks/afc_statistics.py b/earwigbot/tasks/afc_statistics.py index 9f5e319..3c8c885 100644 --- a/earwigbot/tasks/afc_statistics.py +++ b/earwigbot/tasks/afc_statistics.py @@ -185,7 +185,7 @@ class Task(BaseTask): This is used by the template as a hidden sortkey. """ - return (dt - datetime(1970, 1, 1)).total_seconds() + return int((dt - datetime(1970, 1, 1)).total_seconds()) def sync(self, **kwargs): """Synchronize our local statistics database with the site. diff --git a/earwigbot/wiki/copyright.py b/earwigbot/wiki/copyright.py new file mode 100644 index 0000000..c5c7a64 --- /dev/null +++ b/earwigbot/wiki/copyright.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2009, 2010, 2011 by Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from json import loads +from urllib import quote_plus, urlencode + +try: + import oauth2 as oauth +except ImportError: + oauth = None + +from earwigbot.wiki.exceptions import * + +class CopyrightMixin(object): + """ + EarwigBot's Wiki Toolset: Copyright Violation Mixin + + This is a mixin that provides one public method, copyvio_check(), which + checks the page for copyright violations using a search engine API. The + API keys must be provided to the method as arguments. + """ + def _yahoo_boss_query(self, query, cred): + """Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials. + + Returns a list of URLs, no more than fifty, ranked by relevance (as + determined by Yahoo). Raises SearchQueryError() on errors. + """ + base_url = "http://yboss.yahooapis.com/ysearch/web" + params = {"q": quote_plus(query), "style": "raw", "format": "json"} + url = "{0}?{1}".format(base_url, urlencode(params)) + + consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"]) + client = oauth.Client(consumer) + headers, body = client.request(url, "GET") + + if headers["status"] != "200": + e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" + raise SearchQueryError(e.format(headers["status"], body)) + + try: + res = loads(body) + except ValueError: + e = "Yahoo! BOSS Error: JSON could not be decoded" + raise SearchQueryError(e) + + try: + results = res["bossresponse"]["web"]["results"] + except KeyError: + return [] + return [result["url"] for result in results] + + def copyvio_check(self, engine, credentials, force=False): + """Check the page for copyright violations.""" + if engine == "Yahoo! BOSS": + if not oauth: + e = "The package 'oauth2' could not be imported" + raise UnsupportedSearchEngineError(e) + querier = self._yahoo_boss_query + else: + raise UnknownSearchEngineError(engine) + content = self.get(force) + return querier(content, credentials) diff --git a/earwigbot/wiki/exceptions.py b/earwigbot/wiki/exceptions.py index c23a743..e88158f 100644 --- a/earwigbot/wiki/exceptions.py +++ b/earwigbot/wiki/exceptions.py @@ -23,7 +23,29 @@ """ EarwigBot's Wiki Toolset: Exceptions -This module contains all exceptions used by the wiki package. There are a lot. +This module contains all exceptions used by the wiki package. There are a lot: + +-- WikiToolsetError + -- SiteNotFoundError + -- SiteAPIError + -- LoginError + -- NamespaceNotFoundError + -- PageNotFoundError + -- InvalidPageError + -- RedirectError + -- UserNotFoundError + -- EditError + -- PermissionsError + -- EditConflictError + -- NoContentError + -- ContentTooBigError + -- SpamDetectedError + -- FilteredError + -- SQLError + -- CopyvioCheckError + -- UnknownSearchEngineError + -- UnsupportedSearchEngineError + -- SearchQueryError """ class WikiToolsetError(Exception): @@ -87,3 +109,16 @@ class FilteredError(EditError): class SQLError(WikiToolsetError): """Some error involving SQL querying occurred.""" + +class CopyvioCheckError(WikiToolsetError): + """An error occured when checking a page for copyright violations.""" + +class UnknownSearchEngineError(CopyvioCheckError): + """CopyrightMixin().copyvio_check() called with an unknown engine.""" + +class UnsupportedSearchEngineError(CopyvioCheckError): + """The engine requested is not available, e.g., because a required package + is missing.""" + +class SearchQueryError(CopyvioCheckError): + """Some error ocurred while doing a search query.""" diff --git a/earwigbot/wiki/functions.py b/earwigbot/wiki/functions.py index 5648976..16f6f3c 100644 --- a/earwigbot/wiki/functions.py +++ b/earwigbot/wiki/functions.py @@ -54,7 +54,8 @@ def _load_config(): is_encrypted = config.load() if is_encrypted: # Passwords in the config file are encrypted key = getpass("Enter key to unencrypt bot passwords: ") - config.decrypt(key) + config._decryption_key = key + config.decrypt(config.wiki, "password") def _get_cookiejar(): """Returns a LWPCookieJar object loaded from our .cookies file. The same diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py index 5fca120..5b359e1 100644 --- a/earwigbot/wiki/page.py +++ b/earwigbot/wiki/page.py @@ -25,9 +25,10 @@ import re from time import gmtime, strftime from urllib import quote +from earwigbot.wiki.copyright import CopyrightMixin from earwigbot.wiki.exceptions import * -class Page(object): +class Page(CopyrightMixin): """ EarwigBot's Wiki Toolset: Page Class @@ -49,7 +50,8 @@ class Page(object): get -- returns page content get_redirect_target -- if the page is a redirect, returns its destination edit -- replaces the page's content or creates a new page - add_section -- add a new section at the bottom of the page + add_section -- adds a new section at the bottom of the page + copyvio_check -- checks the page for copyright violations """ re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"