Browse Source

Some code for copyvio detection, including querying Yahoo! BOSS correctly.

tags/v0.1^2
Ben Kurtovic 13 years ago
parent
commit
0b6d5eac5e
8 changed files with 215 additions and 20 deletions
  1. +7
    -1
      README.md
  2. +10
    -12
      earwigbot/rules.py
  3. +74
    -2
      earwigbot/tasks/afc_copyvios.py
  4. +1
    -1
      earwigbot/tasks/afc_statistics.py
  5. +81
    -0
      earwigbot/wiki/copyright.py
  6. +36
    -1
      earwigbot/wiki/exceptions.py
  7. +2
    -1
      earwigbot/wiki/functions.py
  8. +4
    -2
      earwigbot/wiki/page.py

+ 7
- 1
README.md View File

@@ -1,4 +1,4 @@
[EarwigBot](http://toolserver.org/~earwig/earwigbot/) is a
[EarwigBot](http://en.wikipedia.org/wiki/User:EarwigBot) is a
[Python](http://python.org/) robot that edits
[Wikipedia](http://en.wikipedia.org/) and interacts with people over
[IRC](http://en.wikipedia.org/wiki/Internet_Relay_Chat).
@@ -31,3 +31,9 @@ Additionally, the afc_history task uses
[matplotlib](http://matplotlib.sourceforge.net/) and
[numpy](http://numpy.scipy.org/) for graphing AfC statistics. Neither of these
modules are required for the main bot itself.

`earwigbot.wiki.copyright` requires access to a search engine for detecting
copyright violations. Currently,
[Yahoo! BOSS](http://developer.yahoo.com/search/boss/) is the only engine
supported, and this requires
[oauth2](https://github.com/simplegeo/python-oauth2).

+ 10
- 12
earwigbot/rules.py View File

@@ -51,36 +51,34 @@ def process(rc):
chans = set() # channels to report this message to
page_name = rc.page.lower()
comment = rc.comment.lower()
if "!earwigbot" in rc.msg.lower():
chans.update(("##earwigbot", "#wikipedia-en-afc"))
if r_page.search(page_name):
#tasks.start("afc_copyvios", action="edit", page=rc.page)
tasks.start("afc_copyvios", page=rc.page)
chans.add("#wikipedia-en-afc")
elif r_ffu.match(page_name):
chans.add("#wikipedia-en-afc")
elif page_name.startswith("template:afc submission"):
chans.add("#wikipedia-en-afc")
elif rc.flags == "move" and (r_move1.match(comment) or
r_move2.match(comment)):
p = r_moved_pages.findall(rc.comment)[0]
#tasks.start("afc_copyvios", action="move", page=p)
chans.add("#wikipedia-en-afc")
elif rc.flags == "delete" and r_delete.match(comment):
p = r_deleted_page.findall(rc.comment)[0]
#tasks.start("afc_copyvios", action="delete", page=p)
chans.add("#wikipedia-en-afc")
elif rc.flags == "restore" and r_restore.match(comment):
p = r_restored_page.findall(rc.comment)[0]
#tasks.start("afc_copyvios", action="restore", page=p)
tasks.start("afc_copyvios", page=p)
chans.add("#wikipedia-en-afc")
elif rc.flags == "protect" and r_protect.match(comment):
chans.add("#wikipedia-en-afc")



+ 74
- 2
earwigbot/tasks/afc_copyvios.py View File

@@ -20,6 +20,12 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from os.path import expanduser
from threading import Lock

import oursql

from earwigbot import wiki
from earwigbot.classes import BaseTask
from earwigbot.config import config

@@ -30,9 +36,75 @@ class Task(BaseTask):
number = 1

def __init__(self):
self.cfg = cfg = config.tasks.get(self.name, {})
config.decrypt(config.tasks, self.name, "search", "credentials", "key")
config.decrypt(config.tasks, self.name, "search", "credentials", "secret")

cfg = config.tasks.get(self.name, {})
self.template = cfg.get("template", "AfC suspected copyvio")
self.ignore_list = cfg.get("ignoreList", [])
default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}"
self.summary = self.make_summary(cfg.get("summary", default_summary))

# Search API data:
search = cfg.get("search", {})
self.engine = search.get("engine")
self.credentials = search.get("credentials", {})

# Connection data for our SQL database:
kwargs = cfg.get("sql", {})
kwargs["read_default_file"] = expanduser("~/.my.cnf")
self.conn_data = kwargs
self.db_access_lock = Lock()

def run(self, **kwargs):
pass
"""Entry point for the bot task.

Takes a page title in kwargs and checks it for copyvios, adding
{{self.template}} at the top if a copyvio has been detected. A page is
only checked once (processed pages are stored by page_id in an SQL
database).
"""
if self.shutoff_enabled():
return
title = kwargs["page"]
page = wiki.get_site().get_page(title)
with self.db_access_lock:
self.conn = oursql.connect(**self.conn_data)
self.process(page)

def process(self, page):
"""Detect copyvios in 'page' and add a note if any are found."""
pageid = page.pageid()
if self.has_been_processed(pageid):
msg = "Skipping check on already processed page [[{0}]]"
self.logger.info(msg.format(page.title()))
return

self.logger.info("Checking [[{0}]]".format(page.title()))
content = page.get()
result = page.copyvio_check(self.engine, self.credentials)
if result:
content = page.get()
template = "\{\{{0}|url={1}\}\}".format(self.template, result)
newtext = "\n".join((template, content))
page.edit(newtext, self.summary.format(url=result))
msg = "Found violation: [[{0}]] -> {1}"
self.logger.info(msg.format(page.title(), result))
else:
self.logger.debug("No violations detected")

self.log_processed(pageid)

def has_been_processed(self, pageid):
query = "SELECT 1 FROM processed WHERE page_id = ?"
with self.conn.cursor() as cursor:
cursor.execute(query, (pageid,))
results = cursor.fetchall()
if results:
return True
return False

def log_processed(self, pageid):
query = "INSERT INTO processed VALUES (?)"
with self.conn.cursor() as cursor:
cursor.execute(query, (pageid,))

+ 1
- 1
earwigbot/tasks/afc_statistics.py View File

@@ -185,7 +185,7 @@ class Task(BaseTask):

This is used by the template as a hidden sortkey.
"""
return (dt - datetime(1970, 1, 1)).total_seconds()
return int((dt - datetime(1970, 1, 1)).total_seconds())

def sync(self, **kwargs):
"""Synchronize our local statistics database with the site.


+ 81
- 0
earwigbot/wiki/copyright.py View File

@@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009, 2010, 2011 by Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from json import loads
from urllib import quote_plus, urlencode

try:
import oauth2 as oauth
except ImportError:
oauth = None

from earwigbot.wiki.exceptions import *

class CopyrightMixin(object):
"""
EarwigBot's Wiki Toolset: Copyright Violation Mixin

This is a mixin that provides one public method, copyvio_check(), which
checks the page for copyright violations using a search engine API. The
API keys must be provided to the method as arguments.
"""
def _yahoo_boss_query(self, query, cred):
"""Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials.

Returns a list of URLs, no more than fifty, ranked by relevance (as
determined by Yahoo). Raises SearchQueryError() on errors.
"""
base_url = "http://yboss.yahooapis.com/ysearch/web"
params = {"q": quote_plus(query), "style": "raw", "format": "json"}
url = "{0}?{1}".format(base_url, urlencode(params))

consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"])
client = oauth.Client(consumer)
headers, body = client.request(url, "GET")

if headers["status"] != "200":
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
raise SearchQueryError(e.format(headers["status"], body))

try:
res = loads(body)
except ValueError:
e = "Yahoo! BOSS Error: JSON could not be decoded"
raise SearchQueryError(e)

try:
results = res["bossresponse"]["web"]["results"]
except KeyError:
return []
return [result["url"] for result in results]

def copyvio_check(self, engine, credentials, force=False):
"""Check the page for copyright violations."""
if engine == "Yahoo! BOSS":
if not oauth:
e = "The package 'oauth2' could not be imported"
raise UnsupportedSearchEngineError(e)
querier = self._yahoo_boss_query
else:
raise UnknownSearchEngineError(engine)
content = self.get(force)
return querier(content, credentials)

+ 36
- 1
earwigbot/wiki/exceptions.py View File

@@ -23,7 +23,29 @@
"""
EarwigBot's Wiki Toolset: Exceptions

This module contains all exceptions used by the wiki package. There are a lot.
This module contains all exceptions used by the wiki package. There are a lot:

-- WikiToolsetError
-- SiteNotFoundError
-- SiteAPIError
-- LoginError
-- NamespaceNotFoundError
-- PageNotFoundError
-- InvalidPageError
-- RedirectError
-- UserNotFoundError
-- EditError
-- PermissionsError
-- EditConflictError
-- NoContentError
-- ContentTooBigError
-- SpamDetectedError
-- FilteredError
-- SQLError
-- CopyvioCheckError
-- UnknownSearchEngineError
-- UnsupportedSearchEngineError
-- SearchQueryError
"""

class WikiToolsetError(Exception):
@@ -87,3 +109,16 @@ class FilteredError(EditError):

class SQLError(WikiToolsetError):
"""Some error involving SQL querying occurred."""

class CopyvioCheckError(WikiToolsetError):
"""An error occured when checking a page for copyright violations."""

class UnknownSearchEngineError(CopyvioCheckError):
"""CopyrightMixin().copyvio_check() called with an unknown engine."""

class UnsupportedSearchEngineError(CopyvioCheckError):
"""The engine requested is not available, e.g., because a required package
is missing."""

class SearchQueryError(CopyvioCheckError):
"""Some error ocurred while doing a search query."""

+ 2
- 1
earwigbot/wiki/functions.py View File

@@ -54,7 +54,8 @@ def _load_config():
is_encrypted = config.load()
if is_encrypted: # Passwords in the config file are encrypted
key = getpass("Enter key to unencrypt bot passwords: ")
config.decrypt(key)
config._decryption_key = key
config.decrypt(config.wiki, "password")

def _get_cookiejar():
"""Returns a LWPCookieJar object loaded from our .cookies file. The same


+ 4
- 2
earwigbot/wiki/page.py View File

@@ -25,9 +25,10 @@ import re
from time import gmtime, strftime
from urllib import quote

from earwigbot.wiki.copyright import CopyrightMixin
from earwigbot.wiki.exceptions import *

class Page(object):
class Page(CopyrightMixin):
"""
EarwigBot's Wiki Toolset: Page Class

@@ -49,7 +50,8 @@ class Page(object):
get -- returns page content
get_redirect_target -- if the page is a redirect, returns its destination
edit -- replaces the page's content or creates a new page
add_section -- add a new section at the bottom of the page
add_section -- adds a new section at the bottom of the page
copyvio_check -- checks the page for copyright violations
"""

re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"


Loading…
Cancel
Save