@@ -1,4 +1,4 @@ | |||
[EarwigBot](http://toolserver.org/~earwig/earwigbot/) is a | |||
[EarwigBot](http://en.wikipedia.org/wiki/User:EarwigBot) is a | |||
[Python](http://python.org/) robot that edits | |||
[Wikipedia](http://en.wikipedia.org/) and interacts with people over | |||
[IRC](http://en.wikipedia.org/wiki/Internet_Relay_Chat). | |||
@@ -31,3 +31,9 @@ Additionally, the afc_history task uses | |||
[matplotlib](http://matplotlib.sourceforge.net/) and | |||
[numpy](http://numpy.scipy.org/) for graphing AfC statistics. Neither of these | |||
modules are required for the main bot itself. | |||
`earwigbot.wiki.copyright` requires access to a search engine for detecting | |||
copyright violations. Currently, | |||
[Yahoo! BOSS](http://developer.yahoo.com/search/boss/) is the only engine | |||
supported, and this requires | |||
[oauth2](https://github.com/simplegeo/python-oauth2). |
@@ -51,36 +51,34 @@ def process(rc): | |||
chans = set() # channels to report this message to | |||
page_name = rc.page.lower() | |||
comment = rc.comment.lower() | |||
if "!earwigbot" in rc.msg.lower(): | |||
chans.update(("##earwigbot", "#wikipedia-en-afc")) | |||
if r_page.search(page_name): | |||
#tasks.start("afc_copyvios", action="edit", page=rc.page) | |||
tasks.start("afc_copyvios", page=rc.page) | |||
chans.add("#wikipedia-en-afc") | |||
elif r_ffu.match(page_name): | |||
chans.add("#wikipedia-en-afc") | |||
elif page_name.startswith("template:afc submission"): | |||
chans.add("#wikipedia-en-afc") | |||
elif rc.flags == "move" and (r_move1.match(comment) or | |||
r_move2.match(comment)): | |||
p = r_moved_pages.findall(rc.comment)[0] | |||
#tasks.start("afc_copyvios", action="move", page=p) | |||
chans.add("#wikipedia-en-afc") | |||
elif rc.flags == "delete" and r_delete.match(comment): | |||
p = r_deleted_page.findall(rc.comment)[0] | |||
#tasks.start("afc_copyvios", action="delete", page=p) | |||
chans.add("#wikipedia-en-afc") | |||
elif rc.flags == "restore" and r_restore.match(comment): | |||
p = r_restored_page.findall(rc.comment)[0] | |||
#tasks.start("afc_copyvios", action="restore", page=p) | |||
tasks.start("afc_copyvios", page=p) | |||
chans.add("#wikipedia-en-afc") | |||
elif rc.flags == "protect" and r_protect.match(comment): | |||
chans.add("#wikipedia-en-afc") | |||
@@ -20,6 +20,12 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from os.path import expanduser | |||
from threading import Lock | |||
import oursql | |||
from earwigbot import wiki | |||
from earwigbot.classes import BaseTask | |||
from earwigbot.config import config | |||
@@ -30,9 +36,75 @@ class Task(BaseTask): | |||
number = 1 | |||
def __init__(self): | |||
self.cfg = cfg = config.tasks.get(self.name, {}) | |||
config.decrypt(config.tasks, self.name, "search", "credentials", "key") | |||
config.decrypt(config.tasks, self.name, "search", "credentials", "secret") | |||
cfg = config.tasks.get(self.name, {}) | |||
self.template = cfg.get("template", "AfC suspected copyvio") | |||
self.ignore_list = cfg.get("ignoreList", []) | |||
default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}" | |||
self.summary = self.make_summary(cfg.get("summary", default_summary)) | |||
# Search API data: | |||
search = cfg.get("search", {}) | |||
self.engine = search.get("engine") | |||
self.credentials = search.get("credentials", {}) | |||
# Connection data for our SQL database: | |||
kwargs = cfg.get("sql", {}) | |||
kwargs["read_default_file"] = expanduser("~/.my.cnf") | |||
self.conn_data = kwargs | |||
self.db_access_lock = Lock() | |||
def run(self, **kwargs): | |||
pass | |||
"""Entry point for the bot task. | |||
Takes a page title in kwargs and checks it for copyvios, adding | |||
{{self.template}} at the top if a copyvio has been detected. A page is | |||
only checked once (processed pages are stored by page_id in an SQL | |||
database). | |||
""" | |||
if self.shutoff_enabled(): | |||
return | |||
title = kwargs["page"] | |||
page = wiki.get_site().get_page(title) | |||
with self.db_access_lock: | |||
self.conn = oursql.connect(**self.conn_data) | |||
self.process(page) | |||
def process(self, page): | |||
"""Detect copyvios in 'page' and add a note if any are found.""" | |||
pageid = page.pageid() | |||
if self.has_been_processed(pageid): | |||
msg = "Skipping check on already processed page [[{0}]]" | |||
self.logger.info(msg.format(page.title())) | |||
return | |||
self.logger.info("Checking [[{0}]]".format(page.title())) | |||
content = page.get() | |||
result = page.copyvio_check(self.engine, self.credentials) | |||
if result: | |||
content = page.get() | |||
template = "\{\{{0}|url={1}\}\}".format(self.template, result) | |||
newtext = "\n".join((template, content)) | |||
page.edit(newtext, self.summary.format(url=result)) | |||
msg = "Found violation: [[{0}]] -> {1}" | |||
self.logger.info(msg.format(page.title(), result)) | |||
else: | |||
self.logger.debug("No violations detected") | |||
self.log_processed(pageid) | |||
def has_been_processed(self, pageid): | |||
query = "SELECT 1 FROM processed WHERE page_id = ?" | |||
with self.conn.cursor() as cursor: | |||
cursor.execute(query, (pageid,)) | |||
results = cursor.fetchall() | |||
if results: | |||
return True | |||
return False | |||
def log_processed(self, pageid): | |||
query = "INSERT INTO processed VALUES (?)" | |||
with self.conn.cursor() as cursor: | |||
cursor.execute(query, (pageid,)) |
@@ -185,7 +185,7 @@ class Task(BaseTask): | |||
This is used by the template as a hidden sortkey. | |||
""" | |||
return (dt - datetime(1970, 1, 1)).total_seconds() | |||
return int((dt - datetime(1970, 1, 1)).total_seconds()) | |||
def sync(self, **kwargs): | |||
"""Synchronize our local statistics database with the site. | |||
@@ -0,0 +1,81 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009, 2010, 2011 by Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from json import loads | |||
from urllib import quote_plus, urlencode | |||
try: | |||
import oauth2 as oauth | |||
except ImportError: | |||
oauth = None | |||
from earwigbot.wiki.exceptions import * | |||
class CopyrightMixin(object): | |||
""" | |||
EarwigBot's Wiki Toolset: Copyright Violation Mixin | |||
This is a mixin that provides one public method, copyvio_check(), which | |||
checks the page for copyright violations using a search engine API. The | |||
API keys must be provided to the method as arguments. | |||
""" | |||
def _yahoo_boss_query(self, query, cred): | |||
"""Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials. | |||
Returns a list of URLs, no more than fifty, ranked by relevance (as | |||
determined by Yahoo). Raises SearchQueryError() on errors. | |||
""" | |||
base_url = "http://yboss.yahooapis.com/ysearch/web" | |||
params = {"q": quote_plus(query), "style": "raw", "format": "json"} | |||
url = "{0}?{1}".format(base_url, urlencode(params)) | |||
consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"]) | |||
client = oauth.Client(consumer) | |||
headers, body = client.request(url, "GET") | |||
if headers["status"] != "200": | |||
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" | |||
raise SearchQueryError(e.format(headers["status"], body)) | |||
try: | |||
res = loads(body) | |||
except ValueError: | |||
e = "Yahoo! BOSS Error: JSON could not be decoded" | |||
raise SearchQueryError(e) | |||
try: | |||
results = res["bossresponse"]["web"]["results"] | |||
except KeyError: | |||
return [] | |||
return [result["url"] for result in results] | |||
def copyvio_check(self, engine, credentials, force=False): | |||
"""Check the page for copyright violations.""" | |||
if engine == "Yahoo! BOSS": | |||
if not oauth: | |||
e = "The package 'oauth2' could not be imported" | |||
raise UnsupportedSearchEngineError(e) | |||
querier = self._yahoo_boss_query | |||
else: | |||
raise UnknownSearchEngineError(engine) | |||
content = self.get(force) | |||
return querier(content, credentials) |
@@ -23,7 +23,29 @@ | |||
""" | |||
EarwigBot's Wiki Toolset: Exceptions | |||
This module contains all exceptions used by the wiki package. There are a lot. | |||
This module contains all exceptions used by the wiki package. There are a lot: | |||
-- WikiToolsetError | |||
-- SiteNotFoundError | |||
-- SiteAPIError | |||
-- LoginError | |||
-- NamespaceNotFoundError | |||
-- PageNotFoundError | |||
-- InvalidPageError | |||
-- RedirectError | |||
-- UserNotFoundError | |||
-- EditError | |||
-- PermissionsError | |||
-- EditConflictError | |||
-- NoContentError | |||
-- ContentTooBigError | |||
-- SpamDetectedError | |||
-- FilteredError | |||
-- SQLError | |||
-- CopyvioCheckError | |||
-- UnknownSearchEngineError | |||
-- UnsupportedSearchEngineError | |||
-- SearchQueryError | |||
""" | |||
class WikiToolsetError(Exception): | |||
@@ -87,3 +109,16 @@ class FilteredError(EditError): | |||
class SQLError(WikiToolsetError): | |||
"""Some error involving SQL querying occurred.""" | |||
class CopyvioCheckError(WikiToolsetError): | |||
"""An error occured when checking a page for copyright violations.""" | |||
class UnknownSearchEngineError(CopyvioCheckError): | |||
"""CopyrightMixin().copyvio_check() called with an unknown engine.""" | |||
class UnsupportedSearchEngineError(CopyvioCheckError): | |||
"""The engine requested is not available, e.g., because a required package | |||
is missing.""" | |||
class SearchQueryError(CopyvioCheckError): | |||
"""Some error ocurred while doing a search query.""" |
@@ -54,7 +54,8 @@ def _load_config(): | |||
is_encrypted = config.load() | |||
if is_encrypted: # Passwords in the config file are encrypted | |||
key = getpass("Enter key to unencrypt bot passwords: ") | |||
config.decrypt(key) | |||
config._decryption_key = key | |||
config.decrypt(config.wiki, "password") | |||
def _get_cookiejar(): | |||
"""Returns a LWPCookieJar object loaded from our .cookies file. The same | |||
@@ -25,9 +25,10 @@ import re | |||
from time import gmtime, strftime | |||
from urllib import quote | |||
from earwigbot.wiki.copyright import CopyrightMixin | |||
from earwigbot.wiki.exceptions import * | |||
class Page(object): | |||
class Page(CopyrightMixin): | |||
""" | |||
EarwigBot's Wiki Toolset: Page Class | |||
@@ -49,7 +50,8 @@ class Page(object): | |||
get -- returns page content | |||
get_redirect_target -- if the page is a redirect, returns its destination | |||
edit -- replaces the page's content or creates a new page | |||
add_section -- add a new section at the bottom of the page | |||
add_section -- adds a new section at the bottom of the page | |||
copyvio_check -- checks the page for copyright violations | |||
""" | |||
re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]" | |||