@@ -1,4 +1,4 @@ | |||||
[EarwigBot](http://toolserver.org/~earwig/earwigbot/) is a | |||||
[EarwigBot](http://en.wikipedia.org/wiki/User:EarwigBot) is a | |||||
[Python](http://python.org/) robot that edits | [Python](http://python.org/) robot that edits | ||||
[Wikipedia](http://en.wikipedia.org/) and interacts with people over | [Wikipedia](http://en.wikipedia.org/) and interacts with people over | ||||
[IRC](http://en.wikipedia.org/wiki/Internet_Relay_Chat). | [IRC](http://en.wikipedia.org/wiki/Internet_Relay_Chat). | ||||
@@ -31,3 +31,9 @@ Additionally, the afc_history task uses | |||||
[matplotlib](http://matplotlib.sourceforge.net/) and | [matplotlib](http://matplotlib.sourceforge.net/) and | ||||
[numpy](http://numpy.scipy.org/) for graphing AfC statistics. Neither of these | [numpy](http://numpy.scipy.org/) for graphing AfC statistics. Neither of these | ||||
modules are required for the main bot itself. | modules are required for the main bot itself. | ||||
`earwigbot.wiki.copyright` requires access to a search engine for detecting | |||||
copyright violations. Currently, | |||||
[Yahoo! BOSS](http://developer.yahoo.com/search/boss/) is the only engine | |||||
supported, and this requires | |||||
[oauth2](https://github.com/simplegeo/python-oauth2). |
@@ -51,36 +51,34 @@ def process(rc): | |||||
chans = set() # channels to report this message to | chans = set() # channels to report this message to | ||||
page_name = rc.page.lower() | page_name = rc.page.lower() | ||||
comment = rc.comment.lower() | comment = rc.comment.lower() | ||||
if "!earwigbot" in rc.msg.lower(): | if "!earwigbot" in rc.msg.lower(): | ||||
chans.update(("##earwigbot", "#wikipedia-en-afc")) | chans.update(("##earwigbot", "#wikipedia-en-afc")) | ||||
if r_page.search(page_name): | if r_page.search(page_name): | ||||
#tasks.start("afc_copyvios", action="edit", page=rc.page) | |||||
tasks.start("afc_copyvios", page=rc.page) | |||||
chans.add("#wikipedia-en-afc") | chans.add("#wikipedia-en-afc") | ||||
elif r_ffu.match(page_name): | elif r_ffu.match(page_name): | ||||
chans.add("#wikipedia-en-afc") | chans.add("#wikipedia-en-afc") | ||||
elif page_name.startswith("template:afc submission"): | elif page_name.startswith("template:afc submission"): | ||||
chans.add("#wikipedia-en-afc") | chans.add("#wikipedia-en-afc") | ||||
elif rc.flags == "move" and (r_move1.match(comment) or | elif rc.flags == "move" and (r_move1.match(comment) or | ||||
r_move2.match(comment)): | r_move2.match(comment)): | ||||
p = r_moved_pages.findall(rc.comment)[0] | p = r_moved_pages.findall(rc.comment)[0] | ||||
#tasks.start("afc_copyvios", action="move", page=p) | |||||
chans.add("#wikipedia-en-afc") | chans.add("#wikipedia-en-afc") | ||||
elif rc.flags == "delete" and r_delete.match(comment): | elif rc.flags == "delete" and r_delete.match(comment): | ||||
p = r_deleted_page.findall(rc.comment)[0] | p = r_deleted_page.findall(rc.comment)[0] | ||||
#tasks.start("afc_copyvios", action="delete", page=p) | |||||
chans.add("#wikipedia-en-afc") | chans.add("#wikipedia-en-afc") | ||||
elif rc.flags == "restore" and r_restore.match(comment): | elif rc.flags == "restore" and r_restore.match(comment): | ||||
p = r_restored_page.findall(rc.comment)[0] | p = r_restored_page.findall(rc.comment)[0] | ||||
#tasks.start("afc_copyvios", action="restore", page=p) | |||||
tasks.start("afc_copyvios", page=p) | |||||
chans.add("#wikipedia-en-afc") | chans.add("#wikipedia-en-afc") | ||||
elif rc.flags == "protect" and r_protect.match(comment): | elif rc.flags == "protect" and r_protect.match(comment): | ||||
chans.add("#wikipedia-en-afc") | chans.add("#wikipedia-en-afc") | ||||
@@ -20,6 +20,12 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
from os.path import expanduser | |||||
from threading import Lock | |||||
import oursql | |||||
from earwigbot import wiki | |||||
from earwigbot.classes import BaseTask | from earwigbot.classes import BaseTask | ||||
from earwigbot.config import config | from earwigbot.config import config | ||||
@@ -30,9 +36,75 @@ class Task(BaseTask): | |||||
number = 1 | number = 1 | ||||
def __init__(self): | def __init__(self): | ||||
self.cfg = cfg = config.tasks.get(self.name, {}) | |||||
config.decrypt(config.tasks, self.name, "search", "credentials", "key") | config.decrypt(config.tasks, self.name, "search", "credentials", "key") | ||||
config.decrypt(config.tasks, self.name, "search", "credentials", "secret") | config.decrypt(config.tasks, self.name, "search", "credentials", "secret") | ||||
cfg = config.tasks.get(self.name, {}) | |||||
self.template = cfg.get("template", "AfC suspected copyvio") | |||||
self.ignore_list = cfg.get("ignoreList", []) | |||||
default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}" | |||||
self.summary = self.make_summary(cfg.get("summary", default_summary)) | |||||
# Search API data: | |||||
search = cfg.get("search", {}) | |||||
self.engine = search.get("engine") | |||||
self.credentials = search.get("credentials", {}) | |||||
# Connection data for our SQL database: | |||||
kwargs = cfg.get("sql", {}) | |||||
kwargs["read_default_file"] = expanduser("~/.my.cnf") | |||||
self.conn_data = kwargs | |||||
self.db_access_lock = Lock() | |||||
def run(self, **kwargs): | def run(self, **kwargs): | ||||
pass | |||||
"""Entry point for the bot task. | |||||
Takes a page title in kwargs and checks it for copyvios, adding | |||||
{{self.template}} at the top if a copyvio has been detected. A page is | |||||
only checked once (processed pages are stored by page_id in an SQL | |||||
database). | |||||
""" | |||||
if self.shutoff_enabled(): | |||||
return | |||||
title = kwargs["page"] | |||||
page = wiki.get_site().get_page(title) | |||||
with self.db_access_lock: | |||||
self.conn = oursql.connect(**self.conn_data) | |||||
self.process(page) | |||||
def process(self, page): | |||||
"""Detect copyvios in 'page' and add a note if any are found.""" | |||||
pageid = page.pageid() | |||||
if self.has_been_processed(pageid): | |||||
msg = "Skipping check on already processed page [[{0}]]" | |||||
self.logger.info(msg.format(page.title())) | |||||
return | |||||
self.logger.info("Checking [[{0}]]".format(page.title())) | |||||
content = page.get() | |||||
result = page.copyvio_check(self.engine, self.credentials) | |||||
if result: | |||||
content = page.get() | |||||
template = "\{\{{0}|url={1}\}\}".format(self.template, result) | |||||
newtext = "\n".join((template, content)) | |||||
page.edit(newtext, self.summary.format(url=result)) | |||||
msg = "Found violation: [[{0}]] -> {1}" | |||||
self.logger.info(msg.format(page.title(), result)) | |||||
else: | |||||
self.logger.debug("No violations detected") | |||||
self.log_processed(pageid) | |||||
def has_been_processed(self, pageid): | |||||
query = "SELECT 1 FROM processed WHERE page_id = ?" | |||||
with self.conn.cursor() as cursor: | |||||
cursor.execute(query, (pageid,)) | |||||
results = cursor.fetchall() | |||||
if results: | |||||
return True | |||||
return False | |||||
def log_processed(self, pageid): | |||||
query = "INSERT INTO processed VALUES (?)" | |||||
with self.conn.cursor() as cursor: | |||||
cursor.execute(query, (pageid,)) |
@@ -185,7 +185,7 @@ class Task(BaseTask): | |||||
This is used by the template as a hidden sortkey. | This is used by the template as a hidden sortkey. | ||||
""" | """ | ||||
return (dt - datetime(1970, 1, 1)).total_seconds() | |||||
return int((dt - datetime(1970, 1, 1)).total_seconds()) | |||||
def sync(self, **kwargs): | def sync(self, **kwargs): | ||||
"""Synchronize our local statistics database with the site. | """Synchronize our local statistics database with the site. | ||||
@@ -0,0 +1,81 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2009, 2010, 2011 by Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
from json import loads | |||||
from urllib import quote_plus, urlencode | |||||
try: | |||||
import oauth2 as oauth | |||||
except ImportError: | |||||
oauth = None | |||||
from earwigbot.wiki.exceptions import * | |||||
class CopyrightMixin(object): | |||||
""" | |||||
EarwigBot's Wiki Toolset: Copyright Violation Mixin | |||||
This is a mixin that provides one public method, copyvio_check(), which | |||||
checks the page for copyright violations using a search engine API. The | |||||
API keys must be provided to the method as arguments. | |||||
""" | |||||
def _yahoo_boss_query(self, query, cred): | |||||
"""Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials. | |||||
Returns a list of URLs, no more than fifty, ranked by relevance (as | |||||
determined by Yahoo). Raises SearchQueryError() on errors. | |||||
""" | |||||
base_url = "http://yboss.yahooapis.com/ysearch/web" | |||||
params = {"q": quote_plus(query), "style": "raw", "format": "json"} | |||||
url = "{0}?{1}".format(base_url, urlencode(params)) | |||||
consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"]) | |||||
client = oauth.Client(consumer) | |||||
headers, body = client.request(url, "GET") | |||||
if headers["status"] != "200": | |||||
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" | |||||
raise SearchQueryError(e.format(headers["status"], body)) | |||||
try: | |||||
res = loads(body) | |||||
except ValueError: | |||||
e = "Yahoo! BOSS Error: JSON could not be decoded" | |||||
raise SearchQueryError(e) | |||||
try: | |||||
results = res["bossresponse"]["web"]["results"] | |||||
except KeyError: | |||||
return [] | |||||
return [result["url"] for result in results] | |||||
def copyvio_check(self, engine, credentials, force=False): | |||||
"""Check the page for copyright violations.""" | |||||
if engine == "Yahoo! BOSS": | |||||
if not oauth: | |||||
e = "The package 'oauth2' could not be imported" | |||||
raise UnsupportedSearchEngineError(e) | |||||
querier = self._yahoo_boss_query | |||||
else: | |||||
raise UnknownSearchEngineError(engine) | |||||
content = self.get(force) | |||||
return querier(content, credentials) |
@@ -23,7 +23,29 @@ | |||||
""" | """ | ||||
EarwigBot's Wiki Toolset: Exceptions | EarwigBot's Wiki Toolset: Exceptions | ||||
This module contains all exceptions used by the wiki package. There are a lot. | |||||
This module contains all exceptions used by the wiki package. There are a lot: | |||||
-- WikiToolsetError | |||||
-- SiteNotFoundError | |||||
-- SiteAPIError | |||||
-- LoginError | |||||
-- NamespaceNotFoundError | |||||
-- PageNotFoundError | |||||
-- InvalidPageError | |||||
-- RedirectError | |||||
-- UserNotFoundError | |||||
-- EditError | |||||
-- PermissionsError | |||||
-- EditConflictError | |||||
-- NoContentError | |||||
-- ContentTooBigError | |||||
-- SpamDetectedError | |||||
-- FilteredError | |||||
-- SQLError | |||||
-- CopyvioCheckError | |||||
-- UnknownSearchEngineError | |||||
-- UnsupportedSearchEngineError | |||||
-- SearchQueryError | |||||
""" | """ | ||||
class WikiToolsetError(Exception): | class WikiToolsetError(Exception): | ||||
@@ -87,3 +109,16 @@ class FilteredError(EditError): | |||||
class SQLError(WikiToolsetError): | class SQLError(WikiToolsetError): | ||||
"""Some error involving SQL querying occurred.""" | """Some error involving SQL querying occurred.""" | ||||
class CopyvioCheckError(WikiToolsetError): | |||||
"""An error occured when checking a page for copyright violations.""" | |||||
class UnknownSearchEngineError(CopyvioCheckError): | |||||
"""CopyrightMixin().copyvio_check() called with an unknown engine.""" | |||||
class UnsupportedSearchEngineError(CopyvioCheckError): | |||||
"""The engine requested is not available, e.g., because a required package | |||||
is missing.""" | |||||
class SearchQueryError(CopyvioCheckError): | |||||
"""Some error ocurred while doing a search query.""" |
@@ -54,7 +54,8 @@ def _load_config(): | |||||
is_encrypted = config.load() | is_encrypted = config.load() | ||||
if is_encrypted: # Passwords in the config file are encrypted | if is_encrypted: # Passwords in the config file are encrypted | ||||
key = getpass("Enter key to unencrypt bot passwords: ") | key = getpass("Enter key to unencrypt bot passwords: ") | ||||
config.decrypt(key) | |||||
config._decryption_key = key | |||||
config.decrypt(config.wiki, "password") | |||||
def _get_cookiejar(): | def _get_cookiejar(): | ||||
"""Returns a LWPCookieJar object loaded from our .cookies file. The same | """Returns a LWPCookieJar object loaded from our .cookies file. The same | ||||
@@ -25,9 +25,10 @@ import re | |||||
from time import gmtime, strftime | from time import gmtime, strftime | ||||
from urllib import quote | from urllib import quote | ||||
from earwigbot.wiki.copyright import CopyrightMixin | |||||
from earwigbot.wiki.exceptions import * | from earwigbot.wiki.exceptions import * | ||||
class Page(object): | |||||
class Page(CopyrightMixin): | |||||
""" | """ | ||||
EarwigBot's Wiki Toolset: Page Class | EarwigBot's Wiki Toolset: Page Class | ||||
@@ -49,7 +50,8 @@ class Page(object): | |||||
get -- returns page content | get -- returns page content | ||||
get_redirect_target -- if the page is a redirect, returns its destination | get_redirect_target -- if the page is a redirect, returns its destination | ||||
edit -- replaces the page's content or creates a new page | edit -- replaces the page's content or creates a new page | ||||
add_section -- add a new section at the bottom of the page | |||||
add_section -- adds a new section at the bottom of the page | |||||
copyvio_check -- checks the page for copyright violations | |||||
""" | """ | ||||
re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]" | re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]" | ||||