Переглянути джерело

creating backbone for wiki-editing tasks: they can be spawned on a cron-like schedule with config/schedule.py, in response to certain edits in config/watcher.py, or through IRC (not implemented yet - I'll need to figure out permissions); task files are in wiki/tasks/, and the wiki-editing tools (think very simplified pywikipedia) will be in wiki/tools/

tags/v0.1
Ben Kurtovic 13 роки тому
джерело
коміт
655fe78312
22 змінених файлів з 396 додано та 40 видалено
  1. +24
    -0
      config/main.py
  2. +28
    -0
      config/schedule.py
  3. +43
    -7
      config/watcher.py
  4. +83
    -11
      core/main.py
  5. +11
    -4
      earwigbot.py
  6. +2
    -6
      irc/command_handler.py
  7. +1
    -1
      irc/commands/git.py
  8. +5
    -1
      irc/connection.py
  9. +5
    -5
      irc/frontend.py
  10. +6
    -5
      irc/watcher.py
  11. +12
    -0
      wiki/base_task.py
  12. +80
    -0
      wiki/task_manager.py
  13. +0
    -0
     
  14. +12
    -0
      wiki/tasks/afc_catdelink.py
  15. +12
    -0
      wiki/tasks/afc_copyvios.py
  16. +12
    -0
      wiki/tasks/afc_dailycats.py
  17. +12
    -0
      wiki/tasks/afc_statistics.py
  18. +12
    -0
      wiki/tasks/afc_undated.py
  19. +12
    -0
      wiki/tasks/blptag.py
  20. +12
    -0
      wiki/tasks/feed_dailycats.py
  21. +12
    -0
      wiki/tasks/wrongmime.py
  22. +0
    -0
     

+ 24
- 0
config/main.py Переглянути файл

@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-

# EarwigBot Configuration File
# This file tells the bot which of its components should be enabled.

# The IRC frontend (configured in config/irc.py) sits on a public IRC network,
# responds to commands given to it, and reports edits (if the IRC watcher
# component is enabled).
enable_irc_frontend = True

# The IRC watcher (connection details configured in config/irc.py as well) sits
# on an IRC network that gives a recent changes feed, usually irc.wikimedia.net.
# It looks for edits matching certain (often regex) patterns (rules configured
# in config/watcher.py), and either reports them to the IRC frontend (if
# enabled), or activates a task on the WikiBot (if configured to do).
enable_irc_watcher = True

# EarwigBot doesn't have to edit a wiki, although this is its main purpose. If
# the wiki schedule is disabled, it will not be able to handle scheduled tasks
# that involve editing (such as creating a daily category every day at midnight
# UTC), but it can still edit through rules given in the watcher, and bot tasks
# can still be activated by the command line. The schedule is configured in
# config/schedule.py.
enable_wiki_schedule = True

+ 28
- 0
config/schedule.py Переглянути файл

@@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-

# EarwigBot Configuration File
# This file tells the bot when to run certain wiki-editing tasks.

def check(minute, hour, month_day, month, week_day):
tasks = [] # tasks to run this turn, each as a tuple of (task_name, kwargs) or just task_name

if minute == 0: # run every hour on the hour
tasks.append(("afc_statistics", {"action": "save"})) # save statistics to [[Template:AFC_statistics]]

if hour == 0: # run every day at midnight
tasks.append("afc_dailycats") # create daily categories for WP:AFC
tasks.append("feed_dailycats") # create daily categories for WP:FEED

if week_day == 0: # run every Sunday at midnight (that is, the start of Sunday, not the end)
tasks.append("afc_undated") # clear [[Category:Undated AfC submissions]]

if week_day == 1: # run every Monday at midnight
tasks.append("afc_catdelink") # delink mainspace categories in declined AfC submissions

if week_day == 2: # run every Tuesday at midnight
tasks.append("wrongmime") # tag files whose extensions do not agree with their MIME type

if week_day == 3: # run every Wednesday at midnight
tasks.append("blptag") # add |blp=yes to {{WPB}} or {{WPBS}} when it is used along with {{WP Biography}}

return tasks

+ 43
- 7
config/watcher.py Переглянути файл

@@ -5,30 +5,66 @@

import re

from wiki import task_manager

# Define different report channels on our front-end server. They /must/ be in CHANS in config/irc.py or the bot will not be able to send messages to them (unless they have -n set).
AFC_CHANS = ["#wikipedia-en-afc"] # report recent AfC changes/give AfC status messages upon join
#AFC_CHANS = ["#wikipedia-en-afc"] # report recent AfC changes/give AfC status messages upon join
AFC_CHANS = ["##earwigbot"] # report recent AfC changes/give AfC status messages upon join
BOT_CHANS = ["##earwigbot", "#wikipedia-en-afc"] # report edits containing "!earwigbot"

# Define some commonly used strings.
afc_prefix = "wikipedia( talk)?:(wikiproject )?articles for creation"

# Define our compiled regexps used when finding certain edits.
r_page = re.compile(afc_prefix)
r_ffu = re.compile("wikipedia( talk)?:files for upload")
r_move1 = re.compile("moved \[\[{}".format(afc_prefix)) # an AFC page was either moved locally or out
r_move2 = re.compile("moved \[\[(.*?)\]\] to \[\[{}".format(afc_prefix)) # an outside page was moved into AFC
r_moved_pages = re.compile("^moved \[\[(.*?)\]\] to \[\[(.*?)\]\]")
r_delete = re.compile("deleted \"\[\[{}".format(afc_prefix))
r_deleted_page = re.compile("^deleted \"\[\[(.*?)\]\]")
r_restore = re.compile("restored \"\[\[{}".format(afc_prefix))
r_restored_page = re.compile("^restored \"\[\[(.*?)\]\]")
r_protect = re.compile("protected \"\[\[{}".format(afc_prefix))

def process(rc):
chans = set() # channels to report this message to
page_name = rc.page.lower()
comment = rc.comment.lower()
if "!earwigbot" in rc.msg.lower():
chans.update(BOT_CHANS)
if re.match("wikipedia( talk)?:(wikiproject )?articles for creation", page_name):
if r_page.search(page_name):
task_manager.start_task("afc_statistics", action="process_edit", page=rc.page)
task_manager.start_task("afc_copyvios", action="process_edit", page=rc.page)
chans.update(AFC_CHANS)
elif re.match("wikipedia( talk)?:files for upload", page_name):
elif r_ffu.match(page_name):
chans.update(AFC_CHANS)
elif page_name.startswith("template:afc submission"):
chans.update(AFC_CHANS)
elif rc.flags == "delete" and re.match("deleted \"\[\[wikipedia( talk)?:(wikiproject )?articles for creation", rc.comment.lower()):
elif rc.flags == "move" and (r_move1.match(comment) or r_move2.match(comment)):
p = r_moved_pages.findall(rc.comment)[0]
task_manager.start_task("afc_statistics", action="process_move", pages=p)
task_manager.start_task("afc_copyvios", action="process_move", pages=p)
chans.update(AFC_CHANS)
elif rc.flags == "protect" and re.match("protected \"\[\[wikipedia( talk)?:(wikiproject )?articles for creation", rc.comment.lower()):
elif rc.flags == "delete" and r_delete.match(comment):
p = r_deleted_page.findall(rc.comment)[0][0]
task_manager.start_task("afc_statistics", action="process_delete", page=p)
task_manager.start_task("afc_copyvios", action="process_delete", page=p)
chans.update(AFC_CHANS)
elif rc.flags == "delete" and r_restore.match(comment):
p = r_restored_page.findall(rc.comment)[0][0]
task_manager.start_task("afc_statistics", action="process_restore", page=p)
task_manager.start_task("afc_copyvios", action="process_restore", page=p)
chans.update(AFC_CHANS)
elif rc.flags == "protect" and r_protect.match(comment):
chans.update(AFC_CHANS)

return chans

+ 83
- 11
core/main.py Переглянути файл

@@ -1,11 +1,23 @@
# -*- coding: utf-8 -*-

## EarwigBot's Core
## Basically, this creates threads for our IRC watcher component and Wikipedia component, and then runs the main IRC bot on the main thread.

## The IRC bot component of EarwigBot has two parts: a front-end and a watcher.
## The front-end runs on a normal IRC server and expects users to interact with it/give it commands.
## The watcher runs on a wiki recent-changes server and listens for edits. Users cannot interact with this part of the bot.
## EarwigBot has three components that can run independently of each other: an
## IRC front-end, an IRC watcher, and a wiki scheduler.
## * The IRC front-end runs on a normal IRC server and expects users to
## interact with it/give it commands.
## * The IRC watcher runs on a wiki recent-changes server and listens for
## edits. Users cannot interact with this part of the bot.
## * The wiki scheduler runs wiki-editing bot tasks in separate threads at
## user-defined times through a cron-like interface.

## There is a "priority" system here:
## 1. If the IRC frontend is enabled, it will run on the main thread, and the
## IRC watcher and wiki scheduler (if enabled) will run on separate threads.
## 2. If the wiki scheduler is enabled, it will run on the main thread, and the
## IRC watcher (if enabled) will run on a separate thread.
## 3. If the IRC watcher is enabled, it will run on the main (and only) thread.
## Else, the bot will stop, as no components are enabled.

import threading
import time
@@ -16,35 +28,95 @@ import os
parent_dir = os.path.split(sys.path[0])[0]
sys.path.append(parent_dir) # make sure we look in the parent directory for modules

from config.main import *
from irc import frontend, watcher
from wiki import task_manager

f_conn = None
w_conn = None

def irc_watcher(f_conn):
"""Function to handle the IRC watcher as another thread (if frontend and/or
scheduler is enabled), otherwise run as the main thread."""
global w_conn
print "\nStarting IRC watcher..."
while 1: # restart the watcher component if (just) it breaks
w_conn = watcher.get_connection()
w_conn.connect()
print # print a blank line here to signify that the bot has finished starting up
try:
watcher.main(w_conn, f_conn)
except:
traceback.print_exc()
time.sleep(5) # sleep a bit before restarting watcher
print "watcher has stopped; restarting component..."
print "\nWatcher has stopped; restarting component..."

def run():
def wiki_scheduler():
"""Function to handle the wiki scheduler as another thread, or as the
primary thread if the IRC frontend is not enabled."""
while 1:
time_start = time.time()
now = time.gmtime(time_start)
task_manager.start_tasks(now)
time_end = time.time()
time_diff = time_start - time_end
if time_diff < 60: # sleep until the next minute
time.sleep(60 - time_diff)

def irc_frontend():
"""If the IRC frontend is enabled, make it run on our primary thread, and
enable the wiki scheduler and IRC watcher on new threads if they are
enabled."""
global f_conn
print "\nStarting IRC frontend..."
f_conn = frontend.get_connection()
frontend.startup(f_conn)
t_watcher = threading.Thread(target=irc_watcher, args=(f_conn,))
t_watcher.daemon = True
t_watcher.start()
if enable_wiki_schedule:
print "\nStarting wiki scheduler..."
task_manager.load_tasks()
t_scheduler = threading.Thread(target=wiki_scheduler)
t_scheduler.name = "wiki-scheduler"
t_scheduler.daemon = True
t_scheduler.start()
if enable_irc_watcher:
t_watcher = threading.Thread(target=irc_watcher, args=(f_conn,))
t_watcher.name = "irc-watcher"
t_watcher.daemon = True
t_watcher.start()

frontend.main()

w_conn.close()
if enable_irc_watcher:
w_conn.close()
f_conn.close()
def run():
if enable_irc_frontend: # make the frontend run on our primary thread if enabled, and enable additional components through that function
irc_frontend()
elif enable_wiki_schedule: # the scheduler is enabled - run it on the main thread, but also run the IRC watcher on another thread if it is enabled
print "\nStarting wiki scheduler..."
task_manager.load_tasks()
if enable_irc_watcher:
t_watcher = threading.Thread(target=irc_watcher, args=(f_conn,))
t_watcher.name = "irc-watcher"
t_watcher.daemon = True
t_watcher.start()
wiki_scheduler()
elif enable_irc_watcher: # the IRC watcher is our only enabled component, so run its function only and don't worry about anything else
irc_watcher()
else: # nothing is enabled!
exit("\nNo bot parts are enabled; stopping...")

if __name__ == "__main__":
run()
try:
run()
except KeyboardInterrupt:
exit("\nKeyboardInterrupt: stopping main bot loop.")

+ 11
- 4
earwigbot.py Переглянути файл

@@ -4,12 +4,19 @@ import time
from subprocess import *

try:
from config import irc, secure, watcher
from config import irc, main, schedule, secure, watcher
except ImportError:
print """Missing a config file! Make sure you have configured the bot. All *.py.default files in config/
should have their .default extension removed, and the info inside should be corrected."""
exit()

while 1:
call(['python', 'core/main.py'])
time.sleep(5) # sleep for five seconds between bot runs
def main():
while 1:
call(['python', 'core/main.py'])
time.sleep(5) # sleep for five seconds between bot runs

if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
exit("\nKeyboardInterrupt: stopping bot wrapper.")

+ 2
- 6
irc/command_handler.py Переглянути файл

@@ -7,7 +7,7 @@ import traceback

commands = []

def init_commands(connection):
def load_commands(connection):
"""load all valid command classes from irc/commmands/ into the commands variable"""
files = os.listdir(os.path.join("irc", "commands")) # get all files in irc/commands/
files.sort() # alphabetically sort list of files
@@ -15,18 +15,14 @@ def init_commands(connection):
for f in files:
if f.startswith("_") or not f.endswith(".py"): # ignore non-python files or files beginning with "_"
continue

module = f[:-3] # strip .py from end

try:
exec "from irc.commands import %s" % module
except: # importing the file failed for some reason...
print "Couldn't load file %s:" % f
traceback.print_exc()
continue

m = eval(module) # 'module' is a string, so get the actual object for processing
process_module(connection, m)
process_module(connection, eval(module)) # 'module' is a string, so get the actual object for processing by eval-ing it

pretty_cmnds = map(lambda c: c.__class__.__name__, commands)
print "Found %s command classes: %s." % (len(commands), ', '.join(pretty_cmnds))


+ 1
- 1
irc/commands/git.py Переглянути файл

@@ -146,7 +146,7 @@ class Git(BaseCommand):
try:
remote = self.exec_shell("git config --get branch.%s.remote" % branch)
url = self.exec_shell("git config --get remote.%s.url" % remote)
self.connection.reply(self.data, "done; %s. [from %s]" % (changes, url))
self.connection.reply(self.data, "done; %s [from %s]." % (changes, url))
except subprocess.CalledProcessError: # something in .git/config is not specified correctly, so we cannot get the remote's url
self.connection.reply(self.data, "done; %s." % changes)



+ 5
- 1
irc/connection.py Переглянути файл

@@ -5,6 +5,10 @@
import socket
import threading

class BrokenSocketException(Exception):
"""A socket has broken, because it is not sending data."""
pass

class Connection(object):
def __init__(self, host=None, port=None, nick=None, ident=None, realname=None):
"""a class to interface with IRC"""
@@ -33,7 +37,7 @@ class Connection(object):
"""receive (get) data from the server"""
data = self.sock.recv(4096)
if not data: # socket giving us no data, so it is dead/broken
raise RuntimeError("socket is dead")
raise BrokenSocketException()
return data

def send(self, msg):


+ 5
- 5
irc/frontend.py Переглянути файл

@@ -7,7 +7,7 @@ from config.irc import *
from config.secure import *

from irc import command_handler
from irc.connection import Connection
from irc.connection import *
from irc.data import Data

connection = None
@@ -19,7 +19,7 @@ def get_connection():
def startup(conn):
global connection
connection = conn
command_handler.init_commands(connection)
command_handler.load_commands(connection)
connection.connect()

def main():
@@ -28,8 +28,8 @@ def main():
while 1:
try:
read_buffer = read_buffer + connection.get()
except RuntimeError: # socket broke
print "socket has broken on front-end; restarting bot..."
except BrokenSocketException:
print "Socket has broken on front-end; restarting bot..."
return

lines = read_buffer.split("\n")
@@ -61,7 +61,7 @@ def main():

if data.msg.startswith("!restart"): # hardcode the !restart command (we can't restart from within an ordinary command)
if data.host in OWNERS:
print "restarting bot per owner request..."
print "Restarting bot per owner request..."
return

if line[0] == "PING": # If we are pinged, pong back to the server


+ 6
- 5
irc/watcher.py Переглянути файл

@@ -2,9 +2,10 @@

## Imports
from config.irc import *
from config.main import *
from config.watcher import *

from irc.connection import Connection
from irc.connection import *
from irc.rc import RC

global frontend_conn
@@ -16,13 +17,12 @@ def get_connection():
def main(connection, f_conn):
global frontend_conn
frontend_conn = f_conn
connection.connect()
read_buffer = str()

while 1:
try:
read_buffer = read_buffer + connection.get()
except RuntimeError: # socket broke
except BrokenSocketException:
return

lines = read_buffer.split("\n")
@@ -53,5 +53,6 @@ def check(rc):
if not results:
return
pretty = rc.get_pretty()
for chan in results:
frontend_conn.say(chan, pretty)
if enable_irc_frontend:
for chan in results:
frontend_conn.say(chan, pretty)

+ 12
- 0
wiki/base_task.py Переглянути файл

@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-

# A base class for bot tasks that edit Wikipedia.

class BaseTask(object):
def __init__(self):
"""A base class for bot tasks that edit Wikipedia."""
self.task_name = None

def run(self, **kwargs):
"""Run this task."""
pass

+ 80
- 0
wiki/task_manager.py Переглянути файл

@@ -0,0 +1,80 @@
# -*- coding: utf-8 -*-

# A module to manage bot tasks.

import time
import traceback
import threading
import os

from config import schedule

task_list = dict() # the key is the task's name, the value is the task's class instance

def load_tasks():
"""Load all valid task classes from wiki/tasks/, and add them to the task_list."""
files = os.listdir(os.path.join("wiki", "tasks")) # get all files in wiki/tasks/
files.sort() # alphabetically sort list of files
for f in files:
if not os.path.isfile(os.path.join("wiki", "tasks", f)): # ignore non-files
continue
if f.startswith("_") or not f.endswith(".py"): # ignore non-python files or files beginning with "_"
continue
load_class_from_file(f)
print "Found %s tasks: %s." % (len(task_list), ', '.join(task_list.keys()))

def load_class_from_file(f):
"""Look in a given file for the task class."""
global task_list
module = f[:-3] # strip .py from end
try:
exec "from wiki.tasks import %s as m" % module
except: # importing the file failed for some reason...
print "Couldn't load task file %s:" % f
traceback.print_exc()
return
try:
task_class = m.Task()
except:
print "Couldn't find or get task class in file %s:" % f
traceback.print_exc()
return
task_name = task_class.task_name
task_list[task_name] = task_class
print "Added task %s from wiki/tasks/%s..." % (task_name, f)

def start_tasks(now=time.gmtime()):
"""Start all tasks that are supposed to be run at a given time."""
tasks = schedule.check(now.tm_min, now.tm_hour, now.tm_mday, now.tm_mon, now.tm_wday) # get list of tasks to run this turn
for task in tasks:
if isinstance(task, tuple): # they've specified kwargs, so pass those to start_task
start_task(task[0], **task[1])
else: # otherwise, just pass task_name
start_task(task)

def start_task(task_name, **kwargs):
"""Start a given task in a new thread. Pass args to the task's run function."""
print "Starting task '{}' in a new thread...".format(task_name)
try:
task = task_list[task_name] # get the class for this task, a subclass of BaseTask
except KeyError:
print "Couldn't find task '{}': wiki/tasks/{}.py does not exist.".format(task_name, task_name)
return
# task_thread = threading.Thread(target=task_wrapper, args=(task, kwargs))
task_thread = threading.Thread(target=lambda: task_wrapper(task, **kwargs)) # Normally we'd do task_wrapper(task, **kwargs), but because of threading we'd have to do Thread(target=task_wrapper, args=(task, **kwargs)), which doesn't work because the **kwargs is inside a tuple, not inside function params. Use lambda to get around the args=tuple nonsense
task_thread.name = "task {} (spawned at {} UTC)".format(task_name, time.asctime())
task_thread.daemon = True # stop bot task threads automagically if the main bot stops
task_thread.start()

def task_wrapper(task, **kwargs):
"""Wrapper for task classes: run the task and catch any errors."""
try:
task.run(**kwargs)
except:
print "Task '{}' raised an exception and had to stop:".format(task.task_name)
traceback.print_exc()
else:
print "Task '{}' finished without error.".format(task.task_name)


+ 12
- 0
wiki/tasks/afc_catdelink.py Переглянути файл

@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-

# A task to delink mainspace categories in declined [[WP:AFC]] submissions.

from wiki.base_task import BaseTask

class Task(BaseTask):
def __init__(self):
self.task_name = "afc_catdelink"
def run(self, **kwargs):
pass

+ 12
- 0
wiki/tasks/afc_copyvios.py Переглянути файл

@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-

# A task to check newly-edited [[WP:AFC]] submissions for copyright violations.

from wiki.base_task import BaseTask

class Task(BaseTask):
def __init__(self):
self.task_name = "afc_copyvios"
def run(self, **kwargs):
pass

+ 12
- 0
wiki/tasks/afc_dailycats.py Переглянути файл

@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-

# A task to create daily categories for [[WP:AFC]].

from wiki.base_task import BaseTask

class Task(BaseTask):
def __init__(self):
self.task_name = "afc_dailycats"
def run(self, **kwargs):
pass

+ 12
- 0
wiki/tasks/afc_statistics.py Переглянути файл

@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-

# A task to generate statistics for [[WP:AFC]] and save them to [[Template:AFC_statistics]].

from wiki.base_task import BaseTask

class Task(BaseTask):
def __init__(self):
self.task_name = "afc_statistics"
def run(self, **kwargs):
pass

+ 12
- 0
wiki/tasks/afc_undated.py Переглянути файл

@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-

# A task to clear [[Category:Undated AfC submissions]].

from wiki.base_task import BaseTask

class Task(BaseTask):
def __init__(self):
self.task_name = "afc_undated"
def run(self, **kwargs):
pass

+ 12
- 0
wiki/tasks/blptag.py Переглянути файл

@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-

# A task to add |blp=yes to {{WPB}} or {{WPBS}} when it is used along with {{WP Biography}}.

from wiki.base_task import BaseTask

class Task(BaseTask):
def __init__(self):
self.task_name = "blptag"
def run(self, **kwargs):
pass

+ 12
- 0
wiki/tasks/feed_dailycats.py Переглянути файл

@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-

# A task to create daily categories for [[WP:FEED]].

from wiki.base_task import BaseTask

class Task(BaseTask):
def __init__(self):
self.task_name = "feed_dailycats"
def run(self, **kwargs):
pass

+ 12
- 0
wiki/tasks/wrongmime.py Переглянути файл

@@ -0,0 +1,12 @@
# -*- coding: utf-8 -*-

# A task to tag files whose extensions do not agree with their MIME type.

from wiki.base_task import BaseTask

class Task(BaseTask):
def __init__(self):
self.task_name = "wrongmime"
def run(self, **kwargs):
pass


Завантаження…
Відмінити
Зберегти