@@ -2,5 +2,7 @@ | |||
*.egg | |||
*.egg-info | |||
.DS_Store | |||
__pycache__ | |||
build | |||
dist | |||
docs/_build |
@@ -0,0 +1,31 @@ | |||
v0.2 (released November 8, 2015): | |||
- Added a new command syntax allowing the caller to redirect replies to another | |||
user. Example: "!dictionary >Fred earwig" | |||
- Added unload() hooks to commands and tasks, called when they are killed | |||
during a reload. | |||
- Added 'rc' hook type to allow IRC commands to respond to RC watcher events. | |||
- Added 'part' hook type as a counterpart to 'join'. | |||
- Added !stalk/!watch. | |||
- Added !watchers. | |||
- Added !epoch as a subcommand of !time. | |||
- Added !version as a subcommand of !help. | |||
- Expanded and improved !remind. | |||
- Improved general behavior of !access and !threads. | |||
- Fixed API behavior when blocked, when using AssertEdit, and under other | |||
circumstances. | |||
- Added copyvio detector functionality: specifying a max time for checks; | |||
improved exclusion support. URL loading and parsing is parallelized to speed | |||
up check times, with a multi-threaded worker model that avoids concurrent | |||
requests to the same domain. Improvements to the comparison algorithm. Fixed | |||
assorted bugs. | |||
- Added support for Wikimedia Labs when creating a config file. | |||
- Added and improved lazy importing for various dependencies. | |||
- Fixed a bug in job scheduling. | |||
- Improved client-side SQL buffering; made Category objects iterable. | |||
- Default to using HTTPS for new sites. | |||
- Updated documentation. | |||
v0.1 (released August 31, 2012): | |||
- Initial release. |
@@ -1,4 +1,4 @@ | |||
Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
@@ -36,15 +36,18 @@ setup.py test`` from the project's root directory. Note that some | |||
tests require an internet connection, and others may take a while to run. | |||
Coverage is currently rather incomplete. | |||
Latest release (v0.1) | |||
Latest release (v0.2) | |||
~~~~~~~~~~~~~~~~~~~~~ | |||
EarwigBot is available from the `Python Package Index`_, so you can install the | |||
latest release with ``pip install earwigbot`` (`get pip`_). | |||
If you get an error while pip is installing dependencies, you may be missing | |||
some header files. For example, on Ubuntu, see `this StackOverflow post`_. | |||
You can also install it from source [1]_ directly:: | |||
curl -Lo earwigbot.tgz https://github.com/earwig/earwigbot/tarball/v0.1 | |||
curl -Lo earwigbot.tgz https://github.com/earwig/earwigbot/tarball/v0.2 | |||
tar -xf earwigbot.tgz | |||
cd earwig-earwigbot-* | |||
python setup.py install | |||
@@ -55,10 +58,10 @@ Development version | |||
~~~~~~~~~~~~~~~~~~~ | |||
You can install the development version of the bot from ``git`` by using | |||
setuptools/distribute's ``develop`` command [1]_, probably on the ``develop`` | |||
branch which contains (usually) working code. ``master`` contains the latest | |||
release. EarwigBot uses `git flow`_, so you're free to | |||
browse by tags or by new features (``feature/*`` branches):: | |||
setuptools's ``develop`` command [1]_, probably on the ``develop`` branch which | |||
contains (usually) working code. ``master`` contains the latest release. | |||
EarwigBot uses `git flow`_, so you're free to browse by tags or by new features | |||
(``feature/*`` branches):: | |||
git clone git://github.com/earwig/earwigbot.git earwigbot | |||
cd earwigbot | |||
@@ -133,8 +136,8 @@ Custom IRC commands | |||
~~~~~~~~~~~~~~~~~~~ | |||
Custom commands are subclasses of `earwigbot.commands.Command`_ that override | |||
``Command``'s ``process()`` (and optionally ``check()`` or ``setup()``) | |||
methods. | |||
``Command``'s ``process()`` (and optionally ``check()``, ``setup()``, or | |||
``unload()``) methods. | |||
The bot has a wide selection of built-in commands and plugins to act as sample | |||
code and/or to give ideas. Start with test_, and then check out chanops_ and | |||
@@ -144,7 +147,7 @@ Custom bot tasks | |||
~~~~~~~~~~~~~~~~ | |||
Custom tasks are subclasses of `earwigbot.tasks.Task`_ that override ``Task``'s | |||
``run()`` (and optionally ``setup()``) methods. | |||
``run()`` (and optionally ``setup()`` or ``unload()``) methods. | |||
See the built-in wikiproject_tagger_ task for a relatively straightforward | |||
task, or the afc_statistics_ plugin for a more complicated one. | |||
@@ -188,8 +191,9 @@ Footnotes | |||
.. _several ongoing tasks: http://en.wikipedia.org/wiki/User:EarwigBot#Tasks | |||
.. _my instance of EarwigBot: http://en.wikipedia.org/wiki/User:EarwigBot | |||
.. _earwigbot-plugins: https://github.com/earwig/earwigbot-plugins | |||
.. _Python Package Index: http://pypi.python.org | |||
.. _Python Package Index: https://pypi.python.org/pypi/earwigbot | |||
.. _get pip: http://pypi.python.org/pypi/pip | |||
.. _this StackOverflow post: http://stackoverflow.com/questions/6504810/how-to-install-lxml-on-ubuntu/6504860#6504860 | |||
.. _git flow: http://nvie.com/posts/a-successful-git-branching-model/ | |||
.. _explanation of YAML: http://en.wikipedia.org/wiki/YAML | |||
.. _earwigbot.bot.Bot: https://github.com/earwig/earwigbot/blob/develop/earwigbot/bot.py | |||
@@ -202,4 +206,4 @@ Footnotes | |||
.. _wikiproject_tagger: https://github.com/earwig/earwigbot/blob/develop/earwigbot/tasks/wikiproject_tagger.py | |||
.. _afc_statistics: https://github.com/earwig/earwigbot-plugins/blob/develop/tasks/afc_statistics.py | |||
.. _its code and docstrings: https://github.com/earwig/earwigbot/tree/develop/earwigbot/wiki | |||
.. _Let me know: ben.kurtovic@verizon.net | |||
.. _Let me know: ben.kurtovic@gmail.com |
@@ -41,16 +41,16 @@ master_doc = 'index' | |||
# General information about the project. | |||
project = u'EarwigBot' | |||
copyright = u'2009, 2010, 2011, 2012 Ben Kurtovic' | |||
copyright = u'2009-2015 Ben Kurtovic' | |||
# The version info for the project you're documenting, acts as replacement for | |||
# |version| and |release|, also used in various other places throughout the | |||
# built documents. | |||
# | |||
# The short X.Y version. | |||
version = '0.1' | |||
version = '0.2' | |||
# The full version, including alpha/beta/rc tags. | |||
release = '0.1' | |||
release = '0.2' | |||
# The language for content autogenerated by Sphinx. Refer to documentation | |||
# for a list of supported languages. | |||
@@ -86,8 +86,9 @@ Custom IRC commands | |||
Custom commands are subclasses of :py:class:`earwigbot.commands.Command` that | |||
override :py:class:`~earwigbot.commands.Command`'s | |||
:py:meth:`~earwigbot.commands.Command.process` (and optionally | |||
:py:meth:`~earwigbot.commands.Command.check` or | |||
:py:meth:`~earwigbot.commands.Command.setup`) methods. | |||
:py:meth:`~earwigbot.commands.Command.check`, | |||
:py:meth:`~earwigbot.commands.Command.setup`, or | |||
:py:meth:`~earwigbot.commands.Command.unload`) methods. | |||
:py:class:`~earwigbot.commands.Command`'s docstrings should explain what each | |||
attribute and method is for and what they should be overridden with, but these | |||
@@ -108,9 +109,10 @@ are the basics: | |||
- Class attribute :py:attr:`~earwigbot.commands.Command.hooks` is a list of the | |||
"IRC events" that this command might respond to. It defaults to ``["msg"]``, | |||
but options include ``"msg_private"`` (for private messages only), | |||
``"msg_public"`` (for channel messages only), and ``"join"`` (for when a user | |||
joins a channel). See the afc_status_ plugin for a command that responds to | |||
other hook types. | |||
``"msg_public"`` (for channel messages only), ``"join"`` (for when a user | |||
joins a channel), ``"part"`` (for when a user parts a channel), and ``"rc"`` | |||
(for recent change messages from the IRC watcher). See the afc_status_ plugin | |||
for a command that responds to other hook types. | |||
- Method :py:meth:`~earwigbot.commands.Command.setup` is called *once* with no | |||
arguments immediately after the command is first loaded. Does nothing by | |||
@@ -153,6 +155,10 @@ are the basics: | |||
<earwigbot.irc.connection.IRCConnection.join>`, and | |||
:py:meth:`part(chan) <earwigbot.irc.connection.IRCConnection.part>`. | |||
- Method :py:meth:`~earwigbot.commands.Command.unload` is called *once* with no | |||
arguments immediately before the command is unloaded, such as when someone | |||
uses ``!reload``. Does nothing by default. | |||
Commands have access to :py:attr:`config.commands[command_name]` for config | |||
information, which is a node in :file:`config.yml` like every other attribute | |||
of :py:attr:`bot.config`. This can be used to store, for example, API keys or | |||
@@ -174,7 +180,8 @@ Custom bot tasks | |||
Custom tasks are subclasses of :py:class:`earwigbot.tasks.Task` that | |||
override :py:class:`~earwigbot.tasks.Task`'s | |||
:py:meth:`~earwigbot.tasks.Task.run` (and optionally | |||
:py:meth:`~earwigbot.tasks.Task.setup`) methods. | |||
:py:meth:`~earwigbot.tasks.Task.setup` or | |||
:py:meth:`~earwigbot.tasks.Task.unload`) methods. | |||
:py:class:`~earwigbot.tasks.Task`'s docstrings should explain what each | |||
attribute and method is for and what they should be overridden with, but these | |||
@@ -219,6 +226,10 @@ are the basics: | |||
the task's code goes. For interfacing with MediaWiki sites, read up on the | |||
:doc:`Wiki Toolset <toolset>`. | |||
- Method :py:meth:`~earwigbot.tasks.Task.unload` is called *once* with no | |||
arguments immediately before the task is unloaded. Does nothing by | |||
default. | |||
Tasks have access to :py:attr:`config.tasks[task_name]` for config information, | |||
which is a node in :file:`config.yml` like every other attribute of | |||
:py:attr:`bot.config`. This can be used to store, for example, edit summaries | |||
@@ -1,4 +1,4 @@ | |||
EarwigBot v0.1 Documentation | |||
EarwigBot v0.2 Documentation | |||
============================ | |||
EarwigBot_ is a Python_ robot that edits Wikipedia_ and interacts with people | |||
@@ -13,15 +13,18 @@ It's recommended to run the bot's unit tests before installing. Run | |||
some tests require an internet connection, and others may take a while to run. | |||
Coverage is currently rather incomplete. | |||
Latest release (v0.1) | |||
Latest release (v0.2) | |||
--------------------- | |||
EarwigBot is available from the `Python Package Index`_, so you can install the | |||
latest release with :command:`pip install earwigbot` (`get pip`_). | |||
If you get an error while pip is installing dependencies, you may be missing | |||
some header files. For example, on Ubuntu, see `this StackOverflow post`_. | |||
You can also install it from source [1]_ directly:: | |||
curl -Lo earwigbot.tgz https://github.com/earwig/earwigbot/tarball/v0.1 | |||
curl -Lo earwigbot.tgz https://github.com/earwig/earwigbot/tarball/v0.2 | |||
tar -xf earwigbot.tgz | |||
cd earwig-earwigbot-* | |||
python setup.py install | |||
@@ -32,10 +35,10 @@ Development version | |||
------------------- | |||
You can install the development version of the bot from :command:`git` by using | |||
setuptools/`distribute`_'s :command:`develop` command [1]_, probably on the | |||
``develop`` branch which contains (usually) working code. ``master`` contains | |||
the latest release. EarwigBot uses `git flow`_, so you're free to browse by | |||
tags or by new features (``feature/*`` branches):: | |||
setuptools's :command:`develop` command [1]_, probably on the ``develop`` | |||
branch which contains (usually) working code. ``master`` contains the latest | |||
release. EarwigBot uses `git flow`_, so you're free to browse by tags or by new | |||
features (``feature/*`` branches):: | |||
git clone git://github.com/earwig/earwigbot.git earwigbot | |||
cd earwigbot | |||
@@ -51,5 +54,5 @@ tags or by new features (``feature/*`` branches):: | |||
.. _earwigbot-plugins: https://github.com/earwig/earwigbot-plugins | |||
.. _Python Package Index: http://pypi.python.org | |||
.. _get pip: http://pypi.python.org/pypi/pip | |||
.. _distribute: http://pypi.python.org/pypi/distribute | |||
.. _this StackOverflow post: http://stackoverflow.com/questions/6504810/how-to-install-lxml-on-ubuntu/6504860#6504860 | |||
.. _git flow: http://nvie.com/posts/a-successful-git-branching-model/ |
@@ -42,5 +42,5 @@ Tips | |||
.. _logging: http://docs.python.org/library/logging.html | |||
.. _!git plugin: https://github.com/earwig/earwigbot-plugins/blob/develop/commands/git.py | |||
.. _Let me know: ben.kurtovic@verizon.net | |||
.. _Let me know: ben.kurtovic@gmail.com | |||
.. _create an issue: https://github.com/earwig/earwigbot/issues |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -30,11 +30,11 @@ details. This documentation is also available `online | |||
""" | |||
__author__ = "Ben Kurtovic" | |||
__copyright__ = "Copyright (C) 2009, 2010, 2011, 2012 Ben Kurtovic" | |||
__copyright__ = "Copyright (C) 2009-2015 Ben Kurtovic" | |||
__license__ = "MIT License" | |||
__version__ = "0.1" | |||
__email__ = "ben.kurtovic@verizon.net" | |||
__release__ = True | |||
__version__ = "0.2" | |||
__email__ = "ben.kurtovic@gmail.com" | |||
__release__ = False | |||
if not __release__: | |||
def _get_git_commit_id(): | |||
@@ -45,7 +45,7 @@ if not __release__: | |||
commit_id = Repo(path).head.object.hexsha | |||
return commit_id[:8] | |||
try: | |||
__version__ += ".git+" + _get_git_commit_id() | |||
__version__ += "+git-" + _get_git_commit_id() | |||
except Exception: | |||
pass | |||
finally: | |||
@@ -64,5 +64,3 @@ managers = importer.new("earwigbot.managers") | |||
tasks = importer.new("earwigbot.tasks") | |||
util = importer.new("earwigbot.util") | |||
wiki = importer.new("earwigbot.wiki") | |||
del importer |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -22,7 +22,7 @@ | |||
import logging | |||
from threading import Lock, Thread, enumerate as enumerate_threads | |||
from time import sleep, time | |||
from time import gmtime, sleep | |||
from earwigbot import __version__ | |||
from earwigbot.config import BotConfig | |||
@@ -101,13 +101,10 @@ class Bot(object): | |||
def _start_wiki_scheduler(self): | |||
"""Start the wiki scheduler in a separate thread if enabled.""" | |||
def wiki_scheduler(): | |||
run_at = 15 | |||
while self._keep_looping: | |||
time_start = time() | |||
self.tasks.schedule() | |||
time_end = time() | |||
time_diff = time_start - time_end | |||
if time_diff < 60: # Sleep until the next minute | |||
sleep(60 - time_diff) | |||
sleep(60 + run_at - gmtime().tm_sec) | |||
if self.config.components.get("wiki_scheduler"): | |||
self.logger.info("Starting wiki scheduler") | |||
@@ -157,7 +154,7 @@ class Bot(object): | |||
tasks.append(thread.name) | |||
if tasks: | |||
log = "The following commands or tasks will be killed: {0}" | |||
self.logger.warn(log.format(" ".join(tasks))) | |||
self.logger.warn(log.format(", ".join(tasks))) | |||
@property | |||
def is_running(self): | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -43,9 +43,9 @@ class Command(object): | |||
# be triggered by the command's name and its name only: | |||
commands = [] | |||
# Hooks are "msg", "msg_private", "msg_public", and "join". "msg" is the | |||
# default behavior; if you wish to override that, change the value in your | |||
# command subclass: | |||
# Hooks are "msg", "msg_private", "msg_public", "join", "part", and "rc". | |||
# "msg" is the default behavior; if you wish to override that, change the | |||
# value in your command subclass: | |||
hooks = ["msg"] | |||
def __init__(self, bot): | |||
@@ -120,3 +120,10 @@ class Command(object): | |||
command's body here. | |||
""" | |||
pass | |||
def unload(self): | |||
"""Hook called immediately before a command is unloaded. | |||
Does nothing by default; feel free to override. | |||
""" | |||
pass |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -30,11 +30,8 @@ class Access(Command): | |||
commands = ["access", "permission", "permissions", "perm", "perms"] | |||
def process(self, data): | |||
if not data.args: | |||
self.reply(data, "Subcommands are self, list, add, remove.") | |||
return | |||
permdb = self.config.irc["permissions"] | |||
if data.args[0] == "self": | |||
if not data.args or data.args[0] == "self": | |||
self.do_self(data, permdb) | |||
elif data.args[0] == "list": | |||
self.do_list(data, permdb) | |||
@@ -42,9 +39,11 @@ class Access(Command): | |||
self.do_add(data, permdb) | |||
elif data.args[0] == "remove": | |||
self.do_remove(data, permdb) | |||
elif data.args[0] == "help": | |||
self.reply(data, "Subcommands are self, list, add, and remove.") | |||
else: | |||
msg = "Unknown subcommand \x0303{0}\x0F.".format(data.args[0]) | |||
self.reply(data, msg) | |||
msg = "Unknown subcommand \x0303{0}\x0F. Subcommands are self, list, add, remove." | |||
self.reply(data, msg.format(data.args[0])) | |||
def do_self(self, data, permdb): | |||
if permdb.is_owner(data): | |||
@@ -59,9 +58,9 @@ class Access(Command): | |||
def do_list(self, data, permdb): | |||
if len(data.args) > 1: | |||
if data.args[1] in ["owner", "owners"]: | |||
name, rules = "owners", permdb.data.get(permdb.OWNER) | |||
name, rules = "owners", permdb.users.get(permdb.OWNER) | |||
elif data.args[1] in ["admin", "admins"]: | |||
name, rules = "admins", permdb.data.get(permdb.ADMIN) | |||
name, rules = "admins", permdb.users.get(permdb.ADMIN) | |||
else: | |||
msg = "Unknown access level \x0302{0}\x0F." | |||
self.reply(data, msg.format(data.args[1])) | |||
@@ -72,9 +71,9 @@ class Access(Command): | |||
msg = "No bot {0}.".format(name) | |||
self.reply(data, msg) | |||
else: | |||
owners = len(permdb.data.get(permdb.OWNER, [])) | |||
admins = len(permdb.data.get(permdb.ADMIN, [])) | |||
msg = "There are {0} bot owners and {1} bot admins. Use '!{2} list owners' or '!{2} list admins' for details." | |||
owners = len(permdb.users.get(permdb.OWNER, [])) | |||
admins = len(permdb.users.get(permdb.ADMIN, [])) | |||
msg = "There are \x02{0}\x0F bot owners and \x02{1}\x0F bot admins. Use '!{2} list owners' or '!{2} list admins' for details." | |||
self.reply(data, msg.format(owners, admins, data.command)) | |||
def do_add(self, data, permdb): | |||
@@ -113,7 +112,7 @@ class Access(Command): | |||
def get_user_from_args(self, data, permdb): | |||
if not permdb.is_owner(data): | |||
msg = "You must be a bot owner to add users to the access list." | |||
msg = "You must be a bot owner to add or remove users to the access list." | |||
self.reply(data, msg) | |||
return | |||
levels = ["owner", "owners", "admin", "admins"] | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -73,7 +73,7 @@ class Calc(Command): | |||
('\$', 'USD '), | |||
(r'\bKB\b', 'kilobytes'), | |||
(r'\bMB\b', 'megabytes'), | |||
(r'\bGB\b', 'kilobytes'), | |||
(r'\bGB\b', 'gigabytes'), | |||
('kbps', '(kilobits / second)'), | |||
('mbps', '(megabits / second)') | |||
] | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -22,10 +22,11 @@ | |||
import hashlib | |||
from Crypto.Cipher import Blowfish | |||
from earwigbot import importer | |||
from earwigbot.commands import Command | |||
Blowfish = importer.new("Crypto.Cipher.Blowfish") | |||
class Crypt(Command): | |||
"""Provides hash functions with !hash (!hash list for supported algorithms) | |||
and Blowfish encryption with !encrypt and !decrypt.""" | |||
@@ -66,7 +67,13 @@ class Crypt(Command): | |||
self.reply(data, msg.format(data.command)) | |||
return | |||
cipher = Blowfish.new(hashlib.sha256(key).digest()) | |||
try: | |||
cipher = Blowfish.new(hashlib.sha256(key).digest()) | |||
except ImportError: | |||
msg = "This command requires the 'pycrypto' package: https://www.dlitz.net/software/pycrypto/" | |||
self.reply(data, msg) | |||
return | |||
try: | |||
if data.command == "encrypt": | |||
if len(text) % 8: | |||
@@ -75,5 +82,5 @@ class Crypt(Command): | |||
self.reply(data, cipher.encrypt(text).encode("hex")) | |||
else: | |||
self.reply(data, cipher.decrypt(text.decode("hex"))) | |||
except ValueError as error: | |||
except (ValueError, TypeError) as error: | |||
self.reply(data, error.message) |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -28,7 +28,7 @@ from earwigbot.commands import Command | |||
class Dictionary(Command): | |||
"""Define words and stuff.""" | |||
name = "dictionary" | |||
commands = ["dict", "dictionary", "define"] | |||
commands = ["dict", "dictionary", "define", "def"] | |||
def process(self, data): | |||
if not data.args: | |||
@@ -65,6 +65,16 @@ class Dictionary(Command): | |||
if not languages: | |||
return u"Couldn't parse {0}!".format(page.url) | |||
if "#" in term: # Requesting a specific language | |||
lcase_langs = {lang.lower(): lang for lang in languages} | |||
request = term.rsplit("#", 1)[1] | |||
lang = lcase_langs.get(request.lower()) | |||
if not lang: | |||
resp = u"Language {0} not found in definition." | |||
return resp.format(request) | |||
definition = self.get_definition(languages[lang], level) | |||
return u"({0}) {1}".format(lang, definition) | |||
result = [] | |||
for lang, section in sorted(languages.items()): | |||
definition = self.get_definition(section, level) | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -47,7 +47,7 @@ class Editcount(Command): | |||
return | |||
safe = quote_plus(user.name.encode("utf8")) | |||
url = "http://toolserver.org/~tparis/pcount/index.php?name={0}&lang={1}&wiki={2}" | |||
url = "http://tools.wmflabs.org/xtools-ec/index.php?user={0}&lang={1}&wiki={2}" | |||
fullurl = url.format(safe, site.lang, site.project) | |||
msg = "\x0302{0}\x0F has {1} edits ({2})." | |||
self.reply(data, msg.format(name, count, fullurl)) |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -20,17 +20,20 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from platform import python_version | |||
import re | |||
from earwigbot import __version__ | |||
from earwigbot.commands import Command | |||
class Help(Command): | |||
"""Displays help information.""" | |||
"""Displays information about the bot.""" | |||
name = "help" | |||
commands = ["help", "version"] | |||
def check(self, data): | |||
if data.is_command: | |||
if data.command == "help": | |||
if data.command in self.commands: | |||
return True | |||
if not data.command and data.trigger == data.my_nick: | |||
return True | |||
@@ -39,6 +42,8 @@ class Help(Command): | |||
def process(self, data): | |||
if not data.command: | |||
self.do_hello(data) | |||
elif data.command == "version": | |||
self.do_version(data) | |||
elif data.args: | |||
self.do_command_help(data) | |||
else: | |||
@@ -69,3 +74,7 @@ class Help(Command): | |||
def do_hello(self, data): | |||
self.say(data.chan, "Yes, {0}?".format(data.nick)) | |||
def do_version(self, data): | |||
vers = "EarwigBot v{bot} on Python {python}: https://github.com/earwig/earwigbot" | |||
self.reply(data, vers.format(bot=__version__, python=python_version())) |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -24,7 +24,7 @@ from earwigbot import exceptions | |||
from earwigbot.commands import Command | |||
class Lag(Command): | |||
"""Return the replag for a specific database on the Toolserver.""" | |||
"""Return replag or maxlag information on specific databases.""" | |||
name = "lag" | |||
commands = ["lag", "replag", "maxlag"] | |||
@@ -45,7 +45,7 @@ class Lag(Command): | |||
self.reply(data, msg) | |||
def get_replag(self, site): | |||
return "Toolserver replag is {0}".format(self.time(site.get_replag())) | |||
return "replag is {0}".format(self.time(site.get_replag())) | |||
def get_maxlag(self, site): | |||
return "database maxlag is {0}".format(self.time(site.get_maxlag())) | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -23,14 +23,14 @@ | |||
from earwigbot.commands import Command | |||
class Langcode(Command): | |||
"""Convert a language code into its name and a list of WMF sites in that | |||
language, or a name into its code.""" | |||
"""Convert a language code into its name (or vice versa), and give a list | |||
of WMF sites in that language.""" | |||
name = "langcode" | |||
commands = ["langcode", "lang", "language"] | |||
def process(self, data): | |||
if not data.args: | |||
self.reply(data, "Please specify a language code.") | |||
self.reply(data, "Please specify a language code or name.") | |||
return | |||
code, lcase = data.args[0], data.args[0].lower() | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -50,7 +50,8 @@ class Notes(Command): | |||
} | |||
if not data.args: | |||
msg = "\x0302The Earwig Mini-Wiki\x0F: running v{0}. Subcommands are: {1}. You can get help on any with '!{2} help subcommand'." | |||
msg = ("\x0302The Earwig Mini-Wiki\x0F: running v{0}. Subcommands " | |||
"are: {1}. You can get help on any with '!{2} help subcommand'.") | |||
cmnds = ", ".join((commands)) | |||
self.reply(data, msg.format(self.version, cmnds, data.command)) | |||
return | |||
@@ -101,7 +102,7 @@ class Notes(Command): | |||
entries = [] | |||
if entries: | |||
entries = [entry[0] for entry in entries] | |||
entries = [entry[0].encode("utf8") for entry in entries] | |||
self.reply(data, "Entries: {0}".format(", ".join(entries))) | |||
else: | |||
self.reply(data, "No entries in the database.") | |||
@@ -123,8 +124,10 @@ class Notes(Command): | |||
except (sqlite.OperationalError, TypeError): | |||
title, content = slug, None | |||
title = title.encode("utf8") | |||
if content: | |||
self.reply(data, "\x0302{0}\x0F: {1}".format(title, content)) | |||
msg = "\x0302{0}\x0F: {1}" | |||
self.reply(data, msg.format(title, content.encode("utf8"))) | |||
else: | |||
self.reply(data, "Entry \x0302{0}\x0F not found.".format(title)) | |||
@@ -142,7 +145,7 @@ class Notes(Command): | |||
except IndexError: | |||
self.reply(data, "Please specify an entry to edit.") | |||
return | |||
content = " ".join(data.args[2:]).strip() | |||
content = " ".join(data.args[2:]).strip().decode("utf8") | |||
if not content: | |||
self.reply(data, "Please give some content to put in the entry.") | |||
return | |||
@@ -153,11 +156,11 @@ class Notes(Command): | |||
id_, title, author = conn.execute(query1, (slug,)).fetchone() | |||
create = False | |||
except sqlite.OperationalError: | |||
id_, title, author = 1, data.args[1], data.host | |||
id_, title, author = 1, data.args[1].decode("utf8"), data.host | |||
self.create_db(conn) | |||
except TypeError: | |||
id_ = self.get_next_entry(conn) | |||
title, author = data.args[1], data.host | |||
title, author = data.args[1].decode("utf8"), data.host | |||
permdb = self.config.irc["permissions"] | |||
if author != data.host and not permdb.is_admin(data): | |||
msg = "You must be an author or a bot admin to edit this entry." | |||
@@ -172,7 +175,8 @@ class Notes(Command): | |||
else: | |||
conn.execute(query4, (revid, id_)) | |||
self.reply(data, "Entry \x0302{0}\x0F updated.".format(title)) | |||
msg = "Entry \x0302{0}\x0F updated." | |||
self.reply(data, msg.format(title.encode("utf8"))) | |||
def do_info(self, data): | |||
"""Get info on an entry in the notes database.""" | |||
@@ -197,7 +201,7 @@ class Notes(Command): | |||
times = [datum[1] for datum in info] | |||
earliest = min(times) | |||
msg = "\x0302{0}\x0F: {1} edits since {2}" | |||
msg = msg.format(title, len(info), earliest) | |||
msg = msg.format(title.encode("utf8"), len(info), earliest) | |||
if len(times) > 1: | |||
latest = max(times) | |||
msg += "; last edit on {0}".format(latest) | |||
@@ -242,7 +246,8 @@ class Notes(Command): | |||
msg = "You must be an author or a bot admin to rename this entry." | |||
self.reply(data, msg) | |||
return | |||
conn.execute(query2, (self.slugify(newtitle), newtitle, id_)) | |||
args = (self.slugify(newtitle), newtitle.decode("utf8"), id_) | |||
conn.execute(query2, args) | |||
msg = "Entry \x0302{0}\x0F renamed to \x0302{1}\x0F." | |||
self.reply(data, msg.format(data.args[1], newtitle)) | |||
@@ -280,7 +285,7 @@ class Notes(Command): | |||
def slugify(self, name): | |||
"""Convert *name* into an identifier for storing in the database.""" | |||
return name.lower().replace("_", "").replace("-", "") | |||
return name.lower().replace("_", "").replace("-", "").decode("utf8") | |||
def create_db(self, conn): | |||
"""Initialize the notes database with its necessary tables.""" | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -20,7 +20,8 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
import time | |||
from datetime import datetime | |||
from time import mktime | |||
from earwigbot import exceptions | |||
from earwigbot.commands import Command | |||
@@ -46,8 +47,9 @@ class Registration(Command): | |||
self.reply(data, msg.format(name)) | |||
return | |||
date = time.strftime("%b %d, %Y at %H:%M:%S UTC", reg) | |||
age = self.get_diff(time.mktime(reg), time.mktime(time.gmtime())) | |||
dt = datetime.fromtimestamp(mktime(reg)) | |||
date = dt.strftime("%b %d, %Y at %H:%M:%S UTC") | |||
age = self.get_age(dt) | |||
if user.gender == "male": | |||
gender = "He's" | |||
@@ -59,14 +61,24 @@ class Registration(Command): | |||
msg = "\x0302{0}\x0F registered on {1}. {2} {3} old." | |||
self.reply(data, msg.format(name, date, gender, age)) | |||
def get_diff(self, t1, t2): | |||
parts = [("year", 31536000), ("day", 86400), ("hour", 3600), | |||
("minute", 60), ("second", 1)] | |||
def get_age(self, birth): | |||
msg = [] | |||
for name, size in parts: | |||
num = int(t2 - t1) / size | |||
t1 += num * size | |||
if num: | |||
chunk = "{0} {1}".format(num, name if num == 1 else name + "s") | |||
msg.append(chunk) | |||
def insert(unit, num): | |||
if not num: | |||
return | |||
msg.append("{0} {1}".format(num, unit if num == 1 else unit + "s")) | |||
now = datetime.utcnow() | |||
bd_passed = now.timetuple()[1:-3] < birth.timetuple()[1:-3] | |||
years = now.year - birth.year - bd_passed | |||
delta = now - birth.replace(year=birth.year + years) | |||
insert("year", years) | |||
insert("day", delta.days) | |||
seconds = delta.seconds | |||
units = [("hour", 3600), ("minute", 60), ("second", 1)] | |||
for unit, size in units: | |||
num = seconds / size | |||
seconds -= num * size | |||
insert(unit, num) | |||
return ", ".join(msg) if msg else "0 seconds" |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -20,43 +20,411 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from threading import Timer | |||
import ast | |||
from contextlib import contextmanager | |||
from itertools import chain | |||
import operator | |||
import random | |||
from threading import RLock, Thread | |||
import time | |||
from earwigbot.commands import Command | |||
from earwigbot.irc import Data | |||
DISPLAY = ["display", "show", "list", "info", "details"] | |||
CANCEL = ["cancel", "stop", "delete", "del", "stop", "unremind", "forget", | |||
"disregard"] | |||
SNOOZE = ["snooze", "delay", "reset", "adjust", "modify", "change"] | |||
class Remind(Command): | |||
"""Set a message to be repeated to you in a certain amount of time.""" | |||
name = "remind" | |||
commands = ["remind", "reminder"] | |||
commands = ["remind", "reminder", "reminders", "snooze", "cancel", | |||
"unremind", "forget"] | |||
def process(self, data): | |||
if not data.args: | |||
msg = "Please specify a time (in seconds) and a message in the following format: !remind <time> <msg>." | |||
self.reply(data, msg) | |||
return | |||
@staticmethod | |||
def _normalize(command): | |||
"""Convert a command name into its canonical form.""" | |||
if command in DISPLAY: | |||
return "display" | |||
if command in CANCEL: | |||
return "cancel" | |||
if command in SNOOZE: | |||
return "snooze" | |||
@staticmethod | |||
def _parse_time(arg): | |||
"""Parse the wait time for a reminder.""" | |||
ast_to_op = { | |||
ast.Add: operator.add, ast.Sub: operator.sub, | |||
ast.Mult: operator.mul, ast.Div: operator.truediv, | |||
ast.FloorDiv: operator.floordiv, ast.Mod: operator.mod, | |||
ast.Pow: operator.pow | |||
} | |||
time_units = { | |||
"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800, "y": 31536000 | |||
} | |||
def _evaluate(node): | |||
"""Convert an AST node into a real number or raise an exception.""" | |||
if isinstance(node, ast.Num): | |||
if not isinstance(node.n, (int, long, float)): | |||
raise ValueError(node.n) | |||
return node.n | |||
elif isinstance(node, ast.BinOp): | |||
left, right = _evaluate(node.left), _evaluate(node.right) | |||
return ast_to_op[type(node.op)](left, right) | |||
else: | |||
raise ValueError(node) | |||
if arg and arg[-1] in time_units: | |||
factor, arg = time_units[arg[-1]], arg[:-1] | |||
else: | |||
factor = 1 | |||
try: | |||
parsed = int(_evaluate(ast.parse(arg, mode="eval").body) * factor) | |||
except (SyntaxError, KeyError): | |||
raise ValueError(arg) | |||
if parsed <= 0: | |||
raise ValueError(parsed) | |||
return parsed | |||
@contextmanager | |||
def _db(self): | |||
"""Return a threadsafe context manager for the permissions database.""" | |||
with self._db_lock: | |||
yield self.config.irc["permissions"] | |||
def _really_get_reminder_by_id(self, user, rid): | |||
"""Return the _Reminder object that corresponds to a particular ID. | |||
Raises IndexError on failure. | |||
""" | |||
rid = rid.upper() | |||
if user not in self.reminders: | |||
raise IndexError(rid) | |||
return [robj for robj in self.reminders[user] if robj.id == rid][0] | |||
def _get_reminder_by_id(self, user, rid, data): | |||
"""Return the _Reminder object that corresponds to a particular ID. | |||
Sends an error message to the user on failure. | |||
""" | |||
try: | |||
wait = int(data.args[0]) | |||
return self._really_get_reminder_by_id(user, rid) | |||
except IndexError: | |||
msg = "Couldn't find a reminder for \x0302{0}\x0F with ID \x0303{1}\x0F." | |||
self.reply(data, msg.format(user, rid)) | |||
def _get_new_id(self): | |||
"""Get a free ID for a new reminder.""" | |||
taken = set(robj.id for robj in chain(*self.reminders.values())) | |||
num = random.choice(list(set(range(4096)) - taken)) | |||
return "R{0:03X}".format(num) | |||
def _start_reminder(self, reminder, user): | |||
"""Start the given reminder object for the given user.""" | |||
reminder.start() | |||
if user in self.reminders: | |||
self.reminders[user].append(reminder) | |||
else: | |||
self.reminders[user] = [reminder] | |||
def _create_reminder(self, data, user): | |||
"""Create a new reminder for the given user.""" | |||
try: | |||
wait = self._parse_time(data.args[0]) | |||
except ValueError: | |||
msg = "The time must be given as an integer, in seconds." | |||
self.reply(data, msg) | |||
msg = "Invalid time \x02{0}\x0F. Time must be a positive integer, in seconds." | |||
return self.reply(data, msg.format(data.args[0])) | |||
if wait > 1000 * 365 * 24 * 60 * 60: | |||
# Hard to think of a good upper limit, but 1000 years works. | |||
msg = "Given time \x02{0}\x0F is too large. Keep it reasonable." | |||
return self.reply(data, msg.format(data.args[0])) | |||
end = time.time() + wait | |||
message = " ".join(data.args[1:]) | |||
try: | |||
rid = self._get_new_id() | |||
except IndexError: | |||
msg = "Couldn't set a new reminder: no free IDs available." | |||
return self.reply(data, msg) | |||
reminder = _Reminder(rid, user, wait, end, message, data, self) | |||
self._start_reminder(reminder, user) | |||
msg = "Set reminder \x0303{0}\x0F ({1})." | |||
self.reply(data, msg.format(rid, reminder.end_time)) | |||
def _display_reminder(self, data, reminder): | |||
"""Display a particular reminder's information.""" | |||
msg = 'Reminder \x0303{0}\x0F: {1} seconds ({2}): "{3}".' | |||
msg = msg.format(reminder.id, reminder.wait, reminder.end_time, | |||
reminder.message) | |||
self.reply(data, msg) | |||
def _cancel_reminder(self, data, user, reminder): | |||
"""Cancel a pending reminder.""" | |||
reminder.stop() | |||
self.reminders[user].remove(reminder) | |||
if not self.reminders[user]: | |||
del self.reminders[user] | |||
msg = "Reminder \x0303{0}\x0F canceled." | |||
self.reply(data, msg.format(reminder.id)) | |||
def _snooze_reminder(self, data, reminder, arg=None): | |||
"""Snooze a reminder to be re-triggered after a period of time.""" | |||
verb = "snoozed" if reminder.end < time.time() else "adjusted" | |||
if arg: | |||
try: | |||
duration = self._parse_time(data.args[arg]) | |||
reminder.wait = duration | |||
except (IndexError, ValueError): | |||
pass | |||
reminder.end = time.time() + reminder.wait | |||
reminder.start() | |||
end = time.strftime("%b %d %H:%M:%S %Z", time.localtime(reminder.end)) | |||
msg = "Reminder \x0303{0}\x0F {1} until {2}." | |||
self.reply(data, msg.format(reminder.id, verb, end)) | |||
def _load_reminders(self): | |||
"""Load previously made reminders from the database.""" | |||
with self._db() as permdb: | |||
try: | |||
database = permdb.get_attr("command:remind", "data") | |||
except KeyError: | |||
return | |||
permdb.set_attr("command:remind", "data", "[]") | |||
for item in ast.literal_eval(database): | |||
rid, user, wait, end, message, data = item | |||
if end < time.time(): | |||
continue | |||
data = Data.unserialize(data) | |||
reminder = _Reminder(rid, user, wait, end, message, data, self) | |||
self._start_reminder(reminder, user) | |||
def _handle_command(self, command, data, user, reminder, arg=None): | |||
"""Handle a reminder-processing subcommand.""" | |||
if command in DISPLAY: | |||
self._display_reminder(data, reminder) | |||
elif command in CANCEL: | |||
self._cancel_reminder(data, user, reminder) | |||
elif command in SNOOZE: | |||
self._snooze_reminder(data, reminder, arg) | |||
else: | |||
msg = "Unknown action \x02{0}\x0F for reminder \x0303{1}\x0F." | |||
self.reply(data, msg.format(command, reminder.id)) | |||
def _show_reminders(self, data, user): | |||
"""Show all of a user's current reminders.""" | |||
shorten = lambda s: (s[:37] + "..." if len(s) > 40 else s) | |||
tmpl = '\x0303{0}\x0F ("{1}", {2})' | |||
fmt = lambda robj: tmpl.format(robj.id, shorten(robj.message), | |||
robj.end_time) | |||
if user in self.reminders: | |||
rlist = ", ".join(fmt(robj) for robj in self.reminders[user]) | |||
msg = "Your reminders: {0}.".format(rlist) | |||
else: | |||
msg = ("You have no reminders. Set one with \x0306!remind [time] " | |||
"[message]\x0F. See also: \x0306!remind help\x0F.") | |||
self.reply(data, msg) | |||
def _process_snooze_command(self, data, user): | |||
"""Process the !snooze command.""" | |||
if not data.args: | |||
if user not in self.reminders: | |||
self.reply(data, "You have no reminders to snooze.") | |||
elif len(self.reminders[user]) == 1: | |||
self._snooze_reminder(data, self.reminders[user][0]) | |||
else: | |||
msg = "You have {0} reminders. Snooze which one?" | |||
self.reply(data, msg.format(len(self.reminders[user]))) | |||
return | |||
message = ' '.join(data.args[1:]) | |||
if not message: | |||
msg = "What message do you want me to give you when time is up?" | |||
self.reply(data, msg) | |||
reminder = self._get_reminder_by_id(user, data.args[0], data) | |||
if reminder: | |||
self._snooze_reminder(data, reminder, 1) | |||
def _process_cancel_command(self, data, user): | |||
"""Process the !cancel, !unremind, and !forget commands.""" | |||
if not data.args: | |||
if user not in self.reminders: | |||
self.reply(data, "You have no reminders to cancel.") | |||
elif len(self.reminders[user]) == 1: | |||
self._cancel_reminder(data, user, self.reminders[user][0]) | |||
else: | |||
msg = "You have {0} reminders. Cancel which one?" | |||
self.reply(data, msg.format(len(self.reminders[user]))) | |||
return | |||
reminder = self._get_reminder_by_id(user, data.args[0], data) | |||
if reminder: | |||
self._cancel_reminder(data, user, reminder) | |||
end = time.localtime(time.time() + wait) | |||
end_time = time.strftime("%b %d %H:%M:%S", end) | |||
end_time_with_timezone = time.strftime("%b %d %H:%M:%S %Z", end) | |||
def _show_help(self, data): | |||
"""Reply to the user with help for all major subcommands.""" | |||
parts = [ | |||
("Add new", "!remind [time] [message]"), | |||
("List all", "!reminders"), | |||
("Get info", "!remind [id]"), | |||
("Cancel", "!remind cancel [id]"), | |||
("Adjust", "!remind adjust [id] [time]"), | |||
("Restart", "!snooze [id]") | |||
] | |||
extra = "In most cases, \x0306[id]\x0F can be omitted if you have only one reminder." | |||
joined = " ".join("{0}: \x0306{1}\x0F.".format(k, v) for k, v in parts) | |||
self.reply(data, joined + " " + extra) | |||
msg = 'Set reminder for "{0}" in {1} seconds (ends {2}).' | |||
msg = msg.format(message, wait, end_time_with_timezone) | |||
self.reply(data, msg) | |||
def setup(self): | |||
self.reminders = {} | |||
self._db_lock = RLock() | |||
self._load_reminders() | |||
def process(self, data): | |||
if data.command == "snooze": | |||
return self._process_snooze_command(data, data.host) | |||
if data.command in ["cancel", "unremind", "forget"]: | |||
return self._process_cancel_command(data, data.host) | |||
if not data.args: | |||
return self._show_reminders(data, data.host) | |||
user = data.host | |||
if len(data.args) == 1: | |||
command = data.args[0] | |||
if command == "help": | |||
return self._show_help(data) | |||
if command in DISPLAY + CANCEL + SNOOZE: | |||
if user not in self.reminders: | |||
msg = "You have no reminders to {0}." | |||
self.reply(data, msg.format(self._normalize(command))) | |||
elif len(self.reminders[user]) == 1: | |||
reminder = self.reminders[user][0] | |||
self._handle_command(command, data, user, reminder) | |||
else: | |||
msg = "You have {0} reminders. {1} which one?" | |||
num = len(self.reminders[user]) | |||
command = self._normalize(command).capitalize() | |||
self.reply(data, msg.format(num, command)) | |||
return | |||
reminder = self._get_reminder_by_id(user, data.args[0], data) | |||
if reminder: | |||
self._display_reminder(data, reminder) | |||
return | |||
if data.args[0] in DISPLAY + CANCEL + SNOOZE: | |||
reminder = self._get_reminder_by_id(user, data.args[1], data) | |||
if reminder: | |||
self._handle_command(data.args[0], data, user, reminder, 2) | |||
return | |||
try: | |||
reminder = self._really_get_reminder_by_id(user, data.args[0]) | |||
except IndexError: | |||
return self._create_reminder(data, user) | |||
self._handle_command(data.args[1], data, user, reminder, 2) | |||
t_reminder = Timer(wait, self.reply, args=(data, message)) | |||
t_reminder.name = "reminder " + end_time | |||
t_reminder.daemon = True | |||
t_reminder.start() | |||
def unload(self): | |||
for reminder in chain(*self.reminders.values()): | |||
reminder.stop(delete=False) | |||
def store_reminder(self, reminder): | |||
"""Store a serialized reminder into the database.""" | |||
with self._db() as permdb: | |||
try: | |||
dump = permdb.get_attr("command:remind", "data") | |||
except KeyError: | |||
dump = "[]" | |||
database = ast.literal_eval(dump) | |||
database.append(reminder) | |||
permdb.set_attr("command:remind", "data", str(database)) | |||
def unstore_reminder(self, rid): | |||
"""Remove a reminder from the database by ID.""" | |||
with self._db() as permdb: | |||
try: | |||
dump = permdb.get_attr("command:remind", "data") | |||
except KeyError: | |||
dump = "[]" | |||
database = ast.literal_eval(dump) | |||
database = [item for item in database if item[0] != rid] | |||
permdb.set_attr("command:remind", "data", str(database)) | |||
class _Reminder(object): | |||
"""Represents a single reminder.""" | |||
def __init__(self, rid, user, wait, end, message, data, cmdobj): | |||
self.id = rid | |||
self.wait = wait | |||
self.end = end | |||
self.message = message | |||
self._user = user | |||
self._data = data | |||
self._cmdobj = cmdobj | |||
self._thread = None | |||
def _callback(self): | |||
"""Internal callback function to be executed by the reminder thread.""" | |||
thread = self._thread | |||
while time.time() < thread.end: | |||
time.sleep(1) | |||
if thread.abort: | |||
return | |||
self._cmdobj.reply(self._data, self.message) | |||
self._delete() | |||
for i in xrange(60): | |||
time.sleep(1) | |||
if thread.abort: | |||
return | |||
try: | |||
self._cmdobj.reminders[self._user].remove(self) | |||
if not self._cmdobj.reminders[self._user]: | |||
del self._cmdobj.reminders[self._user] | |||
except (KeyError, ValueError): # Already canceled by the user | |||
pass | |||
def _save(self): | |||
"""Save this reminder to the database.""" | |||
data = self._data.serialize() | |||
item = (self.id, self._user, self.wait, self.end, self.message, data) | |||
self._cmdobj.store_reminder(item) | |||
def _delete(self): | |||
"""Remove this reminder from the database.""" | |||
self._cmdobj.unstore_reminder(self.id) | |||
@property | |||
def end_time(self): | |||
"""Return a string representing the end time of a reminder.""" | |||
if self.end >= time.time(): | |||
lctime = time.localtime(self.end) | |||
if lctime.tm_year == time.localtime().tm_year: | |||
ends = time.strftime("%b %d %H:%M:%S %Z", lctime) | |||
else: | |||
ends = time.strftime("%b %d, %Y %H:%M:%S %Z", lctime) | |||
return "ends {0}".format(ends) | |||
return "expired" | |||
def start(self): | |||
"""Start the reminder timer thread. Stops it if already running.""" | |||
self.stop() | |||
self._thread = Thread(target=self._callback, name="remind-" + self.id) | |||
self._thread.end = self.end | |||
self._thread.daemon = True | |||
self._thread.abort = False | |||
self._thread.start() | |||
self._save() | |||
def stop(self, delete=True): | |||
"""Stop a currently running reminder.""" | |||
if not self._thread: | |||
return | |||
if delete: | |||
self._delete() | |||
self._thread.abort = True | |||
self._thread = None |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -0,0 +1,319 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from ast import literal_eval | |||
from earwigbot.commands import Command | |||
from earwigbot.irc import RC | |||
class Stalk(Command): | |||
"""Stalk a particular user (!stalk/!unstalk) or page (!watch/!unwatch) for | |||
edits. Applies to the current bot session only.""" | |||
name = "stalk" | |||
commands = ["stalk", "watch", "unstalk", "unwatch", "stalks", "watches", | |||
"allstalks", "allwatches", "unstalkall", "unwatchall"] | |||
hooks = ["msg", "rc"] | |||
MAX_STALKS_PER_USER = 10 | |||
def setup(self): | |||
self._users = {} | |||
self._pages = {} | |||
self._load_stalks() | |||
def check(self, data): | |||
if isinstance(data, RC): | |||
return True | |||
if data.is_command and data.command in self.commands: | |||
return True | |||
return False | |||
def process(self, data): | |||
if isinstance(data, RC): | |||
return self._process_rc(data) | |||
data.is_admin = self.config.irc["permissions"].is_admin(data) | |||
if data.command.startswith("all"): | |||
if data.is_admin: | |||
self.reply(data, self._all_stalks()) | |||
else: | |||
self.reply(data, "You must be a bot admin to view all stalked " | |||
"users or watched pages. View your own with " | |||
"\x0306!stalks\x0F.") | |||
return | |||
if data.command.endswith("all"): | |||
if not data.is_admin: | |||
self.reply(data, "You must be a bot admin to unstalk a user " | |||
"or unwatch a page for all users.") | |||
return | |||
if not data.args: | |||
self.reply(data, "You must give a user to unstalk or a page " | |||
"to unwatch. View all active with " | |||
"\x0306!allstalks\x0F.") | |||
return | |||
if not data.args or data.command in ["stalks", "watches"]: | |||
self.reply(data, self._current_stalks(data.nick)) | |||
return | |||
target = " ".join(data.args).replace("_", " ") | |||
if target.startswith("[[") and target.endswith("]]"): | |||
target = target[2:-2] | |||
if target.startswith("User:") and "stalk" in data.command: | |||
target = target[5:] | |||
target = target[0].upper() + target[1:] | |||
if data.command in ["stalk", "watch"]: | |||
if data.is_private: | |||
stalkinfo = (data.nick, None) | |||
elif not data.is_admin: | |||
self.reply(data, "You must be a bot admin to stalk users or " | |||
"watch pages publicly. Retry this command in " | |||
"a private message.") | |||
return | |||
else: | |||
stalkinfo = (data.nick, data.chan) | |||
if data.command == "stalk": | |||
self._add_stalk("user", data, target, stalkinfo) | |||
elif data.command == "watch": | |||
self._add_stalk("page", data, target, stalkinfo) | |||
elif data.command == "unstalk": | |||
self._remove_stalk("user", data, target) | |||
elif data.command == "unwatch": | |||
self._remove_stalk("page", data, target) | |||
elif data.command == "unstalkall": | |||
self._remove_all_stalks("user", data, target) | |||
elif data.command == "unwatchall": | |||
self._remove_all_stalks("page", data, target) | |||
def _process_rc(self, rc): | |||
"""Process a watcher event.""" | |||
def _update_chans(items): | |||
for item in items: | |||
if item[1]: | |||
if item[1] in chans: | |||
chans[item[1]].add(item[0]) | |||
else: | |||
chans[item[1]] = {item[0]} | |||
else: | |||
chans[item[0]] = None | |||
def _wildcard_match(target, tag): | |||
return target[-1] == "*" and tag.startswith(target[:-1]) | |||
def _process(table, tag): | |||
for target, stalks in table.iteritems(): | |||
if target == tag or _wildcard_match(target, tag): | |||
_update_chans(stalks) | |||
chans = {} | |||
_process(self._users, rc.user) | |||
if rc.is_edit: | |||
_process(self._pages, rc.page) | |||
if not chans: | |||
return | |||
with self.bot.component_lock: | |||
frontend = self.bot.frontend | |||
if frontend and not frontend.is_stopped(): | |||
pretty = rc.prettify() | |||
for chan in chans: | |||
if chans[chan]: | |||
nicks = ", ".join(sorted(chans[chan])) | |||
msg = "\x02{0}\x0F: {1}".format(nicks, pretty) | |||
else: | |||
msg = pretty | |||
if len(msg) > 400: | |||
msg = msg[:397] + "..." | |||
frontend.say(chan, msg) | |||
@staticmethod | |||
def _get_stalks_by_nick(nick, table): | |||
"""Return a dictionary of stalklist entries by the given nick.""" | |||
entries = {} | |||
for target, stalks in table.iteritems(): | |||
for info in stalks: | |||
if info[0] == nick: | |||
if target in entries: | |||
entries[target].append(info[1]) | |||
else: | |||
entries[target] = [info[1]] | |||
return entries | |||
def _add_stalk(self, stalktype, data, target, stalkinfo): | |||
"""Add a stalk entry to the given table.""" | |||
if stalktype == "user": | |||
table = self._users | |||
verb = "stalk" | |||
else: | |||
table = self._pages | |||
verb = "watch" | |||
if not data.is_admin: | |||
nstalks = len(self._get_stalks_by_nick(data.nick, table)) | |||
if nstalks >= self.MAX_STALKS_PER_USER: | |||
msg = ("Already {0}ing {1} {2}s for you, which is the limit " | |||
"for non-bot admins.") | |||
self.reply(data, msg.format(verb, nstalks, stalktype)) | |||
return | |||
if target in table: | |||
if stalkinfo in table[target]: | |||
msg = "Already {0}ing that {1} in here for you." | |||
self.reply(data, msg.format(verb, stalktype)) | |||
return | |||
else: | |||
table[target].append(stalkinfo) | |||
else: | |||
table[target] = [stalkinfo] | |||
msg = "Now {0}ing {1} \x0302{2}\x0F. Remove with \x0306!un{0} {2}\x0F." | |||
self.reply(data, msg.format(verb, stalktype, target)) | |||
self._save_stalks() | |||
def _remove_stalk(self, stalktype, data, target): | |||
"""Remove a stalk entry from the given table.""" | |||
if stalktype == "user": | |||
table = self._users | |||
verb = "stalk" | |||
plural = "stalks" | |||
else: | |||
table = self._pages | |||
verb = "watch" | |||
plural = "watches" | |||
to_remove = [] | |||
if target in table: | |||
for info in table[target]: | |||
if info[0] == data.nick: | |||
to_remove.append(info) | |||
if not to_remove: | |||
msg = ("I haven't been {0}ing that {1} for you in the first " | |||
"place. View your active {2} with \x0306!{2}\x0F.") | |||
if data.is_admin: | |||
msg += (" As a bot admin, you can clear all active {2} on " | |||
"that {1} with \x0306!un{0}all {3}\x0F.") | |||
self.reply(data, msg.format(verb, stalktype, plural, target)) | |||
return | |||
for info in to_remove: | |||
table[target].remove(info) | |||
if not table[target]: | |||
del table[target] | |||
msg = "No longer {0}ing {1} \x0302{2}\x0F for you." | |||
self.reply(data, msg.format(verb, stalktype, target)) | |||
self._save_stalks() | |||
def _remove_all_stalks(self, stalktype, data, target): | |||
"""Remove all entries for a particular target from the given table.""" | |||
if stalktype == "user": | |||
table = self._users | |||
verb = "stalk" | |||
plural = "stalks" | |||
else: | |||
table = self._pages | |||
verb = "watch" | |||
plural = "watches" | |||
try: | |||
del table[target] | |||
except KeyError: | |||
msg = ("I haven't been {0}ing that {1} for anyone in the first " | |||
"place. View all active {2} with \x0306!all{2}\x0F.") | |||
self.reply(data, msg.format(verb, stalktype, plural)) | |||
else: | |||
msg = "No longer {0}ing {1} \x0302{2}\x0F for anyone." | |||
self.reply(data, msg.format(verb, stalktype, target)) | |||
self._save_stalks() | |||
def _current_stalks(self, nick): | |||
"""Return the given user's current stalks.""" | |||
def _format_chans(chans): | |||
if None in chans: | |||
chans.remove(None) | |||
if not chans: | |||
return "privately" | |||
if len(chans) == 1: | |||
return "in {0} and privately".format(chans[0]) | |||
return "in " + ", ".join(chans) + ", and privately" | |||
return "in " + ", ".join(chans) | |||
def _format_stalks(stalks): | |||
return ", ".join( | |||
"\x0302{0}\x0F ({1})".format(target, _format_chans(chans)) | |||
for target, chans in stalks.iteritems()) | |||
users = self._get_stalks_by_nick(nick, self._users) | |||
pages = self._get_stalks_by_nick(nick, self._pages) | |||
if users: | |||
uinfo = " Users: {0}.".format(_format_stalks(users)) | |||
if pages: | |||
pinfo = " Pages: {0}.".format(_format_stalks(pages)) | |||
msg = "Currently stalking {0} user{1} and watching {2} page{3} for you.{4}{5}" | |||
return msg.format(len(users), "s" if len(users) != 1 else "", | |||
len(pages), "s" if len(pages) != 1 else "", | |||
uinfo if users else "", pinfo if pages else "") | |||
def _all_stalks(self): | |||
"""Return all existing stalks, for bot admins.""" | |||
def _format_info(info): | |||
if info[1]: | |||
return "for {0} in {1}".format(info[0], info[1]) | |||
return "for {0} privately".format(info[0]) | |||
def _format_data(data): | |||
return ", ".join(_format_info(info) for info in data) | |||
def _format_stalks(stalks): | |||
return ", ".join( | |||
"\x0302{0}\x0F ({1})".format(target, _format_data(data)) | |||
for target, data in stalks.iteritems()) | |||
users, pages = self._users, self._pages | |||
if users: | |||
uinfo = " Users: {0}.".format(_format_stalks(users)) | |||
if pages: | |||
pinfo = " Pages: {0}.".format(_format_stalks(pages)) | |||
msg = "Currently stalking {0} user{1} and watching {2} page{3}.{4}{5}" | |||
return msg.format(len(users), "s" if len(users) != 1 else "", | |||
len(pages), "s" if len(pages) != 1 else "", | |||
uinfo if users else "", pinfo if pages else "") | |||
def _load_stalks(self): | |||
"""Load saved stalks from the database.""" | |||
permdb = self.config.irc["permissions"] | |||
try: | |||
data = permdb.get_attr("command:stalk", "data") | |||
except KeyError: | |||
return | |||
self._users, self._pages = literal_eval(data) | |||
def _save_stalks(self): | |||
"""Save stalks to the database.""" | |||
permdb = self.config.irc["permissions"] | |||
data = str((self._users, self._pages)) | |||
permdb.set_attr("command:stalk", "data", data) |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -69,21 +69,28 @@ class Threads(Command): | |||
for thread in threads: | |||
tname = thread.name | |||
ident = thread.ident % 10000 | |||
if tname == "MainThread": | |||
t = "\x0302MainThread\x0F (id {0})" | |||
normal_threads.append(t.format(thread.ident)) | |||
normal_threads.append(t.format(ident)) | |||
elif tname in self.config.components: | |||
t = "\x0302{0}\x0F (id {1})" | |||
normal_threads.append(t.format(tname, thread.ident)) | |||
elif tname.startswith("reminder"): | |||
tname = tname.replace("reminder ", "") | |||
t = "\x0302reminder\x0F (until {0})" | |||
normal_threads.append(t.format(tname)) | |||
normal_threads.append(t.format(tname, ident)) | |||
elif tname.startswith("remind-"): | |||
t = "\x0302reminder\x0F (id {0})" | |||
daemon_threads.append(t.format(tname[len("remind-"):])) | |||
elif tname.startswith("cvworker-"): | |||
t = "\x0302copyvio worker\x0F (site {0})" | |||
daemon_threads.append(t.format(tname[len("cvworker-"):])) | |||
else: | |||
tname, start_time = re.findall("^(.*?) \((.*?)\)$", tname)[0] | |||
t = "\x0302{0}\x0F (id {1}, since {2})" | |||
daemon_threads.append(t.format(tname, thread.ident, | |||
start_time)) | |||
match = re.findall("^(.*?) \((.*?)\)$", tname) | |||
if match: | |||
t = "\x0302{0}\x0F (id {1}, since {2})" | |||
thread_info = t.format(match[0][0], ident, match[0][1]) | |||
daemon_threads.append(thread_info) | |||
else: | |||
t = "\x0302{0}\x0F (id {1})" | |||
daemon_threads.append(t.format(tname, ident)) | |||
if daemon_threads: | |||
if len(daemon_threads) > 1: | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -24,19 +24,24 @@ from datetime import datetime | |||
from math import floor | |||
from time import time | |||
import pytz | |||
from earwigbot import importer | |||
from earwigbot.commands import Command | |||
pytz = importer.new("pytz") | |||
class Time(Command): | |||
"""Report the current time in any timezone (UTC default), or in beats.""" | |||
"""Report the current time in any timezone (UTC default), UNIX epoch time, | |||
or beat time.""" | |||
name = "time" | |||
commands = ["time", "beats", "swatch"] | |||
commands = ["time", "beats", "swatch", "epoch", "date"] | |||
def process(self, data): | |||
if data.command in ["beats", "swatch"]: | |||
self.do_beats(data) | |||
return | |||
if data.command == "epoch": | |||
self.reply(data, time()) | |||
return | |||
if data.args: | |||
timezone = data.args[0] | |||
else: | |||
@@ -52,12 +57,12 @@ class Time(Command): | |||
self.reply(data, "@{0:0>3}".format(beats)) | |||
def do_time(self, data, timezone): | |||
if not pytz: | |||
msg = "This command requires the 'pytz' module: http://pytz.sourceforge.net/" | |||
self.reply(data, msg) | |||
return | |||
try: | |||
tzinfo = pytz.timezone(timezone) | |||
except ImportError: | |||
msg = "This command requires the 'pytz' package: http://pytz.sourceforge.net/" | |||
self.reply(data, msg) | |||
return | |||
except pytz.exceptions.UnknownTimeZoneError: | |||
self.reply(data, "Unknown timezone: {0}.".format(timezone)) | |||
return | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -0,0 +1,52 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from earwigbot.commands import Command | |||
class Watchers(Command): | |||
"""Get the number of users watching a given page.""" | |||
name = "watchers" | |||
def process(self, data): | |||
if not data.args: | |||
msg = "Which page do you want me to count the watchers of?" | |||
self.reply(data, msg) | |||
return | |||
site = self.bot.wiki.get_site() | |||
query = site.api_query(action="query", prop="info", inprop="watchers", | |||
titles=" ".join(data.args)) | |||
page = query["query"]["pages"].values()[0] | |||
title = page["title"].encode("utf8") | |||
if "invalid" in page: | |||
msg = "\x0302{0}\x0F is an invalid page title." | |||
self.reply(data, msg.format(title)) | |||
return | |||
if "watchers" in page: | |||
watchers = page["watchers"] | |||
else: | |||
watchers = "<30" | |||
plural = "" if watchers == 1 else "s" | |||
msg = "\x0302{0}\x0F has \x02{1}\x0F watcher{2}." | |||
self.reply(data, msg.format(title, watchers, plural)) |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -28,10 +28,9 @@ import logging.handlers | |||
from os import mkdir, path | |||
import stat | |||
from Crypto.Cipher import Blowfish | |||
import bcrypt | |||
import yaml | |||
from earwigbot import importer | |||
from earwigbot.config.formatter import BotFormatter | |||
from earwigbot.config.node import ConfigNode | |||
from earwigbot.config.ordered_yaml import OrderedLoader | |||
@@ -39,6 +38,9 @@ from earwigbot.config.permissions import PermissionsDB | |||
from earwigbot.config.script import ConfigScript | |||
from earwigbot.exceptions import NoConfigError | |||
Blowfish = importer.new("Crypto.Cipher.Blowfish") | |||
bcrypt = importer.new("bcrypt") | |||
__all__ = ["BotConfig"] | |||
class BotConfig(object): | |||
@@ -280,10 +282,18 @@ class BotConfig(object): | |||
self._setup_logging() | |||
if self.is_encrypted(): | |||
if not self._decryption_cipher: | |||
try: | |||
blowfish_new = Blowfish.new | |||
hashpw = bcrypt.hashpw | |||
except ImportError: | |||
url1 = "http://www.mindrot.org/projects/py-bcrypt" | |||
url2 = "https://www.dlitz.net/software/pycrypto/" | |||
e = "Encryption requires the 'py-bcrypt' and 'pycrypto' packages: {0}, {1}" | |||
raise NoConfigError(e.format(url1, url2)) | |||
key = getpass("Enter key to decrypt bot passwords: ") | |||
self._decryption_cipher = Blowfish.new(sha256(key).digest()) | |||
self._decryption_cipher = blowfish_new(sha256(key).digest()) | |||
signature = self.metadata["signature"] | |||
if bcrypt.hashpw(key, signature) != signature: | |||
if hashpw(key, signature) != signature: | |||
raise RuntimeError("Incorrect password.") | |||
for node, nodes in self._decryptable_nodes: | |||
self._decrypt(node, nodes) | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -39,7 +39,8 @@ class PermissionsDB(object): | |||
def __init__(self, dbfile): | |||
self._dbfile = dbfile | |||
self._db_access_lock = Lock() | |||
self._data = {} | |||
self._users = {} | |||
self._attributes = {} | |||
def __repr__(self): | |||
"""Return the canonical string representation of the PermissionsDB.""" | |||
@@ -53,13 +54,14 @@ class PermissionsDB(object): | |||
def _create(self, conn): | |||
"""Initialize the permissions database with its necessary tables.""" | |||
query = """CREATE TABLE users (user_nick, user_ident, user_host, | |||
user_rank)""" | |||
conn.execute(query) | |||
user_rank); | |||
CREATE TABLE attributes (attr_uid, attr_key, attr_value);""" | |||
conn.executescript(query) | |||
def _is_rank(self, user, rank): | |||
"""Return True if the given user has the given rank, else False.""" | |||
try: | |||
for rule in self._data[rank]: | |||
for rule in self._users[rank]: | |||
if user in rule: | |||
return rule | |||
except KeyError: | |||
@@ -73,9 +75,9 @@ class PermissionsDB(object): | |||
with sqlite.connect(self._dbfile) as conn: | |||
conn.execute(query, (user.nick, user.ident, user.host, rank)) | |||
try: | |||
self._data[rank].append(user) | |||
self._users[rank].append(user) | |||
except KeyError: | |||
self._data[rank] = [user] | |||
self._users[rank] = [user] | |||
return user | |||
def _del_rank(self, user, rank): | |||
@@ -84,40 +86,51 @@ class PermissionsDB(object): | |||
user_host = ? AND user_rank = ?""" | |||
with self._db_access_lock: | |||
try: | |||
for rule in self._data[rank]: | |||
for rule in self._users[rank]: | |||
if user in rule: | |||
with sqlite.connect(self._dbfile) as conn: | |||
args = (user.nick, user.ident, user.host, rank) | |||
conn.execute(query, args) | |||
self._data[rank].remove(rule) | |||
self._users[rank].remove(rule) | |||
return rule | |||
except KeyError: | |||
pass | |||
return None | |||
@property | |||
def data(self): | |||
"""A dict of all entries in the permissions database.""" | |||
return self._data | |||
def users(self): | |||
"""A dict of all users in the permissions database.""" | |||
return self._users | |||
@property | |||
def attributes(self): | |||
"""A dict of all attributes in the permissions database.""" | |||
return self._attributes | |||
def load(self): | |||
"""Load permissions from an existing database, or create a new one.""" | |||
query = "SELECT user_nick, user_ident, user_host, user_rank FROM users" | |||
self._data = {} | |||
qry1 = "SELECT user_nick, user_ident, user_host, user_rank FROM users" | |||
qry2 = "SELECT attr_uid, attr_key, attr_value FROM attributes" | |||
self._users = {} | |||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||
try: | |||
for nick, ident, host, rank in conn.execute(query): | |||
for nick, ident, host, rank in conn.execute(qry1): | |||
try: | |||
self._users[rank].append(_User(nick, ident, host)) | |||
except KeyError: | |||
self._users[rank] = [_User(nick, ident, host)] | |||
for user, key, value in conn.execute(qry2): | |||
try: | |||
self._data[rank].append(_User(nick, ident, host)) | |||
self._attributes[user][key] = value | |||
except KeyError: | |||
self._data[rank] = [_User(nick, ident, host)] | |||
self._attributes[user] = {key: value} | |||
except sqlite.OperationalError: | |||
self._create(conn) | |||
def has_exact(self, rank, nick="*", ident="*", host="*"): | |||
"""Return ``True`` if there is an exact match for this rule.""" | |||
try: | |||
for usr in self._data[rank]: | |||
for usr in self._users[rank]: | |||
if nick != usr.nick or ident != usr.ident or host != usr.host: | |||
continue | |||
return usr | |||
@@ -151,6 +164,39 @@ class PermissionsDB(object): | |||
"""Remove a nick/ident/host combo to the bot owners list.""" | |||
return self._del_rank(_User(nick, ident, host), rank=self.OWNER) | |||
def has_attr(self, user, key): | |||
"""Return ``True`` if a given user has a certain attribute, *key*.""" | |||
return user in self._attributes and key in self._attributes[user] | |||
def get_attr(self, user, key): | |||
"""Get the value of the attribute *key* of a given *user*. | |||
Raises :py:exc:`KeyError` if the *key* or *user* is not found. | |||
""" | |||
return self._attributes[user][key] | |||
def set_attr(self, user, key, value): | |||
"""Set the *value* of the attribute *key* of a given *user*.""" | |||
query1 = """SELECT attr_value FROM attributes WHERE attr_uid = ? | |||
AND attr_key = ?""" | |||
query2 = "INSERT INTO attributes VALUES (?, ?, ?)" | |||
query3 = """UPDATE attributes SET attr_value = ? WHERE attr_uid = ? | |||
AND attr_key = ?""" | |||
with self._db_access_lock, sqlite.connect(self._dbfile) as conn: | |||
if conn.execute(query1, (user, key)).fetchone(): | |||
conn.execute(query3, (value, user, key)) | |||
else: | |||
conn.execute(query2, (user, key, value)) | |||
try: | |||
self._attributes[user][key] = value | |||
except KeyError: | |||
self.attributes[user] = {key: value} | |||
def remove_attr(self, user, key): | |||
"""Remove the attribute *key* of a given *user*.""" | |||
query = "DELETE FROM attributes WHERE attr_uid = ? AND attr_key = ?" | |||
with self._db_access_lock, sqlite.connect(self._dbfile) as conn: | |||
conn.execute(query, (user, key)) | |||
class _User(object): | |||
"""A class that represents an IRC user for the purpose of testing rules.""" | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -23,19 +23,20 @@ | |||
from collections import OrderedDict | |||
from getpass import getpass | |||
from hashlib import sha256 | |||
from os import chmod, mkdir, path | |||
from os import chmod, makedirs, mkdir, path | |||
import re | |||
import stat | |||
import sys | |||
from textwrap import fill, wrap | |||
from Crypto.Cipher import Blowfish | |||
import bcrypt | |||
import yaml | |||
from earwigbot import exceptions | |||
from earwigbot import exceptions, importer | |||
from earwigbot.config.ordered_yaml import OrderedDumper | |||
Blowfish = importer.new("Crypto.Cipher.Blowfish") | |||
bcrypt = importer.new("bcrypt") | |||
__all__ = ["ConfigScript"] | |||
RULES_TEMPLATE = """# -*- coding: utf-8 -*- | |||
@@ -145,17 +146,30 @@ class ConfigScript(object): | |||
is to run on a public computer like the Toolserver, but | |||
otherwise the need to enter a key everytime you start | |||
the bot may be annoying.""") | |||
self.data["metadata"]["encryptPasswords"] = False | |||
if self._ask_bool("Encrypt stored passwords?"): | |||
self.data["metadata"]["encryptPasswords"] = True | |||
key = getpass(self.PROMPT + "Enter an encryption key: ") | |||
msg = "Running {0} rounds of bcrypt...".format(self.BCRYPT_ROUNDS) | |||
self._print_no_nl(msg) | |||
signature = bcrypt.hashpw(key, bcrypt.gensalt(self.BCRYPT_ROUNDS)) | |||
self.data["metadata"]["signature"] = signature | |||
self._cipher = Blowfish.new(sha256(key).digest()) | |||
print " done." | |||
else: | |||
self.data["metadata"]["encryptPasswords"] = False | |||
try: | |||
salt = bcrypt.gensalt(self.BCRYPT_ROUNDS) | |||
signature = bcrypt.hashpw(key, salt) | |||
self._cipher = Blowfish.new(sha256(key).digest()) | |||
except ImportError: | |||
print " error!" | |||
self._print("""Encryption requires the 'py-bcrypt' and | |||
'pycrypto' packages:""") | |||
strt, end = " * \x1b[36m", "\x1b[0m" | |||
print strt + "http://www.mindrot.org/projects/py-bcrypt/" + end | |||
print strt + "https://www.dlitz.net/software/pycrypto/" + end | |||
self._print("""I will disable encryption for now; restart | |||
configuration after installing these packages if | |||
you want it.""") | |||
self._pause() | |||
else: | |||
self.data["metadata"]["encryptPasswords"] = True | |||
self.data["metadata"]["signature"] = signature | |||
print " done." | |||
self._print("""The bot can temporarily store its logs in the logs/ | |||
@@ -265,11 +279,19 @@ class ConfigScript(object): | |||
self.data["wiki"]["sql"] = {} | |||
if self._wmf: | |||
msg = "Will this bot run from the Wikimedia Toolserver?" | |||
toolserver = self._ask_bool(msg, default=False) | |||
if toolserver: | |||
args = [("host", "$1-p.rrdb.toolserver.org"), ("db", "$1_p")] | |||
msg = "Will this bot run from the Wikimedia Tool Labs?" | |||
labs = self._ask_bool(msg, default=False) | |||
if labs: | |||
args = [("host", "$1.labsdb"), ("db", "$1_p"), | |||
("read_default_file", "~/replica.my.cnf")] | |||
self.data["wiki"]["sql"] = OrderedDict(args) | |||
else: | |||
msg = "Will this bot run from the Wikimedia Toolserver?" | |||
toolserver = self._ask_bool(msg, default=False) | |||
if toolserver: | |||
args = [("host", "$1-p.rrdb.toolserver.org"), | |||
("db", "$1_p")] | |||
self.data["wiki"]["sql"] = OrderedDict(args) | |||
self.data["wiki"]["shutoff"] = {} | |||
msg = "Would you like to enable an automatic shutoff page for the bot?" | |||
@@ -419,6 +441,7 @@ class ConfigScript(object): | |||
def make_new(self): | |||
"""Make a new config file based on the user's input.""" | |||
try: | |||
makedirs(path.dirname(self.config.path)) | |||
open(self.config.path, "w").close() | |||
chmod(self.config.path, stat.S_IRUSR|stat.S_IWUSR) | |||
except IOError: | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -36,13 +36,13 @@ This module contains all exceptions used by EarwigBot:: | |||
| +-- SQLError | |||
+-- NoServiceError | |||
+-- LoginError | |||
+-- PermissionsError | |||
+-- NamespaceNotFoundError | |||
+-- PageNotFoundError | |||
+-- InvalidPageError | |||
+-- RedirectError | |||
+-- UserNotFoundError | |||
+-- EditError | |||
| +-- PermissionsError | |||
| +-- EditConflictError | |||
| +-- NoContentError | |||
| +-- ContentTooBigError | |||
@@ -52,6 +52,7 @@ This module contains all exceptions used by EarwigBot:: | |||
+-- UnknownSearchEngineError | |||
+-- UnsupportedSearchEngineError | |||
+-- SearchQueryError | |||
+-- ParserExclusionError | |||
""" | |||
class EarwigBotError(Exception): | |||
@@ -120,6 +121,19 @@ class LoginError(WikiToolsetError): | |||
Raised by :py:meth:`Site._login <earwigbot.wiki.site.Site._login>`. | |||
""" | |||
class PermissionsError(WikiToolsetError): | |||
"""A permissions error ocurred. | |||
We tried to do something we don't have permission to, like trying to delete | |||
a page as a non-admin, or trying to edit a page without login information | |||
and AssertEdit enabled. This will also be raised if we have been blocked | |||
from editing. | |||
Raised by :py:meth:`Page.edit <earwigbot.wiki.page.Page.edit>`, | |||
:py:meth:`Page.add_section <earwigbot.wiki.page.Page.add_section>`, and | |||
other API methods depending on settings. | |||
""" | |||
class NamespaceNotFoundError(WikiToolsetError): | |||
"""A requested namespace name or namespace ID does not exist. | |||
@@ -164,17 +178,6 @@ class EditError(WikiToolsetError): | |||
:py:meth:`Page.add_section <earwigbot.wiki.page.Page.add_section>`. | |||
""" | |||
class PermissionsError(EditError): | |||
"""A permissions error ocurred while editing. | |||
We tried to do something we don't have permission to, like trying to delete | |||
a page as a non-admin, or trying to edit a page without login information | |||
and AssertEdit enabled. | |||
Raised by :py:meth:`Page.edit <earwigbot.wiki.page.Page.edit>` and | |||
:py:meth:`Page.add_section <earwigbot.wiki.page.Page.add_section>`. | |||
""" | |||
class EditConflictError(EditError): | |||
"""We gotten an edit conflict or a (rarer) delete/recreate conflict. | |||
@@ -229,9 +232,7 @@ class UnknownSearchEngineError(CopyvioCheckError): | |||
:py:attr:`config.wiki["search"]["engine"]`. | |||
Raised by :py:meth:`Page.copyvio_check | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and | |||
:py:meth:`Page.copyvio_compare | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`. | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`. | |||
""" | |||
class UnsupportedSearchEngineError(CopyvioCheckError): | |||
@@ -241,16 +242,20 @@ class UnsupportedSearchEngineError(CopyvioCheckError): | |||
couldn't be imported. | |||
Raised by :py:meth:`Page.copyvio_check | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and | |||
:py:meth:`Page.copyvio_compare | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`. | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`. | |||
""" | |||
class SearchQueryError(CopyvioCheckError): | |||
"""Some error ocurred while doing a search query. | |||
Raised by :py:meth:`Page.copyvio_check | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>` and | |||
:py:meth:`Page.copyvio_compare | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_compare>`. | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`. | |||
""" | |||
class ParserExclusionError(CopyvioCheckError): | |||
"""A content parser detected that the given source should be excluded. | |||
Raised internally by :py:meth:`Page.copyvio_check | |||
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`; should not be | |||
exposed in client code. | |||
""" |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -172,7 +172,7 @@ class IRCConnection(object): | |||
if data.is_private: | |||
self.say(data.chan, msg, hidelog) | |||
else: | |||
msg = "\x02{0}\x0F: {1}".format(data.nick, msg) | |||
msg = "\x02{0}\x0F: {1}".format(data.reply_nick, msg) | |||
self.say(data.chan, msg, hidelog) | |||
def action(self, target, msg, hidelog=False): | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -27,34 +27,35 @@ __all__ = ["Data"] | |||
class Data(object): | |||
"""Store data from an individual line received on IRC.""" | |||
def __init__(self, bot, my_nick, line, msgtype): | |||
self._bot = bot | |||
def __init__(self, my_nick, line, msgtype): | |||
self._my_nick = my_nick.lower() | |||
self._line = line | |||
self._msgtype = msgtype | |||
self._is_private = self._is_command = False | |||
self._msg = self._command = self._trigger = None | |||
self._args = [] | |||
self._kwargs = {} | |||
self._parse(msgtype) | |||
self._parse() | |||
def __repr__(self): | |||
"""Return the canonical string representation of the Data.""" | |||
res = "Data(bot={0!r}, my_nick={1!r}, line={2!r})" | |||
return res.format(self._bot, self.my_nick, self.line) | |||
res = "Data(my_nick={0!r}, line={1!r})" | |||
return res.format(self.my_nick, self.line) | |||
def __str__(self): | |||
"""Return a nice string representation of the Data.""" | |||
return "<Data of {0!r}>".format(" ".join(self.line)) | |||
def _parse(self, msgtype): | |||
def _parse(self): | |||
"""Parse a line from IRC into its components as instance attributes.""" | |||
sender = re.findall(r":(.*?)!(.*?)@(.*?)\Z", self.line[0])[0] | |||
self._nick, self._ident, self._host = sender | |||
self._reply_nick = self._nick | |||
self._chan = self.line[2] | |||
if msgtype == "PRIVMSG": | |||
if self._msgtype == "PRIVMSG": | |||
if self.chan.lower() == self.my_nick: | |||
# This is a privmsg to us, so set 'chan' as the nick of the | |||
# sender instead of the 'channel', which is ourselves: | |||
@@ -75,10 +76,16 @@ class Data(object): | |||
self._args = self.msg.strip().split() | |||
try: | |||
self._command = self.args.pop(0).lower() | |||
command_uc = self.args.pop(0) | |||
self._command = command_uc.lower() | |||
except IndexError: | |||
return | |||
# e.g. "!command>user arg1 arg2" | |||
if ">" in self.command: | |||
command_uc, self._reply_nick = command_uc.split(">", 1) | |||
self._command = command_uc.lower() | |||
if self.command.startswith("!") or self.command.startswith("."): | |||
# e.g. "!command arg1 arg2" | |||
self._is_command = True | |||
@@ -103,6 +110,10 @@ class Data(object): | |||
except IndexError: | |||
pass | |||
# e.g. "!command >user arg1 arg2" | |||
if self.args and self.args[0].startswith(">"): | |||
self._reply_nick = self.args.pop(0)[1:] | |||
def _parse_kwargs(self): | |||
"""Parse keyword arguments embedded in self.args. | |||
@@ -152,6 +163,11 @@ class Data(object): | |||
return self._host | |||
@property | |||
def reply_nick(self): | |||
"""Nickname of the person to reply to. Sender by default.""" | |||
return self._reply_nick | |||
@property | |||
def msg(self): | |||
"""Text of the sent message, if it is a message, else ``None``.""" | |||
return self._msg | |||
@@ -210,3 +226,12 @@ class Data(object): | |||
arguments. | |||
""" | |||
return self._kwargs | |||
def serialize(self): | |||
"""Serialize this object into a tuple and return it.""" | |||
return (self._my_nick, self._line, self._msgtype) | |||
@classmethod | |||
def unserialize(cls, data): | |||
"""Return a new Data object built from a serialized tuple.""" | |||
return cls(*data) |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -59,11 +59,15 @@ class Frontend(IRCConnection): | |||
def _process_message(self, line): | |||
"""Process a single message from IRC.""" | |||
if line[1] == "JOIN": | |||
data = Data(self.bot, self.nick, line, msgtype="JOIN") | |||
data = Data(self.nick, line, msgtype="JOIN") | |||
self.bot.commands.call("join", data) | |||
elif line[1] == "PART": | |||
data = Data(self.nick, line, msgtype="PART") | |||
self.bot.commands.call("part", data) | |||
elif line[1] == "PRIVMSG": | |||
data = Data(self.bot, self.nick, line, msgtype="PRIVMSG") | |||
data = Data(self.nick, line, msgtype="PRIVMSG") | |||
if data.is_private: | |||
self.bot.commands.call("msg_private", data) | |||
else: | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -27,7 +27,7 @@ __all__ = ["RC"] | |||
class RC(object): | |||
"""Store data from an event received from our IRC watcher.""" | |||
re_color = re.compile("\x03([0-9]{1,2}(,[0-9]{1,2})?)?") | |||
re_edit = re.compile("\A\[\[(.*?)\]\]\s(.*?)\s(http://.*?)\s\*\s(.*?)\s\*\s(.*?)\Z") | |||
re_edit = re.compile("\A\[\[(.*?)\]\]\s(.*?)\s(https?://.*?)\s\*\s(.*?)\s\*\s(.*?)\Z") | |||
re_log = re.compile("\A\[\[(.*?)\]\]\s(.*?)\s\s\*\s(.*?)\s\*\s(.*?)\Z") | |||
pretty_edit = "\x02New {0}\x0F: \x0314[[\x0307{1}\x0314]]\x0306 * \x0303{2}\x0306 * \x0302{3}\x0306 * \x0310{4}" | |||
@@ -60,7 +60,7 @@ class RC(object): | |||
# We're probably missing the http:// part, because it's a log | |||
# entry, which lacks a URL: | |||
page, flags, user, comment = self.re_log.findall(msg)[0] | |||
url = "http://{0}.org/wiki/{1}".format(self.chan[1:], page) | |||
url = "https://{0}.org/wiki/{1}".format(self.chan[1:], page) | |||
self.is_edit = False # This is a log entry, not edit | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -21,7 +21,6 @@ | |||
# SOFTWARE. | |||
import imp | |||
import os | |||
from earwigbot.irc import IRCConnection, RC | |||
@@ -72,6 +71,7 @@ class Watcher(IRCConnection): | |||
rc = RC(chan, msg) # New RC object to store this event's data | |||
rc.parse() # Parse a message into pagenames, usernames, etc. | |||
self._process_rc_event(rc) | |||
self.bot.commands.call("rc", rc) | |||
# When we've finished starting up, join all watcher channels: | |||
elif line[1] == "376": | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -21,28 +21,39 @@ | |||
# SOFTWARE. | |||
""" | |||
Implements a hierarchy of importing classes as defined in PEP 302 to load | |||
modules in a safe yet lazy manner. | |||
Implements a hierarchy of importing classes as defined in `PEP 302 | |||
<http://www.python.org/dev/peps/pep-0302/>`_ to load modules in a safe yet lazy | |||
manner, so that they can be referred to by name but are not actually loaded | |||
until they are used (i.e. their attributes are read or modified). | |||
""" | |||
from imp import acquire_lock, release_lock | |||
import sys | |||
from threading import RLock | |||
from types import ModuleType | |||
__all__ = ["LazyImporter"] | |||
def _getattribute(self, attr): | |||
_load(self) | |||
return self.__getattribute__(attr) | |||
_real_get = ModuleType.__getattribute__ | |||
def _setattr(self, attr, value): | |||
_load(self) | |||
self.__setattr__(attr, value) | |||
def _create_failing_get(exc): | |||
def _fail(self, attr): | |||
raise exc | |||
return _fail | |||
def _load(self): | |||
type(self).__getattribute__ = ModuleType.__getattribute__ | |||
type(self).__setattr__ = ModuleType.__setattr__ | |||
reload(self) | |||
def _mock_get(self, attr): | |||
with _real_get(self, "_lock"): | |||
if _real_get(self, "_unloaded"): | |||
type(self)._unloaded = False | |||
try: | |||
reload(self) | |||
except ImportError as exc: | |||
type(self).__getattribute__ = _create_failing_get(exc) | |||
del type(self)._lock | |||
raise | |||
type(self).__getattribute__ = _real_get | |||
del type(self)._lock | |||
return _real_get(self, attr) | |||
class _LazyModule(type): | |||
@@ -52,18 +63,26 @@ class _LazyModule(type): | |||
if name not in sys.modules: | |||
attributes = { | |||
"__name__": name, | |||
"__getattribute__": _getattribute, | |||
"__setattr__": _setattr | |||
"__getattribute__": _mock_get, | |||
"_unloaded": True, | |||
"_lock": RLock() | |||
} | |||
parents = (ModuleType,) | |||
klass = type.__new__(cls, "module", parents, attributes) | |||
sys.modules[name] = klass(name) | |||
if "." in name: # Also ensure the parent exists | |||
_LazyModule(name.rsplit(".", 1)[0]) | |||
return sys.modules[name] | |||
finally: | |||
release_lock() | |||
class LazyImporter(object): | |||
"""An importer for modules that are loaded lazily. | |||
This inserts itself into :py:data:`sys.meta_path`, storing a dictionary of | |||
:py:class:`_LazyModule`\ s (which is added to with :py:meth:`new`). | |||
""" | |||
def __init__(self): | |||
self._modules = {} | |||
sys.meta_path.append(self) | |||
@@ -1,7 +1,7 @@ | |||
#! /usr/bin/env python | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -131,11 +131,25 @@ class _ResourceManager(object): | |||
if modname in disabled: | |||
log = "Skipping disabled module {0}".format(modname) | |||
self.logger.debug(log) | |||
processed.append(modname) | |||
continue | |||
if modname not in processed: | |||
self._load_module(modname, dir) | |||
processed.append(modname) | |||
def _unload_resources(self): | |||
"""Unload all resources, calling their unload hooks in the process.""" | |||
res_type = self._resource_name[:-1] # e.g. "command" or "task" | |||
for resource in self: | |||
if not hasattr(resource, "unload"): | |||
continue | |||
try: | |||
resource.unload() | |||
except Exception: | |||
e = "Error unloading {0} '{1}'" | |||
self.logger.exception(e.format(res_type, resource.name)) | |||
self._resources.clear() | |||
@property | |||
def lock(self): | |||
"""The resource access/modify lock.""" | |||
@@ -145,7 +159,7 @@ class _ResourceManager(object): | |||
"""Load (or reload) all valid resources into :py:attr:`_resources`.""" | |||
name = self._resource_name # e.g. "commands" or "tasks" | |||
with self.lock: | |||
self._resources.clear() | |||
self._unload_resources() | |||
builtin_dir = path.join(path.dirname(__file__), name) | |||
plugins_dir = path.join(self.bot.config.root_dir, name) | |||
if getattr(self.bot.config, name).get("disable") is True: | |||
@@ -200,7 +214,11 @@ class CommandManager(_ResourceManager): | |||
self.logger.exception(e.format(command.name)) | |||
def call(self, hook, data): | |||
"""Respond to a hook type and a :py:class:`Data` object.""" | |||
"""Respond to a hook type and a :py:class:`~.Data` object. | |||
.. note:: | |||
The special ``rc`` hook actually passes a :class:`~.RC` object. | |||
""" | |||
for command in self: | |||
if hook in command.hooks and self._wrap_check(command, data): | |||
thread = Thread(target=self._wrap_process, | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -84,6 +84,13 @@ class Task(object): | |||
""" | |||
pass | |||
def unload(self): | |||
"""Hook called immediately before the task is unloaded. | |||
Does nothing by default; feel free to override. | |||
""" | |||
pass | |||
def make_summary(self, comment): | |||
"""Make an edit summary by filling in variables in a config value. | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,7 +1,7 @@ | |||
#! /usr/bin/env python | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -143,7 +143,7 @@ def main(): | |||
pass | |||
finally: | |||
if thread.is_alive(): | |||
bot.tasks.logger.warn("The task is will be killed") | |||
bot.tasks.logger.warn("The task will be killed") | |||
else: | |||
try: | |||
bot.run() | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -58,10 +58,14 @@ class Category(Page): | |||
"""Return a nice string representation of the Category.""" | |||
return '<Category "{0}" of {1}>'.format(self.title, str(self.site)) | |||
def __iter__(self): | |||
"""Iterate over all members of the category.""" | |||
return self.get_members() | |||
def _get_members_via_api(self, limit, follow): | |||
"""Iterate over Pages in the category using the API.""" | |||
params = {"action": "query", "list": "categorymembers", | |||
"cmtitle": self.title} | |||
"cmtitle": self.title, "continue": ""} | |||
while 1: | |||
params["cmlimit"] = limit if limit else "max" | |||
@@ -70,9 +74,8 @@ class Category(Page): | |||
title = member["title"] | |||
yield self.site.get_page(title, follow_redirects=follow) | |||
if "query-continue" in result: | |||
qcontinue = result["query-continue"]["categorymembers"] | |||
params["cmcontinue"] = qcontinue["cmcontinue"] | |||
if "continue" in result: | |||
params.update(result["continue"]) | |||
if limit: | |||
limit -= len(result["query"]["categorymembers"]) | |||
else: | |||
@@ -87,9 +90,9 @@ class Category(Page): | |||
if limit: | |||
query += " LIMIT ?" | |||
result = self.site.sql_query(query, (title, limit)) | |||
result = self.site.sql_query(query, (title, limit), buffsize=0) | |||
else: | |||
result = self.site.sql_query(query, (title,)) | |||
result = self.site.sql_query(query, (title,), buffsize=0) | |||
members = list(result) | |||
for row in members: | |||
@@ -100,7 +103,7 @@ class Category(Page): | |||
else: # Avoid doing a silly (albeit valid) ":Pagename" thing | |||
title = base | |||
yield self.site.get_page(title, follow_redirects=follow, | |||
pageid=row[2]) | |||
pageid=row[2]) | |||
def _get_size_via_api(self, member_type): | |||
"""Return the size of the category using the API.""" | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -20,21 +20,19 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from gzip import GzipFile | |||
from socket import timeout | |||
from StringIO import StringIO | |||
from time import sleep, time | |||
from urllib2 import build_opener, URLError | |||
from urllib2 import build_opener | |||
import oauth2 as oauth | |||
from earwigbot import exceptions | |||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult | |||
from earwigbot import exceptions, importer | |||
from earwigbot.wiki.copyvios.markov import MarkovChain | |||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser | |||
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | |||
from earwigbot.wiki.copyvios.workers import ( | |||
globalize, localize, CopyvioWorkspace) | |||
oauth = importer.new("oauth2") | |||
__all__ = ["CopyvioMixIn"] | |||
__all__ = ["CopyvioMixIn", "globalize", "localize"] | |||
class CopyvioMixIn(object): | |||
""" | |||
@@ -50,34 +48,9 @@ class CopyvioMixIn(object): | |||
def __init__(self, site): | |||
self._search_config = site._search_config | |||
self._exclusions_db = self._search_config.get("exclusions_db") | |||
self._opener = build_opener() | |||
self._opener.addheaders = site._opener.addheaders | |||
def _open_url_ignoring_errors(self, url): | |||
"""Open a URL using self._opener and return its content, or None. | |||
Will decompress the content if the headers contain "gzip" as its | |||
content encoding, and will return None if URLError is raised while | |||
opening the URL. IOErrors while gunzipping a compressed response are | |||
ignored, and the original content is returned. | |||
""" | |||
try: | |||
response = self._opener.open(url.encode("utf8"), timeout=5) | |||
except (URLError, timeout): | |||
return None | |||
result = response.read() | |||
if response.headers.get("Content-Encoding") == "gzip": | |||
stream = StringIO(result) | |||
gzipper = GzipFile(fileobj=stream) | |||
try: | |||
result = gzipper.read() | |||
except IOError: | |||
pass | |||
return result | |||
self._addheaders = site._opener.addheaders | |||
def _select_search_engine(self): | |||
def _get_search_engine(self): | |||
"""Return a function that can be called to do web searches. | |||
The function takes one argument, a search query, and returns a list of | |||
@@ -93,137 +66,124 @@ class CopyvioMixIn(object): | |||
credentials = self._search_config["credentials"] | |||
if engine == "Yahoo! BOSS": | |||
if not oauth: | |||
e = "The package 'oauth2' could not be imported" | |||
try: | |||
oauth.__version__ # Force-load the lazy module | |||
except ImportError: | |||
e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2" | |||
raise exceptions.UnsupportedSearchEngineError(e) | |||
return YahooBOSSSearchEngine(credentials) | |||
opener = build_opener() | |||
opener.addheaders = self._addheaders | |||
return YahooBOSSSearchEngine(credentials, opener) | |||
raise exceptions.UnknownSearchEngineError(engine) | |||
def _copyvio_compare_content(self, article, url): | |||
"""Return a number comparing an article and a URL. | |||
The *article* is a Markov chain, whereas the *url* is just a string | |||
that we'll try to open and read ourselves. | |||
""" | |||
html = self._open_url_ignoring_errors(url) | |||
if not html: | |||
return 0 | |||
source = MarkovChain(HTMLTextParser(html).strip()) | |||
delta = MarkovChainIntersection(article, source) | |||
return float(delta.size()) / article.size(), (source, delta) | |||
def copyvio_check(self, min_confidence=0.5, max_queries=-1, | |||
interquery_sleep=1): | |||
def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1, | |||
no_searches=False, no_links=False, short_circuit=True): | |||
"""Check the page for copyright violations. | |||
Returns a | |||
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object | |||
with information on the results of the check. | |||
Returns a :class:`.CopyvioCheckResult` object with information on the | |||
results of the check. | |||
*max_queries* is self-explanatory; we will never make more than this | |||
number of queries in a given check. If it's lower than 0, we will not | |||
limit the number of queries. | |||
*min_confidence* is the minimum amount of confidence we must have in | |||
the similarity between a source text and the article in order for us to | |||
consider it a suspected violation. This is a number between 0 and 1. | |||
*interquery_sleep* is the minimum amount of time we will sleep between | |||
search engine queries, in seconds. | |||
Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses | |||
(:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`, | |||
:py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors. | |||
*max_queries* is self-explanatory; we will never make more than this | |||
number of queries in a given check. | |||
*max_time* can be set to prevent copyvio checks from taking longer than | |||
a set amount of time (generally around a minute), which can be useful | |||
if checks are called through a web server with timeouts. We will stop | |||
checking new URLs as soon as this limit is reached. | |||
Setting *no_searches* to ``True`` will cause only URLs in the wikitext | |||
of the page to be checked; no search engine queries will be made. | |||
Setting *no_links* to ``True`` will cause the opposite to happen: URLs | |||
in the wikitext will be ignored; search engine queries will be made | |||
only. Setting both of these to ``True`` is pointless. | |||
Normally, the checker will short-circuit if it finds a URL that meets | |||
*min_confidence*. This behavior normally causes it to skip any | |||
remaining URLs and web queries, but setting *short_circuit* to | |||
``False`` will prevent this. | |||
Raises :exc:`.CopyvioCheckError` or subclasses | |||
(:exc:`.UnknownSearchEngineError`, :exc:`.SearchQueryError`, ...) on | |||
errors. | |||
""" | |||
searcher = self._select_search_engine() | |||
log = u"Starting copyvio check for [[{0}]]" | |||
self._logger.info(log.format(self.title)) | |||
searcher = self._get_search_engine() | |||
parser = ArticleTextParser(self.get()) | |||
article = MarkovChain(parser.strip()) | |||
parser_args = {} | |||
if self._exclusions_db: | |||
self._exclusions_db.sync(self.site.name) | |||
handled_urls = [] | |||
best_confidence = 0 | |||
best_match = None | |||
num_queries = 0 | |||
empty = MarkovChain("") | |||
best_chains = (empty, MarkovChainIntersection(empty, empty)) | |||
parser = ArticleTextParser(self.get()) | |||
clean = parser.strip() | |||
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) | |||
article_chain = MarkovChain(clean) | |||
last_query = time() | |||
if article_chain.size() < 20: # Auto-fail very small articles | |||
return CopyvioCheckResult(False, best_confidence, best_match, | |||
num_queries, article_chain, best_chains) | |||
while (chunks and best_confidence < min_confidence and | |||
(max_queries < 0 or num_queries < max_queries)): | |||
chunk = chunks.pop(0) | |||
log = u"[[{0}]] -> querying {1} for {2!r}" | |||
self._logger.debug(log.format(self.title, searcher.name, chunk)) | |||
urls = searcher.search(chunk) | |||
urls = [url for url in urls if url not in handled_urls] | |||
for url in urls: | |||
handled_urls.append(url) | |||
if self._exclusions_db: | |||
if self._exclusions_db.check(self.site.name, url): | |||
continue | |||
conf, chains = self._copyvio_compare_content(article_chain, url) | |||
if conf > best_confidence: | |||
best_confidence = conf | |||
best_match = url | |||
best_chains = chains | |||
num_queries += 1 | |||
diff = time() - last_query | |||
if diff < interquery_sleep: | |||
sleep(interquery_sleep - diff) | |||
last_query = time() | |||
if best_confidence >= min_confidence: | |||
is_violation = True | |||
log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2}; using {3} queries)" | |||
self._logger.debug(log.format(self.title, best_confidence, | |||
best_match, num_queries)) | |||
exclude = lambda u: self._exclusions_db.check(self.site.name, u) | |||
parser_args["mirror_hints"] = self._exclusions_db.get_mirror_hints( | |||
self.site.name) | |||
else: | |||
is_violation = False | |||
log = u"No violation for [[{0}]] (confidence: {1}; using {2} queries)" | |||
self._logger.debug(log.format(self.title, best_confidence, | |||
num_queries)) | |||
exclude = None | |||
workspace = CopyvioWorkspace( | |||
article, min_confidence, max_time, self._logger, self._addheaders, | |||
short_circuit=short_circuit, parser_args=parser_args) | |||
return CopyvioCheckResult(is_violation, best_confidence, best_match, | |||
num_queries, article_chain, best_chains) | |||
if article.size < 20: # Auto-fail very small articles | |||
result = workspace.get_result() | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result | |||
def copyvio_compare(self, url, min_confidence=0.5): | |||
if not no_links: | |||
workspace.enqueue(parser.get_links(), exclude) | |||
num_queries = 0 | |||
if not no_searches: | |||
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) | |||
for chunk in chunks: | |||
if short_circuit and workspace.finished: | |||
workspace.possible_miss = True | |||
break | |||
log = u"[[{0}]] -> querying {1} for {2!r}" | |||
self._logger.debug(log.format(self.title, searcher.name, chunk)) | |||
workspace.enqueue(searcher.search(chunk), exclude) | |||
num_queries += 1 | |||
sleep(1) | |||
workspace.wait() | |||
result = workspace.get_result(num_queries) | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result | |||
def copyvio_compare(self, url, min_confidence=0.75, max_time=30): | |||
"""Check the page like :py:meth:`copyvio_check` against a specific URL. | |||
This is essentially a reduced version of the above - a copyivo | |||
comparison is made using Markov chains and the result is returned in a | |||
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object - | |||
but without using a search engine, since the suspected "violated" URL | |||
is supplied from the start. | |||
This is essentially a reduced version of :meth:`copyvio_check` - a | |||
copyivo comparison is made using Markov chains and the result is | |||
returned in a :class:`.CopyvioCheckResult` object - but without using a | |||
search engine, since the suspected "violated" URL is supplied from the | |||
start. | |||
Its primary use is to generate a result when the URL is retrieved from | |||
a cache, like the one used in EarwigBot's Toolserver site. After a | |||
search is done, the resulting URL is stored in a cache for 24 hours so | |||
a cache, like the one used in EarwigBot's Tool Labs site. After a | |||
search is done, the resulting URL is stored in a cache for 72 hours so | |||
future checks against that page will not require another set of | |||
time-and-money-consuming search engine queries. However, the comparison | |||
itself (which includes the article's and the source's content) cannot | |||
be stored for data retention reasons, so a fresh comparison is made | |||
using this function. | |||
Since no searching is done, neither | |||
:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor | |||
:py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised. | |||
Since no searching is done, neither :exc:`.UnknownSearchEngineError` | |||
nor :exc:`.SearchQueryError` will be raised. | |||
""" | |||
content = self.get() | |||
clean = ArticleTextParser(content).strip() | |||
article_chain = MarkovChain(clean) | |||
confidence, chains = self._copyvio_compare_content(article_chain, url) | |||
if confidence >= min_confidence: | |||
is_violation = True | |||
log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2})" | |||
self._logger.debug(log.format(self.title, confidence, url)) | |||
else: | |||
is_violation = False | |||
log = u"No violation for [[{0}]] (confidence: {1}; URL: {2})" | |||
self._logger.debug(log.format(self.title, confidence, url)) | |||
return CopyvioCheckResult(is_violation, confidence, url, 0, | |||
article_chain, chains) | |||
log = u"Starting copyvio compare for [[{0}]] against {1}" | |||
self._logger.info(log.format(self.title, url)) | |||
article = MarkovChain(ArticleTextParser(self.get()).strip()) | |||
workspace = CopyvioWorkspace( | |||
article, min_confidence, max_time, self._logger, self._addheaders, | |||
max_time, num_workers=1) | |||
workspace.enqueue([url]) | |||
workspace.wait() | |||
result = workspace.get_result() | |||
self._logger.info(result.get_log_message(self.title)) | |||
return result |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -30,13 +30,16 @@ from earwigbot import exceptions | |||
__all__ = ["ExclusionsDB"] | |||
default_sources = { | |||
DEFAULT_SOURCES = { | |||
"all": [ # Applies to all, but located on enwiki | |||
"User:EarwigBot/Copyvios/Exclusions", | |||
"User:EranBot/Copyright/Blacklist" | |||
], | |||
"enwiki": [ | |||
"Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def", | |||
"Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl", | |||
"Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr", | |||
"Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz", | |||
"User:EarwigBot/Copyvios/Exclusions" | |||
"Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz" | |||
] | |||
} | |||
@@ -72,8 +75,9 @@ class ExclusionsDB(object): | |||
""" | |||
query = "INSERT INTO sources VALUES (?, ?);" | |||
sources = [] | |||
for sitename, pages in default_sources.iteritems(): | |||
[sources.append((sitename, page)) for page in pages] | |||
for sitename, pages in DEFAULT_SOURCES.iteritems(): | |||
for page in pages: | |||
sources.append((sitename, page)) | |||
with sqlite.connect(self._dbfile) as conn: | |||
conn.executescript(script) | |||
@@ -87,25 +91,37 @@ class ExclusionsDB(object): | |||
except exceptions.PageNotFoundError: | |||
return urls | |||
if source == "User:EranBot/Copyright/Blacklist": | |||
for line in data.splitlines()[1:]: | |||
line = re.sub(r"(#|==).*$", "", line).strip() | |||
if line: | |||
urls.add("re:" + line) | |||
return urls | |||
regexes = [ | |||
"url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>", | |||
"\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?" | |||
r"url\s*=\s*(?:\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\</nowiki\>.*?)?\s*$", | |||
r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*?)(?:\].*?|\</nowiki\>.*?)?\s*$" | |||
] | |||
for regex in regexes: | |||
[urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)] | |||
for url in re.findall(regex, data, re.I|re.M): | |||
if url.strip(): | |||
urls.add(url.lower().strip()) | |||
return urls | |||
def _update(self, sitename): | |||
"""Update the database from listed sources in the index.""" | |||
query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;" | |||
query1 = "SELECT source_page FROM sources WHERE source_sitename = ?" | |||
query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | |||
query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?" | |||
query4 = "INSERT INTO exclusions VALUES (?, ?);" | |||
query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;" | |||
query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;" | |||
query7 = "INSERT INTO updates VALUES (?, ?);" | |||
query4 = "INSERT INTO exclusions VALUES (?, ?)" | |||
query5 = "SELECT 1 FROM updates WHERE update_sitename = ?" | |||
query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?" | |||
query7 = "INSERT INTO updates VALUES (?, ?)" | |||
site = self._sitesdb.get_site(sitename) | |||
if sitename == "all": | |||
site = self._sitesdb.get_site("enwiki") | |||
else: | |||
site = self._sitesdb.get_site(sitename) | |||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||
urls = set() | |||
for (source,) in conn.execute(query1, (sitename,)): | |||
@@ -123,7 +139,7 @@ class ExclusionsDB(object): | |||
def _get_last_update(self, sitename): | |||
"""Return the UNIX timestamp of the last time the db was updated.""" | |||
query = "SELECT update_time FROM updates WHERE update_sitename = ?;" | |||
query = "SELECT update_time FROM updates WHERE update_sitename = ?" | |||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||
try: | |||
result = conn.execute(query, (sitename,)).fetchone() | |||
@@ -132,35 +148,49 @@ class ExclusionsDB(object): | |||
return 0 | |||
return result[0] if result else 0 | |||
def sync(self, sitename): | |||
"""Update the database if it hasn't been updated in the past week. | |||
def sync(self, sitename, force=False): | |||
"""Update the database if it hasn't been updated recently. | |||
This updates the exclusions database for the site *sitename* and "all". | |||
This only updates the exclusions database for the *sitename* site. | |||
Site-specific lists are considered stale after 48 hours; global lists | |||
after 12 hours. | |||
""" | |||
max_staleness = 60 * 60 * 24 * 7 | |||
max_staleness = 60 * 60 * (12 if sitename == "all" else 48) | |||
time_since_update = int(time() - self._get_last_update(sitename)) | |||
if time_since_update > max_staleness: | |||
if force or time_since_update > max_staleness: | |||
log = u"Updating stale database: {0} (last updated {1} seconds ago)" | |||
self._logger.info(log.format(sitename, time_since_update)) | |||
self._update(sitename) | |||
else: | |||
log = u"Database for {0} is still fresh (last updated {1} seconds ago)" | |||
self._logger.debug(log.format(sitename, time_since_update)) | |||
if sitename != "all": | |||
self.sync("all", force=force) | |||
def check(self, sitename, url): | |||
"""Check whether a given URL is in the exclusions database. | |||
Return ``True`` if the URL is in the database, or ``False`` otherwise. | |||
""" | |||
normalized = re.sub("https?://", "", url.lower()) | |||
query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | |||
normalized = re.sub(r"^https?://(www\.)?", "", url.lower()) | |||
query = """SELECT exclusion_url FROM exclusions | |||
WHERE exclusion_sitename = ? OR exclusion_sitename = ?""" | |||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||
for (excl,) in conn.execute(query, (sitename,)): | |||
for (excl,) in conn.execute(query, (sitename, "all")): | |||
if excl.startswith("*."): | |||
netloc = urlparse(url.lower()).netloc | |||
matches = True if excl[2:] in netloc else False | |||
parsed = urlparse(url.lower()) | |||
matches = excl[2:] in parsed.netloc | |||
if matches and "/" in excl: | |||
excl_path = excl[excl.index("/") + 1] | |||
matches = excl_path.startswith(parsed.path) | |||
elif excl.startswith("re:"): | |||
try: | |||
matches = re.match(excl[3:], normalized) | |||
except re.error: | |||
continue | |||
else: | |||
matches = True if normalized.startswith(excl) else False | |||
matches = normalized.startswith(excl) | |||
if matches: | |||
log = u"Exclusion detected in {0} for {1}" | |||
self._logger.debug(log.format(sitename, url)) | |||
@@ -169,3 +199,22 @@ class ExclusionsDB(object): | |||
log = u"No exclusions in {0} for {1}".format(sitename, url) | |||
self._logger.debug(log) | |||
return False | |||
def get_mirror_hints(self, sitename, try_mobile=True): | |||
"""Return a list of strings that indicate the existence of a mirror. | |||
The source parser checks for the presence of these strings inside of | |||
certain HTML tag attributes (``"href"`` and ``"src"``). | |||
""" | |||
site = self._sitesdb.get_site(sitename) | |||
base = site.domain + site._script_path | |||
roots = [base] | |||
scripts = ["index.php", "load.php", "api.php"] | |||
if try_mobile: | |||
fragments = re.search(r"^([\w]+)\.([\w]+).([\w]+)$", site.domain) | |||
if fragments: | |||
mobile = "{0}.m.{1}.{2}".format(*fragments.groups()) | |||
roots.append(mobile + site._script_path) | |||
return [root + "/" + script for root in roots for script in scripts] |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -23,24 +23,34 @@ | |||
from collections import defaultdict | |||
from re import sub, UNICODE | |||
__all__ = ["MarkovChain", "MarkovChainIntersection"] | |||
__all__ = ["EMPTY", "EMPTY_INTERSECTION", "MarkovChain", | |||
"MarkovChainIntersection"] | |||
class MarkovChain(object): | |||
"""Implements a basic ngram Markov chain of words.""" | |||
START = -1 | |||
END = -2 | |||
degree = 3 # 2 for bigrams, 3 for trigrams, etc. | |||
degree = 5 # 2 for bigrams, 3 for trigrams, etc. | |||
def __init__(self, text): | |||
self.text = text | |||
self.chain = defaultdict(lambda: defaultdict(lambda: 0)) | |||
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() | |||
words = sub(r"[^\w\s-]", "", text.lower(), flags=UNICODE).split() | |||
padding = self.degree - 1 | |||
words = ([self.START] * padding) + words + ([self.END] * padding) | |||
for i in range(len(words) - self.degree + 1): | |||
last = i + self.degree - 1 | |||
self.chain[tuple(words[i:last])][words[last]] += 1 | |||
self.size = self._get_size() | |||
def _get_size(self): | |||
"""Return the size of the Markov chain: the total number of nodes.""" | |||
size = 0 | |||
for node in self.chain.itervalues(): | |||
for hits in node.itervalues(): | |||
size += hits | |||
return size | |||
def __repr__(self): | |||
"""Return the canonical string representation of the MarkovChain.""" | |||
@@ -48,15 +58,7 @@ class MarkovChain(object): | |||
def __str__(self): | |||
"""Return a nice string representation of the MarkovChain.""" | |||
return "<MarkovChain of size {0}>".format(self.size()) | |||
def size(self): | |||
"""Return the size of the Markov chain: the total number of nodes.""" | |||
count = 0 | |||
for node in self.chain.itervalues(): | |||
for hits in node.itervalues(): | |||
count += hits | |||
return count | |||
return "<MarkovChain of size {0}>".format(self.size) | |||
class MarkovChainIntersection(MarkovChain): | |||
@@ -75,6 +77,7 @@ class MarkovChainIntersection(MarkovChain): | |||
if node in nodes2: | |||
count2 = nodes2[node] | |||
self.chain[word][node] = min(count1, count2) | |||
self.size = self._get_size() | |||
def __repr__(self): | |||
"""Return the canonical string representation of the intersection.""" | |||
@@ -84,4 +87,8 @@ class MarkovChainIntersection(MarkovChain): | |||
def __str__(self): | |||
"""Return a nice string representation of the intersection.""" | |||
res = "<MarkovChainIntersection of size {0} ({1} ^ {2})>" | |||
return res.format(self.size(), self.mc1, self.mc2) | |||
return res.format(self.size, self.mc1, self.mc2) | |||
EMPTY = MarkovChain("") | |||
EMPTY_INTERSECTION = MarkovChainIntersection(EMPTY, EMPTY) |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -21,18 +21,31 @@ | |||
# SOFTWARE. | |||
from os import path | |||
import re | |||
from StringIO import StringIO | |||
import bs4 | |||
import mwparserfromhell | |||
import nltk | |||
__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] | |||
from earwigbot import importer | |||
from earwigbot.exceptions import ParserExclusionError | |||
class BaseTextParser(object): | |||
bs4 = importer.new("bs4") | |||
nltk = importer.new("nltk") | |||
converter = importer.new("pdfminer.converter") | |||
pdfinterp = importer.new("pdfminer.pdfinterp") | |||
pdfpage = importer.new("pdfminer.pdfpage") | |||
pdftypes = importer.new("pdfminer.pdftypes") | |||
psparser = importer.new("pdfminer.psparser") | |||
__all__ = ["ArticleTextParser", "get_parser"] | |||
class _BaseTextParser(object): | |||
"""Base class for a parser that handles text.""" | |||
TYPE = None | |||
def __init__(self, text): | |||
def __init__(self, text, args=None): | |||
self.text = text | |||
self._args = args or {} | |||
def __repr__(self): | |||
"""Return the canonical string representation of the text parser.""" | |||
@@ -44,8 +57,24 @@ class BaseTextParser(object): | |||
return "<{0} of text with size {1}>".format(name, len(self.text)) | |||
class ArticleTextParser(BaseTextParser): | |||
class ArticleTextParser(_BaseTextParser): | |||
"""A parser that can strip and chunk wikicode article text.""" | |||
TYPE = "Article" | |||
TEMPLATE_MERGE_THRESHOLD = 35 | |||
def _merge_templates(self, code): | |||
"""Merge template contents in to wikicode when the values are long.""" | |||
for template in code.filter_templates(recursive=code.RECURSE_OTHERS): | |||
chunks = [] | |||
for param in template.params: | |||
if len(param.value) >= self.TEMPLATE_MERGE_THRESHOLD: | |||
self._merge_templates(param.value) | |||
chunks.append(param.value) | |||
if chunks: | |||
subst = u" ".join(map(unicode, chunks)) | |||
code.replace(template, u" " + subst + u" ") | |||
else: | |||
code.remove(template) | |||
def strip(self): | |||
"""Clean the page's raw text by removing templates and formatting. | |||
@@ -58,12 +87,38 @@ class ArticleTextParser(BaseTextParser): | |||
The actual stripping is handled by :py:mod:`mwparserfromhell`. | |||
""" | |||
def remove(code, node): | |||
"""Remove a node from a code object, ignoring ValueError. | |||
Sometimes we will remove a node that contains another node we wish | |||
to remove, and we fail when we try to remove the inner one. Easiest | |||
solution is to just ignore the exception. | |||
""" | |||
try: | |||
code.remove(node) | |||
except ValueError: | |||
pass | |||
wikicode = mwparserfromhell.parse(self.text) | |||
# Preemtively strip some links mwparser doesn't know about: | |||
bad_prefixes = ("file:", "image:", "category:") | |||
for link in wikicode.filter_wikilinks(): | |||
if link.title.strip().lower().startswith(bad_prefixes): | |||
remove(wikicode, link) | |||
# Also strip references: | |||
for tag in wikicode.filter_tags(matches=lambda tag: tag.tag == "ref"): | |||
remove(wikicode, tag) | |||
# Merge in template contents when the values are long: | |||
self._merge_templates(wikicode) | |||
clean = wikicode.strip_code(normalize=True, collapse=True) | |||
self.clean = clean.replace("\n\n", "\n") # Collapse extra newlines | |||
self.clean = re.sub("\n\n+", "\n", clean).strip() | |||
return self.clean | |||
def chunk(self, nltk_dir, max_chunks, max_query=256): | |||
def chunk(self, nltk_dir, max_chunks, min_query=8, max_query=128): | |||
"""Convert the clean article text into a list of web-searchable chunks. | |||
No greater than *max_chunks* will be returned. Each chunk will only be | |||
@@ -91,6 +146,8 @@ class ArticleTextParser(BaseTextParser): | |||
while len(" ".join(words)) > max_query: | |||
words.pop() | |||
sentence = " ".join(words) | |||
if len(sentence) < min_query: | |||
continue | |||
sentences.append(sentence) | |||
if max_chunks >= len(sentences): | |||
@@ -109,30 +166,109 @@ class ArticleTextParser(BaseTextParser): | |||
else: | |||
chunk = sentences.pop(3 * len(sentences) / 4) # Pop from Q3 | |||
chunks.append(chunk) | |||
return chunks | |||
def get_links(self): | |||
"""Return a list of all external links in the article. | |||
class HTMLTextParser(BaseTextParser): | |||
The list is restricted to things that we suspect we can parse: i.e., | |||
those with schemes of ``http`` and ``https``. | |||
""" | |||
schemes = ("http://", "https://") | |||
links = mwparserfromhell.parse(self.text).ifilter_external_links() | |||
return [unicode(link.url) for link in links | |||
if link.url.startswith(schemes)] | |||
class _HTMLParser(_BaseTextParser): | |||
"""A parser that can extract the text from an HTML document.""" | |||
TYPE = "HTML" | |||
hidden_tags = [ | |||
"script", "style" | |||
] | |||
def strip(self): | |||
def parse(self): | |||
"""Return the actual text contained within an HTML document. | |||
Implemented using :py:mod:`BeautifulSoup <bs4>` | |||
(http://www.crummy.com/software/BeautifulSoup/). | |||
""" | |||
try: | |||
soup = bs4.BeautifulSoup(self.text, "lxml").body | |||
soup = bs4.BeautifulSoup(self.text, "lxml") | |||
except ValueError: | |||
soup = bs4.BeautifulSoup(self.text).body | |||
soup = bs4.BeautifulSoup(self.text) | |||
if not soup.body: | |||
# No <body> tag present in HTML -> | |||
# no scrapable content (possibly JS or <frame> magic): | |||
return "" | |||
if "mirror_hints" in self._args: | |||
# Look for obvious signs that this is a mirror: | |||
func = lambda attr: attr and any( | |||
hint in attr for hint in self._args["mirror_hints"]) | |||
if soup.find_all(href=func) or soup.find_all(src=func): | |||
raise ParserExclusionError() | |||
soup = soup.body | |||
is_comment = lambda text: isinstance(text, bs4.element.Comment) | |||
[comment.extract() for comment in soup.find_all(text=is_comment)] | |||
for comment in soup.find_all(text=is_comment): | |||
comment.extract() | |||
for tag in self.hidden_tags: | |||
[element.extract() for element in soup.find_all(tag)] | |||
for element in soup.find_all(tag): | |||
element.extract() | |||
return "\n".join(soup.stripped_strings) | |||
class _PDFParser(_BaseTextParser): | |||
"""A parser that can extract text from a PDF file.""" | |||
TYPE = "PDF" | |||
substitutions = [ | |||
(u"\x0c", u"\n"), | |||
(u"\u2022", u" "), | |||
] | |||
def parse(self): | |||
"""Return extracted text from the PDF.""" | |||
output = StringIO() | |||
manager = pdfinterp.PDFResourceManager() | |||
conv = converter.TextConverter(manager, output) | |||
interp = pdfinterp.PDFPageInterpreter(manager, conv) | |||
try: | |||
pages = pdfpage.PDFPage.get_pages(StringIO(self.text)) | |||
for page in pages: | |||
interp.process_page(page) | |||
except (pdftypes.PDFException, psparser.PSException, AssertionError): | |||
return output.getvalue().decode("utf8") | |||
finally: | |||
conv.close() | |||
value = output.getvalue().decode("utf8") | |||
for orig, new in self.substitutions: | |||
value = value.replace(orig, new) | |||
return re.sub("\n\n+", "\n", value).strip() | |||
class _PlainTextParser(_BaseTextParser): | |||
"""A parser that can unicode-ify and strip text from a plain text page.""" | |||
TYPE = "Text" | |||
def parse(self): | |||
"""Unicode-ify and strip whitespace from the plain text document.""" | |||
converted = bs4.UnicodeDammit(self.text).unicode_markup | |||
return converted.strip() if converted else "" | |||
_CONTENT_TYPES = { | |||
"text/html": _HTMLParser, | |||
"application/xhtml+xml": _HTMLParser, | |||
"application/pdf": _PDFParser, | |||
"application/x-pdf": _PDFParser, | |||
"text/plain": _PlainTextParser | |||
} | |||
def get_parser(content_type): | |||
"""Return the parser most able to handle a given content type, or None.""" | |||
return _CONTENT_TYPES.get(content_type.split(";", 1)[0]) |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -20,7 +20,94 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
__all__ = ["CopyvioCheckResult"] | |||
from threading import Event | |||
from time import time | |||
from earwigbot.wiki.copyvios.markov import EMPTY, EMPTY_INTERSECTION | |||
__all__ = ["CopyvioSource", "CopyvioCheckResult"] | |||
class CopyvioSource(object): | |||
""" | |||
**EarwigBot: Wiki Toolset: Copyvio Source** | |||
A class that represents a single possible source of a copyright violation, | |||
i.e., a URL. | |||
*Attributes:* | |||
- :py:attr:`url`: the URL of the source | |||
- :py:attr:`confidence`: the confidence of a violation, between 0 and 1 | |||
- :py:attr:`chains`: a 2-tuple of the source chain and the delta chain | |||
- :py:attr:`skipped`: whether this URL was skipped during the check | |||
- :py:attr:`excluded`: whether this URL was in the exclusions list | |||
""" | |||
def __init__(self, workspace, url, headers=None, timeout=5, | |||
parser_args=None): | |||
self.workspace = workspace | |||
self.url = url | |||
self.headers = headers | |||
self.timeout = timeout | |||
self.parser_args = parser_args | |||
self.confidence = 0.0 | |||
self.chains = (EMPTY, EMPTY_INTERSECTION) | |||
self.skipped = False | |||
self.excluded = False | |||
self._event1 = Event() | |||
self._event2 = Event() | |||
self._event2.set() | |||
def __repr__(self): | |||
"""Return the canonical string representation of the source.""" | |||
res = ("CopyvioSource(url={0!r}, confidence={1!r}, skipped={2!r}, " | |||
"excluded={3!r})") | |||
return res.format( | |||
self.url, self.confidence, self.skipped, self.excluded) | |||
def __str__(self): | |||
"""Return a nice string representation of the source.""" | |||
if self.excluded: | |||
return "<CopyvioSource ({0}, excluded)>".format(self.url) | |||
if self.skipped: | |||
return "<CopyvioSource ({0}, skipped)>".format(self.url) | |||
res = "<CopyvioSource ({0} with {1} conf)>" | |||
return res.format(self.url, self.confidence) | |||
def start_work(self): | |||
"""Mark this source as being worked on right now.""" | |||
self._event2.clear() | |||
self._event1.set() | |||
def update(self, confidence, source_chain, delta_chain): | |||
"""Fill out the confidence and chain information inside this source.""" | |||
self.confidence = confidence | |||
self.chains = (source_chain, delta_chain) | |||
def finish_work(self): | |||
"""Mark this source as finished.""" | |||
self._event2.set() | |||
def skip(self): | |||
"""Deactivate this source without filling in the relevant data.""" | |||
if self._event1.is_set(): | |||
return | |||
self.skipped = True | |||
self._event1.set() | |||
def join(self, until): | |||
"""Block until this violation result is filled out.""" | |||
for event in [self._event1, self._event2]: | |||
if until: | |||
timeout = until - time() | |||
if timeout <= 0: | |||
return | |||
event.wait(timeout) | |||
else: | |||
event.wait() | |||
class CopyvioCheckResult(object): | |||
""" | |||
@@ -31,30 +118,57 @@ class CopyvioCheckResult(object): | |||
*Attributes:* | |||
- :py:attr:`violation`: ``True`` if this is a violation, else ``False`` | |||
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy | |||
- :py:attr:`url`: the URL of the violated page | |||
- :py:attr:`sources`: a list of CopyvioSources, sorted by confidence | |||
- :py:attr:`best`: the best matching CopyvioSource, or ``None`` | |||
- :py:attr:`confidence`: the best matching source's confidence, or 0 | |||
- :py:attr:`url`: the best matching source's URL, or ``None`` | |||
- :py:attr:`queries`: the number of queries used to reach a result | |||
- :py:attr:`time`: the amount of time the check took to complete | |||
- :py:attr:`article_chain`: the MarkovChain of the article text | |||
- :py:attr:`source_chain`: the MarkovChain of the violated page text | |||
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two | |||
- :py:attr:`possible_miss`: whether some URLs might have been missed | |||
""" | |||
def __init__(self, violation, confidence, url, queries, article, chains): | |||
def __init__(self, violation, sources, queries, check_time, article_chain, | |||
possible_miss): | |||
self.violation = violation | |||
self.confidence = confidence | |||
self.url = url | |||
self.sources = sources | |||
self.queries = queries | |||
self.article_chain = article | |||
self.source_chain = chains[0] | |||
self.delta_chain = chains[1] | |||
self.time = check_time | |||
self.article_chain = article_chain | |||
self.possible_miss = possible_miss | |||
def __repr__(self): | |||
"""Return the canonical string representation of the result.""" | |||
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" | |||
return res.format(self.violation, self.confidence, self.url, | |||
self.queries) | |||
res = "CopyvioCheckResult(violation={0!r}, sources={1!r}, queries={2!r}, time={3!r})" | |||
return res.format(self.violation, self.sources, self.queries, | |||
self.time) | |||
def __str__(self): | |||
"""Return a nice string representation of the result.""" | |||
res = "<CopyvioCheckResult ({0} with {1} conf)>" | |||
return res.format(self.violation, self.confidence) | |||
res = "<CopyvioCheckResult ({0} with best {1})>" | |||
return res.format(self.violation, self.best) | |||
@property | |||
def best(self): | |||
"""The best known source, or None if no sources exist.""" | |||
return self.sources[0] if self.sources else None | |||
@property | |||
def confidence(self): | |||
"""The confidence of the best source, or 0 if no sources exist.""" | |||
return self.best.confidence if self.best else 0.0 | |||
@property | |||
def url(self): | |||
"""The URL of the best source, or None if no sources exist.""" | |||
return self.best.url if self.best else None | |||
def get_log_message(self, title): | |||
"""Build a relevant log message for this copyvio check result.""" | |||
if not self.sources: | |||
log = u"No violation for [[{0}]] (no sources; {1} queries; {2} seconds)" | |||
return log.format(title, self.queries, self.time) | |||
log = u"{0} for [[{1}]] (best: {2} ({3} confidence); {4} sources; {5} queries; {6} seconds)" | |||
is_vio = "Violation detected" if self.violation else "No violation" | |||
return log.format(is_vio, title, self.url, self.confidence, | |||
len(self.sources), self.queries, self.time) |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -20,22 +20,28 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from gzip import GzipFile | |||
from json import loads | |||
from urllib import quote_plus, urlencode | |||
import oauth2 as oauth | |||
from socket import error | |||
from StringIO import StringIO | |||
from urllib import quote | |||
from urllib2 import URLError | |||
from earwigbot import importer | |||
from earwigbot.exceptions import SearchQueryError | |||
oauth = importer.new("oauth2") | |||
__all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] | |||
class BaseSearchEngine(object): | |||
"""Base class for a simple search engine interface.""" | |||
name = "Base" | |||
def __init__(self, cred): | |||
"""Store credentials *cred* for searching later on.""" | |||
def __init__(self, cred, opener): | |||
"""Store credentials (*cred*) and *opener* for searching later on.""" | |||
self.cred = cred | |||
self.opener = opener | |||
def __repr__(self): | |||
"""Return the canonical string representation of the search engine.""" | |||
@@ -57,29 +63,51 @@ class YahooBOSSSearchEngine(BaseSearchEngine): | |||
"""A search engine interface with Yahoo! BOSS.""" | |||
name = "Yahoo! BOSS" | |||
@staticmethod | |||
def _build_url(base, params): | |||
"""Works like urllib.urlencode(), but uses %20 for spaces over +.""" | |||
enc = lambda s: quote(s.encode("utf8"), safe="") | |||
args = ["=".join((enc(k), enc(v))) for k, v in params.iteritems()] | |||
return base + "?" + "&".join(args) | |||
def search(self, query): | |||
"""Do a Yahoo! BOSS web search for *query*. | |||
Returns a list of URLs, no more than fifty, ranked by relevance (as | |||
determined by Yahoo). Raises | |||
:py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | |||
Returns a list of URLs, no more than five, ranked by relevance | |||
(as determined by Yahoo). | |||
Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. | |||
""" | |||
base_url = "http://yboss.yahooapis.com/ysearch/web" | |||
query = quote_plus(query.join('"', '"')) | |||
params = {"q": query, "type": "html,text", "format": "json"} | |||
url = "{0}?{1}".format(base_url, urlencode(params)) | |||
key, secret = self.cred["key"], self.cred["secret"] | |||
consumer = oauth.Consumer(key=key, secret=secret) | |||
url = "http://yboss.yahooapis.com/ysearch/web" | |||
params = { | |||
"oauth_version": oauth.OAUTH_VERSION, | |||
"oauth_nonce": oauth.generate_nonce(), | |||
"oauth_timestamp": oauth.Request.make_timestamp(), | |||
"oauth_consumer_key": consumer.key, | |||
"q": '"' + query.encode("utf8") + '"', "count": "5", | |||
"type": "html,text,pdf", "format": "json", | |||
} | |||
req = oauth.Request(method="GET", url=url, parameters=params) | |||
req.sign_request(oauth.SignatureMethod_HMAC_SHA1(), consumer, None) | |||
try: | |||
response = self.opener.open(self._build_url(url, req)) | |||
result = response.read() | |||
except (URLError, error) as exc: | |||
raise SearchQueryError("Yahoo! BOSS Error: " + str(exc)) | |||
consumer = oauth.Consumer(key=self.cred["key"], | |||
secret=self.cred["secret"]) | |||
client = oauth.Client(consumer) | |||
headers, body = client.request(url, "GET") | |||
if response.headers.get("Content-Encoding") == "gzip": | |||
stream = StringIO(result) | |||
gzipper = GzipFile(fileobj=stream) | |||
result = gzipper.read() | |||
if headers["status"] != "200": | |||
if response.getcode() != 200: | |||
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" | |||
raise SearchQueryError(e.format(headers["status"], body)) | |||
raise SearchQueryError(e.format(response.getcode(), result)) | |||
try: | |||
res = loads(body) | |||
res = loads(result) | |||
except ValueError: | |||
e = "Yahoo! BOSS Error: JSON could not be decoded" | |||
raise SearchQueryError(e) | |||
@@ -0,0 +1,395 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from collections import deque | |||
from gzip import GzipFile | |||
from httplib import HTTPException | |||
from logging import getLogger | |||
from math import log | |||
from Queue import Empty, Queue | |||
from socket import error as socket_error | |||
from StringIO import StringIO | |||
from struct import error as struct_error | |||
from threading import Lock, Thread | |||
from time import time | |||
from urllib2 import build_opener, URLError | |||
from earwigbot import importer | |||
from earwigbot.exceptions import ParserExclusionError | |||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||
from earwigbot.wiki.copyvios.parsers import get_parser | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | |||
tldextract = importer.new("tldextract") | |||
__all__ = ["globalize", "localize", "CopyvioWorkspace"] | |||
_is_globalized = False | |||
_global_queues = None | |||
_global_workers = [] | |||
def globalize(num_workers=8): | |||
"""Cause all copyvio checks to be done by one global set of workers. | |||
This is useful when checks are being done through a web interface where | |||
large numbers of simulatenous requests could be problematic. The global | |||
workers are spawned when the function is called, run continuously, and | |||
intelligently handle multiple checks. | |||
This function is not thread-safe and should only be called when no checks | |||
are being done. It has no effect if it has already been called. | |||
""" | |||
global _is_globalized, _global_queues | |||
if _is_globalized: | |||
return | |||
_global_queues = _CopyvioQueues() | |||
for i in xrange(num_workers): | |||
worker = _CopyvioWorker("global-{0}".format(i), _global_queues) | |||
worker.start() | |||
_global_workers.append(worker) | |||
_is_globalized = True | |||
def localize(): | |||
"""Return to using page-specific workers for copyvio checks. | |||
This disables changes made by :func:`globalize`, including stoping the | |||
global worker threads. | |||
This function is not thread-safe and should only be called when no checks | |||
are being done. | |||
""" | |||
global _is_globalized, _global_queues, _global_workers | |||
if not _is_globalized: | |||
return | |||
for i in xrange(len(_global_workers)): | |||
_global_queues.unassigned.put((StopIteration, None)) | |||
_global_queues = None | |||
_global_workers = [] | |||
_is_globalized = False | |||
class _CopyvioQueues(object): | |||
"""Stores data necessary to maintain the various queues during a check.""" | |||
def __init__(self): | |||
self.lock = Lock() | |||
self.sites = {} | |||
self.unassigned = Queue() | |||
class _CopyvioWorker(object): | |||
"""A multithreaded URL opener/parser instance.""" | |||
def __init__(self, name, queues, until=None): | |||
self._name = name | |||
self._queues = queues | |||
self._until = until | |||
self._site = None | |||
self._queue = None | |||
self._opener = build_opener() | |||
self._logger = getLogger("earwigbot.wiki.cvworker." + name) | |||
def _open_url(self, source): | |||
"""Open a URL and return its parsed content, or None. | |||
First, we will decompress the content if the headers contain "gzip" as | |||
its content encoding. Then, we will return the content stripped using | |||
an HTML parser if the headers indicate it is HTML, or return the | |||
content directly if it is plain text. If we don't understand the | |||
content type, we'll return None. | |||
If a URLError was raised while opening the URL or an IOError was raised | |||
while decompressing, None will be returned. | |||
""" | |||
if source.headers: | |||
self._opener.addheaders = source.headers | |||
url = source.url.encode("utf8") | |||
try: | |||
response = self._opener.open(url, timeout=source.timeout) | |||
except (URLError, HTTPException, socket_error): | |||
return None | |||
try: | |||
size = int(response.headers.get("Content-Length", 0)) | |||
except ValueError: | |||
return None | |||
content_type = response.headers.get("Content-Type", "text/plain") | |||
handler = get_parser(content_type) | |||
if not handler: | |||
return None | |||
if size > (15 if handler.TYPE == "PDF" else 2) * 1024 ** 2: | |||
return None | |||
try: | |||
content = response.read() | |||
except (URLError, socket_error): | |||
return None | |||
if response.headers.get("Content-Encoding") == "gzip": | |||
stream = StringIO(content) | |||
gzipper = GzipFile(fileobj=stream) | |||
try: | |||
content = gzipper.read() | |||
except (IOError, struct_error): | |||
return None | |||
return handler(content, source.parser_args).parse() | |||
def _acquire_new_site(self): | |||
"""Block for a new unassigned site queue.""" | |||
if self._until: | |||
timeout = self._until - time() | |||
if timeout <= 0: | |||
raise Empty | |||
else: | |||
timeout = None | |||
self._logger.debug("Waiting for new site queue") | |||
site, queue = self._queues.unassigned.get(timeout=timeout) | |||
if site is StopIteration: | |||
raise StopIteration | |||
self._logger.debug(u"Acquired new site queue: {0}".format(site)) | |||
self._site = site | |||
self._queue = queue | |||
def _dequeue(self): | |||
"""Remove a source from one of the queues.""" | |||
if not self._site: | |||
self._acquire_new_site() | |||
logmsg = u"Fetching source URL from queue {0}" | |||
self._logger.debug(logmsg.format(self._site)) | |||
self._queues.lock.acquire() | |||
try: | |||
source = self._queue.popleft() | |||
except IndexError: | |||
self._logger.debug("Queue is empty") | |||
del self._queues.sites[self._site] | |||
self._site = None | |||
self._queue = None | |||
self._queues.lock.release() | |||
return self._dequeue() | |||
self._logger.debug(u"Got source URL: {0}".format(source.url)) | |||
if source.skipped: | |||
self._logger.debug("Source has been skipped") | |||
self._queues.lock.release() | |||
return self._dequeue() | |||
source.start_work() | |||
self._queues.lock.release() | |||
return source | |||
def _run(self): | |||
"""Main entry point for the worker thread. | |||
We will keep fetching URLs from the queues and handling them until | |||
either we run out of time, or we get an exit signal that the queue is | |||
now empty. | |||
""" | |||
while True: | |||
try: | |||
source = self._dequeue() | |||
except Empty: | |||
self._logger.debug("Exiting: queue timed out") | |||
return | |||
except StopIteration: | |||
self._logger.debug("Exiting: got stop signal") | |||
return | |||
try: | |||
text = self._open_url(source) | |||
except ParserExclusionError: | |||
self._logger.debug("Source excluded by content parser") | |||
source.skipped = source.excluded = True | |||
source.finish_work() | |||
else: | |||
chain = MarkovChain(text) if text else None | |||
source.workspace.compare(source, chain) | |||
def start(self): | |||
"""Start the copyvio worker in a new thread.""" | |||
thread = Thread(target=self._run, name="cvworker-" + self._name) | |||
thread.daemon = True | |||
thread.start() | |||
class CopyvioWorkspace(object): | |||
"""Manages a single copyvio check distributed across threads.""" | |||
def __init__(self, article, min_confidence, max_time, logger, headers, | |||
url_timeout=5, num_workers=8, short_circuit=True, | |||
parser_args=None): | |||
self.sources = [] | |||
self.finished = False | |||
self.possible_miss = False | |||
self._article = article | |||
self._logger = logger.getChild("copyvios") | |||
self._min_confidence = min_confidence | |||
self._start_time = time() | |||
self._until = (self._start_time + max_time) if max_time > 0 else None | |||
self._handled_urls = set() | |||
self._finish_lock = Lock() | |||
self._short_circuit = short_circuit | |||
self._source_args = { | |||
"workspace": self, "headers": headers, "timeout": url_timeout, | |||
"parser_args": parser_args} | |||
if _is_globalized: | |||
self._queues = _global_queues | |||
else: | |||
self._queues = _CopyvioQueues() | |||
self._num_workers = num_workers | |||
for i in xrange(num_workers): | |||
name = "local-{0:04}.{1}".format(id(self) % 10000, i) | |||
_CopyvioWorker(name, self._queues, self._until).start() | |||
def _calculate_confidence(self, delta): | |||
"""Return the confidence of a violation as a float between 0 and 1.""" | |||
def conf_with_article_and_delta(article, delta): | |||
"""Calculate confidence using the article and delta chain sizes.""" | |||
# This piecewise function exhibits exponential growth until it | |||
# reaches the default "suspect" confidence threshold, at which | |||
# point it transitions to polynomial growth with a limit of 1 as | |||
# (delta / article) approaches 1. | |||
# A graph can be viewed here: http://goo.gl/mKPhvr | |||
ratio = delta / article | |||
if ratio <= 0.52763: | |||
return -log(1 - ratio) | |||
else: | |||
return (-0.8939 * (ratio ** 2)) + (1.8948 * ratio) - 0.0009 | |||
def conf_with_delta(delta): | |||
"""Calculate confidence using just the delta chain size.""" | |||
# This piecewise function was derived from experimental data using | |||
# reference points at (0, 0), (100, 0.5), (250, 0.75), (500, 0.9), | |||
# and (1000, 0.95), with a limit of 1 as delta approaches infinity. | |||
# A graph can be viewed here: http://goo.gl/lVl7or | |||
if delta <= 100: | |||
return delta / (delta + 100) | |||
elif delta <= 250: | |||
return (delta - 25) / (delta + 50) | |||
elif delta <= 500: | |||
return (10.5 * delta - 750) / (10 * delta) | |||
else: | |||
return (delta - 50) / delta | |||
d_size = float(delta.size) | |||
return abs(max(conf_with_article_and_delta(self._article.size, d_size), | |||
conf_with_delta(d_size))) | |||
def _finish_early(self): | |||
"""Finish handling links prematurely (if we've hit min_confidence).""" | |||
self._logger.debug("Confidence threshold met; skipping remaining sources") | |||
with self._queues.lock: | |||
for source in self.sources: | |||
source.skip() | |||
self.finished = True | |||
def enqueue(self, urls, exclude_check=None): | |||
"""Put a list of URLs into the various worker queues. | |||
*exclude_check* is an optional exclusion function that takes a URL and | |||
returns ``True`` if we should skip it and ``False`` otherwise. | |||
""" | |||
for url in urls: | |||
with self._queues.lock: | |||
if url in self._handled_urls: | |||
continue | |||
self._handled_urls.add(url) | |||
source = CopyvioSource(url=url, **self._source_args) | |||
self.sources.append(source) | |||
if exclude_check and exclude_check(url): | |||
self._logger.debug(u"enqueue(): exclude {0}".format(url)) | |||
source.excluded = True | |||
source.skip() | |||
continue | |||
if self._short_circuit and self.finished: | |||
self._logger.debug(u"enqueue(): auto-skip {0}".format(url)) | |||
source.skip() | |||
continue | |||
try: | |||
key = tldextract.extract(url).registered_domain | |||
except ImportError: # Fall back on very naive method | |||
from urlparse import urlparse | |||
key = u".".join(urlparse(url).netloc.split(".")[-2:]) | |||
logmsg = u"enqueue(): {0} {1} -> {2}" | |||
if key in self._queues.sites: | |||
self._logger.debug(logmsg.format("append", key, url)) | |||
self._queues.sites[key].append(source) | |||
else: | |||
self._logger.debug(logmsg.format("new", key, url)) | |||
self._queues.sites[key] = queue = deque() | |||
queue.append(source) | |||
self._queues.unassigned.put((key, queue)) | |||
def compare(self, source, source_chain): | |||
"""Compare a source to the article; call _finish_early if necessary.""" | |||
if source_chain: | |||
delta = MarkovChainIntersection(self._article, source_chain) | |||
conf = self._calculate_confidence(delta) | |||
else: | |||
conf = 0.0 | |||
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf)) | |||
with self._finish_lock: | |||
if source_chain: | |||
source.update(conf, source_chain, delta) | |||
source.finish_work() | |||
if not self.finished and conf >= self._min_confidence: | |||
if self._short_circuit: | |||
self._finish_early() | |||
else: | |||
self.finished = True | |||
def wait(self): | |||
"""Wait for the workers to finish handling the sources.""" | |||
self._logger.debug("Waiting on {0} sources".format(len(self.sources))) | |||
for source in self.sources: | |||
source.join(self._until) | |||
with self._finish_lock: | |||
pass # Wait for any remaining comparisons to be finished | |||
if not _is_globalized: | |||
for i in xrange(self._num_workers): | |||
self._queues.unassigned.put((StopIteration, None)) | |||
def get_result(self, num_queries=0): | |||
"""Return a CopyvioCheckResult containing the results of this check.""" | |||
def cmpfunc(s1, s2): | |||
if s2.confidence != s1.confidence: | |||
return 1 if s2.confidence > s1.confidence else -1 | |||
if s2.excluded != s1.excluded: | |||
return 1 if s1.excluded else -1 | |||
return int(s1.skipped) - int(s2.skipped) | |||
self.sources.sort(cmpfunc) | |||
return CopyvioCheckResult(self.finished, self.sources, num_queries, | |||
time() - self._start_time, self._article, | |||
self.possible_miss) |
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -50,6 +50,7 @@ class Page(CopyvioMixIn): | |||
- :py:attr:`pageid`: an integer ID representing the page | |||
- :py:attr:`url`: the page's URL | |||
- :py:attr:`namespace`: the page's namespace as an integer | |||
- :py:attr:`lastrevid`: the ID of the page's most recent revision | |||
- :py:attr:`protection`: the page's current protection status | |||
- :py:attr:`is_talkpage`: ``True`` if this is a talkpage, else ``False`` | |||
- :py:attr:`is_redirect`: ``True`` if this is a redirect, else ``False`` | |||
@@ -116,7 +117,6 @@ class Page(CopyvioMixIn): | |||
self._creator = None | |||
# Attributes used for editing/deleting/protecting/etc: | |||
self._token = None | |||
self._basetimestamp = None | |||
self._starttimestamp = None | |||
@@ -199,21 +199,25 @@ class Page(CopyvioMixIn): | |||
"""Load various data from the API in a single query. | |||
Loads self._title, ._exists, ._is_redirect, ._pageid, ._fullurl, | |||
._protection, ._namespace, ._is_talkpage, ._creator, ._lastrevid, | |||
._token, and ._starttimestamp using the API. It will do a query of | |||
its own unless *result* is provided, in which case we'll pretend | |||
*result* is what the query returned. | |||
._protection, ._namespace, ._is_talkpage, ._creator, ._lastrevid, and | |||
._starttimestamp using the API. It will do a query of its own unless | |||
*result* is provided, in which case we'll pretend *result* is what the | |||
query returned. | |||
Assuming the API is sound, this should not raise any exceptions. | |||
""" | |||
if not result: | |||
query = self.site.api_query | |||
result = query(action="query", rvprop="user", intoken="edit", | |||
prop="info|revisions", rvlimit=1, rvdir="newer", | |||
titles=self._title, inprop="protection|url") | |||
result = query(action="query", prop="info|revisions", | |||
inprop="protection|url", rvprop="user", rvlimit=1, | |||
rvdir="newer", titles=self._title) | |||
res = result["query"]["pages"].values()[0] | |||
if "interwiki" in result["query"]: | |||
self._title = result["query"]["interwiki"][0]["title"] | |||
self._exists = self.PAGE_INVALID | |||
return | |||
res = result["query"]["pages"].values()[0] | |||
self._title = res["title"] # Normalize our pagename/title | |||
self._is_redirect = "redirect" in res | |||
@@ -233,13 +237,7 @@ class Page(CopyvioMixIn): | |||
self._fullurl = res["fullurl"] | |||
self._protection = res["protection"] | |||
try: | |||
self._token = res["edittoken"] | |||
except KeyError: | |||
pass | |||
else: | |||
self._starttimestamp = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime()) | |||
self._starttimestamp = strftime("%Y-%m-%dT%H:%M:%SZ", gmtime()) | |||
# We've determined the namespace and talkpage status in __init__() | |||
# based on the title, but now we can be sure: | |||
@@ -280,8 +278,7 @@ class Page(CopyvioMixIn): | |||
self._assert_existence() | |||
def _edit(self, params=None, text=None, summary=None, minor=None, bot=None, | |||
force=None, section=None, captcha_id=None, captcha_word=None, | |||
tries=0): | |||
force=None, section=None, captcha_id=None, captcha_word=None): | |||
"""Edit the page! | |||
If *params* is given, we'll use it as our API query parameters. | |||
@@ -292,13 +289,6 @@ class Page(CopyvioMixIn): | |||
in _handle_edit_errors(). We'll then throw these back as subclasses of | |||
EditError. | |||
""" | |||
# Try to get our edit token, and die if we can't: | |||
if not self._token: | |||
self._load_attributes() | |||
if not self._token: | |||
e = "You don't have permission to edit this page." | |||
raise exceptions.PermissionsError(e) | |||
# Weed out invalid pages before we get too far: | |||
self._assert_validity() | |||
@@ -307,7 +297,7 @@ class Page(CopyvioMixIn): | |||
params = self._build_edit_params(text, summary, minor, bot, force, | |||
section, captcha_id, captcha_word) | |||
else: # Make sure we have the right token: | |||
params["token"] = self._token | |||
params["token"] = self.site.get_token() | |||
# Try the API query, catching most errors with our handler: | |||
try: | |||
@@ -315,7 +305,7 @@ class Page(CopyvioMixIn): | |||
except exceptions.APIError as error: | |||
if not hasattr(error, "code"): | |||
raise # We can only handle errors with a code attribute | |||
result = self._handle_edit_errors(error, params, tries) | |||
result = self._handle_edit_errors(error, params) | |||
# If everything was successful, reset invalidated attributes: | |||
if result["edit"]["result"] == "Success": | |||
@@ -324,21 +314,17 @@ class Page(CopyvioMixIn): | |||
self._exists = self.PAGE_UNKNOWN | |||
return | |||
# If we're here, then the edit failed. If it's because of AssertEdit, | |||
# handle that. Otherwise, die - something odd is going on: | |||
try: | |||
assertion = result["edit"]["assert"] | |||
except KeyError: | |||
raise exceptions.EditError(result["edit"]) | |||
self._handle_assert_edit(assertion, params, tries) | |||
# Otherwise, there was some kind of problem. Throw an exception: | |||
raise exceptions.EditError(result["edit"]) | |||
def _build_edit_params(self, text, summary, minor, bot, force, section, | |||
captcha_id, captcha_word): | |||
"""Given some keyword arguments, build an API edit query string.""" | |||
unitxt = text.encode("utf8") if isinstance(text, unicode) else text | |||
hashed = md5(unitxt).hexdigest() # Checksum to ensure text is correct | |||
params = {"action": "edit", "title": self._title, "text": text, | |||
"token": self._token, "summary": summary, "md5": hashed} | |||
params = { | |||
"action": "edit", "title": self._title, "text": text, | |||
"token": self.site.get_token(), "summary": summary, "md5": hashed} | |||
if section: | |||
params["section"] = section | |||
@@ -353,7 +339,8 @@ class Page(CopyvioMixIn): | |||
params["bot"] = "true" | |||
if not force: | |||
params["starttimestamp"] = self._starttimestamp | |||
if self._starttimestamp: | |||
params["starttimestamp"] = self._starttimestamp | |||
if self._basetimestamp: | |||
params["basetimestamp"] = self._basetimestamp | |||
if self._exists == self.PAGE_MISSING: | |||
@@ -364,93 +351,42 @@ class Page(CopyvioMixIn): | |||
return params | |||
def _handle_edit_errors(self, error, params, tries): | |||
def _handle_edit_errors(self, error, params, retry=True): | |||
"""If our edit fails due to some error, try to handle it. | |||
We'll either raise an appropriate exception (for example, if the page | |||
is protected), or we'll try to fix it (for example, if we can't edit | |||
due to being logged out, we'll try to log in). | |||
is protected), or we'll try to fix it (for example, if the token is | |||
invalid, we'll try to get a new one). | |||
""" | |||
if error.code in ["noedit", "cantcreate", "protectedtitle", | |||
"noimageredirect"]: | |||
perms = ["noedit", "noedit-anon", "cantcreate", "cantcreate-anon", | |||
"protectedtitle", "noimageredirect", "noimageredirect-anon", | |||
"blocked"] | |||
if error.code in perms: | |||
raise exceptions.PermissionsError(error.info) | |||
elif error.code in ["noedit-anon", "cantcreate-anon", | |||
"noimageredirect-anon"]: | |||
if not all(self.site._login_info): | |||
# Insufficient login info: | |||
raise exceptions.PermissionsError(error.info) | |||
if tries == 0: | |||
# We have login info; try to login: | |||
self.site._login(self.site._login_info) | |||
self._token = None # Need a new token; old one is invalid now | |||
return self._edit(params=params, tries=1) | |||
else: | |||
# We already tried to log in and failed! | |||
e = "Although we should be logged in, we are not. This may be a cookie problem or an odd bug." | |||
raise exceptions.LoginError(e) | |||
elif error.code in ["editconflict", "pagedeleted", "articleexists"]: | |||
# These attributes are now invalidated: | |||
self._content = None | |||
self._basetimestamp = None | |||
self._exists = self.PAGE_UNKNOWN | |||
raise exceptions.EditConflictError(error.info) | |||
elif error.code == "badtoken" and retry: | |||
params["token"] = self.site.get_token(force=True) | |||
try: | |||
return self.site.api_query(**params) | |||
except exceptions.APIError as err: | |||
if not hasattr(err, "code"): | |||
raise # We can only handle errors with a code attribute | |||
return self._handle_edit_errors(err, params, retry=False) | |||
elif error.code in ["emptypage", "emptynewsection"]: | |||
raise exceptions.NoContentError(error.info) | |||
elif error.code == "contenttoobig": | |||
raise exceptions.ContentTooBigError(error.info) | |||
elif error.code == "spamdetected": | |||
raise exceptions.SpamDetectedError(error.info) | |||
elif error.code == "filtered": | |||
raise exceptions.FilteredError(error.info) | |||
raise exceptions.EditError(": ".join((error.code, error.info))) | |||
def _handle_assert_edit(self, assertion, params, tries): | |||
"""If we can't edit due to a failed AssertEdit assertion, handle that. | |||
If the assertion was 'user' and we have valid login information, try to | |||
log in. Otherwise, raise PermissionsError with details. | |||
""" | |||
if assertion == "user": | |||
if not all(self.site._login_info): | |||
# Insufficient login info: | |||
e = "AssertEdit: user assertion failed, and no login info was provided." | |||
raise exceptions.PermissionsError(e) | |||
if tries == 0: | |||
# We have login info; try to login: | |||
self.site._login(self.site._login_info) | |||
self._token = None # Need a new token; old one is invalid now | |||
return self._edit(params=params, tries=1) | |||
else: | |||
# We already tried to log in and failed! | |||
e = "Although we should be logged in, we are not. This may be a cookie problem or an odd bug." | |||
raise exceptions.LoginError(e) | |||
elif assertion == "bot": | |||
if not all(self.site._login_info): | |||
# Insufficient login info: | |||
e = "AssertEdit: bot assertion failed, and no login info was provided." | |||
raise exceptions.PermissionsError(e) | |||
if tries == 0: | |||
# Try to log in if we got logged out: | |||
self.site._login(self.site._login_info) | |||
self._token = None # Need a new token; old one is invalid now | |||
return self._edit(params=params, tries=1) | |||
else: | |||
# We already tried to log in, so we don't have a bot flag: | |||
e = "AssertEdit: bot assertion failed: we don't have a bot flag!" | |||
raise exceptions.PermissionsError(e) | |||
# Unknown assertion, maybe "true", "false", or "exists": | |||
e = "AssertEdit: assertion '{0}' failed.".format(assertion) | |||
raise exceptions.PermissionsError(e) | |||
@property | |||
def site(self): | |||
"""The page's corresponding Site object.""" | |||
@@ -530,6 +466,19 @@ class Page(CopyvioMixIn): | |||
return self._namespace | |||
@property | |||
def lastrevid(self): | |||
"""The ID of the page's most recent revision. | |||
Raises :py:exc:`~earwigbot.exceptions.InvalidPageError` or | |||
:py:exc:`~earwigbot.exceptions.PageNotFoundError` if the page name is | |||
invalid or the page does not exist, respectively. | |||
""" | |||
if self._exists == self.PAGE_UNKNOWN: | |||
self._load() | |||
self._assert_existence() # Missing pages don't have revisions | |||
return self._lastrevid | |||
@property | |||
def protection(self): | |||
"""The page's current protection status. | |||
@@ -633,7 +582,7 @@ class Page(CopyvioMixIn): | |||
query = self.site.api_query | |||
result = query(action="query", rvlimit=1, titles=self._title, | |||
prop="info|revisions", inprop="protection|url", | |||
intoken="edit", rvprop="content|timestamp") | |||
rvprop="content|timestamp") | |||
self._load_attributes(result=result) | |||
self._assert_existence() | |||
self._load_content(result=result) | |||
@@ -666,7 +615,7 @@ class Page(CopyvioMixIn): | |||
:py:exc:`~earwigbot.exceptions.RedirectError` if the page is not a | |||
redirect. | |||
""" | |||
re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]" | |||
re_redirect = r"^\s*\#\s*redirect\s*\[\[(.*?)\]\]" | |||
content = self.get() | |||
try: | |||
return re.findall(re_redirect, content, flags=re.I)[0] | |||
@@ -765,7 +714,7 @@ class Page(CopyvioMixIn): | |||
username = username.lower() | |||
optouts = [optout.lower() for optout in optouts] if optouts else [] | |||
r_bots = "\{\{\s*(no)?bots\s*(\||\}\})" | |||
r_bots = r"\{\{\s*(no)?bots\s*(\||\}\})" | |||
filter = self.parse().ifilter_templates(recursive=True, matches=r_bots) | |||
for template in filter: | |||
if template.has_param("deny"): | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -26,20 +26,20 @@ from json import loads | |||
from logging import getLogger, NullHandler | |||
from os.path import expanduser | |||
from StringIO import StringIO | |||
from threading import Lock | |||
from threading import RLock | |||
from time import sleep, time | |||
from urllib import quote_plus, unquote_plus | |||
from urllib2 import build_opener, HTTPCookieProcessor, URLError | |||
from urlparse import urlparse | |||
import oursql | |||
from earwigbot import exceptions | |||
from earwigbot import exceptions, importer | |||
from earwigbot.wiki import constants | |||
from earwigbot.wiki.category import Category | |||
from earwigbot.wiki.page import Page | |||
from earwigbot.wiki.user import User | |||
oursql = importer.new("oursql") | |||
__all__ = ["Site"] | |||
class Site(object): | |||
@@ -73,6 +73,7 @@ class Site(object): | |||
- :py:meth:`sql_query`: does an SQL query and yields its results | |||
- :py:meth:`get_maxlag`: returns the internal database lag | |||
- :py:meth:`get_replag`: estimates the external database lag | |||
- :py:meth:`get_token`: gets a token for a specific API action | |||
- :py:meth:`namespace_id_to_name`: returns names associated with an NS id | |||
- :py:meth:`namespace_name_to_id`: returns the ID associated with a NS name | |||
- :py:meth:`get_page`: returns a Page for the given title | |||
@@ -82,11 +83,13 @@ class Site(object): | |||
""" | |||
SERVICE_API = 1 | |||
SERVICE_SQL = 2 | |||
SPECIAL_TOKENS = ["deleteglobalaccount", "patrol", "rollback", | |||
"setglobalaccountstatus", "userrights", "watch"] | |||
def __init__(self, name=None, project=None, lang=None, base_url=None, | |||
article_path=None, script_path=None, sql=None, | |||
namespaces=None, login=(None, None), cookiejar=None, | |||
user_agent=None, use_https=False, assert_edit=None, | |||
user_agent=None, use_https=True, assert_edit=None, | |||
maxlag=None, wait_between_queries=2, logger=None, | |||
search_config=None): | |||
"""Constructor for new Site instances. | |||
@@ -123,7 +126,8 @@ class Site(object): | |||
self._wait_between_queries = wait_between_queries | |||
self._max_retries = 6 | |||
self._last_query_time = 0 | |||
self._api_lock = Lock() | |||
self._tokens = {} | |||
self._api_lock = RLock() | |||
self._api_info_cache = {"maxlag": 0, "lastcheck": 0} | |||
# Attributes used for SQL queries: | |||
@@ -132,7 +136,7 @@ class Site(object): | |||
else: | |||
self._sql_data = {} | |||
self._sql_conn = None | |||
self._sql_lock = Lock() | |||
self._sql_lock = RLock() | |||
self._sql_info_cache = {"replag": 0, "lastcheck": 0, "usable": None} | |||
# Attribute used in copyright violation checks (see CopyrightMixIn): | |||
@@ -209,11 +213,13 @@ class Site(object): | |||
args.append(key + "=" + val) | |||
return "&".join(args) | |||
def _api_query(self, params, tries=0, wait=5, ignore_maxlag=False): | |||
def _api_query(self, params, tries=0, wait=5, ignore_maxlag=False, | |||
no_assert=False, ae_retry=True): | |||
"""Do an API query with *params* as a dict of parameters. | |||
See the documentation for :py:meth:`api_query` for full implementation | |||
details. | |||
details. *tries*, *wait*, and *ignore_maxlag* are for maxlag; | |||
*no_assert* and *ae_retry* are for AssertEdit. | |||
""" | |||
since_last_query = time() - self._last_query_time # Throttling support | |||
if since_last_query < self._wait_between_queries: | |||
@@ -223,7 +229,7 @@ class Site(object): | |||
sleep(wait_time) | |||
self._last_query_time = time() | |||
url, data = self._build_api_query(params, ignore_maxlag) | |||
url, data = self._build_api_query(params, ignore_maxlag, no_assert) | |||
if "lgpassword" in params: | |||
self._logger.debug("{0} -> <hidden>".format(url)) | |||
else: | |||
@@ -247,26 +253,42 @@ class Site(object): | |||
gzipper = GzipFile(fileobj=stream) | |||
result = gzipper.read() | |||
return self._handle_api_query_result(result, params, tries, wait) | |||
return self._handle_api_result(result, params, tries, wait, ae_retry) | |||
def _request_csrf_token(self, params): | |||
"""If possible, add a request for a CSRF token to an API query.""" | |||
if params.get("action") == "query": | |||
if params.get("meta"): | |||
if "tokens" not in params["meta"].split("|"): | |||
params["meta"] += "|tokens" | |||
else: | |||
params["meta"] = "tokens" | |||
if params.get("type"): | |||
if "csrf" not in params["type"].split("|"): | |||
params["type"] += "|csrf" | |||
def _build_api_query(self, params, ignore_maxlag): | |||
def _build_api_query(self, params, ignore_maxlag, no_assert): | |||
"""Given API query params, return the URL to query and POST data.""" | |||
if not self._base_url or self._script_path is None: | |||
e = "Tried to do an API query, but no API URL is known." | |||
raise exceptions.APIError(e) | |||
url = ''.join((self.url, self._script_path, "/api.php")) | |||
url = self.url + self._script_path + "/api.php" | |||
params["format"] = "json" # This is the only format we understand | |||
if self._assert_edit: # If requested, ensure that we're logged in | |||
if self._assert_edit and not no_assert: | |||
# If requested, ensure that we're logged in | |||
params["assert"] = self._assert_edit | |||
if self._maxlag and not ignore_maxlag: | |||
# If requested, don't overload the servers: | |||
params["maxlag"] = self._maxlag | |||
if "csrf" not in self._tokens: | |||
# If we don't have a CSRF token, try to fetch one: | |||
self._request_csrf_token(params) | |||
data = self._urlencode_utf8(params) | |||
return url, data | |||
def _handle_api_query_result(self, result, params, tries, wait): | |||
def _handle_api_result(self, result, params, tries, wait, ae_retry): | |||
"""Given the result of an API query, attempt to return useful data.""" | |||
try: | |||
res = loads(result) # Try to parse as a JSON object | |||
@@ -277,8 +299,11 @@ class Site(object): | |||
try: | |||
code = res["error"]["code"] | |||
info = res["error"]["info"] | |||
except (TypeError, KeyError): # Having these keys indicates a problem | |||
return res # All is well; return the decoded JSON | |||
except (TypeError, KeyError): # If there's no error code/info, return | |||
if "query" in res and "tokens" in res["query"]: | |||
for name, token in res["query"]["tokens"].iteritems(): | |||
self._tokens[name.split("token")[0]] = token | |||
return res | |||
if code == "maxlag": # We've been throttled by the server | |||
if tries >= self._max_retries: | |||
@@ -288,7 +313,21 @@ class Site(object): | |||
msg = 'Server says "{0}"; retrying in {1} seconds ({2}/{3})' | |||
self._logger.info(msg.format(info, wait, tries, self._max_retries)) | |||
sleep(wait) | |||
return self._api_query(params, tries=tries, wait=wait*2) | |||
return self._api_query(params, tries, wait * 2, ae_retry=ae_retry) | |||
elif code in ["assertuserfailed", "assertbotfailed"]: # AssertEdit | |||
if ae_retry and all(self._login_info): | |||
# Try to log in if we got logged out: | |||
self._login(self._login_info) | |||
if "token" in params: # Fetch a new one; this is invalid now | |||
params["token"] = self.get_token(params["action"]) | |||
return self._api_query(params, tries, wait, ae_retry=False) | |||
if not all(self._login_info): | |||
e = "Assertion failed, and no login info was provided." | |||
elif code == "assertbotfailed": | |||
e = "Bot assertion failed: we don't have a bot flag!" | |||
else: | |||
e = "User assertion failed due to an unknown issue. Cookie problem?" | |||
raise exceptions.PermissionsError("AssertEdit: " + e) | |||
else: # Some unknown error occurred | |||
e = 'API query failed: got error "{0}"; server says: "{1}".' | |||
error = exceptions.APIError(e.format(code, info)) | |||
@@ -308,18 +347,20 @@ class Site(object): | |||
# All attributes to be loaded, except _namespaces, which is a special | |||
# case because it requires additional params in the API query: | |||
attrs = [self._name, self._project, self._lang, self._base_url, | |||
self._article_path, self._script_path] | |||
self._article_path, self._script_path] | |||
params = {"action": "query", "meta": "siteinfo", "siprop": "general"} | |||
if not self._namespaces or force: | |||
params["siprop"] += "|namespaces|namespacealiases" | |||
result = self.api_query(**params) | |||
with self._api_lock: | |||
result = self._api_query(params, no_assert=True) | |||
self._load_namespaces(result) | |||
elif all(attrs): # Everything is already specified and we're not told | |||
return # to force a reload, so do nothing | |||
else: # We're only loading attributes other than _namespaces | |||
result = self.api_query(**params) | |||
with self._api_lock: | |||
result = self._api_query(params, no_assert=True) | |||
res = result["query"]["general"] | |||
self._name = res["wikiid"] | |||
@@ -465,13 +506,14 @@ class Site(object): | |||
from our first request, and *attempt* is to prevent getting stuck in a | |||
loop if MediaWiki isn't acting right. | |||
""" | |||
self._tokens.clear() | |||
name, password = login | |||
params = {"action": "login", "lgname": name, "lgpassword": password} | |||
if token: | |||
result = self.api_query(action="login", lgname=name, | |||
lgpassword=password, lgtoken=token) | |||
else: | |||
result = self.api_query(action="login", lgname=name, | |||
lgpassword=password) | |||
params["lgtoken"] = token | |||
with self._api_lock: | |||
result = self._api_query(params, no_assert=True) | |||
res = result["login"]["result"] | |||
if res == "Success": | |||
@@ -514,24 +556,23 @@ class Site(object): | |||
may raise its own exceptions (e.g. oursql.InterfaceError) if it cannot | |||
establish a connection. | |||
""" | |||
if not oursql: | |||
e = "Module 'oursql' is required for SQL queries." | |||
raise exceptions.SQLError(e) | |||
args = self._sql_data | |||
for key, value in kwargs.iteritems(): | |||
args[key] = value | |||
if "read_default_file" not in args and "user" not in args and "passwd" not in args: | |||
args["read_default_file"] = expanduser("~/.my.cnf") | |||
elif "read_default_file" in args: | |||
args["read_default_file"] = expanduser(args["read_default_file"]) | |||
if "autoping" not in args: | |||
args["autoping"] = True | |||
if "autoreconnect" not in args: | |||
args["autoreconnect"] = True | |||
self._sql_conn = oursql.connect(**args) | |||
try: | |||
self._sql_conn = oursql.connect(**args) | |||
except ImportError: | |||
e = "SQL querying requires the 'oursql' package: http://packages.python.org/oursql/" | |||
raise exceptions.SQLError(e) | |||
def _get_service_order(self): | |||
"""Return a preferred order for using services (e.g. the API and SQL). | |||
@@ -639,13 +680,13 @@ class Site(object): | |||
query until we exceed :py:attr:`self._max_retries`. | |||
There is helpful MediaWiki API documentation at `MediaWiki.org | |||
<http://www.mediawiki.org/wiki/API>`_. | |||
<https://www.mediawiki.org/wiki/API>`_. | |||
""" | |||
with self._api_lock: | |||
return self._api_query(kwargs) | |||
def sql_query(self, query, params=(), plain_query=False, dict_cursor=False, | |||
cursor_class=None, show_table=False): | |||
cursor_class=None, show_table=False, buffsize=1024): | |||
"""Do an SQL query and yield its results. | |||
If *plain_query* is ``True``, we will force an unparameterized query. | |||
@@ -656,6 +697,13 @@ class Site(object): | |||
is True, the name of the table will be prepended to the name of the | |||
column. This will mainly affect an :py:class:`~oursql.DictCursor`. | |||
*buffsize* is the size of each memory-buffered group of results, to | |||
reduce the number of conversations with the database; it is passed to | |||
:py:meth:`cursor.fetchmany() <oursql.Cursor.fetchmany>`. If set to | |||
``0```, all results will be buffered in memory at once (this uses | |||
:py:meth:`fetchall() <oursql.Cursor.fetchall>`). If set to ``1``, it is | |||
equivalent to using :py:meth:`fetchone() <oursql.Cursor.fetchone>`. | |||
Example usage:: | |||
>>> query = "SELECT user_id, user_registration FROM user WHERE user_name = ?" | |||
@@ -688,7 +736,14 @@ class Site(object): | |||
self._sql_connect() | |||
with self._sql_conn.cursor(klass, show_table=show_table) as cur: | |||
cur.execute(query, params, plain_query) | |||
for result in cur: | |||
if buffsize: | |||
while True: | |||
group = cur.fetchmany(buffsize) | |||
if not group: | |||
return | |||
for result in group: | |||
yield result | |||
for result in cur.fetchall(): | |||
yield result | |||
def get_maxlag(self, showall=False): | |||
@@ -729,7 +784,28 @@ class Site(object): | |||
query = """SELECT UNIX_TIMESTAMP() - UNIX_TIMESTAMP(rc_timestamp) FROM | |||
recentchanges ORDER BY rc_timestamp DESC LIMIT 1""" | |||
result = list(self.sql_query(query)) | |||
return result[0][0] | |||
return int(result[0][0]) | |||
def get_token(self, action=None, force=False): | |||
"""Return a token for a data-modifying API action. | |||
In general, this will be a CSRF token, unless *action* is in a special | |||
list of non-CSRF tokens. Tokens are cached for the session (until | |||
:meth:`_login` is called again); set *force* to ``True`` to force a new | |||
token to be fetched. | |||
Raises :exc:`.APIError` if there was an API issue. | |||
""" | |||
if action not in self.SPECIAL_TOKENS: | |||
action = "csrf" | |||
if action in self._tokens and not force: | |||
return self._tokens[action] | |||
res = self.api_query(action="query", meta="tokens", type=action) | |||
if action not in self._tokens: | |||
err = "Tried to fetch a {0} token, but API returned: {1}" | |||
raise exceptions.APIError(err.format(action, res)) | |||
return self._tokens[action] | |||
def namespace_id_to_name(self, ns_id, all=False): | |||
"""Given a namespace ID, returns associated namespace names. | |||
@@ -768,7 +844,7 @@ class Site(object): | |||
if lname in lnames: | |||
return ns_id | |||
e = "There is no namespace with name '{0}'.".format(name) | |||
e = u"There is no namespace with name '{0}'.".format(name) | |||
raise exceptions.NamespaceNotFoundError(e) | |||
def get_page(self, title, follow_redirects=False, pageid=None): | |||
@@ -826,7 +902,7 @@ class Site(object): | |||
(:py:attr:`self.SERVICE_API <SERVICE_API>` or | |||
:py:attr:`self.SERVICE_SQL <SERVICE_SQL>`), and the value is the | |||
function to call for this service. All functions will be passed the | |||
same arguments the tuple *args* and the dict **kwargs**, which are both | |||
same arguments the tuple *args* and the dict *kwargs*, which are both | |||
empty by default. The service order is determined by | |||
:py:meth:`_get_service_order`. | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -188,7 +188,7 @@ class SitesDB(object): | |||
config = self.config | |||
login = (config.wiki.get("username"), config.wiki.get("password")) | |||
user_agent = config.wiki.get("userAgent") | |||
use_https = config.wiki.get("useHTTPS", False) | |||
use_https = config.wiki.get("useHTTPS", True) | |||
assert_edit = config.wiki.get("assert") | |||
maxlag = config.wiki.get("maxlag") | |||
wait_between_queries = config.wiki.get("waitTime", 2) | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,7 +1,7 @@ | |||
#! /usr/bin/env python | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -25,24 +25,35 @@ from setuptools import setup, find_packages | |||
from earwigbot import __version__ | |||
# Not all of these dependencies are required, particularly the copyvio-specific | |||
# ones (bs4, lxml, nltk, and oauth2) and the command-specific one (pytz). The | |||
# bot should run fine without them, but will raise an exception if you try to | |||
# detect copyvios or run a command that requries one. | |||
dependencies = [ | |||
"PyYAML >= 3.10", # Parsing config files | |||
"beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML for copyvios | |||
"lxml >= 2.3.5", # Faster parser for BeautifulSoup | |||
"mwparserfromhell >= 0.1", # Parsing wikicode for manipulation | |||
"nltk >= 2.0.2", # Parsing sentences to split article content for copyvios | |||
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search for copyvios | |||
"oursql >= 0.9.3.1", # Interfacing with MediaWiki databases | |||
"py-bcrypt >= 0.2", # Hashing the bot key in the config file | |||
"pycrypto >= 2.6", # Storing bot passwords and keys in the config file | |||
"pytz >= 2012d", # Handling timezones for the !time IRC command | |||
required_deps = [ | |||
"PyYAML >= 3.11", # Parsing config files | |||
"mwparserfromhell >= 0.4.3", # Parsing wikicode for manipulation | |||
] | |||
extra_deps = { | |||
"crypto": [ | |||
"py-bcrypt >= 0.4", # Hashing the bot key in the config file | |||
"pycrypto >= 2.6.1", # Storing bot passwords + keys in the config file | |||
], | |||
"sql": [ | |||
"oursql >= 0.9.3.1", # Interfacing with MediaWiki databases | |||
], | |||
"copyvios": [ | |||
"beautifulsoup4 >= 4.4.1", # Parsing/scraping HTML | |||
"cchardet >= 1.0.0", # Encoding detection for BeautifulSoup | |||
"lxml >= 3.4.4", # Faster parser for BeautifulSoup | |||
"nltk >= 3.1", # Parsing sentences to split article content | |||
"oauth2 >= 1.9.0", # Interfacing with Yahoo! BOSS Search | |||
"pdfminer >= 20140328", # Extracting text from PDF files | |||
"tldextract >= 1.7.1", # Getting domains for the multithreaded workers | |||
], | |||
"time": [ | |||
"pytz >= 2015.7", # Handling timezones for the !time IRC command | |||
], | |||
} | |||
dependencies = required_deps + sum(extra_deps.values(), []) | |||
with open("README.rst") as fp: | |||
long_docs = fp.read() | |||
@@ -54,7 +65,7 @@ setup( | |||
test_suite = "tests", | |||
version = __version__, | |||
author = "Ben Kurtovic", | |||
author_email = "ben.kurtovic@verizon.net", | |||
author_email = "ben.kurtovic@gmail.com", | |||
url = "https://github.com/earwig/earwigbot", | |||
description = "EarwigBot is a Python robot that edits Wikipedia and interacts with people over IRC.", | |||
long_description = long_docs, | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# Copyright (C) 2009-2015 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||