From baf8b9683b5020c2e2b770743fe397746de431bf Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Nov 2015 23:07:44 -0600 Subject: [PATCH 01/88] Version bump for 0.3 --- CHANGELOG | 4 ++++ docs/conf.py | 4 ++-- docs/index.rst | 2 +- earwigbot/__init__.py | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 47e568b..4807656 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,7 @@ +v0.3 (unreleased): + +- + v0.2 (released November 8, 2015): - Added a new command syntax allowing the caller to redirect replies to another diff --git a/docs/conf.py b/docs/conf.py index a8e5cc6..b843fe0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,9 +48,9 @@ copyright = u'2009-2015 Ben Kurtovic' # built documents. # # The short X.Y version. -version = '0.2' +version = '0.3' # The full version, including alpha/beta/rc tags. -release = '0.2' +release = '0.3.dev0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/index.rst b/docs/index.rst index d61bae7..3c4e3bc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,4 +1,4 @@ -EarwigBot v0.2 Documentation +EarwigBot v0.3 Documentation ============================ EarwigBot_ is a Python_ robot that edits Wikipedia_ and interacts with people diff --git a/earwigbot/__init__.py b/earwigbot/__init__.py index b9b7bc3..6f4e09a 100644 --- a/earwigbot/__init__.py +++ b/earwigbot/__init__.py @@ -32,7 +32,7 @@ details. This documentation is also available `online __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2009-2015 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.2" +__version__ = "0.3.dev0" __email__ = "ben.kurtovic@gmail.com" __release__ = False From 8f2b232568fe849475e005dbdf32dc826e5b1ecb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 15 Nov 2015 20:39:29 -0600 Subject: [PATCH 02/88] Add !remind all. --- CHANGELOG | 2 +- earwigbot/commands/remind.py | 30 ++++++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 4807656..e09b882 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,6 @@ v0.3 (unreleased): -- +- Added !remind all. v0.2 (released November 8, 2015): diff --git a/earwigbot/commands/remind.py b/earwigbot/commands/remind.py index f61e3f4..5fc0c3d 100644 --- a/earwigbot/commands/remind.py +++ b/earwigbot/commands/remind.py @@ -221,9 +221,8 @@ class Remind(Command): def _show_reminders(self, data, user): """Show all of a user's current reminders.""" shorten = lambda s: (s[:37] + "..." if len(s) > 40 else s) - tmpl = '\x0303{0}\x0F ("{1}", {2})' - fmt = lambda robj: tmpl.format(robj.id, shorten(robj.message), - robj.end_time) + fmt = lambda robj: '\x0303{0}\x0F ("{1}", {2})'.format( + robj.id, shorten(robj.message), robj.end_time) if user in self.reminders: rlist = ", ".join(fmt(robj) for robj in self.reminders[user]) @@ -233,6 +232,26 @@ class Remind(Command): "[message]\x0F. See also: \x0306!remind help\x0F.") self.reply(data, msg) + def _show_all_reminders(self, data): + """Show all reminders to bot admins.""" + if not self.config.irc["permissions"].is_admin(data): + self.reply(data, "You must be a bot admin to view other users' " + "reminders. View your own with " + "\x0306!reminders\x0F.") + return + if not self.reminders: + self.reply(data, "There are no active reminders.") + return + + shorten = lambda s: (s[:37] + "..." if len(s) > 40 else s) + fmt = lambda robj, user: '\x0303{0}\x0F ("{1}" for {2}, {3})'.format( + robj.id, shorten(robj.message), user, robj.end_time) + + rlist = (fmt(rem, user) for user, rems in self.reminders.iteritems() + for rem in rems) + msg = "All reminders: {0}.".format(", ".join(rlist)) + self.reply(data, msg) + def _process_snooze_command(self, data, user): """Process the !snooze command.""" if not data.args: @@ -271,7 +290,8 @@ class Remind(Command): ("Get info", "!remind [id]"), ("Cancel", "!remind cancel [id]"), ("Adjust", "!remind adjust [id] [time]"), - ("Restart", "!snooze [id]") + ("Restart", "!snooze [id]"), + ("Admin", "!remind all") ] extra = "In most cases, \x0306[id]\x0F can be omitted if you have only one reminder." joined = " ".join("{0}: \x0306{1}\x0F.".format(k, v) for k, v in parts) @@ -295,6 +315,8 @@ class Remind(Command): command = data.args[0] if command == "help": return self._show_help(data) + if command == "all": + return self._show_all_reminders(data) if command in DISPLAY + CANCEL + SNOOZE: if user not in self.reminders: msg = "You have no reminders to {0}." From 941cc2c00a7ac74dbf814a262f1c2d7a2cebd0d6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 15 Nov 2015 21:03:02 -0600 Subject: [PATCH 03/88] Tweak !remind all a bit. --- earwigbot/commands/remind.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/earwigbot/commands/remind.py b/earwigbot/commands/remind.py index 5fc0c3d..9a3e397 100644 --- a/earwigbot/commands/remind.py +++ b/earwigbot/commands/remind.py @@ -220,17 +220,20 @@ class Remind(Command): def _show_reminders(self, data, user): """Show all of a user's current reminders.""" + if user not in self.reminders: + self.reply(data, "You have no reminders. Set one with " + "\x0306!remind [time] [message]\x0F. See also: " + "\x0306!remind help\x0F.") + return + shorten = lambda s: (s[:37] + "..." if len(s) > 40 else s) - fmt = lambda robj: '\x0303{0}\x0F ("{1}", {2})'.format( - robj.id, shorten(robj.message), robj.end_time) + dest = lambda data: ( + "privately" if data.is_private else "in {0}".format(data.chan)) + fmt = lambda robj: '\x0303{0}\x0F ("{1}" {2}, {3})'.format( + robj.id, shorten(robj.message), dest(robj.data), robj.end_time) - if user in self.reminders: - rlist = ", ".join(fmt(robj) for robj in self.reminders[user]) - msg = "Your reminders: {0}.".format(rlist) - else: - msg = ("You have no reminders. Set one with \x0306!remind [time] " - "[message]\x0F. See also: \x0306!remind help\x0F.") - self.reply(data, msg) + rlist = ", ".join(fmt(robj) for robj in self.reminders[user]) + self.reply(data, "Your reminders: {0}.".format(rlist)) def _show_all_reminders(self, data): """Show all reminders to bot admins.""" @@ -243,14 +246,14 @@ class Remind(Command): self.reply(data, "There are no active reminders.") return - shorten = lambda s: (s[:37] + "..." if len(s) > 40 else s) - fmt = lambda robj, user: '\x0303{0}\x0F ("{1}" for {2}, {3})'.format( - robj.id, shorten(robj.message), user, robj.end_time) + dest = lambda data: ( + "privately" if data.is_private else "in {0}".format(data.chan)) + fmt = lambda robj, user: '\x0303{0}\x0F (for {1} {2}, {3})'.format( + robj.id, user, dest(robj.data), robj.end_time) rlist = (fmt(rem, user) for user, rems in self.reminders.iteritems() for rem in rems) - msg = "All reminders: {0}.".format(", ".join(rlist)) - self.reply(data, msg) + self.reply(data, "All reminders: {0}.".format(", ".join(rlist))) def _process_snooze_command(self, data, user): """Process the !snooze command.""" @@ -421,6 +424,11 @@ class _Reminder(object): self._cmdobj.unstore_reminder(self.id) @property + def data(self): + """Return the IRC data object associated with this reminder.""" + return self._data + + @property def end_time(self): """Return a string representing the end time of a reminder.""" if self.end >= time.time(): From 696d24432dc7ade1e6b312131265f02107480131 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 16 Nov 2015 05:05:48 -0600 Subject: [PATCH 04/88] Improve detection of maximum IRC message length. --- CHANGELOG | 1 + earwigbot/irc/connection.py | 33 +++++++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index e09b882..53d36fa 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ v0.3 (unreleased): - Added !remind all. +- Improved detection of maximum IRC message length. v0.2 (released November 8, 2015): diff --git a/earwigbot/irc/connection.py b/earwigbot/irc/connection.py index 5fe15b0..2d10341 100644 --- a/earwigbot/irc/connection.py +++ b/earwigbot/irc/connection.py @@ -45,6 +45,7 @@ class IRCConnection(object): self._last_recv = time() self._last_send = 0 self._last_ping = 0 + self._myhost = "." * 63 # default: longest possible hostname def __repr__(self): """Return the canonical string representation of the IRCConnection.""" @@ -100,8 +101,19 @@ class IRCConnection(object): self.logger.debug(msg) self._last_send = time() - def _split(self, msgs, maxlen, maxsplits=3): - """Split a large message into multiple messages smaller than maxlen.""" + def _get_maxlen(self, extra): + """Return our best guess of the maximum length of a standard message. + + This applies mainly to PRIVMSGs and NOTICEs. + """ + base_max = 512 + userhost = len(self.nick) + len(self.ident) + len(self._myhost) + 2 + padding = 4 # "\r\n" at end, ":" at beginning, and " " after userhost + return base_max - userhost - padding - extra + + def _split(self, msgs, extralen, maxsplits=3): + """Split a large message into multiple messages.""" + maxlen = self._get_maxlen(extralen) words = msgs.split(" ") splits = 0 while words and splits < maxsplits: @@ -128,6 +140,19 @@ class IRCConnection(object): self._last_recv = time() if line[0] == "PING": # If we are pinged, pong back self.pong(line[1][1:]) + elif line[1] == "001": # Update nickname on startup + if line[2] != self.nick: + self.logger.warn("Nickname changed from {0} to {1}".format( + self.nick, line[2])) + self._nick = line[2] + elif line[1] == "376": # After sign-on, get our userhost + self._send("WHOIS {0}".format(self.nick)) + elif line[1] == "311": # Receiving WHOIS result + if line[2] == self.nick: + self._ident = line[4] + self._myhost = line[5] + elif line[1] == "396": # Hostname change + self._myhost = line[3] def _process_message(self, line): """To be overridden in subclasses.""" @@ -163,7 +188,7 @@ class IRCConnection(object): def say(self, target, msg, hidelog=False): """Send a private message to a target on the server.""" - for msg in self._split(msg, 400): + for msg in self._split(msg, len(target) + 10): msg = "PRIVMSG {0} :{1}".format(target, msg) self._send(msg, hidelog) @@ -182,7 +207,7 @@ class IRCConnection(object): def notice(self, target, msg, hidelog=False): """Send a notice to a target on the server.""" - for msg in self._split(msg, 400): + for msg in self._split(msg, len(target) + 9): msg = "NOTICE {0} :{1}".format(target, msg) self._send(msg, hidelog) From e9add9f27f1379d50c8a0938eb43df57beb93084 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 16 Nov 2015 05:08:29 -0600 Subject: [PATCH 05/88] Tweak version, script message. --- earwigbot/__init__.py | 2 +- earwigbot/config/script.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/earwigbot/__init__.py b/earwigbot/__init__.py index 6f4e09a..78b02b7 100644 --- a/earwigbot/__init__.py +++ b/earwigbot/__init__.py @@ -45,7 +45,7 @@ if not __release__: commit_id = Repo(path).head.object.hexsha return commit_id[:8] try: - __version__ += "+git-" + _get_git_commit_id() + __version__ += "+" + _get_git_commit_id() except Exception: pass finally: diff --git a/earwigbot/config/script.py b/earwigbot/config/script.py index e6e4ec8..d4669f7 100644 --- a/earwigbot/config/script.py +++ b/earwigbot/config/script.py @@ -143,9 +143,9 @@ class ConfigScript(object): self._print("""I can encrypt passwords stored in your config file in addition to preventing other users on your system from reading the file. Encryption is recommended if the bot - is to run on a public computer like the Toolserver, but - otherwise the need to enter a key everytime you start - the bot may be annoying.""") + is to run on a public server like Wikimedia Labs, but + otherwise the need to enter a key every time you start + the bot may be an inconvenience.""") self.data["metadata"]["encryptPasswords"] = False if self._ask_bool("Encrypt stored passwords?"): key = getpass(self.PROMPT + "Enter an encryption key: ") From 248e9a84acea2854249f9eb836b20c9626c81250 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 20 Nov 2015 00:28:49 -0600 Subject: [PATCH 06/88] Update !help remind. --- earwigbot/commands/remind.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/earwigbot/commands/remind.py b/earwigbot/commands/remind.py index 9a3e397..9422e86 100644 --- a/earwigbot/commands/remind.py +++ b/earwigbot/commands/remind.py @@ -37,7 +37,8 @@ CANCEL = ["cancel", "stop", "delete", "del", "stop", "unremind", "forget", SNOOZE = ["snooze", "delay", "reset", "adjust", "modify", "change"] class Remind(Command): - """Set a message to be repeated to you in a certain amount of time.""" + """Set a message to be repeated to you in a certain amount of time. See + usage with !remind help.""" name = "remind" commands = ["remind", "reminder", "reminders", "snooze", "cancel", "unremind", "forget"] From 75058997c2a33fec82e2df6ca652b5579a59482a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 20 Nov 2015 05:51:44 -0600 Subject: [PATCH 07/88] Split copyvio queries a bit differently; maybe better on other languages. --- earwigbot/wiki/copyvios/parsers.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index d843ad5..546a138 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -61,6 +61,7 @@ class ArticleTextParser(_BaseTextParser): """A parser that can strip and chunk wikicode article text.""" TYPE = "Article" TEMPLATE_MERGE_THRESHOLD = 35 + SPLIT_REGEX = re.compile("[!#$%&*+,./:;<=>?@^`|~{}]") def _merge_templates(self, code): """Merge template contents in to wikicode when the values are long.""" @@ -132,6 +133,12 @@ class ArticleTextParser(_BaseTextParser): directory (*nltk_dir*) is required to store nltk's punctuation database. This is typically located in the bot's working directory. """ + def cut_string(fragment): + words = fragment.split() + while len(" ".join(words)) > max_query: + words.pop() + return " ".join(words) + datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") try: tokenizer = nltk.data.load("file:" + datafile) @@ -141,16 +148,14 @@ class ArticleTextParser(_BaseTextParser): sentences = [] for sentence in tokenizer.tokenize(self.clean): - if len(sentence) > max_query: - words = sentence.split() - while len(" ".join(words)) > max_query: - words.pop() - sentence = " ".join(words) - if len(sentence) < min_query: - continue - sentences.append(sentence) - - if max_chunks >= len(sentences): + if len(sentence) <= max_query: + sentences.append(sentence) + else: + sentences.extend(cut_string(fragment) for fragment in + self.SPLIT_REGEX.split(sentence)) + + sentences = [sen for sen in sentences if len(sen) >= min_query] + if len(sentences) <= max_chunks: return sentences chunks = [] From f92fb34d0ecf9c911dd9ae0d37d7c6754584887a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 21 Nov 2015 01:19:40 -0600 Subject: [PATCH 08/88] Improve sentence splitting, again. --- CHANGELOG | 6 ++-- earwigbot/wiki/copyvios/__init__.py | 7 ++-- earwigbot/wiki/copyvios/parsers.py | 70 +++++++++++++++++++++++++++---------- 3 files changed, 61 insertions(+), 22 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 53d36fa..8c1dac9 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,9 @@ v0.3 (unreleased): -- Added !remind all. -- Improved detection of maximum IRC message length. +- Copyvio detector: improved sentence splitting algorithm. +- IRC: Added !remind all. +- IRC: Improved detection of maximum IRC message length. +- IRC: Improved some help commands. v0.2 (released November 8, 2015): diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index dbe8efa..ef40cd5 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -114,7 +114,10 @@ class CopyvioMixIn(object): log = u"Starting copyvio check for [[{0}]]" self._logger.info(log.format(self.title)) searcher = self._get_search_engine() - parser = ArticleTextParser(self.get()) + parser = ArticleTextParser(self.get(), { + "nltk_dir": self._search_config["nltk_dir"], + "lang": self._site.lang + }) article = MarkovChain(parser.strip()) parser_args = {} @@ -139,7 +142,7 @@ class CopyvioMixIn(object): workspace.enqueue(parser.get_links(), exclude) num_queries = 0 if not no_searches: - chunks = parser.chunk(self._search_config["nltk_dir"], max_queries) + chunks = parser.chunk(max_queries) for chunk in chunks: if short_circuit and workspace.finished: workspace.possible_miss = True diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 546a138..ce2bb2b 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -61,7 +61,26 @@ class ArticleTextParser(_BaseTextParser): """A parser that can strip and chunk wikicode article text.""" TYPE = "Article" TEMPLATE_MERGE_THRESHOLD = 35 - SPLIT_REGEX = re.compile("[!#$%&*+,./:;<=>?@^`|~{}]") + NLTK_DEFAULT = "english" + NLTK_LANGS = { + "cs": "czech", + "da": "danish", + "de": "german", + "el": "greek", + "en": "english", + "es": "spanish", + "et": "estonian", + "fi": "finnish", + "fr": "french", + "it": "italian", + "nl": "dutch", + "no": "norwegian", + "pl": "polish", + "pt": "portuguese", + "sl": "slovene", + "sv": "swedish", + "tr": "turkish" + } def _merge_templates(self, code): """Merge template contents in to wikicode when the values are long.""" @@ -77,6 +96,18 @@ class ArticleTextParser(_BaseTextParser): else: code.remove(template) + def _get_tokenizer(self): + """Return a NLTK punctuation tokenizer for the article's language.""" + datafile = lambda lang: "file:" + path.join( + self._args["nltk_dir"], "tokenizers", "punkt", lang + ".pickle") + + lang = self.NLTK_LANGS.get(self._args.get("lang"), self.NLTK_DEFAULT) + try: + nltk.data.load(datafile(self.NLTK_DEFAULT)) + except LookupError: + nltk.download("punkt", self._args["nltk_dir"]) + return nltk.data.load(datafile(lang)) + def strip(self): """Clean the page's raw text by removing templates and formatting. @@ -119,7 +150,7 @@ class ArticleTextParser(_BaseTextParser): self.clean = re.sub("\n\n+", "\n", clean).strip() return self.clean - def chunk(self, nltk_dir, max_chunks, min_query=8, max_query=128): + def chunk(self, max_chunks, min_query=8, max_query=128, split_thresh=32): """Convert the clean article text into a list of web-searchable chunks. No greater than *max_chunks* will be returned. Each chunk will only be @@ -131,28 +162,31 @@ class ArticleTextParser(_BaseTextParser): This is implemented using :py:mod:`nltk` (http://nltk.org/). A base directory (*nltk_dir*) is required to store nltk's punctuation - database. This is typically located in the bot's working directory. + database, and should be passed as an argument to the constructor. It is + typically located in the bot's working directory. """ - def cut_string(fragment): - words = fragment.split() - while len(" ".join(words)) > max_query: - words.pop() - return " ".join(words) - - datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") - try: - tokenizer = nltk.data.load("file:" + datafile) - except LookupError: - nltk.download("punkt", nltk_dir) - tokenizer = nltk.data.load("file:" + datafile) - + def cut_sentence(words): + div = len(words) + if div == 0: + return [] + + length = len(" ".join(words)) + while length > max_query: + div -= 1 + length -= len(words[div]) + 1 + + result = [] + if length >= split_thresh: + result.append(" ".join(words[:div])) + return result + cut_sentence(words[div + 1:]) + + tokenizer = self._get_tokenizer() sentences = [] for sentence in tokenizer.tokenize(self.clean): if len(sentence) <= max_query: sentences.append(sentence) else: - sentences.extend(cut_string(fragment) for fragment in - self.SPLIT_REGEX.split(sentence)) + sentences.extend(cut_sentence(sentence.split())) sentences = [sen for sen in sentences if len(sen) >= min_query] if len(sentences) <= max_chunks: From eceb4d139acda3f793a6163ccefcbe3e6aec4fd5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 21 Nov 2015 01:35:28 -0600 Subject: [PATCH 09/88] Minor refactor. --- earwigbot/wiki/copyvios/parsers.py | 54 +++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index ce2bb2b..2f88356 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -108,6 +108,35 @@ class ArticleTextParser(_BaseTextParser): nltk.download("punkt", self._args["nltk_dir"]) return nltk.data.load(datafile(lang)) + def _get_sentences(self, min_query, max_query, split_thresh): + """Split the article text into sentences of a certain length.""" + def cut_sentence(words): + div = len(words) + if div == 0: + return [] + + length = len(" ".join(words)) + while length > max_query: + div -= 1 + length -= len(words[div]) + 1 + + result = [] + if length >= split_thresh: + result.append(" ".join(words[:div])) + return result + cut_sentence(words[div + 1:]) + + tokenizer = self._get_tokenizer() + sentences = [] + if not hasattr(self, "clean"): + self.strip() + + for sentence in tokenizer.tokenize(self.clean): + if len(sentence) <= max_query: + sentences.append(sentence) + else: + sentences.extend(cut_sentence(sentence.split())) + return [sen for sen in sentences if len(sen) >= min_query] + def strip(self): """Clean the page's raw text by removing templates and formatting. @@ -165,30 +194,7 @@ class ArticleTextParser(_BaseTextParser): database, and should be passed as an argument to the constructor. It is typically located in the bot's working directory. """ - def cut_sentence(words): - div = len(words) - if div == 0: - return [] - - length = len(" ".join(words)) - while length > max_query: - div -= 1 - length -= len(words[div]) + 1 - - result = [] - if length >= split_thresh: - result.append(" ".join(words[:div])) - return result + cut_sentence(words[div + 1:]) - - tokenizer = self._get_tokenizer() - sentences = [] - for sentence in tokenizer.tokenize(self.clean): - if len(sentence) <= max_query: - sentences.append(sentence) - else: - sentences.extend(cut_sentence(sentence.split())) - - sentences = [sen for sen in sentences if len(sen) >= min_query] + sentences = self._get_sentences(min_query, max_query, split_thresh) if len(sentences) <= max_chunks: return sentences From ac678c0f14976a6a734e356c1f1bc0366eded692 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 2 Dec 2015 18:04:54 -0600 Subject: [PATCH 10/88] Improve !remind time detection (allow e.g. "!remind 7d-2h") --- CHANGELOG | 2 +- earwigbot/commands/remind.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 8c1dac9..0e3787f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,7 @@ v0.3 (unreleased): - Copyvio detector: improved sentence splitting algorithm. -- IRC: Added !remind all. +- IRC > !remind: Added !remind all. Improved time detection. - IRC: Improved detection of maximum IRC message length. - IRC: Improved some help commands. diff --git a/earwigbot/commands/remind.py b/earwigbot/commands/remind.py index 9422e86..a8c568f 100644 --- a/earwigbot/commands/remind.py +++ b/earwigbot/commands/remind.py @@ -78,13 +78,11 @@ class Remind(Command): else: raise ValueError(node) - if arg and arg[-1] in time_units: - factor, arg = time_units[arg[-1]], arg[:-1] - else: - factor = 1 + for unit, factor in time_units.iteritems(): + arg = arg.replace(unit, "*" + str(factor)) try: - parsed = int(_evaluate(ast.parse(arg, mode="eval").body) * factor) + parsed = int(_evaluate(ast.parse(arg, mode="eval").body)) except (SyntaxError, KeyError): raise ValueError(arg) if parsed <= 0: From a28eac94261c0ef8ca30b1ad73a05b560e28debd Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 3 Dec 2015 00:02:04 -0600 Subject: [PATCH 11/88] Substantial rework to reminders; fixes multithreading issues. --- CHANGELOG | 3 +- earwigbot/bot.py | 3 +- earwigbot/commands/remind.py | 259 ++++++++++++++++++++++++++---------------- earwigbot/commands/threads.py | 5 +- 4 files changed, 168 insertions(+), 102 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 0e3787f..47528ff 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,8 @@ v0.3 (unreleased): - Copyvio detector: improved sentence splitting algorithm. -- IRC > !remind: Added !remind all. Improved time detection. +- IRC > !remind: Added !remind all. Fixed multithreading efficiency issues. + Improved time detection. - IRC: Improved detection of maximum IRC message length. - IRC: Improved some help commands. diff --git a/earwigbot/bot.py b/earwigbot/bot.py index df59950..bd3cf24 100644 --- a/earwigbot/bot.py +++ b/earwigbot/bot.py @@ -150,7 +150,8 @@ class Bot(object): component_names = self.config.components.keys() skips = component_names + ["MainThread", "reminder", "irc:quit"] for thread in enumerate_threads(): - if thread.name not in skips and thread.is_alive(): + if thread.is_alive() and not any( + thread.name.startswith(skip) for skip in skips): tasks.append(thread.name) if tasks: log = "The following commands or tasks will be killed: {0}" diff --git a/earwigbot/commands/remind.py b/earwigbot/commands/remind.py index a8c568f..151a477 100644 --- a/earwigbot/commands/remind.py +++ b/earwigbot/commands/remind.py @@ -21,7 +21,6 @@ # SOFTWARE. import ast -from contextlib import contextmanager from itertools import chain import operator import random @@ -89,12 +88,6 @@ class Remind(Command): raise ValueError(parsed) return parsed - @contextmanager - def _db(self): - """Return a threadsafe context manager for the permissions database.""" - with self._db_lock: - yield self.config.irc["permissions"] - def _really_get_reminder_by_id(self, user, rid): """Return the _Reminder object that corresponds to a particular ID. @@ -124,11 +117,11 @@ class Remind(Command): def _start_reminder(self, reminder, user): """Start the given reminder object for the given user.""" - reminder.start() if user in self.reminders: self.reminders[user].append(reminder) else: self.reminders[user] = [reminder] + self._thread.add(reminder) def _create_reminder(self, data, user): """Create a new reminder for the given user.""" @@ -143,7 +136,6 @@ class Remind(Command): msg = "Given time \x02{0}\x0F is too large. Keep it reasonable." return self.reply(data, msg.format(data.args[0])) - end = time.time() + wait message = " ".join(data.args[1:]) try: rid = self._get_new_id() @@ -151,7 +143,7 @@ class Remind(Command): msg = "Couldn't set a new reminder: no free IDs available." return self.reply(data, msg) - reminder = _Reminder(rid, user, wait, end, message, data, self) + reminder = _Reminder(rid, user, wait, message, data, self) self._start_reminder(reminder, user) msg = "Set reminder \x0303{0}\x0F ({1})." self.reply(data, msg.format(rid, reminder.end_time)) @@ -165,7 +157,8 @@ class Remind(Command): def _cancel_reminder(self, data, user, reminder): """Cancel a pending reminder.""" - reminder.stop() + self._thread.remove(reminder) + self.unstore_reminder(reminder.id) self.reminders[user].remove(reminder) if not self.reminders[user]: del self.reminders[user] @@ -174,35 +167,34 @@ class Remind(Command): def _snooze_reminder(self, data, reminder, arg=None): """Snooze a reminder to be re-triggered after a period of time.""" - verb = "snoozed" if reminder.end < time.time() else "adjusted" + verb = "snoozed" if reminder.expired else "adjusted" + duration = None if arg: try: duration = self._parse_time(data.args[arg]) - reminder.wait = duration except (IndexError, ValueError): pass - reminder.end = time.time() + reminder.wait - reminder.start() + reminder.reset(duration) end = time.strftime("%b %d %H:%M:%S %Z", time.localtime(reminder.end)) msg = "Reminder \x0303{0}\x0F {1} until {2}." self.reply(data, msg.format(reminder.id, verb, end)) def _load_reminders(self): """Load previously made reminders from the database.""" - with self._db() as permdb: - try: - database = permdb.get_attr("command:remind", "data") - except KeyError: - return - permdb.set_attr("command:remind", "data", "[]") + permdb = self.config.irc["permissions"] + try: + database = permdb.get_attr("command:remind", "data") + except KeyError: + return + permdb.set_attr("command:remind", "data", "[]") for item in ast.literal_eval(database): rid, user, wait, end, message, data = item if end < time.time(): continue data = Data.unserialize(data) - reminder = _Reminder(rid, user, wait, end, message, data, self) + reminder = _Reminder(rid, user, wait, message, data, self, end) self._start_reminder(reminder, user) def _handle_command(self, command, data, user, reminder, arg=None): @@ -299,12 +291,8 @@ class Remind(Command): joined = " ".join("{0}: \x0306{1}\x0F.".format(k, v) for k, v in parts) self.reply(data, joined + " " + extra) - def setup(self): - self.reminders = {} - self._db_lock = RLock() - self._load_reminders() - - def process(self, data): + def _process(self, data): + """Main entry point.""" if data.command == "snooze": return self._process_snooze_command(data, data.host) if data.command in ["cancel", "unremind", "forget"]: @@ -350,67 +338,129 @@ class Remind(Command): self._handle_command(data.args[1], data, user, reminder, 2) + @property + def lock(self): + """Return the reminder modification/access lock.""" + return self._lock + + def setup(self): + self.reminders = {} + self._lock = RLock() + self._thread = _ReminderThread(self._lock) + self._load_reminders() + + def process(self, data): + with self.lock: + self._process(data) + def unload(self): - for reminder in chain(*self.reminders.values()): - reminder.stop(delete=False) + self._thread.stop() def store_reminder(self, reminder): """Store a serialized reminder into the database.""" - with self._db() as permdb: - try: - dump = permdb.get_attr("command:remind", "data") - except KeyError: - dump = "[]" + permdb = self.config.irc["permissions"] + try: + dump = permdb.get_attr("command:remind", "data") + except KeyError: + dump = "[]" - database = ast.literal_eval(dump) - database.append(reminder) - permdb.set_attr("command:remind", "data", str(database)) + database = ast.literal_eval(dump) + database.append(reminder) + permdb.set_attr("command:remind", "data", str(database)) def unstore_reminder(self, rid): """Remove a reminder from the database by ID.""" - with self._db() as permdb: - try: - dump = permdb.get_attr("command:remind", "data") - except KeyError: - dump = "[]" + permdb = self.config.irc["permissions"] + try: + dump = permdb.get_attr("command:remind", "data") + except KeyError: + dump = "[]" + + database = ast.literal_eval(dump) + database = [item for item in database if item[0] != rid] + permdb.set_attr("command:remind", "data", str(database)) + + +class _ReminderThread(object): + """A single thread that handles reminders.""" + + def __init__(self, lock): + self._thread = None + self._abort = False + self._active = {} + self._lock = lock + + def _running(self): + """Return if the thread should still be running.""" + return self._active and not self._abort + + def _get_soonest(self): + """Get the soonest reminder to trigger.""" + return min(self._active.values(), key=lambda robj: robj.end) + + def _get_ready_reminder(self): + """Block until a reminder is ready to be triggered.""" + while self._running(): + if self._get_soonest().end <= time.time(): + return self._get_soonest() + self._lock.release() + time.sleep(0.25) + self._lock.acquire() + + def _callback(self): + """Internal callback function to be executed by the reminder thread.""" + with self._lock: + while True: + reminder = self._get_ready_reminder() + if not reminder: + break + + if reminder.trigger(): + del self._active[reminder.id] + self._thread = None + + def _start(self): + """Start the thread.""" + self._thread = Thread(target=self._callback, name="reminder") + self._thread.daemon = True + self._thread.start() + self._abort = False + + def add(self, reminder): + """Add a reminder to the table of active reminders.""" + self._active[reminder.id] = reminder + if not self._thread: + self._start() + + def remove(self, reminder): + """Remove a reminder from the table of active reminders.""" + if reminder.id in self._active: + del self._active[reminder.id] + if not self._active: + self.stop() + + def stop(self): + """Stop the thread.""" + if not self._thread: + return + self._abort = True + self._thread = None - database = ast.literal_eval(dump) - database = [item for item in database if item[0] != rid] - permdb.set_attr("command:remind", "data", str(database)) class _Reminder(object): """Represents a single reminder.""" - - def __init__(self, rid, user, wait, end, message, data, cmdobj): + def __init__(self, rid, user, wait, message, data, cmdobj, end=None): self.id = rid self.wait = wait - self.end = end + self.end = time.time() + wait if end is None else end self.message = message self._user = user self._data = data self._cmdobj = cmdobj - self._thread = None + self._expired = False - def _callback(self): - """Internal callback function to be executed by the reminder thread.""" - thread = self._thread - while time.time() < thread.end: - time.sleep(1) - if thread.abort: - return - self._cmdobj.reply(self._data, self.message) - self._delete() - for i in xrange(60): - time.sleep(1) - if thread.abort: - return - try: - self._cmdobj.reminders[self._user].remove(self) - if not self._cmdobj.reminders[self._user]: - del self._cmdobj.reminders[self._user] - except (KeyError, ValueError): # Already canceled by the user - pass + self._save() def _save(self): """Save this reminder to the database.""" @@ -418,9 +468,21 @@ class _Reminder(object): item = (self.id, self._user, self.wait, self.end, self.message, data) self._cmdobj.store_reminder(item) - def _delete(self): - """Remove this reminder from the database.""" + def _fire(self): + """Activate the reminder for the user.""" + self._cmdobj.reply(self._data, self.message) self._cmdobj.unstore_reminder(self.id) + self.end = time.time() + 60 + self._expired = True + + def _finalize(self): + """Clean up after a reminder has been expired for too long.""" + try: + self._cmdobj.reminders[self._user].remove(self) + if not self._cmdobj.reminders[self._user]: + del self._cmdobj.reminders[self._user] + except (KeyError, ValueError): # Already canceled by the user + pass @property def data(self): @@ -430,30 +492,35 @@ class _Reminder(object): @property def end_time(self): """Return a string representing the end time of a reminder.""" - if self.end >= time.time(): - lctime = time.localtime(self.end) - if lctime.tm_year == time.localtime().tm_year: - ends = time.strftime("%b %d %H:%M:%S %Z", lctime) - else: - ends = time.strftime("%b %d, %Y %H:%M:%S %Z", lctime) - return "ends {0}".format(ends) - return "expired" - - def start(self): - """Start the reminder timer thread. Stops it if already running.""" - self.stop() - self._thread = Thread(target=self._callback, name="remind-" + self.id) - self._thread.end = self.end - self._thread.daemon = True - self._thread.abort = False - self._thread.start() + if self._expired or self.end < time.time(): + return "expired" + lctime = time.localtime(self.end) + if lctime.tm_year == time.localtime().tm_year: + ends = time.strftime("%b %d %H:%M:%S %Z", lctime) + else: + ends = time.strftime("%b %d, %Y %H:%M:%S %Z", lctime) + return "ends {0}".format(ends) + + @property + def expired(self): + """Return whether the reminder is expired.""" + return self._expired + + def reset(self, wait=None): + """Reactivate a reminder.""" + if wait is not None: + self.wait = wait + self.end = self.wait + time.time() + self._expired = False + + self._cmdobj.unstore_reminder(self.id) self._save() - def stop(self, delete=True): - """Stop a currently running reminder.""" - if not self._thread: - return - if delete: - self._delete() - self._thread.abort = True - self._thread = None + def trigger(self): + """Hook run by the reminder thread.""" + if not self._expired: + self._fire() + return False + else: + self._finalize() + return True diff --git a/earwigbot/commands/threads.py b/earwigbot/commands/threads.py index 285ca58..30751fe 100644 --- a/earwigbot/commands/threads.py +++ b/earwigbot/commands/threads.py @@ -71,14 +71,11 @@ class Threads(Command): tname = thread.name ident = thread.ident % 10000 if tname == "MainThread": - t = "\x0302MainThread\x0F (id {0})" + t = "\x0302main\x0F (id {0})" normal_threads.append(t.format(ident)) elif tname in self.config.components: t = "\x0302{0}\x0F (id {1})" normal_threads.append(t.format(tname, ident)) - elif tname.startswith("remind-"): - t = "\x0302reminder\x0F (id {0})" - daemon_threads.append(t.format(tname[len("remind-"):])) elif tname.startswith("cvworker-"): t = "\x0302copyvio worker\x0F (site {0})" daemon_threads.append(t.format(tname[len("cvworker-"):])) From 48a14ee3edefe031c9a47af6f07cb9a7aaf3e851 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 5 Dec 2015 05:21:04 -0600 Subject: [PATCH 12/88] Don't log the full debug line when sending a lot of data. --- earwigbot/wiki/site.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index 6a2d092..e9fb38a 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -232,6 +232,8 @@ class Site(object): url, data = self._build_api_query(params, ignore_maxlag, no_assert) if "lgpassword" in params: self._logger.debug("{0} -> ".format(url)) + elif len(data) > 1000: + self._logger.debug("{0} -> {1}...".format(url, data[:997])) else: self._logger.debug("{0} -> {1}".format(url, data)) From c40ba21a09c9d8eeee34c3cba0ec741ff836b41f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 6 Dec 2015 04:31:34 -0600 Subject: [PATCH 13/88] Suport regexes for !stalk. --- CHANGELOG | 1 + earwigbot/commands/stalk.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 47528ff..5f0d67c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,6 +3,7 @@ v0.3 (unreleased): - Copyvio detector: improved sentence splitting algorithm. - IRC > !remind: Added !remind all. Fixed multithreading efficiency issues. Improved time detection. +- IRC > !stalk: Allow regular expressions as page titles or usernames. - IRC: Improved detection of maximum IRC message length. - IRC: Improved some help commands. diff --git a/earwigbot/commands/stalk.py b/earwigbot/commands/stalk.py index 2b04782..29db53f 100644 --- a/earwigbot/commands/stalk.py +++ b/earwigbot/commands/stalk.py @@ -21,13 +21,14 @@ # SOFTWARE. from ast import literal_eval +import re from earwigbot.commands import Command from earwigbot.irc import RC class Stalk(Command): """Stalk a particular user (!stalk/!unstalk) or page (!watch/!unwatch) for - edits. Applies to the current bot session only.""" + edits. Prefix regular expressions with "re:" (uses re.match).""" name = "stalk" commands = ["stalk", "watch", "unstalk", "unwatch", "stalks", "watches", "allstalks", "allwatches", "unstalkall", "unwatchall"] @@ -79,9 +80,12 @@ class Stalk(Command): target = " ".join(data.args).replace("_", " ") if target.startswith("[[") and target.endswith("]]"): target = target[2:-2] - if target.startswith("User:") and "stalk" in data.command: - target = target[5:] - target = target[0].upper() + target[1:] + if target.startswith("re:"): + target = "re:" + target[3:].lstrip() + else: + if target.startswith("User:") and "stalk" in data.command: + target = target[5:] + target = target[0].upper() + target[1:] if data.command in ["stalk", "watch"]: if data.is_private: @@ -119,12 +123,12 @@ class Stalk(Command): else: chans[item[0]] = None - def _wildcard_match(target, tag): - return target[-1] == "*" and tag.startswith(target[:-1]) + def _regex_match(target, tag): + return target.startswith("re:") and re.match(target[3:], tag) def _process(table, tag): for target, stalks in table.iteritems(): - if target == tag or _wildcard_match(target, tag): + if target == tag or _regex_match(target, tag): _update_chans(stalks) chans = {} From be6c272d18b7c18dbd5231e80f9d306137597c59 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 13 Dec 2015 03:52:16 -0600 Subject: [PATCH 14/88] Make expired reminders last for a full day. --- earwigbot/commands/remind.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/commands/remind.py b/earwigbot/commands/remind.py index 151a477..d312248 100644 --- a/earwigbot/commands/remind.py +++ b/earwigbot/commands/remind.py @@ -472,7 +472,7 @@ class _Reminder(object): """Activate the reminder for the user.""" self._cmdobj.reply(self._data, self.message) self._cmdobj.unstore_reminder(self.id) - self.end = time.time() + 60 + self.end = time.time() + (60 * 60 * 24) self._expired = True def _finalize(self): From f8bb72f9f24b1de1960afc70387463335a578744 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 16 Dec 2015 03:35:01 -0600 Subject: [PATCH 15/88] Send a message when an IRC-initiated task completes. --- earwigbot/commands/threads.py | 3 +++ earwigbot/managers.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/earwigbot/commands/threads.py b/earwigbot/commands/threads.py index 30751fe..d877792 100644 --- a/earwigbot/commands/threads.py +++ b/earwigbot/commands/threads.py @@ -142,6 +142,9 @@ class Threads(Command): return data.kwargs["fromIRC"] = True + data.kwargs["_IRCCallback"] = lambda: self.reply( + data, "Task \x0302{0}\x0F finished.".format(task_name)) + self.bot.tasks.start(task_name, **data.kwargs) msg = "Task \x0302{0}\x0F started.".format(task_name) self.reply(data, msg) diff --git a/earwigbot/managers.py b/earwigbot/managers.py index 7605139..c980f7a 100644 --- a/earwigbot/managers.py +++ b/earwigbot/managers.py @@ -247,6 +247,8 @@ class TaskManager(_ResourceManager): else: msg = "Task '{0}' finished successfully" self.logger.info(msg.format(task.name)) + if kwargs.get("fromIRC"): + kwargs.get("_IRCCallback")() def start(self, task_name, **kwargs): """Start a given task in a new daemon thread, and return the thread. From 49f178760f1f5428ca441ccd67d652a2ed9d81ea Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Dec 2015 03:35:02 -0500 Subject: [PATCH 16/88] !notes: Improved help and added aliases. --- CHANGELOG | 1 + earwigbot/commands/notes.py | 58 +++++++++++++++++++++++++++++---------------- 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 5f0d67c..db1028a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ v0.3 (unreleased): - Copyvio detector: improved sentence splitting algorithm. +- IRC > !notes: Improved help and added aliases. - IRC > !remind: Added !remind all. Fixed multithreading efficiency issues. Improved time detection. - IRC > !stalk: Allow regular expressions as page titles or usernames. diff --git a/earwigbot/commands/notes.py b/earwigbot/commands/notes.py index b7f01f9..bf3a687 100644 --- a/earwigbot/commands/notes.py +++ b/earwigbot/commands/notes.py @@ -32,7 +32,19 @@ class Notes(Command): """A mini IRC-based wiki for storing notes, tips, and reminders.""" name = "notes" commands = ["notes", "note", "about"] - version = 2 + version = "2.1" + + aliases = { + "all": "list", + "show": "read", + "get": "read", + "add": "edit", + "write": "edit", + "change": "edit", + "modify": "edit", + "move": "rename", + "remove": "delete" + } def setup(self): self._dbfile = path.join(self.config.root_dir, "notes.db") @@ -50,14 +62,13 @@ class Notes(Command): } if not data.args: - msg = ("\x0302The Earwig Mini-Wiki\x0F: running v{0}. Subcommands " - "are: {1}. You can get help on any with '!{2} help subcommand'.") - cmnds = ", ".join((commands)) - self.reply(data, msg.format(self.version, cmnds, data.command)) + self.do_help(data) return command = data.args[0].lower() if command in commands: commands[command](data) + elif command in self.aliases: + commands[self.aliases[command]](data) else: msg = "Unknown subcommand: \x0303{0}\x0F.".format(command) self.reply(data, msg) @@ -83,8 +94,13 @@ class Notes(Command): try: command = data.args[1] except IndexError: - self.reply(data, "Please specify a subcommand to get help on.") + msg = ("\x0302The Earwig Mini-Wiki\x0F: running v{0}. Subcommands " + "are: {1}. You can get help on any with '!{2} help subcommand'.") + cmnds = ", ".join((info.keys())) + self.reply(data, msg.format(self.version, cmnds, data.command)) return + if command in self.aliases: + command = self.aliases[command] try: help_ = re.sub(r"\s\s+", " ", info[command].replace("\n", "")) self.reply(data, "\x0303{0}\x0F: ".format(command) + help_) @@ -113,7 +129,7 @@ class Notes(Command): INNER JOIN revisions ON entry_revision = rev_id WHERE entry_slug = ?""" try: - slug = self.slugify(data.args[1]) + slug = self._slugify(data.args[1]) except IndexError: self.reply(data, "Please specify an entry to read from.") return @@ -141,7 +157,7 @@ class Notes(Command): query3 = "INSERT INTO entries VALUES (?, ?, ?, ?)" query4 = "UPDATE entries SET entry_revision = ? WHERE entry_id = ?" try: - slug = self.slugify(data.args[1]) + slug = self._slugify(data.args[1]) except IndexError: self.reply(data, "Please specify an entry to edit.") return @@ -157,17 +173,17 @@ class Notes(Command): create = False except sqlite.OperationalError: id_, title, author = 1, data.args[1].decode("utf8"), data.host - self.create_db(conn) + self._create_db(conn) except TypeError: - id_ = self.get_next_entry(conn) + id_ = self._get_next_entry(conn) title, author = data.args[1].decode("utf8"), data.host permdb = self.config.irc["permissions"] if author != data.host and not permdb.is_admin(data): msg = "You must be an author or a bot admin to edit this entry." self.reply(data, msg) return - revid = self.get_next_revision(conn) - userid = self.get_user(conn, data.host) + revid = self._get_next_revision(conn) + userid = self._get_user(conn, data.host) now = datetime.utcnow().strftime("%b %d, %Y %H:%M:%S") conn.execute(query2, (revid, id_, userid, now, content)) if create: @@ -185,7 +201,7 @@ class Notes(Command): INNER JOIN users ON rev_user = user_id WHERE entry_slug = ?""" try: - slug = self.slugify(data.args[1]) + slug = self._slugify(data.args[1]) except IndexError: self.reply(data, "Please specify an entry to get info on.") return @@ -221,7 +237,7 @@ class Notes(Command): query2 = """UPDATE entries SET entry_slug = ?, entry_title = ? WHERE entry_id = ?""" try: - slug = self.slugify(data.args[1]) + slug = self._slugify(data.args[1]) except IndexError: self.reply(data, "Please specify an entry to rename.") return @@ -246,7 +262,7 @@ class Notes(Command): msg = "You must be an author or a bot admin to rename this entry." self.reply(data, msg) return - args = (self.slugify(newtitle), newtitle.decode("utf8"), id_) + args = (self._slugify(newtitle), newtitle.decode("utf8"), id_) conn.execute(query2, args) msg = "Entry \x0302{0}\x0F renamed to \x0302{1}\x0F." @@ -261,7 +277,7 @@ class Notes(Command): query2 = "DELETE FROM entries WHERE entry_id = ?" query3 = "DELETE FROM revisions WHERE rev_entry = ?" try: - slug = self.slugify(data.args[1]) + slug = self._slugify(data.args[1]) except IndexError: self.reply(data, "Please specify an entry to delete.") return @@ -283,11 +299,11 @@ class Notes(Command): self.reply(data, "Entry \x0302{0}\x0F deleted.".format(data.args[1])) - def slugify(self, name): + def _slugify(self, name): """Convert *name* into an identifier for storing in the database.""" return name.lower().replace("_", "").replace("-", "").decode("utf8") - def create_db(self, conn): + def _create_db(self, conn): """Initialize the notes database with its necessary tables.""" script = """ CREATE TABLE entries (entry_id, entry_slug, entry_title, @@ -298,19 +314,19 @@ class Notes(Command): """ conn.executescript(script) - def get_next_entry(self, conn): + def _get_next_entry(self, conn): """Get the next entry ID.""" query = "SELECT MAX(entry_id) FROM entries" later = conn.execute(query).fetchone()[0] return later + 1 if later else 1 - def get_next_revision(self, conn): + def _get_next_revision(self, conn): """Get the next revision ID.""" query = "SELECT MAX(rev_id) FROM revisions" later = conn.execute(query).fetchone()[0] return later + 1 if later else 1 - def get_user(self, conn, host): + def _get_user(self, conn, host): """Get the user ID corresponding to a hostname, or make one.""" query1 = "SELECT user_id FROM users WHERE user_host = ?" query2 = "SELECT MAX(user_id) FROM users" From 17a8a53bbb8abc8b7c39ceceb6d44d5e0e195987 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 2 Jan 2016 02:41:28 -0500 Subject: [PATCH 17/88] Fixed config script. --- earwigbot/config/script.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/earwigbot/config/script.py b/earwigbot/config/script.py index d4669f7..b9f5185 100644 --- a/earwigbot/config/script.py +++ b/earwigbot/config/script.py @@ -442,6 +442,10 @@ class ConfigScript(object): """Make a new config file based on the user's input.""" try: makedirs(path.dirname(self.config.path)) + except OSError as exc: + if exc.errno != 17: + raise + try: open(self.config.path, "w").close() chmod(self.config.path, stat.S_IRUSR|stat.S_IWUSR) except IOError: From 4eaf43d3055cb31df38e99e1e29383eb490d27bf Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 2 Jan 2016 02:59:41 -0500 Subject: [PATCH 18/88] Improve argument parsing for !remind. --- CHANGELOG | 2 +- earwigbot/commands/remind.py | 208 +++++++++++++++++++------------------------ 2 files changed, 91 insertions(+), 119 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index db1028a..487237d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,7 +3,7 @@ v0.3 (unreleased): - Copyvio detector: improved sentence splitting algorithm. - IRC > !notes: Improved help and added aliases. - IRC > !remind: Added !remind all. Fixed multithreading efficiency issues. - Improved time detection. + Improved time detection and argument parsing. - IRC > !stalk: Allow regular expressions as page titles or usernames. - IRC: Improved detection of maximum IRC message length. - IRC: Improved some help commands. diff --git a/earwigbot/commands/remind.py b/earwigbot/commands/remind.py index d312248..42d0325 100644 --- a/earwigbot/commands/remind.py +++ b/earwigbot/commands/remind.py @@ -34,6 +34,16 @@ DISPLAY = ["display", "show", "list", "info", "details"] CANCEL = ["cancel", "stop", "delete", "del", "stop", "unremind", "forget", "disregard"] SNOOZE = ["snooze", "delay", "reset", "adjust", "modify", "change"] +SNOOZE_ONLY = ["snooze", "delay", "reset"] + +def _format_time(epoch): + """Format a UNIX timestamp nicely.""" + lctime = time.localtime(epoch) + if lctime.tm_year == time.localtime().tm_year: + return time.strftime("%b %d %H:%M:%S %Z", lctime) + else: + return time.strftime("%b %d, %Y %H:%M:%S %Z", lctime) + class Remind(Command): """Set a message to be repeated to you in a certain amount of time. See @@ -49,8 +59,10 @@ class Remind(Command): return "display" if command in CANCEL: return "cancel" - if command in SNOOZE: + if command in SNOOZE_ONLY: return "snooze" + if command in SNOOZE: # "adjust" == snoozing active reminders + return "adjust" @staticmethod def _parse_time(arg): @@ -88,7 +100,7 @@ class Remind(Command): raise ValueError(parsed) return parsed - def _really_get_reminder_by_id(self, user, rid): + def _get_reminder_by_id(self, user, rid): """Return the _Reminder object that corresponds to a particular ID. Raises IndexError on failure. @@ -98,17 +110,6 @@ class Remind(Command): raise IndexError(rid) return [robj for robj in self.reminders[user] if robj.id == rid][0] - def _get_reminder_by_id(self, user, rid, data): - """Return the _Reminder object that corresponds to a particular ID. - - Sends an error message to the user on failure. - """ - try: - return self._really_get_reminder_by_id(user, rid) - except IndexError: - msg = "Couldn't find a reminder for \x0302{0}\x0F with ID \x0303{1}\x0F." - self.reply(data, msg.format(user, rid)) - def _get_new_id(self): """Get a free ID for a new reminder.""" taken = set(robj.id for robj in chain(*self.reminders.values())) @@ -123,7 +124,7 @@ class Remind(Command): self.reminders[user] = [reminder] self._thread.add(reminder) - def _create_reminder(self, data, user): + def _create_reminder(self, data): """Create a new reminder for the given user.""" try: wait = self._parse_time(data.args[0]) @@ -143,8 +144,8 @@ class Remind(Command): msg = "Couldn't set a new reminder: no free IDs available." return self.reply(data, msg) - reminder = _Reminder(rid, user, wait, message, data, self) - self._start_reminder(reminder, user) + reminder = _Reminder(rid, data.host, wait, message, data, self) + self._start_reminder(reminder, data.host) msg = "Set reminder \x0303{0}\x0F ({1})." self.reply(data, msg.format(rid, reminder.end_time)) @@ -155,28 +156,26 @@ class Remind(Command): reminder.message) self.reply(data, msg) - def _cancel_reminder(self, data, user, reminder): + def _cancel_reminder(self, data, reminder): """Cancel a pending reminder.""" self._thread.remove(reminder) self.unstore_reminder(reminder.id) - self.reminders[user].remove(reminder) - if not self.reminders[user]: - del self.reminders[user] + self.reminders[data.host].remove(reminder) + if not self.reminders[data.host]: + del self.reminders[data.host] msg = "Reminder \x0303{0}\x0F canceled." self.reply(data, msg.format(reminder.id)) def _snooze_reminder(self, data, reminder, arg=None): """Snooze a reminder to be re-triggered after a period of time.""" verb = "snoozed" if reminder.expired else "adjusted" - duration = None - if arg: - try: - duration = self._parse_time(data.args[arg]) - except (IndexError, ValueError): - pass + try: + duration = self._parse_time(arg) if arg else None + except ValueError: + duration = None reminder.reset(duration) - end = time.strftime("%b %d %H:%M:%S %Z", time.localtime(reminder.end)) + end = _format_time(reminder.end) msg = "Reminder \x0303{0}\x0F {1} until {2}." self.reply(data, msg.format(reminder.id, verb, end)) @@ -197,21 +196,9 @@ class Remind(Command): reminder = _Reminder(rid, user, wait, message, data, self, end) self._start_reminder(reminder, user) - def _handle_command(self, command, data, user, reminder, arg=None): - """Handle a reminder-processing subcommand.""" - if command in DISPLAY: - self._display_reminder(data, reminder) - elif command in CANCEL: - self._cancel_reminder(data, user, reminder) - elif command in SNOOZE: - self._snooze_reminder(data, reminder, arg) - else: - msg = "Unknown action \x02{0}\x0F for reminder \x0303{1}\x0F." - self.reply(data, msg.format(command, reminder.id)) - - def _show_reminders(self, data, user): + def _show_reminders(self, data): """Show all of a user's current reminders.""" - if user not in self.reminders: + if data.host not in self.reminders: self.reply(data, "You have no reminders. Set one with " "\x0306!remind [time] [message]\x0F. See also: " "\x0306!remind help\x0F.") @@ -223,7 +210,7 @@ class Remind(Command): fmt = lambda robj: '\x0303{0}\x0F ("{1}" {2}, {3})'.format( robj.id, shorten(robj.message), dest(robj.data), robj.end_time) - rlist = ", ".join(fmt(robj) for robj in self.reminders[user]) + rlist = ", ".join(fmt(robj) for robj in self.reminders[data.host]) self.reply(data, "Your reminders: {0}.".format(rlist)) def _show_all_reminders(self, data): @@ -246,36 +233,6 @@ class Remind(Command): for rem in rems) self.reply(data, "All reminders: {0}.".format(", ".join(rlist))) - def _process_snooze_command(self, data, user): - """Process the !snooze command.""" - if not data.args: - if user not in self.reminders: - self.reply(data, "You have no reminders to snooze.") - elif len(self.reminders[user]) == 1: - self._snooze_reminder(data, self.reminders[user][0]) - else: - msg = "You have {0} reminders. Snooze which one?" - self.reply(data, msg.format(len(self.reminders[user]))) - return - reminder = self._get_reminder_by_id(user, data.args[0], data) - if reminder: - self._snooze_reminder(data, reminder, 1) - - def _process_cancel_command(self, data, user): - """Process the !cancel, !unremind, and !forget commands.""" - if not data.args: - if user not in self.reminders: - self.reply(data, "You have no reminders to cancel.") - elif len(self.reminders[user]) == 1: - self._cancel_reminder(data, user, self.reminders[user][0]) - else: - msg = "You have {0} reminders. Cancel which one?" - self.reply(data, msg.format(len(self.reminders[user]))) - return - reminder = self._get_reminder_by_id(user, data.args[0], data) - if reminder: - self._cancel_reminder(data, user, reminder) - def _show_help(self, data): """Reply to the user with help for all major subcommands.""" parts = [ @@ -284,59 +241,79 @@ class Remind(Command): ("Get info", "!remind [id]"), ("Cancel", "!remind cancel [id]"), ("Adjust", "!remind adjust [id] [time]"), - ("Restart", "!snooze [id]"), + ("Restart", "!snooze [id] [time]"), ("Admin", "!remind all") ] - extra = "In most cases, \x0306[id]\x0F can be omitted if you have only one reminder." + extra = "The \x0306[id]\x0F can be omitted if you have only one reminder." joined = " ".join("{0}: \x0306{1}\x0F.".format(k, v) for k, v in parts) self.reply(data, joined + " " + extra) - def _process(self, data): - """Main entry point.""" - if data.command == "snooze": - return self._process_snooze_command(data, data.host) - if data.command in ["cancel", "unremind", "forget"]: - return self._process_cancel_command(data, data.host) - if not data.args: - return self._show_reminders(data, data.host) - + def _dispatch_command(self, data, command, args): + """Handle a reminder-processing subcommand.""" user = data.host - if len(data.args) == 1: - command = data.args[0] - if command == "help": - return self._show_help(data) - if command == "all": - return self._show_all_reminders(data) - if command in DISPLAY + CANCEL + SNOOZE: - if user not in self.reminders: - msg = "You have no reminders to {0}." - self.reply(data, msg.format(self._normalize(command))) - elif len(self.reminders[user]) == 1: - reminder = self.reminders[user][0] - self._handle_command(command, data, user, reminder) - else: - msg = "You have {0} reminders. {1} which one?" - num = len(self.reminders[user]) - command = self._normalize(command).capitalize() - self.reply(data, msg.format(num, command)) + reminder = None + if args and args[0].startswith("R"): + try: + reminder = self._get_reminder_by_id(user, args[0]) + except IndexError: + msg = "Couldn't find a reminder for \x0302{0}\x0F with ID \x0303{1}\x0F." + self.reply(data, msg.format(user, args[0])) return - reminder = self._get_reminder_by_id(user, data.args[0], data) - if reminder: - self._display_reminder(data, reminder) + args.pop(0) + elif user not in self.reminders: + msg = "You have no reminders to {0}." + self.reply(data, msg.format(self._normalize(command))) + return + elif len(self.reminders[user]) == 1: + reminder = self.reminders[user][0] + elif command in SNOOZE_ONLY: # Select most recent expired reminder + rmds = [rmd for rmd in self.reminders[user] if rmd.expired] + rmds.sort(key=lambda rmd: rmd.end) + if len(rmds) > 0: + reminder = rmds[-1] + elif command in SNOOZE or command in CANCEL: # Select only active one + rmds = [rmd for rmd in self.reminders[user] if not rmd.expired] + if len(rmds) == 1: + reminder = rmds[0] + if not reminder: + msg = "You have {0} reminders. {1} which one?" + num = len(self.reminders[user]) + command = self._normalize(command).capitalize() + self.reply(data, msg.format(num, command)) return + if command in DISPLAY: + self._display_reminder(data, reminder) + elif command in CANCEL: + self._cancel_reminder(data, reminder) + elif command in SNOOZE: + self._snooze_reminder(data, reminder, args[0] if args else None) + else: + msg = "Unknown action \x02{0}\x0F for reminder \x0303{1}\x0F." + self.reply(data, msg.format(command, reminder.id)) + + def _process(self, data): + """Main entry point.""" + if data.command in SNOOZE + CANCEL: + return self._dispatch_command(data, data.command, data.args) + if not data.args: + return self._show_reminders(data) + + if data.args[0] == "help": + return self._show_help(data) + if data.args[0] == "all": + return self._show_all_reminders(data) if data.args[0] in DISPLAY + CANCEL + SNOOZE: - reminder = self._get_reminder_by_id(user, data.args[1], data) - if reminder: - self._handle_command(data.args[0], data, user, reminder, 2) - return + return self._dispatch_command(data, data.args[0], data.args[1:]) try: - reminder = self._really_get_reminder_by_id(user, data.args[0]) + self._get_reminder_by_id(data.host, data.args[0]) except IndexError: - return self._create_reminder(data, user) - - self._handle_command(data.args[1], data, user, reminder, 2) + return self._create_reminder(data) + if len(data.args) == 1: + return self._dispatch_command(data, "display", data.args) + self._dispatch_command( + data, data.args[1], [data.args[0]] + data.args[2:]) @property def lock(self): @@ -494,12 +471,7 @@ class _Reminder(object): """Return a string representing the end time of a reminder.""" if self._expired or self.end < time.time(): return "expired" - lctime = time.localtime(self.end) - if lctime.tm_year == time.localtime().tm_year: - ends = time.strftime("%b %d %H:%M:%S %Z", lctime) - else: - ends = time.strftime("%b %d, %Y %H:%M:%S %Z", lctime) - return "ends {0}".format(ends) + return "ends {0}".format(_format_time(self.end)) @property def expired(self): From 1fc63081eb7d1713304b78f9006978a006a5b2ed Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 2 Jan 2016 03:34:30 -0500 Subject: [PATCH 19/88] Update copyright year for 2016. --- LICENSE | 2 +- docs/conf.py | 2 +- earwigbot/__init__.py | 2 +- setup.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/LICENSE b/LICENSE index f1e78b1..7fb250d 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (C) 2009-2015 Ben Kurtovic +Copyright (C) 2009-2016 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/conf.py b/docs/conf.py index b843fe0..3e66cb9 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -41,7 +41,7 @@ master_doc = 'index' # General information about the project. project = u'EarwigBot' -copyright = u'2009-2015 Ben Kurtovic' +copyright = u'2009-2016 Ben Kurtovic' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/earwigbot/__init__.py b/earwigbot/__init__.py index 78b02b7..51ff5b8 100644 --- a/earwigbot/__init__.py +++ b/earwigbot/__init__.py @@ -30,7 +30,7 @@ details. This documentation is also available `online """ __author__ = "Ben Kurtovic" -__copyright__ = "Copyright (C) 2009-2015 Ben Kurtovic" +__copyright__ = "Copyright (C) 2009-2016 Ben Kurtovic" __license__ = "MIT License" __version__ = "0.3.dev0" __email__ = "ben.kurtovic@gmail.com" diff --git a/setup.py b/setup.py index b1c60b2..de035f1 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2016 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal From 7dfc38bb5ca8fb5e0c2aee16b3c0cb5d4c5aa6c6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 2 Jan 2016 04:25:47 -0500 Subject: [PATCH 20/88] Reminders that expire while the bot is offline trigger after startup. --- CHANGELOG | 3 ++- earwigbot/commands/remind.py | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 487237d..5256493 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -3,7 +3,8 @@ v0.3 (unreleased): - Copyvio detector: improved sentence splitting algorithm. - IRC > !notes: Improved help and added aliases. - IRC > !remind: Added !remind all. Fixed multithreading efficiency issues. - Improved time detection and argument parsing. + Improved time detection and argument parsing. Newly expired reminders are now + triggered on bot startup. - IRC > !stalk: Allow regular expressions as page titles or usernames. - IRC: Improved detection of maximum IRC message length. - IRC: Improved some help commands. diff --git a/earwigbot/commands/remind.py b/earwigbot/commands/remind.py index 42d0325..65638fe 100644 --- a/earwigbot/commands/remind.py +++ b/earwigbot/commands/remind.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2016 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -188,10 +188,13 @@ class Remind(Command): return permdb.set_attr("command:remind", "data", "[]") + connect_wait = 30 for item in ast.literal_eval(database): rid, user, wait, end, message, data = item - if end < time.time(): - continue + if end < time.time() + connect_wait: + # Make reminders that have expired while the bot was offline + # trigger shortly after startup + end = time.time() + connect_wait data = Data.unserialize(data) reminder = _Reminder(rid, user, wait, message, data, self, end) self._start_reminder(reminder, user) From ea809a5395bedd43af8938e06bf8e6ab6772f900 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 2 Jan 2016 04:55:12 -0500 Subject: [PATCH 21/88] Remove extraneous colon in edit summary. --- earwigbot/config/script.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/earwigbot/config/script.py b/earwigbot/config/script.py index b9f5185..112829a 100644 --- a/earwigbot/config/script.py +++ b/earwigbot/config/script.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2016 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -270,7 +270,7 @@ class ConfigScript(object): password = self._ask_pass("Bot password:", encrypt=False) self.data["wiki"]["password"] = password self.data["wiki"]["userAgent"] = "EarwigBot/$1 (Python/$2; https://github.com/earwig/earwigbot)" - self.data["wiki"]["summary"] = "([[WP:BOT|Bot]]): $2" + self.data["wiki"]["summary"] = "([[WP:BOT|Bot]]) $2" self.data["wiki"]["useHTTPS"] = True self.data["wiki"]["assert"] = "user" self.data["wiki"]["maxlag"] = 10 From f75a40dcd6e4558ca9b4cf2b2934fc53054ea5bc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 2 Jan 2016 22:41:57 -0500 Subject: [PATCH 22/88] Add a !cidr command. --- earwigbot/commands/cidr.py | 137 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 earwigbot/commands/cidr.py diff --git a/earwigbot/commands/cidr.py b/earwigbot/commands/cidr.py new file mode 100644 index 0000000..2c30bfd --- /dev/null +++ b/earwigbot/commands/cidr.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2009-2016 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from collections import namedtuple +import re +import socket +from socket import AF_INET, AF_INET6 + +from earwigbot.commands import Command + +_Range = namedtuple("_Range", ["family", "range", "size", "addresses"]) + +class CIDR(Command): + """Calculates the smallest CIDR range that encompasses a list of IP + addresses. Used to make range blocks.""" + name = "cidr" + commands = ["cidr", "range", "rangeblock", "rangecalc", "blockcalc", + "iprange", "cdir"] + + # https://www.mediawiki.org/wiki/Manual:$wgBlockCIDRLimit + LIMIT_IPv4 = 16 + LIMIT_IPv6 = 19 + + def process(self, data): + if not data.args: + msg = ("Specify a list of IP addresses to calculate a CIDR range " + "for. For example, \x0306!{0} 192.168.0.3 192.168.0.15 " + "192.168.1.4\x0F.") + self.reply(data, msg.format(data.command)) + return + + try: + ips = [self._parse_arg(arg) for arg in data.args] + except ValueError as exc: + msg = "Can't parse IP address \x0302{0}\x0F." + self.reply(data, msg.format(exc.message)) + return + + if any(ip[0] == AF_INET for ip in ips) and any(ip[0] == AF_INET6 for ip in ips): + msg = "Can't calculate a range for both IPv4 and IPv6 addresses." + self.reply(data, msg) + return + + cidr = self._calculate_range(ips[0][0], [ip[1] for ip in ips]) + descr = self._describe(cidr.family, cidr.size) + msg = "Smallest CIDR range is \x02{0}\x0F ({1}){2}" + self.reply(data, msg.format( + cidr.range, cidr.addresses, " – " + descr if descr else "")) + + def _parse_arg(self, arg): + """Converts an argument into an IP address.""" + if "[[" in arg and "]]" in arg: + regex = r"\[\[\s*(?:User(?:\stalk)?:)?(.*?)(?:\|.*?)?\s*\]\]" + match = re.search(regex, arg, re.I) + if not match: + raise ValueError(arg) + arg = match.group(1) + + if re.match(r"https?://", arg): + if "target=" in arg: + regex = r"target=(.*?)(?:&|$)" + elif "page=" in arg: + regex = r"page=(?:User(?:(?:\s|_)talk)?(?::|%3A))?(.*?)(?:&|$)" + elif re.search(r"Special(:|%3A)Contributions/", arg, re.I): + regex = r"Special(?:\:|%3A)Contributions/(.*?)(?:\&|\?|$)" + elif re.search(r"User((\s|_)talk)?(:|%3A)", arg, re.I): + regex = r"User(?:(?:\s|_)talk)?(?:\:|%3A)(.*?)(?:\&|\?|$)" + else: + raise ValueError(arg) + match = re.search(regex, arg, re.I) + if not match: + raise ValueError(arg) + arg = match.group(1) + + try: + return (AF_INET, socket.inet_pton(AF_INET, arg)) + except socket.error: + try: + return (AF_INET6, socket.inet_pton(AF_INET6, arg)) + except socket.error: + raise ValueError(arg) + + def _calculate_range(self, family, ips): + """Calculate the smallest CIDR range encompassing a list of IPs.""" + bin_ips = ["".join( + bin(ord(octet))[2:].zfill(8) for octet in ip) for ip in ips] + size = len(bin_ips[0]) + for i in xrange(len(bin_ips[0])): + if any(ip[i] == "0" for ip in bin_ips) and any( + ip[i] == "1" for ip in bin_ips): + size = i + break + + mask = bin_ips[0][:size].ljust(len(bin_ips[0]), "0") + return _Range( + family, + socket.inet_ntop(family, int(mask, 2)) + "/" + str(size), + size, + self._format_count(2 ** (len(bin_ips[0]) - size))) + + def _format_count(self, count): + """Nicely format a number of addresses affected by a range block.""" + if count > 2 ** 32: + base = "{0:.2E} addresses".format(count) + if count > 2 ** 96: + return base + ", {0:.2E} /64 subnets".format(count >> 64) + if count > 2 ** 63: + return base + ", {0} /64 subnets".format(count >> 64) + return base + if count == 1: + return "1 address" + return "{0:,} addresses".format(count) + + def _describe(self, family, size): + """Return an optional English description of a range.""" + if (family == AF_INET and size < self.LIMIT_IPv4) or ( + family == AF_INET6 and size < self.LIMIT_IPv6): + return "too large to block" From 623bf5a9a12bc950f1617fcfe844e7e062b16a3d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 2 Jan 2016 22:59:10 -0500 Subject: [PATCH 23/88] Fix construction of packed CIDR mask from binary representation. --- earwigbot/commands/cidr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/earwigbot/commands/cidr.py b/earwigbot/commands/cidr.py index 2c30bfd..6b832fa 100644 --- a/earwigbot/commands/cidr.py +++ b/earwigbot/commands/cidr.py @@ -111,9 +111,11 @@ class CIDR(Command): break mask = bin_ips[0][:size].ljust(len(bin_ips[0]), "0") + packed = "".join( + chr(int(mask[i:i + 8], 2)) for i in xrange(0, len(mask), 8)) return _Range( family, - socket.inet_ntop(family, int(mask, 2)) + "/" + str(size), + socket.inet_ntop(family, packed) + "/" + str(size), size, self._format_count(2 ** (len(bin_ips[0]) - size))) From 222b28f4f4d0a955adef80b8b9256fc4a85ca5d6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 3 Jan 2016 01:10:23 -0500 Subject: [PATCH 24/88] Show IP range; tweaks. --- earwigbot/commands/cidr.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/earwigbot/commands/cidr.py b/earwigbot/commands/cidr.py index 6b832fa..a1a2013 100644 --- a/earwigbot/commands/cidr.py +++ b/earwigbot/commands/cidr.py @@ -27,7 +27,8 @@ from socket import AF_INET, AF_INET6 from earwigbot.commands import Command -_Range = namedtuple("_Range", ["family", "range", "size", "addresses"]) +_Range = namedtuple("_Range", [ + "family", "range", "low", "high", "size", "addresses"]) class CIDR(Command): """Calculates the smallest CIDR range that encompasses a list of IP @@ -62,9 +63,10 @@ class CIDR(Command): cidr = self._calculate_range(ips[0][0], [ip[1] for ip in ips]) descr = self._describe(cidr.family, cidr.size) - msg = "Smallest CIDR range is \x02{0}\x0F ({1}){2}" + msg = "Smallest CIDR range is \x02{0}\x0F ({1}: {2} – {3}){4}." self.reply(data, msg.format( - cidr.range, cidr.addresses, " – " + descr if descr else "")) + cidr.range, cidr.addresses, cidr.low, cidr.high, + "; " + descr if descr else "")) def _parse_arg(self, arg): """Converts an argument into an IP address.""" @@ -110,23 +112,30 @@ class CIDR(Command): size = i break - mask = bin_ips[0][:size].ljust(len(bin_ips[0]), "0") - packed = "".join( - chr(int(mask[i:i + 8], 2)) for i in xrange(0, len(mask), 8)) + bin_low = bin_ips[0][:size].ljust(len(bin_ips[0]), "0") + bin_high = bin_ips[0][:size].ljust(len(bin_ips[0]), "1") + low = self._format_bin(family, bin_low) + high = self._format_bin(family, bin_high) + return _Range( - family, - socket.inet_ntop(family, packed) + "/" + str(size), - size, + family, low + "/" + str(size), low, high, size, self._format_count(2 ** (len(bin_ips[0]) - size))) - def _format_count(self, count): + @staticmethod + def _format_bin(family, binary): + """Convert an IP's binary representation to presentation format.""" + return socket.inet_ntop(family, "".join( + chr(int(binary[i:i + 8], 2)) for i in xrange(0, len(binary), 8))) + + @staticmethod + def _format_count(count): """Nicely format a number of addresses affected by a range block.""" if count > 2 ** 32: base = "{0:.2E} addresses".format(count) if count > 2 ** 96: return base + ", {0:.2E} /64 subnets".format(count >> 64) if count > 2 ** 63: - return base + ", {0} /64 subnets".format(count >> 64) + return base + ", {0:,} /64 subnets".format(count >> 64) return base if count == 1: return "1 address" From 0b4e6d98bce1068cf31268af9c750c886c81913a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 3 Jan 2016 01:32:32 -0500 Subject: [PATCH 25/88] Tweaks for !cidr. --- CHANGELOG | 1 + earwigbot/commands/cidr.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 5256493..0f5709d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ v0.3 (unreleased): - Copyvio detector: improved sentence splitting algorithm. +- IRC > !cidr: Added; new command for calculating range blocks. - IRC > !notes: Improved help and added aliases. - IRC > !remind: Added !remind all. Fixed multithreading efficiency issues. Improved time detection and argument parsing. Newly expired reminders are now diff --git a/earwigbot/commands/cidr.py b/earwigbot/commands/cidr.py index a1a2013..162e41c 100644 --- a/earwigbot/commands/cidr.py +++ b/earwigbot/commands/cidr.py @@ -45,7 +45,8 @@ class CIDR(Command): if not data.args: msg = ("Specify a list of IP addresses to calculate a CIDR range " "for. For example, \x0306!{0} 192.168.0.3 192.168.0.15 " - "192.168.1.4\x0F.") + "192.168.1.4\x0F or \x0306!{0} 2500:1:2:3:: " + "2500:1:2:3:dead:beef::\x0F.") self.reply(data, msg.format(data.command)) return @@ -63,10 +64,12 @@ class CIDR(Command): cidr = self._calculate_range(ips[0][0], [ip[1] for ip in ips]) descr = self._describe(cidr.family, cidr.size) - msg = "Smallest CIDR range is \x02{0}\x0F ({1}: {2} – {3}){4}." + + msg = ("Smallest CIDR range is \x02{0}\x0F, covering {1} from " + "\x03\x05{2}\x0F to \x03\x05{3}\x0F{4}.") self.reply(data, msg.format( cidr.range, cidr.addresses, cidr.low, cidr.high, - "; " + descr if descr else "")) + " (\x03\x04{0}\x0F)".format(descr) if descr else "")) def _parse_arg(self, arg): """Converts an argument into an IP address.""" @@ -130,15 +133,17 @@ class CIDR(Command): @staticmethod def _format_count(count): """Nicely format a number of addresses affected by a range block.""" + if count == 1: + return "1 address" if count > 2 ** 32: base = "{0:.2E} addresses".format(count) + if count == 2 ** 64: + return base + " (1 /64 subnet)" if count > 2 ** 96: - return base + ", {0:.2E} /64 subnets".format(count >> 64) + return base + " ({0:.2E} /64 subnets)".format(count >> 64) if count > 2 ** 63: - return base + ", {0:,} /64 subnets".format(count >> 64) + return base + " ({0:,} /64 subnets)".format(count >> 64) return base - if count == 1: - return "1 address" return "{0:,} addresses".format(count) def _describe(self, family, size): From a15149df821e5cb8d205469d6985bb2feda7d21d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 3 Jan 2016 01:34:15 -0500 Subject: [PATCH 26/88] Fix broken color codes. --- earwigbot/commands/cidr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/earwigbot/commands/cidr.py b/earwigbot/commands/cidr.py index 162e41c..9905171 100644 --- a/earwigbot/commands/cidr.py +++ b/earwigbot/commands/cidr.py @@ -66,10 +66,10 @@ class CIDR(Command): descr = self._describe(cidr.family, cidr.size) msg = ("Smallest CIDR range is \x02{0}\x0F, covering {1} from " - "\x03\x05{2}\x0F to \x03\x05{3}\x0F{4}.") + "\x0305{2}\x0F to \x0305{3}\x0F{4}.") self.reply(data, msg.format( cidr.range, cidr.addresses, cidr.low, cidr.high, - " (\x03\x04{0}\x0F)".format(descr) if descr else "")) + " (\x0304{0}\x0F)".format(descr) if descr else "")) def _parse_arg(self, arg): """Converts an argument into an IP address.""" From d30f515c706bf384ed8028edcf571b96f4308453 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 3 Jan 2016 03:38:29 -0500 Subject: [PATCH 27/88] Support ranges as arguments to !cidr. --- earwigbot/commands/cidr.py | 56 +++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/earwigbot/commands/cidr.py b/earwigbot/commands/cidr.py index 9905171..cd9d72b 100644 --- a/earwigbot/commands/cidr.py +++ b/earwigbot/commands/cidr.py @@ -27,6 +27,7 @@ from socket import AF_INET, AF_INET6 from earwigbot.commands import Command +_IP = namedtuple("_IP", ["family", "ip"]) _Range = namedtuple("_Range", [ "family", "range", "low", "high", "size", "addresses"]) @@ -51,18 +52,19 @@ class CIDR(Command): return try: - ips = [self._parse_arg(arg) for arg in data.args] + ips = [self._parse_ip(arg) for arg in data.args] except ValueError as exc: msg = "Can't parse IP address \x0302{0}\x0F." self.reply(data, msg.format(exc.message)) return - if any(ip[0] == AF_INET for ip in ips) and any(ip[0] == AF_INET6 for ip in ips): + if any(ip.family == AF_INET for ip in ips) and any( + ip.family == AF_INET6 for ip in ips): msg = "Can't calculate a range for both IPv4 and IPv6 addresses." self.reply(data, msg) return - cidr = self._calculate_range(ips[0][0], [ip[1] for ip in ips]) + cidr = self._calculate_range(ips[0].family, ips) descr = self._describe(cidr.family, cidr.size) msg = ("Smallest CIDR range is \x02{0}\x0F, covering {1} from " @@ -71,8 +73,33 @@ class CIDR(Command): cidr.range, cidr.addresses, cidr.low, cidr.high, " (\x0304{0}\x0F)".format(descr) if descr else "")) + def _parse_ip(self, arg): + """Converts an argument into an IP address object.""" + arg = self._parse_arg(arg) + oldarg = arg + size = None + if "/" in arg: + arg, size = arg.split("/", 1) + try: + size = int(size, 10) + except ValueError: + raise ValueError(oldarg) + if size < 0 or size > 128: + raise ValueError(oldarg) + + try: + ip = _IP(AF_INET, socket.inet_pton(AF_INET, arg), size) + except socket.error: + try: + return _IP(AF_INET6, socket.inet_pton(AF_INET6, arg), size) + except socket.error: + raise ValueError(oldarg) + if size > 32: + raise ValueError(oldarg) + return ip + def _parse_arg(self, arg): - """Converts an argument into an IP address.""" + """Converts an argument into an IP address string.""" if "[[" in arg and "]]" in arg: regex = r"\[\[\s*(?:User(?:\stalk)?:)?(.*?)(?:\|.*?)?\s*\]\]" match = re.search(regex, arg, re.I) @@ -95,23 +122,22 @@ class CIDR(Command): if not match: raise ValueError(arg) arg = match.group(1) - - try: - return (AF_INET, socket.inet_pton(AF_INET, arg)) - except socket.error: - try: - return (AF_INET6, socket.inet_pton(AF_INET6, arg)) - except socket.error: - raise ValueError(arg) + return arg def _calculate_range(self, family, ips): """Calculate the smallest CIDR range encompassing a list of IPs.""" bin_ips = ["".join( - bin(ord(octet))[2:].zfill(8) for octet in ip) for ip in ips] + bin(ord(octet))[2:].zfill(8) for octet in ip.ip) for ip in ips] + for i, ip in enumerate(ips): + if ip.size: + suffix = "X" * (len(bin_ips[i]) - ip.size) + bin_ips[i] = bin_ips[i][:ip.size] + suffix + size = len(bin_ips[0]) for i in xrange(len(bin_ips[0])): - if any(ip[i] == "0" for ip in bin_ips) and any( - ip[i] == "1" for ip in bin_ips): + if any(ip[i] == "X" for ip in bin_ips) or ( + any(ip[i] == "0" for ip in bin_ips) and + any(ip[i] == "1" for ip in bin_ips)): size = i break From 6516992ef9877ed7f52737c259887706df321435 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 3 Jan 2016 03:42:33 -0500 Subject: [PATCH 28/88] Add a missing argument to the _IP tuple. --- earwigbot/commands/cidr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/commands/cidr.py b/earwigbot/commands/cidr.py index cd9d72b..a4e285e 100644 --- a/earwigbot/commands/cidr.py +++ b/earwigbot/commands/cidr.py @@ -27,7 +27,7 @@ from socket import AF_INET, AF_INET6 from earwigbot.commands import Command -_IP = namedtuple("_IP", ["family", "ip"]) +_IP = namedtuple("_IP", ["family", "ip", "size"]) _Range = namedtuple("_Range", [ "family", "range", "low", "high", "size", "addresses"]) From 66ef969f82a881fe1c721a88439b67aaeda6bf93 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 3 Jan 2016 03:54:59 -0500 Subject: [PATCH 29/88] Fix ranges with size 0. --- earwigbot/commands/cidr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/commands/cidr.py b/earwigbot/commands/cidr.py index a4e285e..c424bc3 100644 --- a/earwigbot/commands/cidr.py +++ b/earwigbot/commands/cidr.py @@ -129,7 +129,7 @@ class CIDR(Command): bin_ips = ["".join( bin(ord(octet))[2:].zfill(8) for octet in ip.ip) for ip in ips] for i, ip in enumerate(ips): - if ip.size: + if ip.size is not None: suffix = "X" * (len(bin_ips[i]) - ip.size) bin_ips[i] = bin_ips[i][:ip.size] + suffix From f9b646b0b8d8d7ca3922ac2c6b8b6ddb6b6406eb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 4 Jan 2016 00:51:50 -0500 Subject: [PATCH 30/88] Improve config file command/task exclusion logic. --- CHANGELOG | 1 + docs/customizing.rst | 7 +++++++ earwigbot/managers.py | 20 +++++++++++++------- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 0f5709d..42d1a6e 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ v0.3 (unreleased): - Copyvio detector: improved sentence splitting algorithm. +- Improved config file command/task exclusion logic. - IRC > !cidr: Added; new command for calculating range blocks. - IRC > !notes: Improved help and added aliases. - IRC > !remind: Added !remind all. Fixed multithreading efficiency issues. diff --git a/docs/customizing.rst b/docs/customizing.rst index ed5e6f3..56c48a0 100644 --- a/docs/customizing.rst +++ b/docs/customizing.rst @@ -174,6 +174,13 @@ The bot has a wide selection of built-in commands and plugins to act as sample code and/or to give ideas. Start with test_, and then check out chanops_ and afc_status_ for some more complicated scripts. +By default, the bot loads every built-in and custom command available. You can +disable *all* built-in commands with the config entry +:py:attr:`config.commands["disable"]` set to ``True``, or a subset of commands +by setting it to a list of command class names or module names. If using the +former method, you can specifically enable certain built-in commands with +:py:attr:`config.commands["enable"]` set to a list of command module names. + Custom bot tasks ---------------- diff --git a/earwigbot/managers.py b/earwigbot/managers.py index c980f7a..611645e 100644 --- a/earwigbot/managers.py +++ b/earwigbot/managers.py @@ -72,14 +72,21 @@ class _ResourceManager(object): for resource in self._resources.itervalues(): yield resource + def _is_disabled(self, name): + """Check whether a resource should be disabled.""" + conf = getattr(self.bot.config, self._resource_name) + disabled = conf.get("disable", []) + enabled = conf.get("enable", []) + return name not in enabled and (disabled is True or name in disabled) + def _load_resource(self, name, path, klass): """Instantiate a resource class and add it to the dictionary.""" res_type = self._resource_name[:-1] # e.g. "command" or "task" if hasattr(klass, "name"): - res_config = getattr(self.bot.config, self._resource_name) - if getattr(klass, "name") in res_config.get("disable", []): + classname = getattr(klass, "name") + if self._is_disabled(name) and self._is_disabled(classname): log = "Skipping disabled {0} {1}" - self.logger.debug(log.format(res_type, getattr(klass, "name"))) + self.logger.debug(log.format(res_type, classname)) return try: resource = klass(self.bot) # Create instance of resource @@ -119,8 +126,6 @@ class _ResourceManager(object): def _load_directory(self, dir): """Load all valid resources in a given directory.""" self.logger.debug("Loading directory {0}".format(dir)) - res_config = getattr(self.bot.config, self._resource_name) - disabled = res_config.get("disable", []) processed = [] for name in listdir(dir): if not name.endswith(".py") and not name.endswith(".pyc"): @@ -128,7 +133,7 @@ class _ResourceManager(object): if name.startswith("_") or name.startswith("."): continue modname = sub("\.pyc?$", "", name) # Remove extension - if modname in disabled: + if self._is_disabled(modname): log = "Skipping disabled module {0}".format(modname) self.logger.debug(log) processed.append(modname) @@ -162,7 +167,8 @@ class _ResourceManager(object): self._unload_resources() builtin_dir = path.join(path.dirname(__file__), name) plugins_dir = path.join(self.bot.config.root_dir, name) - if getattr(self.bot.config, name).get("disable") is True: + conf = getattr(self.bot.config, name) + if conf.get("disable") is True and not conf.get("enable"): log = "Skipping disabled builtins directory: {0}" self.logger.debug(log.format(builtin_dir)) else: From 7d34781013e4440c3ea7cac6230d8fdf58fe157f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 4 Jan 2016 01:01:23 -0500 Subject: [PATCH 31/88] Avoid redundant loading of disabled modules. --- earwigbot/managers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/earwigbot/managers.py b/earwigbot/managers.py index 611645e..dbd0bf2 100644 --- a/earwigbot/managers.py +++ b/earwigbot/managers.py @@ -133,14 +133,14 @@ class _ResourceManager(object): if name.startswith("_") or name.startswith("."): continue modname = sub("\.pyc?$", "", name) # Remove extension + if modname in processed: + continue + processed.append(modname) if self._is_disabled(modname): log = "Skipping disabled module {0}".format(modname) self.logger.debug(log) - processed.append(modname) continue - if modname not in processed: - self._load_module(modname, dir) - processed.append(modname) + self._load_module(modname, dir) def _unload_resources(self): """Unload all resources, calling their unload hooks in the process.""" From fc0bff62a59e8564afa489dad8ad39220873b989 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 4 Jan 2016 03:00:59 -0500 Subject: [PATCH 32/88] Try not to join channels before NickServ auth has completed. --- earwigbot/irc/data.py | 7 ++++--- earwigbot/irc/frontend.py | 30 +++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/earwigbot/irc/data.py b/earwigbot/irc/data.py index aaff822..06c8d5d 100644 --- a/earwigbot/irc/data.py +++ b/earwigbot/irc/data.py @@ -55,15 +55,16 @@ class Data(object): self._reply_nick = self._nick self._chan = self.line[2] - if self._msgtype == "PRIVMSG": + if self._msgtype in ["PRIVMSG", "NOTICE"]: if self.chan.lower() == self.my_nick: # This is a privmsg to us, so set 'chan' as the nick of the # sender instead of the 'channel', which is ourselves: self._chan = self._nick self._is_private = True self._msg = " ".join(self.line[3:])[1:] - self._parse_args() - self._parse_kwargs() + if self._msgtype == "PRIVMSG": + self._parse_args() + self._parse_kwargs() def _parse_args(self): """Parse command arguments from the message. diff --git a/earwigbot/irc/frontend.py b/earwigbot/irc/frontend.py index e92366f..a81ccc1 100644 --- a/earwigbot/irc/frontend.py +++ b/earwigbot/irc/frontend.py @@ -20,6 +20,8 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from time import sleep + from earwigbot.irc import IRCConnection, Data __all__ = ["Frontend"] @@ -36,6 +38,7 @@ class Frontend(IRCConnection): :py:mod:`earwigbot.commands` or the bot's custom command directory (explained in the :doc:`documentation `). """ + NICK_SERVICES = "NickServ" def __init__(self, bot): self.bot = bot @@ -43,6 +46,8 @@ class Frontend(IRCConnection): base = super(Frontend, self) base.__init__(cf["host"], cf["port"], cf["nick"], cf["ident"], cf["realname"], bot.logger.getChild("frontend")) + + self._auth_wait = False self._connect() def __repr__(self): @@ -56,6 +61,11 @@ class Frontend(IRCConnection): res = "" return res.format(self.nick, self.ident, self.host, self.port) + def _join_channels(self): + """Join all startup channels as specified by the config file.""" + for chan in self.bot.config.irc["frontend"]["channels"]: + self.join(chan) + def _process_message(self, line): """Process a single message from IRC.""" if line[1] == "JOIN": @@ -74,17 +84,27 @@ class Frontend(IRCConnection): self.bot.commands.call("msg_public", data) self.bot.commands.call("msg", data) + elif line[1] == "NOTICE": + data = Data(self.nick, line, msgtype="NOTICE") + if self._auth_wait and data.nick == self.NICK_SERVICES: + self._auth_wait = False + sleep(2) # Wait for hostname change to propagate + self._join_channels() + elif line[1] == "376": # On successful connection to the server # If we're supposed to auth to NickServ, do that: try: username = self.bot.config.irc["frontend"]["nickservUsername"] password = self.bot.config.irc["frontend"]["nickservPassword"] except KeyError: - pass + self._join_channels() else: msg = "IDENTIFY {0} {1}".format(username, password) - self.say("NickServ", msg, hidelog=True) + self.say(self.NICK_SERVICES, msg, hidelog=True) + self._auth_wait = True - # Join all of our startup channels: - for chan in self.bot.config.irc["frontend"]["channels"]: - self.join(chan) + elif line[1] == "401": # No such nickname + if self._auth_wait and line[3] == self.NICK_SERVICES: + # Services is down, or something...? + self._auth_wait = False + self._join_channels() From e73fb6dfa6c1deaebd1a052d15547cbbb7b7e88e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 4 Jan 2016 03:01:36 -0500 Subject: [PATCH 33/88] Add changelog entry for previous commit. --- CHANGELOG | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG b/CHANGELOG index 42d1a6e..110536a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -8,6 +8,7 @@ v0.3 (unreleased): Improved time detection and argument parsing. Newly expired reminders are now triggered on bot startup. - IRC > !stalk: Allow regular expressions as page titles or usernames. +- IRC: Try not to join channels before NickServ auth has completed. - IRC: Improved detection of maximum IRC message length. - IRC: Improved some help commands. From 9a7652cb9b61403088b540fab5778703e0b8fe7b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 4 Jan 2016 03:43:50 -0500 Subject: [PATCH 34/88] Bugfix for alternate PRIVMSG/NOTICE format. --- earwigbot/irc/data.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/earwigbot/irc/data.py b/earwigbot/irc/data.py index 06c8d5d..eaffed0 100644 --- a/earwigbot/irc/data.py +++ b/earwigbot/irc/data.py @@ -50,10 +50,15 @@ class Data(object): def _parse(self): """Parse a line from IRC into its components as instance attributes.""" - sender = re.findall(r":(.*?)!(.*?)@(.*?)\Z", self.line[0])[0] + self._chan = self.line[2] + try: + sender = re.findall(r":(.*?)!(.*?)@(.*?)\Z", self.line[0])[0] + except IndexError: + self._host = self.line[0][1:] + self._nick = self._ident = self._reply_nick = "*" + return self._nick, self._ident, self._host = sender self._reply_nick = self._nick - self._chan = self.line[2] if self._msgtype in ["PRIVMSG", "NOTICE"]: if self.chan.lower() == self.my_nick: From e4509b9ed52f054ee87871b4209440aa053d3bd0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 4 Jan 2016 04:50:37 -0500 Subject: [PATCH 35/88] Fix NickServ auth with protected nicks. --- earwigbot/irc/frontend.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/earwigbot/irc/frontend.py b/earwigbot/irc/frontend.py index a81ccc1..52a24b5 100644 --- a/earwigbot/irc/frontend.py +++ b/earwigbot/irc/frontend.py @@ -87,6 +87,8 @@ class Frontend(IRCConnection): elif line[1] == "NOTICE": data = Data(self.nick, line, msgtype="NOTICE") if self._auth_wait and data.nick == self.NICK_SERVICES: + if data.msg.startswith("This nickname is registered."): + continue self._auth_wait = False sleep(2) # Wait for hostname change to propagate self._join_channels() @@ -99,6 +101,7 @@ class Frontend(IRCConnection): except KeyError: self._join_channels() else: + self.logger.debug("Identifying with services") msg = "IDENTIFY {0} {1}".format(username, password) self.say(self.NICK_SERVICES, msg, hidelog=True) self._auth_wait = True From 805f27df31c2bca173d561f2a76c6787a0f422c2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 4 Jan 2016 04:59:34 -0500 Subject: [PATCH 36/88] Fix broken continue. --- earwigbot/irc/frontend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/irc/frontend.py b/earwigbot/irc/frontend.py index 52a24b5..5f78247 100644 --- a/earwigbot/irc/frontend.py +++ b/earwigbot/irc/frontend.py @@ -88,7 +88,7 @@ class Frontend(IRCConnection): data = Data(self.nick, line, msgtype="NOTICE") if self._auth_wait and data.nick == self.NICK_SERVICES: if data.msg.startswith("This nickname is registered."): - continue + return self._auth_wait = False sleep(2) # Wait for hostname change to propagate self._join_channels() From 4828cbad69e8a33ab22bd47672bf75cd8519fc3f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 13 Jan 2016 05:13:41 -0500 Subject: [PATCH 37/88] Catch possible ValueError when doing opener.open(). --- earwigbot/wiki/copyvios/workers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 691fa6f..c66e197 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -128,7 +128,7 @@ class _CopyvioWorker(object): url = source.url.encode("utf8") try: response = self._opener.open(url, timeout=source.timeout) - except (URLError, HTTPException, socket_error): + except (URLError, HTTPException, socket_error, ValueError): return None try: From b4b079ffd0ab20492213514ecfc12d2ab9349eb5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 13 Jan 2016 05:28:44 -0500 Subject: [PATCH 38/88] Update copyright year for 2016. --- earwigbot/irc/data.py | 2 +- earwigbot/irc/frontend.py | 2 +- earwigbot/wiki/copyvios/exclusions.py | 2 +- earwigbot/wiki/copyvios/workers.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/earwigbot/irc/data.py b/earwigbot/irc/data.py index eaffed0..43264f8 100644 --- a/earwigbot/irc/data.py +++ b/earwigbot/irc/data.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2016 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/earwigbot/irc/frontend.py b/earwigbot/irc/frontend.py index 5f78247..3a4154a 100644 --- a/earwigbot/irc/frontend.py +++ b/earwigbot/irc/frontend.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2016 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index 9a4fb2d..1aa773e 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2016 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index c66e197..2872df0 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2016 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal From 69cdb41d07469fd59fee37bf960cb310d5d76fdf Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 15 Jan 2016 05:25:20 -0500 Subject: [PATCH 39/88] Adjust mirror hints to include direct links back to the article. --- earwigbot/wiki/copyvios/__init__.py | 6 +++--- earwigbot/wiki/copyvios/exclusions.py | 22 ++++++++++++---------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index ef40cd5..1d960a7 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2016 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -124,8 +124,8 @@ class CopyvioMixIn(object): if self._exclusions_db: self._exclusions_db.sync(self.site.name) exclude = lambda u: self._exclusions_db.check(self.site.name, u) - parser_args["mirror_hints"] = self._exclusions_db.get_mirror_hints( - self.site.name) + parser_args["mirror_hints"] = \ + self._exclusions_db.get_mirror_hints(self) else: exclude = None diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index 1aa773e..3e86def 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -122,7 +122,7 @@ class ExclusionsDB(object): site = self._sitesdb.get_site("enwiki") else: site = self._sitesdb.get_site(sitename) - with sqlite.connect(self._dbfile) as conn, self._db_access_lock: + with self._db_access_lock, sqlite.connect(self._dbfile) as conn: urls = set() for (source,) in conn.execute(query1, (sitename,)): urls |= self._load_source(site, source) @@ -140,7 +140,7 @@ class ExclusionsDB(object): def _get_last_update(self, sitename): """Return the UNIX timestamp of the last time the db was updated.""" query = "SELECT update_time FROM updates WHERE update_sitename = ?" - with sqlite.connect(self._dbfile) as conn, self._db_access_lock: + with self._db_access_lock, sqlite.connect(self._dbfile) as conn: try: result = conn.execute(query, (sitename,)).fetchone() except sqlite.OperationalError: @@ -176,7 +176,7 @@ class ExclusionsDB(object): normalized = re.sub(r"^https?://(www\.)?", "", url.lower()) query = """SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ? OR exclusion_sitename = ?""" - with sqlite.connect(self._dbfile) as conn, self._db_access_lock: + with self._db_access_lock, sqlite.connect(self._dbfile) as conn: for (excl,) in conn.execute(query, (sitename, "all")): if excl.startswith("*."): parsed = urlparse(url.lower()) @@ -200,21 +200,23 @@ class ExclusionsDB(object): self._logger.debug(log) return False - def get_mirror_hints(self, sitename, try_mobile=True): + def get_mirror_hints(self, page, try_mobile=True): """Return a list of strings that indicate the existence of a mirror. The source parser checks for the presence of these strings inside of certain HTML tag attributes (``"href"`` and ``"src"``). """ - site = self._sitesdb.get_site(sitename) - base = site.domain + site._script_path - roots = [base] + site = page.site + path = urlparse(page.url).path + roots = [site.domain] scripts = ["index.php", "load.php", "api.php"] if try_mobile: fragments = re.search(r"^([\w]+)\.([\w]+).([\w]+)$", site.domain) if fragments: - mobile = "{0}.m.{1}.{2}".format(*fragments.groups()) - roots.append(mobile + site._script_path) + roots.append("{0}.m.{1}.{2}".format(*fragments.groups())) - return [root + "/" + script for root in roots for script in scripts] + general = [root + site._script_path + "/" + script + for root in roots for script in scripts] + specific = [root + path for root in roots] + return general + specific From cb86d6628789789942198cb4c734895103830fdb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 31 Jan 2016 00:54:54 -0600 Subject: [PATCH 40/88] Add a per-channel quiet config setting. --- CHANGELOG | 1 + earwigbot/managers.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 110536a..25f3b5b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -8,6 +8,7 @@ v0.3 (unreleased): Improved time detection and argument parsing. Newly expired reminders are now triggered on bot startup. - IRC > !stalk: Allow regular expressions as page titles or usernames. +- IRC: Added a per-channel quiet config setting. - IRC: Try not to join channels before NickServ auth has completed. - IRC: Improved detection of maximum IRC message length. - IRC: Improved some help commands. diff --git a/earwigbot/managers.py b/earwigbot/managers.py index dbd0bf2..9c51adf 100644 --- a/earwigbot/managers.py +++ b/earwigbot/managers.py @@ -225,6 +225,8 @@ class CommandManager(_ResourceManager): .. note:: The special ``rc`` hook actually passes a :class:`~.RC` object. """ + if data.chan in self.bot.config.irc.get("quiet", []): + return for command in self: if hook in command.hooks and self._wrap_check(command, data): thread = Thread(target=self._wrap_process, From 425c51a9e9b108923d7d7068786fbd12991abf3d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 31 Jan 2016 15:50:02 -0600 Subject: [PATCH 41/88] Fix config. --- earwigbot/managers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/managers.py b/earwigbot/managers.py index 9c51adf..fcf3b74 100644 --- a/earwigbot/managers.py +++ b/earwigbot/managers.py @@ -225,7 +225,7 @@ class CommandManager(_ResourceManager): .. note:: The special ``rc`` hook actually passes a :class:`~.RC` object. """ - if data.chan in self.bot.config.irc.get("quiet", []): + if data.chan in self.bot.config.irc["frontend"].get("quiet", []): return for command in self: if hook in command.hooks and self._wrap_check(command, data): From 29d9c802747613f7449476c187b6c35337ce8767 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 31 Jan 2016 15:55:40 -0600 Subject: [PATCH 42/88] Fixup. --- earwigbot/managers.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/earwigbot/managers.py b/earwigbot/managers.py index fcf3b74..2ffb55c 100644 --- a/earwigbot/managers.py +++ b/earwigbot/managers.py @@ -225,8 +225,12 @@ class CommandManager(_ResourceManager): .. note:: The special ``rc`` hook actually passes a :class:`~.RC` object. """ - if data.chan in self.bot.config.irc["frontend"].get("quiet", []): - return + try: + if data.chan in self.bot.config.irc["frontend"]["quiet"]: + return + except KeyError: + pass + for command in self: if hook in command.hooks and self._wrap_check(command, data): thread = Thread(target=self._wrap_process, From 5479375ce222e68de84a71c3ff2e1b3a8dd5cbbf Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 31 Jan 2016 22:05:04 -0600 Subject: [PATCH 43/88] Improve per-channel quieting. --- earwigbot/managers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/earwigbot/managers.py b/earwigbot/managers.py index 2ffb55c..1562bdd 100644 --- a/earwigbot/managers.py +++ b/earwigbot/managers.py @@ -226,7 +226,8 @@ class CommandManager(_ResourceManager): The special ``rc`` hook actually passes a :class:`~.RC` object. """ try: - if data.chan in self.bot.config.irc["frontend"]["quiet"]: + quiet = self.bot.config.irc["frontend"]["quiet"][data.chan] + if quiet is True or hook in quiet: return except KeyError: pass From 262490889b6795b1904b91503c162382da72292d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 13 Mar 2016 21:12:13 -0500 Subject: [PATCH 44/88] Allow lower-case R for reminder IDs. --- earwigbot/commands/remind.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/commands/remind.py b/earwigbot/commands/remind.py index 65638fe..1b3d524 100644 --- a/earwigbot/commands/remind.py +++ b/earwigbot/commands/remind.py @@ -255,7 +255,7 @@ class Remind(Command): """Handle a reminder-processing subcommand.""" user = data.host reminder = None - if args and args[0].startswith("R"): + if args and args[0].upper().startswith("R"): try: reminder = self._get_reminder_by_id(user, args[0]) except IndexError: From 977b587e5ec46374993b1090f36eed94d36cf6ef Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 2 Apr 2016 18:29:55 -0500 Subject: [PATCH 45/88] Add support for Bing Search --- earwigbot/wiki/copyvios/__init__.py | 30 ++++++++------ earwigbot/wiki/copyvios/search.py | 79 ++++++++++++++++++++++++++++++++++--- 2 files changed, 92 insertions(+), 17 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 1d960a7..b129941 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -23,15 +23,14 @@ from time import sleep, time from urllib2 import build_opener -from earwigbot import exceptions, importer +from earwigbot import exceptions from earwigbot.wiki.copyvios.markov import MarkovChain from earwigbot.wiki.copyvios.parsers import ArticleTextParser -from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine +from earwigbot.wiki.copyvios.search import ( + BingSearchEngine, YahooBOSSSearchEngine) from earwigbot.wiki.copyvios.workers import ( globalize, localize, CopyvioWorkspace) -oauth = importer.new("oauth2") - __all__ = ["CopyvioMixIn", "globalize", "localize"] class CopyvioMixIn(object): @@ -62,20 +61,29 @@ class CopyvioMixIn(object): unknown to us, and UnsupportedSearchEngineError if we are missing a required package or module, like oauth2 for "Yahoo! BOSS". """ + engines = { + "Bing": BingSearchEngine, + "Yahoo! BOSS": YahooBOSSSearchEngine + } + engine = self._search_config["engine"] + if engine not in engines: + raise exceptions.UnknownSearchEngineError(engine) + + klass = engines[engine] credentials = self._search_config["credentials"] + opener = build_opener() + opener.addheaders = self._addheaders - if engine == "Yahoo! BOSS": + for dep in klass.requirements(): try: - oauth.__version__ # Force-load the lazy module + __import__(dep).__package__ except ImportError: - e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2" + e = "Missing a required dependency ({}) for the {} engine" + e = e.format(dep, engine) raise exceptions.UnsupportedSearchEngineError(e) - opener = build_opener() - opener.addheaders = self._addheaders - return YahooBOSSSearchEngine(credentials, opener) - raise exceptions.UnknownSearchEngineError(engine) + return klass(credentials, opener) def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1, no_searches=False, no_links=False, short_circuit=True): diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index a049837..9df20f7 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -24,7 +24,7 @@ from gzip import GzipFile from json import loads from socket import error from StringIO import StringIO -from urllib import quote +from urllib import quote, urlencode from urllib2 import URLError from earwigbot import importer @@ -32,7 +32,7 @@ from earwigbot.exceptions import SearchQueryError oauth = importer.new("oauth2") -__all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] +__all__ = ["BaseSearchEngine", "BingSearchEngine", "YahooBOSSSearchEngine"] class BaseSearchEngine(object): """Base class for a simple search engine interface.""" @@ -51,6 +51,11 @@ class BaseSearchEngine(object): """Return a nice string representation of the search engine.""" return "<{0}>".format(self.__class__.__name__) + @staticmethod + def requirements(): + """Return a list of packages required by this search engine.""" + return [] + def search(self, query): """Use this engine to search for *query*. @@ -59,6 +64,64 @@ class BaseSearchEngine(object): raise NotImplementedError() +class BingSearchEngine(BaseSearchEngine): + """A search engine interface with Bing Search (via Azure Marketplace).""" + name = "Bing" + + def __init__(self, cred, opener): + super(BingSearchEngine, self).__init__(cred, opener) + + key = self.cred["key"] + auth = (key + ":" + key).encode("base64").replace("\n", "") + self.opener.addheaders.append(("Authorization", "Basic " + auth)) + + def search(self, query): + """Do a Bing web search for *query*. + + Returns a list of URLs, no more than five, ranked by relevance + (as determined by Bing). + Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. + """ + service = "SearchWeb" if self.cred["type"] == "searchweb" else "Search" + url = "https://api.datamarket.azure.com/Bing/{0}/Web?".format(service) + params = { + "$format": "json", + "$top": "5", + "Query": "'\"" + query.replace('"', "").encode("utf8") + "\"'", + "Market": "'en-US'", + "Adult": "'Off'", + "Options": "'DisableLocationDetection'", + "WebFileType": "'HTM+HTML+PDF+TEXT+TXT'", + "WebSearchOptions": "'DisableHostCollapsing+DisableQueryAlterations'" + } + + try: + response = self.opener.open(url + urlencode(params)) + result = response.read() + except (URLError, error) as exc: + raise SearchQueryError("Bing Error: " + str(exc)) + + if response.headers.get("Content-Encoding") == "gzip": + stream = StringIO(result) + gzipper = GzipFile(fileobj=stream) + result = gzipper.read() + + if response.getcode() != 200: + err = "Bing Error: got response code '{0}':\n{1}'" + raise SearchQueryError(err.format(response.getcode(), result)) + try: + res = loads(result) + except ValueError: + err = "Bing Error: JSON could not be decoded" + raise SearchQueryError(err) + + try: + results = res["d"]["results"] + except KeyError: + return [] + return [result["Url"] for result in results] + + class YahooBOSSSearchEngine(BaseSearchEngine): """A search engine interface with Yahoo! BOSS.""" name = "Yahoo! BOSS" @@ -70,6 +133,10 @@ class YahooBOSSSearchEngine(BaseSearchEngine): args = ["=".join((enc(k), enc(v))) for k, v in params.iteritems()] return base + "?" + "&".join(args) + @staticmethod + def requirements(): + return ["oauth2"] + def search(self, query): """Do a Yahoo! BOSS web search for *query*. @@ -104,13 +171,13 @@ class YahooBOSSSearchEngine(BaseSearchEngine): result = gzipper.read() if response.getcode() != 200: - e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" - raise SearchQueryError(e.format(response.getcode(), result)) + err = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" + raise SearchQueryError(err.format(response.getcode(), result)) try: res = loads(result) except ValueError: - e = "Yahoo! BOSS Error: JSON could not be decoded" - raise SearchQueryError(e) + err = "Yahoo! BOSS Error: JSON could not be decoded" + raise SearchQueryError(err) try: results = res["bossresponse"]["web"]["results"] From 80890fb1916402d4f97d74f7e86bf506c2ada22e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 2 Apr 2016 19:04:22 -0500 Subject: [PATCH 46/88] WebFileType doesn't work --- earwigbot/wiki/copyvios/search.py | 1 - 1 file changed, 1 deletion(-) diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index 9df20f7..8f16c66 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -91,7 +91,6 @@ class BingSearchEngine(BaseSearchEngine): "Market": "'en-US'", "Adult": "'Off'", "Options": "'DisableLocationDetection'", - "WebFileType": "'HTM+HTML+PDF+TEXT+TXT'", "WebSearchOptions": "'DisableHostCollapsing+DisableQueryAlterations'" } From 04ed5257c751c512d77a5725254065445daf1956 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 4 May 2016 12:00:55 -0500 Subject: [PATCH 47/88] Refactor search engines. --- earwigbot/wiki/copyvios/__init__.py | 12 +++--------- earwigbot/wiki/copyvios/search.py | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index b129941..d63f384 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -26,8 +26,7 @@ from urllib2 import build_opener from earwigbot import exceptions from earwigbot.wiki.copyvios.markov import MarkovChain from earwigbot.wiki.copyvios.parsers import ArticleTextParser -from earwigbot.wiki.copyvios.search import ( - BingSearchEngine, YahooBOSSSearchEngine) +from earwigbot.wiki.copyvios.search import SEARCH_ENGINES from earwigbot.wiki.copyvios.workers import ( globalize, localize, CopyvioWorkspace) @@ -61,16 +60,11 @@ class CopyvioMixIn(object): unknown to us, and UnsupportedSearchEngineError if we are missing a required package or module, like oauth2 for "Yahoo! BOSS". """ - engines = { - "Bing": BingSearchEngine, - "Yahoo! BOSS": YahooBOSSSearchEngine - } - engine = self._search_config["engine"] - if engine not in engines: + if engine not in SEARCH_ENGINES: raise exceptions.UnknownSearchEngineError(engine) - klass = engines[engine] + klass = SEARCH_ENGINES[engine] credentials = self._search_config["credentials"] opener = build_opener() opener.addheaders = self._addheaders diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index 8f16c66..139a8eb 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -32,9 +32,9 @@ from earwigbot.exceptions import SearchQueryError oauth = importer.new("oauth2") -__all__ = ["BaseSearchEngine", "BingSearchEngine", "YahooBOSSSearchEngine"] +__all__ = ["BingSearchEngine", "YahooBOSSSearchEngine", "SEARCH_ENGINES"] -class BaseSearchEngine(object): +class _BaseSearchEngine(object): """Base class for a simple search engine interface.""" name = "Base" @@ -64,7 +64,7 @@ class BaseSearchEngine(object): raise NotImplementedError() -class BingSearchEngine(BaseSearchEngine): +class BingSearchEngine(_BaseSearchEngine): """A search engine interface with Bing Search (via Azure Marketplace).""" name = "Bing" @@ -78,8 +78,7 @@ class BingSearchEngine(BaseSearchEngine): def search(self, query): """Do a Bing web search for *query*. - Returns a list of URLs, no more than five, ranked by relevance - (as determined by Bing). + Returns a list of URLs ranked by relevance (as determined by Bing). Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. """ service = "SearchWeb" if self.cred["type"] == "searchweb" else "Search" @@ -121,7 +120,7 @@ class BingSearchEngine(BaseSearchEngine): return [result["Url"] for result in results] -class YahooBOSSSearchEngine(BaseSearchEngine): +class YahooBOSSSearchEngine(_BaseSearchEngine): """A search engine interface with Yahoo! BOSS.""" name = "Yahoo! BOSS" @@ -139,8 +138,7 @@ class YahooBOSSSearchEngine(BaseSearchEngine): def search(self, query): """Do a Yahoo! BOSS web search for *query*. - Returns a list of URLs, no more than five, ranked by relevance - (as determined by Yahoo). + Returns a list of URLs ranked by relevance (as determined by Yahoo). Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. """ key, secret = self.cred["key"], self.cred["secret"] @@ -183,3 +181,9 @@ class YahooBOSSSearchEngine(BaseSearchEngine): except KeyError: return [] return [result["url"] for result in results] + + +SEARCH_ENGINES = { + "Bing": BingSearchEngine, + "Yahoo! BOSS": YahooBOSSSearchEngine +} From a0d7eb62a2b1c8f7c1c269ecfdc1b3ee2a618b36 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 10 May 2016 02:05:32 -0500 Subject: [PATCH 48/88] Add Yandex search support. --- earwigbot/wiki/copyvios/search.py | 58 +++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 6 deletions(-) diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index 139a8eb..ff97326 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2016 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -22,6 +22,7 @@ from gzip import GzipFile from json import loads +from re import sub as re_sub from socket import error from StringIO import StringIO from urllib import quote, urlencode @@ -30,9 +31,11 @@ from urllib2 import URLError from earwigbot import importer from earwigbot.exceptions import SearchQueryError +etree = importer.new("lxml.etree") oauth = importer.new("oauth2") -__all__ = ["BingSearchEngine", "YahooBOSSSearchEngine", "SEARCH_ENGINES"] +__all__ = ["BingSearchEngine", "YahooBOSSSearchEngine", "YandexSearchEngine", + "SEARCH_ENGINES"] class _BaseSearchEngine(object): """Base class for a simple search engine interface.""" @@ -42,6 +45,7 @@ class _BaseSearchEngine(object): """Store credentials (*cred*) and *opener* for searching later on.""" self.cred = cred self.opener = opener + self.count = 5 def __repr__(self): """Return the canonical string representation of the search engine.""" @@ -85,7 +89,7 @@ class BingSearchEngine(_BaseSearchEngine): url = "https://api.datamarket.azure.com/Bing/{0}/Web?".format(service) params = { "$format": "json", - "$top": "5", + "$top": str(self.count), "Query": "'\"" + query.replace('"', "").encode("utf8") + "\"'", "Market": "'en-US'", "Adult": "'Off'", @@ -150,8 +154,10 @@ class YahooBOSSSearchEngine(_BaseSearchEngine): "oauth_nonce": oauth.generate_nonce(), "oauth_timestamp": oauth.Request.make_timestamp(), "oauth_consumer_key": consumer.key, - "q": '"' + query.encode("utf8") + '"', "count": "5", - "type": "html,text,pdf", "format": "json", + "q": '"' + query.encode("utf8") + '"', + "count": str(self.count), + "type": "html,text,pdf", + "format": "json", } req = oauth.Request(method="GET", url=url, parameters=params) @@ -183,7 +189,47 @@ class YahooBOSSSearchEngine(_BaseSearchEngine): return [result["url"] for result in results] +class YandexSearchEngine(_BaseSearchEngine): + """A search engine interface with Yandex Search.""" + name = "Yandex" + + @staticmethod + def requirements(): + return ["lxml"] + + def search(self, query): + """Do a Yandex web search for *query*. + + Returns a list of URLs ranked by relevance (as determined by Yandex). + Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. + """ + url = "https://yandex.com/search/xml" + query = re_sub(r"[^a-zA-Z0-9]", "", query).encode("utf8") + params = { + "user": self.cred["user"], + "key": self.cred["key"], + "query": '"' + query + '"', + "l10n": "en", + "filter": "none", + "maxpassages": "1", + "groupby": "mode=flat.groups-on-page={0}".format(self.count) + } + + try: + response = self.opener.open(url, urlencode(params)) + result = response.read() + except (URLError, error) as exc: + raise SearchQueryError("Yandex Error: " + str(exc)) + + try: + data = etree.fromstring(result) + return [elem.text for elem in data.xpath(".//url")] + except etree.Error as exc: + raise SearchQueryError("Yandex XML parse error: " + str(exc)) + + SEARCH_ENGINES = { "Bing": BingSearchEngine, - "Yahoo! BOSS": YahooBOSSSearchEngine + "Yahoo! BOSS": YahooBOSSSearchEngine, + "Yandex": YandexSearchEngine } From 76b068c4dfa59ba4ee3a1b415d6ed6d32602ebbe Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 10 May 2016 04:17:54 -0500 Subject: [PATCH 49/88] Add Yandex proxy support. --- earwigbot/wiki/copyvios/search.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index ff97326..82d133f 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -203,7 +203,8 @@ class YandexSearchEngine(_BaseSearchEngine): Returns a list of URLs ranked by relevance (as determined by Yandex). Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. """ - url = "https://yandex.com/search/xml" + domain = self.cred.get("proxy", "yandex.com") + url = "https://{0}/search/xml".format(domain) query = re_sub(r"[^a-zA-Z0-9]", "", query).encode("utf8") params = { "user": self.cred["user"], From 7853bcc0f3091c6457d6c792dd14c0cd3402a972 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 10 May 2016 04:27:17 -0500 Subject: [PATCH 50/88] Fix dependency checking for search engines. --- earwigbot/wiki/copyvios/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index d63f384..45385e7 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -71,8 +71,8 @@ class CopyvioMixIn(object): for dep in klass.requirements(): try: - __import__(dep).__package__ - except ImportError: + __import__(dep).__name__ + except (ImportError, AttributeError): e = "Missing a required dependency ({}) for the {} engine" e = e.format(dep, engine) raise exceptions.UnsupportedSearchEngineError(e) From 98d0977c19ffa3c3f16a9ef7281c74f77ba5297c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 10 May 2016 04:47:45 -0500 Subject: [PATCH 51/88] Refactor search; cleanup; fixup. --- earwigbot/wiki/copyvios/search.py | 57 +++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index 82d133f..e46abef 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -55,6 +55,26 @@ class _BaseSearchEngine(object): """Return a nice string representation of the search engine.""" return "<{0}>".format(self.__class__.__name__) + def _open(self, *args): + """Open a URL (like urlopen) and try to return its contents.""" + try: + response = self.opener.open(*args) + result = response.read() + except (URLError, error) as exc: + raise SearchQueryError("{0} Error: {1}".format(self.name, exc)) + + if response.headers.get("Content-Encoding") == "gzip": + stream = StringIO(result) + gzipper = GzipFile(fileobj=stream) + result = gzipper.read() + + code = response.getcode() + if code != 200: + err = "{0} Error: got response code '{1}':\n{2}'" + raise SearchQueryError(err.format(self.name, code, result)) + + return result + @staticmethod def requirements(): """Return a list of packages required by this search engine.""" @@ -97,20 +117,8 @@ class BingSearchEngine(_BaseSearchEngine): "WebSearchOptions": "'DisableHostCollapsing+DisableQueryAlterations'" } - try: - response = self.opener.open(url + urlencode(params)) - result = response.read() - except (URLError, error) as exc: - raise SearchQueryError("Bing Error: " + str(exc)) + result = self._open(url + urlencode(params)) - if response.headers.get("Content-Encoding") == "gzip": - stream = StringIO(result) - gzipper = GzipFile(fileobj=stream) - result = gzipper.read() - - if response.getcode() != 200: - err = "Bing Error: got response code '{0}':\n{1}'" - raise SearchQueryError(err.format(response.getcode(), result)) try: res = loads(result) except ValueError: @@ -162,20 +170,9 @@ class YahooBOSSSearchEngine(_BaseSearchEngine): req = oauth.Request(method="GET", url=url, parameters=params) req.sign_request(oauth.SignatureMethod_HMAC_SHA1(), consumer, None) - try: - response = self.opener.open(self._build_url(url, req)) - result = response.read() - except (URLError, error) as exc: - raise SearchQueryError("Yahoo! BOSS Error: " + str(exc)) - if response.headers.get("Content-Encoding") == "gzip": - stream = StringIO(result) - gzipper = GzipFile(fileobj=stream) - result = gzipper.read() + result = self._open(self._build_url(url, req)) - if response.getcode() != 200: - err = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" - raise SearchQueryError(err.format(response.getcode(), result)) try: res = loads(result) except ValueError: @@ -204,8 +201,8 @@ class YandexSearchEngine(_BaseSearchEngine): Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. """ domain = self.cred.get("proxy", "yandex.com") - url = "https://{0}/search/xml".format(domain) - query = re_sub(r"[^a-zA-Z0-9]", "", query).encode("utf8") + url = "https://{0}/search/xml?".format(domain) + query = re_sub(r"[^a-zA-Z0-9 ]", "", query).encode("utf8") params = { "user": self.cred["user"], "key": self.cred["key"], @@ -216,11 +213,7 @@ class YandexSearchEngine(_BaseSearchEngine): "groupby": "mode=flat.groups-on-page={0}".format(self.count) } - try: - response = self.opener.open(url, urlencode(params)) - result = response.read() - except (URLError, error) as exc: - raise SearchQueryError("Yandex Error: " + str(exc)) + result = self._open(url + urlencode(params)) try: data = etree.fromstring(result) From a95356676bfb5baf20c97d026a4b0ec67f894b5a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 5 Jun 2016 23:01:55 -0400 Subject: [PATCH 52/88] Add GoogleSearchEngine. --- earwigbot/wiki/copyvios/search.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index e46abef..f6e21e3 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -34,8 +34,8 @@ from earwigbot.exceptions import SearchQueryError etree = importer.new("lxml.etree") oauth = importer.new("oauth2") -__all__ = ["BingSearchEngine", "YahooBOSSSearchEngine", "YandexSearchEngine", - "SEARCH_ENGINES"] +__all__ = ["BingSearchEngine", "GoogleSearchEngine", "YahooBOSSSearchEngine", + "YandexSearchEngine", "SEARCH_ENGINES"] class _BaseSearchEngine(object): """Base class for a simple search engine interface.""" @@ -132,6 +132,38 @@ class BingSearchEngine(_BaseSearchEngine): return [result["Url"] for result in results] +class GoogleSearchEngine(_BaseSearchEngine): + """A search engine interface with Google Search.""" + name = "Google" + + def search(self, query): + """Do a Google web search for *query*. + + Returns a list of URLs ranked by relevance (as determined by Google). + Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. + """ + domain = self.cred.get("proxy", "www.googleapis.com") + url = "https://{0}/customsearch/v1?".format(domain) + params = { + "cx": self.cred["id"], + "key": self.cred["key"], + "q": '"' + query.replace('"', "").encode("utf8") + '"', + "alt": "json", + "num": str(self.count), + "safe": "off" + "fields": "items(link)" + } + + result = self._open(url + urlencode(params)) + + try: + res = loads(result) + except ValueError: + err = "Google Error: JSON could not be decoded" + raise SearchQueryError(err) + return [item["link"] for item in res["items"]] + + class YahooBOSSSearchEngine(_BaseSearchEngine): """A search engine interface with Yahoo! BOSS.""" name = "Yahoo! BOSS" @@ -224,6 +256,7 @@ class YandexSearchEngine(_BaseSearchEngine): SEARCH_ENGINES = { "Bing": BingSearchEngine, + "Google": GoogleSearchEngine, "Yahoo! BOSS": YahooBOSSSearchEngine, "Yandex": YandexSearchEngine } From aba91c0f1c8043ea5a074003991e67196fb2c3d4 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 5 Jun 2016 23:15:47 -0400 Subject: [PATCH 53/88] Missing comma. --- earwigbot/wiki/copyvios/search.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index f6e21e3..aea9614 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -150,7 +150,7 @@ class GoogleSearchEngine(_BaseSearchEngine): "q": '"' + query.replace('"', "").encode("utf8") + '"', "alt": "json", "num": str(self.count), - "safe": "off" + "safe": "off", "fields": "items(link)" } From fbb9ea7b03f908b8ac00d89ae85c76601c8d784b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 5 Jun 2016 23:39:06 -0400 Subject: [PATCH 54/88] Catch empty Google results properly. --- earwigbot/wiki/copyvios/search.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index aea9614..0824a5f 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -161,7 +161,11 @@ class GoogleSearchEngine(_BaseSearchEngine): except ValueError: err = "Google Error: JSON could not be decoded" raise SearchQueryError(err) - return [item["link"] for item in res["items"]] + + try: + return [item["link"] for item in res["items"]] + except KeyError: + return [] class YahooBOSSSearchEngine(_BaseSearchEngine): From 454e6bdb8cca11aa7ea7a38e94f72a711ba1dc26 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 9 Jun 2016 22:08:58 -0400 Subject: [PATCH 55/88] Bump dependencies. --- setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index de035f1..7a302c1 100644 --- a/setup.py +++ b/setup.py @@ -36,19 +36,19 @@ extra_deps = { "pycrypto >= 2.6.1", # Storing bot passwords + keys in the config file ], "sql": [ - "oursql >= 0.9.3.1", # Interfacing with MediaWiki databases + "oursql >= 0.9.3.2", # Interfacing with MediaWiki databases ], "copyvios": [ "beautifulsoup4 >= 4.4.1", # Parsing/scraping HTML "cchardet >= 1.0.0", # Encoding detection for BeautifulSoup - "lxml >= 3.4.4", # Faster parser for BeautifulSoup - "nltk >= 3.1", # Parsing sentences to split article content + "lxml >= 3.6.0", # Faster parser for BeautifulSoup + "nltk >= 3.2.1", # Parsing sentences to split article content "oauth2 >= 1.9.0", # Interfacing with Yahoo! BOSS Search "pdfminer >= 20140328", # Extracting text from PDF files - "tldextract >= 1.7.1", # Getting domains for the multithreaded workers + "tldextract >= 2.0.1", # Getting domains for the multithreaded workers ], "time": [ - "pytz >= 2015.7", # Handling timezones for the !time IRC command + "pytz >= 2016.4", # Handling timezones for the !time IRC command ], } From f2099df5d5efbc6a5ac6fe0621e3129e4d91a4f1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 27 Jul 2016 18:48:38 -0400 Subject: [PATCH 56/88] Minor refactor in HTML parser. --- earwigbot/wiki/copyvios/parsers.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 2f88356..21ccfed 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -232,6 +232,20 @@ class _HTMLParser(_BaseTextParser): "script", "style" ] + def _fail_if_mirror(self, soup): + """Look for obvious signs that the given soup is a wiki mirror. + + If so, raise ParserExclusionError, which is caught in the workers and + causes this source to excluded. + """ + if "mirror_hints" not in self._args: + return + + func = lambda attr: attr and any( + hint in attr for hint in self._args["mirror_hints"]) + if soup.find_all(href=func) or soup.find_all(src=func): + raise ParserExclusionError() + def parse(self): """Return the actual text contained within an HTML document. @@ -248,12 +262,7 @@ class _HTMLParser(_BaseTextParser): # no scrapable content (possibly JS or magic): return "" - if "mirror_hints" in self._args: - # Look for obvious signs that this is a mirror: - func = lambda attr: attr and any( - hint in attr for hint in self._args["mirror_hints"]) - if soup.find_all(href=func) or soup.find_all(src=func): - raise ParserExclusionError() + self._fail_if_mirror(soup) soup = soup.body is_comment = lambda text: isinstance(text, bs4.element.Comment) From a463c6d052566cfd12442ed0eb98b5fda90dca61 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 4 Aug 2016 00:59:52 -0400 Subject: [PATCH 57/88] Fix lazy loading bug where lxml.etree wasn't accessible to bs4. --- earwigbot/wiki/copyvios/search.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index 0824a5f..1056496 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -31,7 +31,7 @@ from urllib2 import URLError from earwigbot import importer from earwigbot.exceptions import SearchQueryError -etree = importer.new("lxml.etree") +lxml = importer.new("lxml") oauth = importer.new("oauth2") __all__ = ["BingSearchEngine", "GoogleSearchEngine", "YahooBOSSSearchEngine", @@ -228,7 +228,7 @@ class YandexSearchEngine(_BaseSearchEngine): @staticmethod def requirements(): - return ["lxml"] + return ["lxml.etree"] def search(self, query): """Do a Yandex web search for *query*. @@ -252,9 +252,9 @@ class YandexSearchEngine(_BaseSearchEngine): result = self._open(url + urlencode(params)) try: - data = etree.fromstring(result) + data = lxml.etree.fromstring(result) return [elem.text for elem in data.xpath(".//url")] - except etree.Error as exc: + except lxml.etree.Error as exc: raise SearchQueryError("Yandex XML parse error: " + str(exc)) From 2294de395f7dbd34d149f6c3275795d36a9289c0 Mon Sep 17 00:00:00 2001 From: Justin Kim Date: Fri, 30 Sep 2016 20:56:27 -0400 Subject: [PATCH 58/88] Clarify !lag output --- earwigbot/commands/lag.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/earwigbot/commands/lag.py b/earwigbot/commands/lag.py index 5e902f1..d363b31 100644 --- a/earwigbot/commands/lag.py +++ b/earwigbot/commands/lag.py @@ -37,7 +37,7 @@ class Lag(Command): msg = base.format(site.name, self.get_replag(site)) elif data.command == "maxlag": base = "\x0302{0}\x0F: {1}." - msg = base.format(site.name, self.get_maxlag(site).capitalize()) + msg = base.format(site.name, self.get_maxlag(site)) else: base = "\x0302{0}\x0F: {1}; {2}." msg = base.format(site.name, self.get_replag(site), @@ -45,10 +45,10 @@ class Lag(Command): self.reply(data, msg) def get_replag(self, site): - return "replag is {0}".format(self.time(site.get_replag())) + return "SQL replag is {0}".format(self.time(site.get_replag())) def get_maxlag(self, site): - return "database maxlag is {0}".format(self.time(site.get_maxlag())) + return "API maxlag is {0}".format(self.time(site.get_maxlag())) def get_site(self, data): if data.kwargs and "project" in data.kwargs and "lang" in data.kwargs: From 802fa3227d6f33d01366cdd5b5e146d0cb4e8036 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 6 Oct 2016 00:01:48 -0500 Subject: [PATCH 59/88] Make "!remind list" behavior more predictable. --- earwigbot/commands/remind.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/earwigbot/commands/remind.py b/earwigbot/commands/remind.py index 1b3d524..e2061d4 100644 --- a/earwigbot/commands/remind.py +++ b/earwigbot/commands/remind.py @@ -30,7 +30,7 @@ import time from earwigbot.commands import Command from earwigbot.irc import Data -DISPLAY = ["display", "show", "list", "info", "details"] +DISPLAY = ["display", "show", "info", "details"] CANCEL = ["cancel", "stop", "delete", "del", "stop", "unremind", "forget", "disregard"] SNOOZE = ["snooze", "delay", "reset", "adjust", "modify", "change"] @@ -304,6 +304,8 @@ class Remind(Command): if data.args[0] == "help": return self._show_help(data) + if data.args[0] == "list": + return self._show_reminders(data) if data.args[0] == "all": return self._show_all_reminders(data) if data.args[0] in DISPLAY + CANCEL + SNOOZE: From aed5a5954d1dd0abf33f2e56afd9abfff3b9300f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 30 Dec 2016 13:49:02 -0500 Subject: [PATCH 60/88] Fix SitesDB lookup for sites with overlapping URLs. --- earwigbot/wiki/sitesdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py index 3a53531..d17c965 100644 --- a/earwigbot/wiki/sitesdb.py +++ b/earwigbot/wiki/sitesdb.py @@ -239,7 +239,7 @@ class SitesDB(object): if site: return site[0] else: - url = "%{0}.{1}%".format(lang, project) + url = "//{0}.{1}.%".format(lang, project) site = conn.execute(query2, (url,)).fetchone() return site[0] if site else None except sqlite.OperationalError: From 39b63f11c17b0a730d41829e9d2166c8182a4dc0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 15 Jan 2017 02:18:22 -0600 Subject: [PATCH 61/88] Add a bunch of things to the WikiProjectTagger task. --- CHANGELOG | 1 + earwigbot/tasks/wikiproject_tagger.py | 246 +++++++++++++++++++++++----------- 2 files changed, 168 insertions(+), 79 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 25f3b5b..7126c4b 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,6 @@ v0.3 (unreleased): +- Added various new features to the WikiProjectTagger task. - Copyvio detector: improved sentence splitting algorithm. - Improved config file command/task exclusion logic. - IRC > !cidr: Added; new command for calculating range blocks. diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index 6884d76..58b7794 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -30,9 +30,9 @@ class WikiProjectTagger(Task): """A task to tag talk pages with WikiProject banners. Usage: :command:`earwigbot -t wikiproject_tagger PATH - --banner BANNER (--category CAT | --file FILE) [--summary SUM] - [--append TEXT] [--autoassess] [--nocreate] [--recursive NUM] - [--site SITE]` + --banner BANNER (--category CAT | --file FILE) [--summary SUM] [--update] + [--append PARAMS] [--autoassess [CLASSES]] [--only-with BANNER] + [--nocreate] [--recursive [NUM]] [--genfixes] [--site SITE] [--dry-run]` .. glossary:: @@ -47,21 +47,33 @@ class WikiProjectTagger(Task): current directory) ``--summary SUM`` an optional edit summary to use; defaults to - ``"Adding WikiProject banner {{BANNER}}."`` - ``--append TEXT`` - optional text to append to the banner (after an autoassessment, if - any), like ``|importance=low`` - ``--autoassess`` + ``"Tagging with WikiProject banner {{BANNER}}."`` + ``--update`` + updates existing banners with new fields; should include at least one + of ``--append`` or ``--autoassess`` to be useful + ``--append PARAMS`` + optional comma-separated parameters to append to the banner (after an + auto-assessment, if any); use syntax ``importance=low,taskforce=yes`` + to add ``|importance=low|taskforce=yes`` + ``--autoassess [CLASSES]`` try to assess each article's class automatically based on the class of - other banners on the same page + other banners on the same page; if CLASSES is given as a + comma-separated list, only those classes will be auto-assessed + ``--only-with BANNER`` + only tag pages that already have the given banner ``--nocreate`` don't create new talk pages with just a banner if the page doesn't already exist ``--recursive NUM`` recursively go through subcategories up to a maximum depth of ``NUM``, or if ``NUM`` isn't provided, go infinitely (this can be dangerous) + ``--genfixes`` + apply general fixes to the page if already making other changes ``--site SITE`` - the ID of the site to tag pages on, defaulting to the... default site + the ID of the site to tag pages on, defaulting to the default site + ``--dry-run`` + don't actually make any edits, just log the pages that would have been + edited """ name = "wikiproject_tagger" @@ -94,7 +106,8 @@ class WikiProjectTagger(Task): r"((wikiproject|wp) ?)?bio(graph(y|ies))?$", ] - def _upperfirst(self, text): + @staticmethod + def _upperfirst(text): """Try to uppercase the first letter of a string.""" try: return text[0].upper() + text[1:] @@ -114,15 +127,28 @@ class WikiProjectTagger(Task): site = self.bot.wiki.get_site(name=kwargs.get("site")) banner = kwargs["banner"] - summary = kwargs.get("summary", "Adding WikiProject banner $3.") + summary = kwargs.get("summary", "Tagging with WikiProject banner $3.") + update = kwargs.get("update", False) append = kwargs.get("append") autoassess = kwargs.get("autoassess", False) + ow_banner = kwargs.get("only-with") nocreate = kwargs.get("nocreate", False) recursive = kwargs.get("recursive", 0) + genfixes = kwargs.get("genfixes", False) + dry_run = kwargs.get("dry-run", False) banner, names = self.get_names(site, banner) if not names: return - job = _Job(banner, names, summary, append, autoassess, nocreate) + if ow_banner: + _, only_with = self.get_names(site, ow_banner) + if not only_with: + return + else: + only_with = None + + job = _Job(banner=banner, names=names, summary=summary, update=update, + append=append, autoassess=autoassess, only_with=only_with, + nocreate=nocreate, genfixes=genfixes, dry_run=dry_run) try: self.run_job(kwargs, site, job, recursive) @@ -172,32 +198,29 @@ class WikiProjectTagger(Task): banner = banner.split(":", 1)[1] page = site.get_page(title) if page.exists != page.PAGE_EXISTS: - self.logger.error(u"Banner [[{0}]] does not exist".format(title)) + self.logger.error(u"Banner [[%s]] does not exist", title) return banner, None - if banner == title: - names = [self._upperfirst(banner)] - else: - names = [self._upperfirst(banner), self._upperfirst(title)] + names = {banner, title} result = site.api_query(action="query", list="backlinks", bllimit=500, blfilterredir="redirects", bltitle=title) for backlink in result["query"]["backlinks"]: - names.append(backlink["title"]) + names.add(backlink["title"]) if backlink["ns"] == constants.NS_TEMPLATE: - names.append(backlink["title"].split(":", 1)[1]) + names.add(backlink["title"].split(":", 1)[1]) - log = u"Found {0} aliases for banner [[{1}]]".format(len(names), title) - self.logger.debug(log) + log = u"Found %s aliases for banner [[%s]]" + self.logger.debug(log, len(names), title) return banner, names def process_category(self, page, job, recursive): """Try to tag all pages in the given category.""" - self.logger.info(u"Processing category: [[{0]]".format(page.title)) + self.logger.info(u"Processing category: [[%s]]", page.title) for member in page.get_members(): if member.namespace == constants.NS_CATEGORY: if recursive is True: self.process_category(member, job, True) - elif recursive: + elif recursive > 0: self.process_category(member, job, recursive - 1) else: self.process_page(member, job) @@ -214,65 +237,125 @@ class WikiProjectTagger(Task): try: code = page.parse() except exceptions.PageNotFoundError: - if job.nocreate: - log = u"Skipping nonexistent page: [[{0}]]".format(page.title) - self.logger.info(log) - else: - log = u"Tagging new page: [[{0}]]".format(page.title) - self.logger.info(log) - banner = "{{" + job.banner + job.append + "}}" - summary = job.summary.replace("$3", banner) - page.edit(banner, self.make_summary(summary)) + self.process_new_page(page, job) return except exceptions.InvalidPageError: - log = u"Skipping invalid page: [[{0}]]".format(page.title) - self.logger.error(log) + self.logger.error(u"Skipping invalid page: [[%s]]", page.title) return + is_update = False for template in code.ifilter_templates(recursive=True): - name = self._upperfirst(template.name.strip()) - if name in job.names: - log = u"Skipping page: [[{0}]]; already tagged with '{1}'" - self.logger.info(log.format(page.title, name)) + if template.name.matches(job.names): + if job.update: + banner = template + is_update = True + break + else: + log = u"Skipping page: [[%s]]; already tagged with '%s'" + self.logger.info(log, page.title, template.name) + return + + if job.only_with: + if not any(template.name.matches(job.only_with) + for template in code.ifilter_templates(recursive=True)): + log = u"Skipping page: [[%s]]; fails only-with condition" + self.logger.info(log, page.title) return - banner = self.make_banner(job, code) - shell = self.get_banner_shell(code) - if shell: - if shell.has_param(1): - shell.get(1).value.insert(0, banner + "\n") - else: - shell.add(1, banner) + if is_update: + old_banner = unicode(banner) + self.update_banner(banner, job, code) + if banner == old_banner: + log = u"Skipping page: [[%s]]; already tagged and no updates" + self.logger.info(log, page.title) + return + self.logger.info(u"Updating banner on page: [[%s]]", page.title) else: - self.add_banner(code, banner) - self.apply_genfixes(code) + self.logger.info(u"Tagging page: [[%s]]", page.title) + banner = self.make_banner(job, code) + shell = self.get_banner_shell(code) + if shell: + if shell.has_param(1): + shell.get(1).value.insert(0, banner + "\n") + else: + shell.add(1, banner) + else: + self.add_banner(code, banner) - self.logger.info(u"Tagging page: [[{0}]]".format(page.title)) - summary = job.summary.replace("$3", banner) - page.edit(unicode(code), self.make_summary(summary)) + if job.genfixes: + self.apply_genfixes(code) - def make_banner(self, job, code): + if job.dry_run: + self.logger.debug(u"DRY RUN: Banner: %s", banner) + else: + summary = job.summary.replace("$3", banner) + page.edit(unicode(code), self.make_summary(summary)) + + def process_new_page(self, page, job): + """Try to tag a *page* that doesn't exist yet using the *job*.""" + if job.nocreate or job.only_with: + log = u"Skipping nonexistent page: [[%s]]" + self.logger.info(log, page.title) + else: + self.logger.info(u"Tagging new page: [[%s]]", page.title) + banner = self.make_banner(job) + if job.dry_run: + self.logger.debug(u"DRY RUN: Banner: %s", banner) + else: + summary = job.summary.replace("$3", banner) + page.edit(banner, self.make_summary(summary)) + + def make_banner(self, job, code=None): """Return banner text to add based on a *job* and a page's *code*.""" - banner = "{{" + job.banner - if job.autoassess: - classes = {"fa": 0, "fl": 0, "ga": 0, "a": 0, "b": 0, "start": 0, - "stub": 0, "list": 0, "dab": 0, "c": 0, "redirect": 0, - "book": 0, "template": 0, "category": 0} - for template in code.ifilter_templates(recursive=True): - if template.has_param("class"): - value = unicode(template.get("class").value).lower() - if value in classes: - classes[value] += 1 - values = tuple(classes.values()) + banner = job.banner + if code is not None and job.autoassess is not False: + assessment = self.get_autoassessment(code, job.autoassess) + if assessment: + banner += "|class=" + assessment + if job.append: + banner += "|" + "|".join(job.append.split(",")) + return "{{" + banner + "}}" + + def update_banner(self, banner, job, code): + """Update an existing *banner* based on a *job* and a page's *code*.""" + if job.autoassess is not False: + if not banner.has("class") or not banner.get("class").value: + assessment = self.get_autoassessment(code, job.autoassess) + if assessment: + banner.add("class", assessment) + if job.append: + for param in job.append.split(","): + key, value = param.split("=", 1) + if not banner.has(key) or not banner.get(key).value: + banner.add(key, value) + + def get_autoassessment(self, code, only_classes=None): + if only_classes is None: + classnames = ["a", "b", "book", "c", "category", "dab", "fa", + "fl", "ga", "list", "redirect", "start", "stub", + "template"] + else: + classnames = [klass.strip().lower() + for klass in only_classes.split(",")] + + classes = {klass: 0 for klass in classnames} + for template in code.ifilter_templates(recursive=True): + if template.has("class"): + value = unicode(template.get("class").value).lower() + if value in classes: + classes[value] += 1 + + values = tuple(classes.values()) + if values: best = max(values) confidence = float(best) / sum(values) if confidence > 0.75: rank = tuple(classes.keys())[values.index(best)] if rank in ("fa", "fl", "ga"): - banner += "|class=" + rank.upper() + return rank.upper() else: - banner += "|class=" + self._upperfirst(rank) - return banner + job.append + "}}" + return self._upperfirst(rank) + return None def get_banner_shell(self, code): """Return the banner shell template within *code*, else ``None``.""" @@ -281,8 +364,8 @@ class WikiProjectTagger(Task): if not shells: shells = code.filter_templates(matches=regex, recursive=True) if shells: - log = u"Inserting banner into shell: {0}" - self.logger.debug(log.format(shells[0].name)) + log = u"Inserting banner into shell: %s" + self.logger.debug(log, shells[0].name) return shells[0] def add_banner(self, code, banner): @@ -292,15 +375,16 @@ class WikiProjectTagger(Task): name = template.name.lower().replace("_", " ") for regex in self.TOP_TEMPS: if re.match(regex, name): - self.logger.info("Skipping top template: {0}".format(name)) + self.logger.debug(u"Skipping top template: %s", name) index = i + 1 - self.logger.debug(u"Inserting banner at index {0}".format(index)) + self.logger.debug(u"Inserting banner at index %s", index) code.insert(index, banner) def apply_genfixes(self, code): """Apply general fixes to *code*, such as template substitution.""" - regex = r"^\{\{\s*((un|no)?s(i((gn|ng)(ed3?)?|g))?|usu|tilde|forgot to sign|without signature)" + regex = (r"^\{\{\s*((un|no)?s(i((gn|ng)(ed3?)?|g))?|usu|tilde|" + r"forgot to sign|without signature)") for template in code.ifilter_templates(matches=regex): self.logger.debug("Applying genfix: substitute {{unsigned}}") template.name = "subst:unsigned" @@ -313,13 +397,17 @@ class _Job(object): or not to autoassess and create new pages from scratch, and a counter of the number of pages edited. """ - def __init__(self, banner, names, summary, append, autoassess, nocreate): - self.banner = banner - self.names = names - self.summary = summary - self.append = append - self.autoassess = autoassess - self.nocreate = nocreate + def __init__(self, **kwargs): + self.banner = kwargs["banner"] + self.names = kwargs["names"] + self.summary = kwargs["summary"] + self.update = kwargs["update"] + self.append = kwargs["append"] + self.autoassess = kwargs["autoassess"] + self.only_with = kwargs["only_with"] + self.nocreate = kwargs["nocreate"] + self.genfixes = kwargs["genfixes"] + self.dry_run = kwargs["dry_run"] self.counter = 0 From 07b241ff7ac765d9534043f580598b7016ae96d9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 15 Jan 2017 02:36:11 -0600 Subject: [PATCH 62/88] Fix WikiProjectTagger page saving bug. --- earwigbot/tasks/wikiproject_tagger.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index 58b7794..7a425d4 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -270,6 +270,7 @@ class WikiProjectTagger(Task): self.logger.info(log, page.title) return self.logger.info(u"Updating banner on page: [[%s]]", page.title) + banner = banner.encode("utf8") else: self.logger.info(u"Tagging page: [[%s]]", page.title) banner = self.make_banner(job, code) @@ -285,11 +286,7 @@ class WikiProjectTagger(Task): if job.genfixes: self.apply_genfixes(code) - if job.dry_run: - self.logger.debug(u"DRY RUN: Banner: %s", banner) - else: - summary = job.summary.replace("$3", banner) - page.edit(unicode(code), self.make_summary(summary)) + self.save_page(page, job, unicode(code), banner) def process_new_page(self, page, job): """Try to tag a *page* that doesn't exist yet using the *job*.""" @@ -299,11 +296,15 @@ class WikiProjectTagger(Task): else: self.logger.info(u"Tagging new page: [[%s]]", page.title) banner = self.make_banner(job) - if job.dry_run: - self.logger.debug(u"DRY RUN: Banner: %s", banner) - else: - summary = job.summary.replace("$3", banner) - page.edit(banner, self.make_summary(summary)) + self.save_page(page, job, banner, banner) + + def save_page(self, page, job, text, banner): + """Save a page with an updated banner.""" + if job.dry_run: + self.logger.debug(u"[DRY RUN] Banner: %s", banner) + else: + summary = job.summary.replace("$3", banner) + page.edit(text, self.make_summary(summary)) def make_banner(self, job, code=None): """Return banner text to add based on a *job* and a page's *code*.""" From 178428cb974a23340897000dbd05c7a54c128694 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 15 Jan 2017 02:44:07 -0600 Subject: [PATCH 63/88] Allow specifying task number in config. --- earwigbot/tasks/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/earwigbot/tasks/__init__.py b/earwigbot/tasks/__init__.py index 48f2d16..b502f88 100644 --- a/earwigbot/tasks/__init__.py +++ b/earwigbot/tasks/__init__.py @@ -54,6 +54,10 @@ class Task(object): self.bot = bot self.config = bot.config self.logger = bot.tasks.logger.getChild(self.name) + + number = self.config.tasks.get(self.name, {}).get("number") + if number is not None: + self.number = number self.setup() def __repr__(self): From e9cf7882a99aea6be8fc2979e75c755ab2f574ab Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 15 Jan 2017 03:11:42 -0600 Subject: [PATCH 64/88] Slightly better detection of existing parameters. --- earwigbot/tasks/wikiproject_tagger.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index 7a425d4..b394ee7 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -319,15 +319,18 @@ class WikiProjectTagger(Task): def update_banner(self, banner, job, code): """Update an existing *banner* based on a *job* and a page's *code*.""" + has = lambda key: (banner.has(key) and + banner.get(key).value.strip() not in ("", "?")) + if job.autoassess is not False: - if not banner.has("class") or not banner.get("class").value: + if not has("class"): assessment = self.get_autoassessment(code, job.autoassess) if assessment: banner.add("class", assessment) if job.append: for param in job.append.split(","): key, value = param.split("=", 1) - if not banner.has(key) or not banner.get(key).value: + if not has(key): banner.add(key, value) def get_autoassessment(self, code, only_classes=None): From e41a178c05e1822caddc0b7eb6159422f5b5f61a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 15 Jan 2017 03:14:07 -0600 Subject: [PATCH 65/88] Mark WikiProjectTagger edits as minor. --- earwigbot/tasks/wikiproject_tagger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index b394ee7..9884d64 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -304,7 +304,7 @@ class WikiProjectTagger(Task): self.logger.debug(u"[DRY RUN] Banner: %s", banner) else: summary = job.summary.replace("$3", banner) - page.edit(text, self.make_summary(summary)) + page.edit(text, self.make_summary(summary), minor=True) def make_banner(self, job, code=None): """Return banner text to add based on a *job* and a page's *code*.""" From cc574cec6d532f48f94275c0931311b3ca1f6c16 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 15 Jan 2017 03:57:58 -0600 Subject: [PATCH 66/88] WikiProjectTagger: Fix auto-assess when no current assessments. --- earwigbot/tasks/wikiproject_tagger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index 9884d64..edff8aa 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -350,8 +350,8 @@ class WikiProjectTagger(Task): classes[value] += 1 values = tuple(classes.values()) - if values: - best = max(values) + best = max(values) + if best: confidence = float(best) / sum(values) if confidence > 0.75: rank = tuple(classes.keys())[values.index(best)] From f684821aef4630c322eb70087406a2253c062d63 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 4 May 2017 15:22:16 -0500 Subject: [PATCH 67/88] banner tagger: can tag categories, add |auto param if appropriate, fixes --- earwigbot/tasks/wikiproject_tagger.py | 55 ++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index edff8aa..6fdee44 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -39,7 +39,7 @@ class WikiProjectTagger(Task): ``--banner BANNER`` the page name of the banner to add, without a namespace (unless the namespace is something other than ``Template``) so - ``--banner WikiProject Biography`` for ``{{WikiProject Biography}}`` + ``--banner "WikiProject Biography"`` for ``{{WikiProject Biography}}`` ``--category CAT`` or ``--file FILE`` determines which pages to tag; either all pages in a category (to include subcategories as well, see ``--recursive``) or all @@ -67,6 +67,9 @@ class WikiProjectTagger(Task): ``--recursive NUM`` recursively go through subcategories up to a maximum depth of ``NUM``, or if ``NUM`` isn't provided, go infinitely (this can be dangerous) + ``--tag-categories`` + also tag category pages; will autoassess with ``|class=category`` if + ``--autoassess`` is given ``--genfixes`` apply general fixes to the page if already making other changes ``--site SITE`` @@ -134,6 +137,7 @@ class WikiProjectTagger(Task): ow_banner = kwargs.get("only-with") nocreate = kwargs.get("nocreate", False) recursive = kwargs.get("recursive", 0) + tag_categories = kwargs.get("tag-categories", False) genfixes = kwargs.get("genfixes", False) dry_run = kwargs.get("dry-run", False) banner, names = self.get_names(site, banner) @@ -148,7 +152,8 @@ class WikiProjectTagger(Task): job = _Job(banner=banner, names=names, summary=summary, update=update, append=append, autoassess=autoassess, only_with=only_with, - nocreate=nocreate, genfixes=genfixes, dry_run=dry_run) + nocreate=nocreate, tag_categories=tag_categories, + genfixes=genfixes, dry_run=dry_run) try: self.run_job(kwargs, site, job, recursive) @@ -218,6 +223,8 @@ class WikiProjectTagger(Task): self.logger.info(u"Processing category: [[%s]]", page.title) for member in page.get_members(): if member.namespace == constants.NS_CATEGORY: + if job.tag_categories: + self.process_page(member, job, is_category=True) if recursive is True: self.process_category(member, job, True) elif recursive > 0: @@ -225,7 +232,7 @@ class WikiProjectTagger(Task): else: self.process_page(member, job) - def process_page(self, page, job): + def process_page(self, page, job, is_category=False): """Try to tag a specific *page* using the *job* description.""" if job.counter % 10 == 0: # Do a shutoff check every ten pages if self.shutoff_enabled(page.site): @@ -264,7 +271,7 @@ class WikiProjectTagger(Task): if is_update: old_banner = unicode(banner) - self.update_banner(banner, job, code) + self.update_banner(banner, job, code, is_category=is_category) if banner == old_banner: log = u"Skipping page: [[%s]]; already tagged and no updates" self.logger.info(log, page.title) @@ -273,7 +280,7 @@ class WikiProjectTagger(Task): banner = banner.encode("utf8") else: self.logger.info(u"Tagging page: [[%s]]", page.title) - banner = self.make_banner(job, code) + banner = self.make_banner(job, code, is_category=is_category) shell = self.get_banner_shell(code) if shell: if shell.has_param(1): @@ -306,34 +313,44 @@ class WikiProjectTagger(Task): summary = job.summary.replace("$3", banner) page.edit(text, self.make_summary(summary), minor=True) - def make_banner(self, job, code=None): + def make_banner(self, job, code=None, is_category=False): """Return banner text to add based on a *job* and a page's *code*.""" banner = job.banner if code is not None and job.autoassess is not False: - assessment = self.get_autoassessment(code, job.autoassess) - if assessment: - banner += "|class=" + assessment + assess, reason = self.get_autoassessment( + code, job.autoassess, is_category=is_category) + if assess: + banner += "|class=" + assess + if reason: + banner += "|auto=" + reason if job.append: banner += "|" + "|".join(job.append.split(",")) return "{{" + banner + "}}" - def update_banner(self, banner, job, code): + def update_banner(self, banner, job, code, is_category=False): """Update an existing *banner* based on a *job* and a page's *code*.""" has = lambda key: (banner.has(key) and banner.get(key).value.strip() not in ("", "?")) if job.autoassess is not False: if not has("class"): - assessment = self.get_autoassessment(code, job.autoassess) - if assessment: - banner.add("class", assessment) + assess, reason = self.get_autoassessment( + code, job.autoassess, is_category=is_category) + if assess: + banner.add("class", assess) + if reason: + banner.add("auto", reason) if job.append: for param in job.append.split(","): key, value = param.split("=", 1) if not has(key): banner.add(key, value) - def get_autoassessment(self, code, only_classes=None): + def get_autoassessment(self, code, only_classes=None, is_category=False): + """Get an autoassessment for a page. + + Return (assessed class as a string or None, assessment reason or None). + """ if only_classes is None: classnames = ["a", "b", "book", "c", "category", "dab", "fa", "fl", "ga", "list", "redirect", "start", "stub", @@ -342,6 +359,9 @@ class WikiProjectTagger(Task): classnames = [klass.strip().lower() for klass in only_classes.split(",")] + if is_category: + return ("category" if "category" in classnames else None), None + classes = {klass: 0 for klass in classnames} for template in code.ifilter_templates(recursive=True): if template.has("class"): @@ -356,10 +376,10 @@ class WikiProjectTagger(Task): if confidence > 0.75: rank = tuple(classes.keys())[values.index(best)] if rank in ("fa", "fl", "ga"): - return rank.upper() + return rank.upper(), "inherit" else: - return self._upperfirst(rank) - return None + return self._upperfirst(rank), "inherit" + return None, None def get_banner_shell(self, code): """Return the banner shell template within *code*, else ``None``.""" @@ -410,6 +430,7 @@ class _Job(object): self.autoassess = kwargs["autoassess"] self.only_with = kwargs["only_with"] self.nocreate = kwargs["nocreate"] + self.tag_categories = kwargs["tag_categories"] self.genfixes = kwargs["genfixes"] self.dry_run = kwargs["dry_run"] self.counter = 0 From d910c6a96eabad14a3da1b33e0d5ddb3af655466 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 4 May 2017 15:38:33 -0500 Subject: [PATCH 68/88] --tag-categories should examine the root as wel --- earwigbot/tasks/wikiproject_tagger.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index 6fdee44..c93d1b7 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -221,14 +221,16 @@ class WikiProjectTagger(Task): def process_category(self, page, job, recursive): """Try to tag all pages in the given category.""" self.logger.info(u"Processing category: [[%s]]", page.title) + if job.tag_categories: + self.process_page(member, job, is_category=True) for member in page.get_members(): if member.namespace == constants.NS_CATEGORY: - if job.tag_categories: - self.process_page(member, job, is_category=True) if recursive is True: self.process_category(member, job, True) elif recursive > 0: self.process_category(member, job, recursive - 1) + elif job.tag_categories: + self.process_page(member, job, is_category=True) else: self.process_page(member, job) From ac5d741322af0c10d43ea5bdd4f2cc4bb5b7fd8c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 4 May 2017 15:39:46 -0500 Subject: [PATCH 69/88] whoops --- earwigbot/tasks/wikiproject_tagger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index c93d1b7..31a16f7 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -222,7 +222,7 @@ class WikiProjectTagger(Task): """Try to tag all pages in the given category.""" self.logger.info(u"Processing category: [[%s]]", page.title) if job.tag_categories: - self.process_page(member, job, is_category=True) + self.process_page(page, job, is_category=True) for member in page.get_members(): if member.namespace == constants.NS_CATEGORY: if recursive is True: From 8765f12dd477cf9c539df7ace9d88d186779f836 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 4 May 2017 15:45:36 -0500 Subject: [PATCH 70/88] Remove project tagger's genfixes; they don't really work anyway --- earwigbot/tasks/wikiproject_tagger.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index 31a16f7..59c5d4d 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -32,7 +32,7 @@ class WikiProjectTagger(Task): Usage: :command:`earwigbot -t wikiproject_tagger PATH --banner BANNER (--category CAT | --file FILE) [--summary SUM] [--update] [--append PARAMS] [--autoassess [CLASSES]] [--only-with BANNER] - [--nocreate] [--recursive [NUM]] [--genfixes] [--site SITE] [--dry-run]` + [--nocreate] [--recursive [NUM]] [--site SITE] [--dry-run]` .. glossary:: @@ -70,8 +70,6 @@ class WikiProjectTagger(Task): ``--tag-categories`` also tag category pages; will autoassess with ``|class=category`` if ``--autoassess`` is given - ``--genfixes`` - apply general fixes to the page if already making other changes ``--site SITE`` the ID of the site to tag pages on, defaulting to the default site ``--dry-run`` @@ -138,7 +136,6 @@ class WikiProjectTagger(Task): nocreate = kwargs.get("nocreate", False) recursive = kwargs.get("recursive", 0) tag_categories = kwargs.get("tag-categories", False) - genfixes = kwargs.get("genfixes", False) dry_run = kwargs.get("dry-run", False) banner, names = self.get_names(site, banner) if not names: @@ -153,7 +150,7 @@ class WikiProjectTagger(Task): job = _Job(banner=banner, names=names, summary=summary, update=update, append=append, autoassess=autoassess, only_with=only_with, nocreate=nocreate, tag_categories=tag_categories, - genfixes=genfixes, dry_run=dry_run) + dry_run=dry_run) try: self.run_job(kwargs, site, job, recursive) @@ -292,9 +289,6 @@ class WikiProjectTagger(Task): else: self.add_banner(code, banner) - if job.genfixes: - self.apply_genfixes(code) - self.save_page(page, job, unicode(code), banner) def process_new_page(self, page, job): @@ -407,14 +401,6 @@ class WikiProjectTagger(Task): self.logger.debug(u"Inserting banner at index %s", index) code.insert(index, banner) - def apply_genfixes(self, code): - """Apply general fixes to *code*, such as template substitution.""" - regex = (r"^\{\{\s*((un|no)?s(i((gn|ng)(ed3?)?|g))?|usu|tilde|" - r"forgot to sign|without signature)") - for template in code.ifilter_templates(matches=regex): - self.logger.debug("Applying genfix: substitute {{unsigned}}") - template.name = "subst:unsigned" - class _Job(object): """Represents a single wikiproject-tagging task. @@ -433,7 +419,6 @@ class _Job(object): self.only_with = kwargs["only_with"] self.nocreate = kwargs["nocreate"] self.tag_categories = kwargs["tag_categories"] - self.genfixes = kwargs["genfixes"] self.dry_run = kwargs["dry_run"] self.counter = 0 From be2fcff8e6a4dd8f6e6a12f37d3e32cf7de762dc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 4 May 2017 16:07:12 -0500 Subject: [PATCH 71/88] Smarter banner placement. --- earwigbot/tasks/wikiproject_tagger.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index 59c5d4d..9476959 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -282,10 +282,7 @@ class WikiProjectTagger(Task): banner = self.make_banner(job, code, is_category=is_category) shell = self.get_banner_shell(code) if shell: - if shell.has_param(1): - shell.get(1).value.insert(0, banner + "\n") - else: - shell.add(1, banner) + self.add_banner_to_shell(shell, banner) else: self.add_banner(code, banner) @@ -388,6 +385,17 @@ class WikiProjectTagger(Task): self.logger.debug(log, shells[0].name) return shells[0] + def add_banner_to_shell(self, shell, banner): + """Add *banner* to *shell*.""" + if shell.has_param(1): + if unicode(shell.get(1).value).endswith("\n"): + banner += "\n" + else: + banner = "\n" + banner + shell.get(1).value.append(banner) + else: + shell.add(1, banner) + def add_banner(self, code, banner): """Add *banner* to *code*, following template order conventions.""" index = 0 @@ -395,11 +403,16 @@ class WikiProjectTagger(Task): name = template.name.lower().replace("_", " ") for regex in self.TOP_TEMPS: if re.match(regex, name): - self.logger.debug(u"Skipping top template: %s", name) + self.logger.debug(u"Adding after top template: %s", name) index = i + 1 + if "wikiproject" in name or name.startswith("wp"): + self.logger.debug(u"Adding after banner template: %s", name) + index = i + 1 self.logger.debug(u"Inserting banner at index %s", index) - code.insert(index, banner) + if index > 0 and not unicode(code.get(index - 1)).endswith("\n"): + banner = "\n" + banner + code.insert(index, banner + "\n") class _Job(object): From ce591e76e70df06b76e3217ffff0abc9a0aa964d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 4 May 2017 16:23:10 -0500 Subject: [PATCH 72/88] Fix banner placement logic for shell-less banners. --- earwigbot/tasks/wikiproject_tagger.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index 9476959..6cbcc4d 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -103,8 +103,6 @@ class WikiProjectTagger(Task): r"failed ?ga$", r"old ?prod( ?full)?$", r"(old|previous) ?afd$", - - r"((wikiproject|wp) ?)?bio(graph(y|ies))?$", ] @staticmethod @@ -398,22 +396,26 @@ class WikiProjectTagger(Task): def add_banner(self, code, banner): """Add *banner* to *code*, following template order conventions.""" - index = 0 - for i, template in enumerate(code.ifilter_templates()): + predecessor = None + for template in code.ifilter_templates(): name = template.name.lower().replace("_", " ") for regex in self.TOP_TEMPS: if re.match(regex, name): - self.logger.debug(u"Adding after top template: %s", name) - index = i + 1 + self.logger.debug(u"Skipping past top template: %s", name) + predecessor = template + break if "wikiproject" in name or name.startswith("wp"): - self.logger.debug(u"Adding after banner template: %s", name) - index = i + 1 - - self.logger.debug(u"Inserting banner at index %s", index) - if index > 0 and not unicode(code.get(index - 1)).endswith("\n"): - banner = "\n" + banner - code.insert(index, banner + "\n") + self.logger.debug(u"Skipping past banner template: %s", name) + predecessor = template + if predecessor: + self.logger.debug("Inserting banner after template") + if not unicode(predecessor).endswith("\n"): + banner = "\n" + banner + code.insert_after(predecessor, banner + "\n") + else: + self.logger.debug("Inserting banner at beginning") + code.insert(0, banner + "\n") class _Job(object): """Represents a single wikiproject-tagging task. From 8a339c721bc498c47165e56c725aa8426d818865 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 4 May 2017 16:39:08 -0500 Subject: [PATCH 73/88] Fix spacing after banner. --- earwigbot/tasks/wikiproject_tagger.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index 6cbcc4d..c366aec 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -397,7 +397,7 @@ class WikiProjectTagger(Task): def add_banner(self, code, banner): """Add *banner* to *code*, following template order conventions.""" predecessor = None - for template in code.ifilter_templates(): + for template in code.ifilter_templates(recursive=False): name = template.name.lower().replace("_", " ") for regex in self.TOP_TEMPS: if re.match(regex, name): @@ -412,7 +412,10 @@ class WikiProjectTagger(Task): self.logger.debug("Inserting banner after template") if not unicode(predecessor).endswith("\n"): banner = "\n" + banner - code.insert_after(predecessor, banner + "\n") + post = code.index(predecessor) + 1 + if len(code.nodes) > post and not code.get(post).startswith("\n"): + banner += "\n" + code.insert_after(predecessor, banner) else: self.logger.debug("Inserting banner at beginning") code.insert(0, banner + "\n") From 4018e1a82eef46a052b36748efd43e381d241199 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 4 May 2017 20:38:49 -0500 Subject: [PATCH 74/88] Slightly more efficient when a page is encountered multiple times. --- earwigbot/tasks/wikiproject_tagger.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index c366aec..b53dcc5 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -215,7 +215,13 @@ class WikiProjectTagger(Task): def process_category(self, page, job, recursive): """Try to tag all pages in the given category.""" + if page.title in job.processed_cats: + self.logger.debug(u"Skipping category, already processed: [[%s]]", + page.title) + return self.logger.info(u"Processing category: [[%s]]", page.title) + job.processed_cats.add(page.title) + if job.tag_categories: self.process_page(page, job, is_category=True) for member in page.get_members(): @@ -231,13 +237,20 @@ class WikiProjectTagger(Task): def process_page(self, page, job, is_category=False): """Try to tag a specific *page* using the *job* description.""" + if not page.is_talkpage: + page = page.toggle_talk() + + if page.title in job.processed_pages: + self.logger.debug(u"Skipping page, already processed: [[%s]]", + page.title) + return + job.processed_pages.add(page.title) + if job.counter % 10 == 0: # Do a shutoff check every ten pages if self.shutoff_enabled(page.site): raise _ShutoffEnabled() job.counter += 1 - if not page.is_talkpage: - page = page.toggle_talk() try: code = page.parse() except exceptions.PageNotFoundError: @@ -438,7 +451,10 @@ class _Job(object): self.nocreate = kwargs["nocreate"] self.tag_categories = kwargs["tag_categories"] self.dry_run = kwargs["dry_run"] + self.counter = 0 + self.processed_cats = set() + self.processed_pages = set() class _ShutoffEnabled(Exception): From 17df270bdf4b30b6ed3234df401e1bb011288f98 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 6 May 2017 12:21:26 -0500 Subject: [PATCH 75/88] fix category class tagging --- earwigbot/tasks/wikiproject_tagger.py | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index b53dcc5..97cbda3 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -68,8 +68,7 @@ class WikiProjectTagger(Task): recursively go through subcategories up to a maximum depth of ``NUM``, or if ``NUM`` isn't provided, go infinitely (this can be dangerous) ``--tag-categories`` - also tag category pages; will autoassess with ``|class=category`` if - ``--autoassess`` is given + also tag category pages ``--site SITE`` the ID of the site to tag pages on, defaulting to the default site ``--dry-run`` @@ -223,7 +222,7 @@ class WikiProjectTagger(Task): job.processed_cats.add(page.title) if job.tag_categories: - self.process_page(page, job, is_category=True) + self.process_page(page, job) for member in page.get_members(): if member.namespace == constants.NS_CATEGORY: if recursive is True: @@ -231,11 +230,11 @@ class WikiProjectTagger(Task): elif recursive > 0: self.process_category(member, job, recursive - 1) elif job.tag_categories: - self.process_page(member, job, is_category=True) + self.process_page(member, job) else: self.process_page(member, job) - def process_page(self, page, job, is_category=False): + def process_page(self, page, job): """Try to tag a specific *page* using the *job* description.""" if not page.is_talkpage: page = page.toggle_talk() @@ -281,7 +280,7 @@ class WikiProjectTagger(Task): if is_update: old_banner = unicode(banner) - self.update_banner(banner, job, code, is_category=is_category) + self.update_banner(banner, job, code) if banner == old_banner: log = u"Skipping page: [[%s]]; already tagged and no updates" self.logger.info(log, page.title) @@ -290,7 +289,7 @@ class WikiProjectTagger(Task): banner = banner.encode("utf8") else: self.logger.info(u"Tagging page: [[%s]]", page.title) - banner = self.make_banner(job, code, is_category=is_category) + banner = self.make_banner(job, code) shell = self.get_banner_shell(code) if shell: self.add_banner_to_shell(shell, banner) @@ -317,12 +316,11 @@ class WikiProjectTagger(Task): summary = job.summary.replace("$3", banner) page.edit(text, self.make_summary(summary), minor=True) - def make_banner(self, job, code=None, is_category=False): + def make_banner(self, job, code=None): """Return banner text to add based on a *job* and a page's *code*.""" banner = job.banner if code is not None and job.autoassess is not False: - assess, reason = self.get_autoassessment( - code, job.autoassess, is_category=is_category) + assess, reason = self.get_autoassessment(code, job.autoassess) if assess: banner += "|class=" + assess if reason: @@ -331,15 +329,14 @@ class WikiProjectTagger(Task): banner += "|" + "|".join(job.append.split(",")) return "{{" + banner + "}}" - def update_banner(self, banner, job, code, is_category=False): + def update_banner(self, banner, job, code): """Update an existing *banner* based on a *job* and a page's *code*.""" has = lambda key: (banner.has(key) and banner.get(key).value.strip() not in ("", "?")) if job.autoassess is not False: if not has("class"): - assess, reason = self.get_autoassessment( - code, job.autoassess, is_category=is_category) + assess, reason = self.get_autoassessment(code, job.autoassess) if assess: banner.add("class", assess) if reason: @@ -350,22 +347,18 @@ class WikiProjectTagger(Task): if not has(key): banner.add(key, value) - def get_autoassessment(self, code, only_classes=None, is_category=False): + def get_autoassessment(self, code, only_classes=None): """Get an autoassessment for a page. Return (assessed class as a string or None, assessment reason or None). """ if only_classes is None: - classnames = ["a", "b", "book", "c", "category", "dab", "fa", - "fl", "ga", "list", "redirect", "start", "stub", - "template"] + classnames = ["a", "b", "book", "c", "dab", "fa", "fl", "ga", + "list", "redirect", "start", "stub"] else: classnames = [klass.strip().lower() for klass in only_classes.split(",")] - if is_category: - return ("category" if "category" in classnames else None), None - classes = {klass: 0 for klass in classnames} for template in code.ifilter_templates(recursive=True): if template.has("class"): From 2f5d7063b34f7a1e3c891f48c381d1514b703bc0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 7 May 2017 00:42:09 -0500 Subject: [PATCH 76/88] Exclude userspace from tagging. --- earwigbot/tasks/wikiproject_tagger.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index 97cbda3..adec23d 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -224,13 +224,16 @@ class WikiProjectTagger(Task): if job.tag_categories: self.process_page(page, job) for member in page.get_members(): - if member.namespace == constants.NS_CATEGORY: + nspace = member.namespace + if nspace == constants.NS_CATEGORY: if recursive is True: self.process_category(member, job, True) elif recursive > 0: self.process_category(member, job, recursive - 1) elif job.tag_categories: self.process_page(member, job) + elif nspace in (constants.NS_USER, constants.NS_USER_TALK): + continue else: self.process_page(member, job) From 0bc195080d624d825dfba6c15e76f0f536b0fbd8 Mon Sep 17 00:00:00 2001 From: EarwigBot Date: Sat, 2 Sep 2017 08:07:32 +0000 Subject: [PATCH 77/88] Update copyright year. --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 7fb250d..b567c42 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (C) 2009-2016 Ben Kurtovic +Copyright (C) 2009-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 7d7d1aceea60b4beadb746dd87a6f004c85c3f6d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 2 Sep 2017 03:22:45 -0500 Subject: [PATCH 78/88] Update dependencies, copyright year. --- earwigbot/__init__.py | 4 ++-- earwigbot/wiki/site.py | 2 +- earwigbot/wiki/sitesdb.py | 2 +- setup.py | 18 +++++++++--------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/earwigbot/__init__.py b/earwigbot/__init__.py index 51ff5b8..a37f45c 100644 --- a/earwigbot/__init__.py +++ b/earwigbot/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -30,7 +30,7 @@ details. This documentation is also available `online """ __author__ = "Ben Kurtovic" -__copyright__ = "Copyright (C) 2009-2016 Ben Kurtovic" +__copyright__ = "Copyright (C) 2009-2017 Ben Kurtovic" __license__ = "MIT License" __version__ = "0.3.dev0" __email__ = "ben.kurtovic@gmail.com" diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index e9fb38a..63c4fee 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py index d17c965..cddc24c 100644 --- a/earwigbot/wiki/sitesdb.py +++ b/earwigbot/wiki/sitesdb.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/setup.py b/setup.py index 7a302c1..71cddd8 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2016 Ben Kurtovic +# Copyright (C) 2009-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -26,8 +26,8 @@ from setuptools import setup, find_packages from earwigbot import __version__ required_deps = [ - "PyYAML >= 3.11", # Parsing config files - "mwparserfromhell >= 0.4.3", # Parsing wikicode for manipulation + "PyYAML >= 3.12", # Parsing config files + "mwparserfromhell >= 0.5", # Parsing wikicode for manipulation ] extra_deps = { @@ -39,16 +39,16 @@ extra_deps = { "oursql >= 0.9.3.2", # Interfacing with MediaWiki databases ], "copyvios": [ - "beautifulsoup4 >= 4.4.1", # Parsing/scraping HTML - "cchardet >= 1.0.0", # Encoding detection for BeautifulSoup - "lxml >= 3.6.0", # Faster parser for BeautifulSoup - "nltk >= 3.2.1", # Parsing sentences to split article content + "beautifulsoup4 >= 4.6.0", # Parsing/scraping HTML + "cchardet >= 2.1.1", # Encoding detection for BeautifulSoup + "lxml >= 3.8.0", # Faster parser for BeautifulSoup + "nltk >= 3.2.4", # Parsing sentences to split article content "oauth2 >= 1.9.0", # Interfacing with Yahoo! BOSS Search "pdfminer >= 20140328", # Extracting text from PDF files - "tldextract >= 2.0.1", # Getting domains for the multithreaded workers + "tldextract >= 2.1.0", # Getting domains for the multithreaded workers ], "time": [ - "pytz >= 2016.4", # Handling timezones for the !time IRC command + "pytz >= 2017.2", # Handling timezones for the !time IRC command ], } From 7b294d1dadb6845c56c4581a6f45f2aaf216342e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 11 Sep 2017 01:47:39 -0500 Subject: [PATCH 79/88] Fix --autoassess with no argument. --- earwigbot/tasks/wikiproject_tagger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index adec23d..250608c 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -355,7 +355,7 @@ class WikiProjectTagger(Task): Return (assessed class as a string or None, assessment reason or None). """ - if only_classes is None: + if only_classes is None or only_classes is True: classnames = ["a", "b", "book", "c", "dab", "fa", "fl", "ga", "list", "redirect", "start", "stub"] else: From b48af48de44eb675f25a93ea164e55c69e827f5f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 28 Sep 2017 10:07:56 -0500 Subject: [PATCH 80/88] wikiproject_tagger: Fix regex for wikiproject banner shells. --- earwigbot/tasks/wikiproject_tagger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/earwigbot/tasks/wikiproject_tagger.py b/earwigbot/tasks/wikiproject_tagger.py index 250608c..a23eb3a 100644 --- a/earwigbot/tasks/wikiproject_tagger.py +++ b/earwigbot/tasks/wikiproject_tagger.py @@ -383,7 +383,7 @@ class WikiProjectTagger(Task): def get_banner_shell(self, code): """Return the banner shell template within *code*, else ``None``.""" - regex = r"^\{\{\s*((WikiProject|WP)[ _]?Banner[ _]?S(hell)?|W(BPS|PBS|PB)|Shell)" + regex = r"^\{\{\s*((WikiProject|WP)[ _]?Banner[ _]?S(hell)?|W(BPS|PBS|PB)|Shell)\s*(\||\}\})" shells = code.filter_templates(matches=regex) if not shells: shells = code.filter_templates(matches=regex, recursive=True) From c68a5e6dfb818cc28a877cd94c696768fa57a973 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 15 Jan 2019 22:07:17 -0500 Subject: [PATCH 81/88] Fix Page.toggle_talk() on mainspace titles with colons. --- CHANGELOG | 1 + earwigbot/wiki/page.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 7126c4b..5d3af6d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -13,6 +13,7 @@ v0.3 (unreleased): - IRC: Try not to join channels before NickServ auth has completed. - IRC: Improved detection of maximum IRC message length. - IRC: Improved some help commands. +- Wiki: Fixed Page.toggle_talk() behavior on mainspace titles with colons. v0.2 (released November 8, 2015): diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py index 00a32ca..b05f9ff 100644 --- a/earwigbot/wiki/page.py +++ b/earwigbot/wiki/page.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -552,9 +552,9 @@ class Page(CopyvioMixIn): else: new_ns = self._namespace + 1 - try: + if self._namespace != 0: body = self._title.split(":", 1)[1] - except IndexError: + else: body = self._title new_prefix = self.site.namespace_id_to_name(new_ns) From 42a224f365263bee58f6602eaf82528791c20c16 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 18 Feb 2019 00:53:04 -0500 Subject: [PATCH 82/88] copyvios: Catch PDF parser exceptions more aggressively. --- earwigbot/wiki/copyvios/parsers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 21ccfed..2a4022f 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -34,8 +34,6 @@ nltk = importer.new("nltk") converter = importer.new("pdfminer.converter") pdfinterp = importer.new("pdfminer.pdfinterp") pdfpage = importer.new("pdfminer.pdfpage") -pdftypes = importer.new("pdfminer.pdftypes") -psparser = importer.new("pdfminer.psparser") __all__ = ["ArticleTextParser", "get_parser"] @@ -294,7 +292,7 @@ class _PDFParser(_BaseTextParser): pages = pdfpage.PDFPage.get_pages(StringIO(self.text)) for page in pages: interp.process_page(page) - except (pdftypes.PDFException, psparser.PSException, AssertionError): + except Exception: # pylint: disable=broad-except return output.getvalue().decode("utf8") finally: conv.close() From 466d3a42f137db9421e887186641689307822205 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 18 Feb 2019 21:26:03 -0500 Subject: [PATCH 83/88] copyvios: Minor refactor for cleaner stack frames. --- earwigbot/wiki/copyvios/parsers.py | 2 +- earwigbot/wiki/copyvios/workers.py | 44 ++++++++++++++++++++++---------------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 2a4022f..6cf03ef 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2015 Ben Kurtovic +# Copyright (C) 2009-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 2872df0..f23bb5f 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2016 Ben Kurtovic +# Copyright (C) 2009-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -203,6 +203,28 @@ class _CopyvioWorker(object): self._queues.lock.release() return source + def _handle_once(self): + """Handle a single source from one of the queues.""" + try: + source = self._dequeue() + except Empty: + self._logger.debug("Exiting: queue timed out") + return False + except StopIteration: + self._logger.debug("Exiting: got stop signal") + return False + + try: + text = self._open_url(source) + except ParserExclusionError: + self._logger.debug("Source excluded by content parser") + source.skipped = source.excluded = True + source.finish_work() + else: + chain = MarkovChain(text) if text else None + source.workspace.compare(source, chain) + return True + def _run(self): """Main entry point for the worker thread. @@ -211,24 +233,8 @@ class _CopyvioWorker(object): now empty. """ while True: - try: - source = self._dequeue() - except Empty: - self._logger.debug("Exiting: queue timed out") - return - except StopIteration: - self._logger.debug("Exiting: got stop signal") - return - - try: - text = self._open_url(source) - except ParserExclusionError: - self._logger.debug("Source excluded by content parser") - source.skipped = source.excluded = True - source.finish_work() - else: - chain = MarkovChain(text) if text else None - source.workspace.compare(source, chain) + if not self._handle_once(): + break def start(self): """Start the copyvio worker in a new thread.""" From 8a945b07829659bcb347bd666035c1b6a9f71221 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 24 Feb 2019 00:05:23 -0500 Subject: [PATCH 84/88] Greatly simplify MarkovChain implementation --- earwigbot/wiki/copyvios/markov.py | 50 +++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index cf26317..9a4717d 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -20,7 +20,6 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from collections import defaultdict from re import sub, UNICODE __all__ = ["EMPTY", "EMPTY_INTERSECTION", "MarkovChain", @@ -34,23 +33,27 @@ class MarkovChain(object): def __init__(self, text): self.text = text - self.chain = defaultdict(lambda: defaultdict(lambda: 0)) - words = sub(r"[^\w\s-]", "", text.lower(), flags=UNICODE).split() + self.chain = self._build() + self.size = self._get_size() + def _build(self): + """Build and return the Markov chain from the input text.""" padding = self.degree - 1 + words = sub(r"[^\w\s-]", "", self.text.lower(), flags=UNICODE).split() words = ([self.START] * padding) + words + ([self.END] * padding) - for i in range(len(words) - self.degree + 1): - last = i + self.degree - 1 - self.chain[tuple(words[i:last])][words[last]] += 1 - self.size = self._get_size() + chain = {} + + for i in xrange(len(words) - self.degree + 1): + phrase = tuple(words[i:i+self.degree]) + if phrase in chain: + chain[phrase] += 1 + else: + chain[phrase] = 1 + return chain def _get_size(self): """Return the size of the Markov chain: the total number of nodes.""" - size = 0 - for node in self.chain.itervalues(): - for hits in node.itervalues(): - size += hits - return size + return sum(self.chain.itervalues()) def __repr__(self): """Return the canonical string representation of the MarkovChain.""" @@ -65,20 +68,21 @@ class MarkovChainIntersection(MarkovChain): """Implements the intersection of two chains (i.e., their shared nodes).""" def __init__(self, mc1, mc2): - self.chain = defaultdict(lambda: defaultdict(lambda: 0)) self.mc1, self.mc2 = mc1, mc2 - c1 = mc1.chain - c2 = mc2.chain - - for word, nodes1 in c1.iteritems(): - if word in c2: - nodes2 = c2[word] - for node, count1 in nodes1.iteritems(): - if node in nodes2: - count2 = nodes2[node] - self.chain[word][node] = min(count1, count2) + self.chain = self._build() self.size = self._get_size() + def _build(self): + """Build and return the Markov chain from the input chains.""" + c1 = self.mc1.chain + c2 = self.mc2.chain + chain = {} + + for phrase in c1: + if phrase in c2: + chain[phrase] = min(c1[phrase], c2[phrase]) + return chain + def __repr__(self): """Return the canonical string representation of the intersection.""" res = "MarkovChainIntersection(mc1={0!r}, mc2={1!r})" From 7af4e905911b800a5e939770d0277b1eb555c812 Mon Sep 17 00:00:00 2001 From: Bhuvan Venkatesh Date: Fri, 8 Mar 2019 13:39:47 -0600 Subject: [PATCH 85/88] Fixed spelling error, fist -> first --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 896c1a2..4b9a56b 100644 --- a/README.rst +++ b/README.rst @@ -10,7 +10,7 @@ History ------- Development began, based on the `Pywikipedia framework`_, in early 2009. -Approval for its fist task, a `copyright violation detector`_, was carried out +Approval for its first task, a `copyright violation detector`_, was carried out in May, and the bot has been running consistently ever since (with the exception of Jan/Feb 2011). It currently handles `several ongoing tasks`_ ranging from statistics generation to category cleanup, and on-demand tasks From f1b93a465aa02022a67c3e17aa47071ab2c43ced Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 23 Mar 2019 21:44:02 -0400 Subject: [PATCH 86/88] Log warnings; use rvslots when fetching revision content --- CHANGELOG | 5 ++++- earwigbot/wiki/page.py | 12 +++++++----- earwigbot/wiki/site.py | 13 ++++++++++++- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 5d3af6d..e2b82d2 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,8 @@ v0.3 (unreleased): - Added various new features to the WikiProjectTagger task. -- Copyvio detector: improved sentence splitting algorithm. +- Copyvio detector: improved sentence splitting algorithm; many performance + improvements. - Improved config file command/task exclusion logic. - IRC > !cidr: Added; new command for calculating range blocks. - IRC > !notes: Improved help and added aliases. @@ -13,6 +14,8 @@ v0.3 (unreleased): - IRC: Try not to join channels before NickServ auth has completed. - IRC: Improved detection of maximum IRC message length. - IRC: Improved some help commands. +- Wiki: Added logging for warnings. +- Wiki: Updated some deprecated API calls. - Wiki: Fixed Page.toggle_talk() behavior on mainspace titles with colons. v0.2 (released November 8, 2015): diff --git a/earwigbot/wiki/page.py b/earwigbot/wiki/page.py index b05f9ff..302188f 100644 --- a/earwigbot/wiki/page.py +++ b/earwigbot/wiki/page.py @@ -264,13 +264,15 @@ class Page(CopyvioMixIn): if not result: query = self.site.api_query result = query(action="query", prop="revisions", rvlimit=1, - rvprop="content|timestamp", titles=self._title) + rvprop="content|timestamp", rvslots="main", + titles=self._title) res = result["query"]["pages"].values()[0] try: - self._content = res["revisions"][0]["*"] - self._basetimestamp = res["revisions"][0]["timestamp"] - except KeyError: + revision = res["revisions"][0] + self._content = revision["slots"]["main"]["*"] + self._basetimestamp = revision["timestamp"] + except (KeyError, IndexError): # This can only happen if the page was deleted since we last called # self._load_attributes(). In that case, some of our attributes are # outdated, so force another self._load_attributes(): @@ -582,7 +584,7 @@ class Page(CopyvioMixIn): query = self.site.api_query result = query(action="query", rvlimit=1, titles=self._title, prop="info|revisions", inprop="protection|url", - rvprop="content|timestamp") + rvprop="content|timestamp", rvslots="main") self._load_attributes(result=result) self._assert_existence() self._load_content(result=result) diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index 63c4fee..c93d940 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2017 Ben Kurtovic +# Copyright (C) 2009-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -298,6 +298,17 @@ class Site(object): e = "API query failed: JSON could not be decoded." raise exceptions.APIError(e) + if "warnings" in res: + for name, value in res["warnings"].items(): + try: + warning = value["warnings"] + except KeyError: + try: + warning = value["*"] + except KeyError: + warning = value + self._logger.warning("API warning: %s: %s", name, warning) + try: code = res["error"]["code"] info = res["error"]["info"] From 774628b34eb79418880e4b9aea4dc22a5023e990 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 24 Mar 2019 04:21:08 -0400 Subject: [PATCH 87/88] OAuth support; switch to requests; update login flow --- CHANGELOG | 8 +- earwigbot/wiki/copyvios/__init__.py | 3 +- earwigbot/wiki/site.py | 179 ++++++++++++++++++++---------------- earwigbot/wiki/sitesdb.py | 11 ++- setup.py | 2 + 5 files changed, 114 insertions(+), 89 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index e2b82d2..433571e 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,11 @@ v0.3 (unreleased): - Copyvio detector: improved sentence splitting algorithm; many performance improvements. - Improved config file command/task exclusion logic. +- Wiki: Added logging for warnings. +- Wiki: Added OAuth support. +- Wiki: Switched to requests from urllib2. +- Wiki: Updated some deprecated API calls. +- Wiki: Fixed Page.toggle_talk() behavior on mainspace titles with colons. - IRC > !cidr: Added; new command for calculating range blocks. - IRC > !notes: Improved help and added aliases. - IRC > !remind: Added !remind all. Fixed multithreading efficiency issues. @@ -14,9 +19,6 @@ v0.3 (unreleased): - IRC: Try not to join channels before NickServ auth has completed. - IRC: Improved detection of maximum IRC message length. - IRC: Improved some help commands. -- Wiki: Added logging for warnings. -- Wiki: Updated some deprecated API calls. -- Wiki: Fixed Page.toggle_talk() behavior on mainspace titles with colons. v0.2 (released November 8, 2015): diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 45385e7..3d625fe 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -46,7 +46,8 @@ class CopyvioMixIn(object): def __init__(self, site): self._search_config = site._search_config self._exclusions_db = self._search_config.get("exclusions_db") - self._addheaders = site._opener.addheaders + self._addheaders = [("User-Agent", site.user_agent), + ("Accept-Encoding", "gzip")] def _get_search_engine(self): """Return a function that can be called to do web searches. diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index c93d940..2f314e6 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -21,17 +21,16 @@ # SOFTWARE. from cookielib import CookieJar -from gzip import GzipFile -from json import loads from logging import getLogger, NullHandler from os.path import expanduser -from StringIO import StringIO from threading import RLock from time import sleep, time from urllib import quote_plus, unquote_plus -from urllib2 import build_opener, HTTPCookieProcessor, URLError from urlparse import urlparse +import requests +from requests_oauthlib import OAuth1 + from earwigbot import exceptions, importer from earwigbot.wiki import constants from earwigbot.wiki.category import Category @@ -83,15 +82,16 @@ class Site(object): """ SERVICE_API = 1 SERVICE_SQL = 2 - SPECIAL_TOKENS = ["deleteglobalaccount", "patrol", "rollback", - "setglobalaccountstatus", "userrights", "watch"] + SPECIAL_TOKENS = ["createaccount", "deleteglobalaccount", "login", + "patrol", "rollback", "setglobalaccountstatus", + "userrights", "watch"] def __init__(self, name=None, project=None, lang=None, base_url=None, article_path=None, script_path=None, sql=None, - namespaces=None, login=(None, None), cookiejar=None, - user_agent=None, use_https=True, assert_edit=None, - maxlag=None, wait_between_queries=2, logger=None, - search_config=None): + namespaces=None, login=(None, None), oauth=None, + cookiejar=None, user_agent=None, use_https=True, + assert_edit=None, maxlag=None, wait_between_queries=2, + logger=None, search_config=None): """Constructor for new Site instances. This probably isn't necessary to call yourself unless you're building a @@ -100,14 +100,15 @@ class Site(object): based on your config file and the sites database. We accept a bunch of kwargs, but the only ones you really "need" are *base_url* and *script_path*; this is enough to figure out an API url. *login*, a - tuple of (username, password), is highly recommended. *cookiejar* will - be used to store cookies, and we'll use a normal CookieJar if none is - given. + tuple of (username, password), can be used to log in using the legacy + BotPasswords system; otherwise, a dict of OAuth info should be provided + to *oauth*. *cookiejar* will be used to store cookies, and we'll use a + normal CookieJar if none is given. First, we'll store the given arguments as attributes, then set up our - URL opener. We'll load any of the attributes that weren't given from - the API, and then log in if a username/pass was given and we aren't - already logged in. + requests session. We'll load any of the attributes that weren't given + from the API, and then log in if a username/pass was given and we + aren't already logged in. """ # Attributes referring to site information, filled in by an API query # if they are missing (and an API url can be determined): @@ -145,16 +146,22 @@ class Site(object): else: self._search_config = {} - # Set up cookiejar and URL opener for making API queries: + # Set up cookiejar and requests session for making API queries: if cookiejar is not None: self._cookiejar = cookiejar else: self._cookiejar = CookieJar() + self._last_cookiejar_save = None if not user_agent: user_agent = constants.USER_AGENT # Set default UA - self._opener = build_opener(HTTPCookieProcessor(self._cookiejar)) - self._opener.addheaders = [("User-Agent", user_agent), - ("Accept-Encoding", "gzip")] + self._oauth = oauth + self._session = requests.Session() + self._session.cookies = self._cookiejar + self._session.headers["User-Agent"] = user_agent + if oauth: + self._session.auth = OAuth1( + oauth["consumer_token"], oauth["consumer_secret"], + oauth["access_token"], oauth["access_secret"]) # Set up our internal logger: if logger: @@ -168,7 +175,7 @@ class Site(object): # If we have a name/pass and the API says we're not logged in, log in: self._login_info = name, password = login - if name and password: + if not self._oauth and name and password: logged_in_as = self._get_username_from_cookies() if not logged_in_as or name.replace("_", " ") != logged_in_as: self._login(login) @@ -180,17 +187,18 @@ class Site(object): "base_url={_base_url!r}", "article_path={_article_path!r}", "script_path={_script_path!r}", "use_https={_use_https!r}", "assert_edit={_assert_edit!r}", "maxlag={_maxlag!r}", - "sql={_sql_data!r}", "login={0}", "user_agent={2!r}", - "cookiejar={1})")) + "sql={_sql_data!r}", "login={0}", "oauth={1}", "user_agent={3!r}", + "cookiejar={2})")) name, password = self._login_info login = "({0}, {1})".format(repr(name), "hidden" if password else None) + oauth = "hidden" if self._oauth else None cookies = self._cookiejar.__class__.__name__ if hasattr(self._cookiejar, "filename"): cookies += "({0!r})".format(getattr(self._cookiejar, "filename")) else: cookies += "()" - agent = self._opener.addheaders[0][1] - return res.format(login, cookies, agent, **self.__dict__) + agent = self.user_agent + return res.format(login, oauth, cookies, agent, **self.__dict__) def __str__(self): """Return a nice string representation of the Site.""" @@ -238,24 +246,12 @@ class Site(object): self._logger.debug("{0} -> {1}".format(url, data)) try: - response = self._opener.open(url, data) - except URLError as error: - if hasattr(error, "reason"): - e = "API query failed: {0}.".format(error.reason) - elif hasattr(error, "code"): - e = "API query failed: got an error code of {0}." - e = e.format(error.code) - else: - e = "API query failed." - raise exceptions.APIError(e) + response = self._session.post(url, data=data) + response.raise_for_status() + except requests.RequestException as exc: + raise exceptions.APIError("API query failed: {0}".format(exc)) - result = response.read() - if response.headers.get("Content-Encoding") == "gzip": - stream = StringIO(result) - gzipper = GzipFile(fileobj=stream) - result = gzipper.read() - - return self._handle_api_result(result, params, tries, wait, ae_retry) + return self._handle_api_result(response, params, tries, wait, ae_retry) def _request_csrf_token(self, params): """If possible, add a request for a CSRF token to an API query.""" @@ -290,10 +286,10 @@ class Site(object): data = self._urlencode_utf8(params) return url, data - def _handle_api_result(self, result, params, tries, wait, ae_retry): - """Given the result of an API query, attempt to return useful data.""" + def _handle_api_result(self, response, params, tries, wait, ae_retry): + """Given an API query response, attempt to return useful data.""" try: - res = loads(result) # Try to parse as a JSON object + res = response.json() except ValueError: e = "API query failed: JSON could not be decoded." raise exceptions.APIError(e) @@ -309,6 +305,9 @@ class Site(object): warning = value self._logger.warning("API warning: %s: %s", name, warning) + if self._should_save_cookiejar(): + self._save_cookiejar() + try: code = res["error"]["code"] info = res["error"]["info"] @@ -328,18 +327,18 @@ class Site(object): sleep(wait) return self._api_query(params, tries, wait * 2, ae_retry=ae_retry) elif code in ["assertuserfailed", "assertbotfailed"]: # AssertEdit - if ae_retry and all(self._login_info): + if ae_retry and all(self._login_info) and not self._oauth: # Try to log in if we got logged out: self._login(self._login_info) if "token" in params: # Fetch a new one; this is invalid now params["token"] = self.get_token(params["action"]) return self._api_query(params, tries, wait, ae_retry=False) - if not all(self._login_info): + if not all(self._login_info) and not self._oauth: e = "Assertion failed, and no login info was provided." elif code == "assertbotfailed": e = "Bot assertion failed: we don't have a bot flag!" else: - e = "User assertion failed due to an unknown issue. Cookie problem?" + e = "User assertion failed due to an unknown issue. Cookie or OAuth problem?" raise exceptions.PermissionsError("AssertEdit: " + e) else: # Some unknown error occurred e = 'API query failed: got error "{0}"; server says: "{1}".' @@ -476,15 +475,30 @@ class Site(object): unnecessary API query. For the cookie-detection method, see _get_username_from_cookies()'s docs. - If our username isn't in cookies, then we're probably not logged in, or - something fishy is going on (like forced logout). In this case, do a - single API query for our username (or IP address) and return that. + If our username isn't in cookies, then we're either using OAuth or + we're probably not logged in, or something fishy is going on (like + forced logout). If we're using OAuth and a username was configured, + assume it is accurate and use it. Otherwise, do a single API query for + our username (or IP address) and return that. """ name = self._get_username_from_cookies() if name: return name + if self._oauth and self._login_info[0]: + return self._login_info[0] return self._get_username_from_api() + def _should_save_cookiejar(self): + """Return a bool indicating whether we should save the cookiejar. + + This is True if we haven't saved the cookiejar yet this session, or if + our last save was over a day ago. + """ + max_staleness = 60 * 60 * 24 # 1 day + if not self._last_cookiejar_save: + return True + return time() - self._last_cookiejar_save > max_staleness + def _save_cookiejar(self): """Try to save our cookiejar after doing a (normal) login or logout. @@ -498,8 +512,9 @@ class Site(object): getattr(self._cookiejar, "save")() except (NotImplementedError, ValueError): pass + self._last_cookiejar_save = time() - def _login(self, login, token=None, attempt=0): + def _login(self, login): """Safely login through the API. Normally, this is called by __init__() if a username and password have @@ -507,45 +522,43 @@ class Site(object): time it needs to be called is when those cookies expire, which is done automatically by api_query() if a query fails. - Recent versions of MediaWiki's API have fixed a CSRF vulnerability, - requiring login to be done in two separate requests. If the response - from from our initial request is "NeedToken", we'll do another one with - the token. If login is successful, we'll try to save our cookiejar. + *login* is a (username, password) tuple. Raises LoginError on login errors (duh), like bad passwords and nonexistent usernames. - - *login* is a (username, password) tuple. *token* is the token returned - from our first request, and *attempt* is to prevent getting stuck in a - loop if MediaWiki isn't acting right. """ self._tokens.clear() name, password = login - params = {"action": "login", "lgname": name, "lgpassword": password} - if token: - params["lgtoken"] = token + params = {"action": "query", "meta": "tokens", "type": "login"} + with self._api_lock: + result = self._api_query(params, no_assert=True) + try: + token = result["query"]["tokens"]["logintoken"] + except KeyError: + raise exceptions.LoginError("Couldn't get login token") + + params = {"action": "login", "lgname": name, "lgpassword": password, + "lgtoken": token} with self._api_lock: result = self._api_query(params, no_assert=True) res = result["login"]["result"] if res == "Success": + self._tokens.clear() self._save_cookiejar() - elif res == "NeedToken" and attempt == 0: - token = result["login"]["token"] - return self._login(login, token, attempt=1) + return + if res == "Illegal": + e = "The provided username is illegal." + elif res == "NotExists": + e = "The provided username does not exist." + elif res == "EmptyPass": + e = "No password was given." + elif res == "WrongPass" or res == "WrongPluginPass": + e = "The given password is incorrect." else: - if res == "Illegal": - e = "The provided username is illegal." - elif res == "NotExists": - e = "The provided username does not exist." - elif res == "EmptyPass": - e = "No password was given." - elif res == "WrongPass" or res == "WrongPluginPass": - e = "The given password is incorrect." - else: - e = "Couldn't login; server says '{0}'.".format(res) - raise exceptions.LoginError(e) + e = "Couldn't login; server says '{0}'.".format(res) + raise exceptions.LoginError(e) def _logout(self): """Safely logout through the API. @@ -663,6 +676,11 @@ class Site(object): url = "http:" + url return url + @property + def user_agent(self): + """The User-Agent header sent to the API by the requests session.""" + return self._session.headers["User-Agent"] + def api_query(self, **kwargs): """Do an API query with `kwargs` as the parameters. @@ -679,10 +697,9 @@ class Site(object): :py:attr:`self._assert_edit` and :py:attr:`_maxlag` respectively. Additionally, we'll sleep a bit if the last query was made fewer than :py:attr:`self._wait_between_queries` seconds ago. The request is made - through :py:attr:`self._opener`, which has cookie support - (:py:attr:`self._cookiejar`), a ``User-Agent`` - (:py:const:`earwigbot.wiki.constants.USER_AGENT`), and - ``Accept-Encoding`` set to ``"gzip"``. + through :py:attr:`self._session`, which has cookie support + (:py:attr:`self._cookiejar`) and a ``User-Agent`` + (:py:const:`earwigbot.wiki.constants.USER_AGENT`). Assuming everything went well, we'll gunzip the data (if compressed), load it as a JSON object, and return it. diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py index cddc24c..98e9d50 100644 --- a/earwigbot/wiki/sitesdb.py +++ b/earwigbot/wiki/sitesdb.py @@ -187,6 +187,7 @@ class SitesDB(object): config = self.config login = (config.wiki.get("username"), config.wiki.get("password")) + oauth = config.wiki.get("oauth") user_agent = config.wiki.get("userAgent") use_https = config.wiki.get("useHTTPS", True) assert_edit = config.wiki.get("assert") @@ -212,7 +213,7 @@ class SitesDB(object): return Site(name=name, project=project, lang=lang, base_url=base_url, article_path=article_path, script_path=script_path, - sql=sql, namespaces=namespaces, login=login, + sql=sql, namespaces=namespaces, login=login, oauth=oauth, cookiejar=cookiejar, user_agent=user_agent, use_https=use_https, assert_edit=assert_edit, maxlag=maxlag, wait_between_queries=wait_between_queries, @@ -386,6 +387,7 @@ class SitesDB(object): config = self.config login = (config.wiki.get("username"), config.wiki.get("password")) + oauth = config.wiki.get("oauth") user_agent = config.wiki.get("userAgent") use_https = config.wiki.get("useHTTPS", True) assert_edit = config.wiki.get("assert") @@ -398,9 +400,10 @@ class SitesDB(object): # Create a Site object to log in and load the other attributes: site = Site(base_url=base_url, script_path=script_path, sql=sql, - login=login, cookiejar=cookiejar, user_agent=user_agent, - use_https=use_https, assert_edit=assert_edit, - maxlag=maxlag, wait_between_queries=wait_between_queries) + login=login, oauth=oauth, cookiejar=cookiejar, + user_agent=user_agent, use_https=use_https, + assert_edit=assert_edit, maxlag=maxlag, + wait_between_queries=wait_between_queries) self._logger.info("Added site '{0}'".format(site.name)) self._add_site_to_sitesdb(site) diff --git a/setup.py b/setup.py index 71cddd8..aaee9bc 100644 --- a/setup.py +++ b/setup.py @@ -28,6 +28,8 @@ from earwigbot import __version__ required_deps = [ "PyYAML >= 3.12", # Parsing config files "mwparserfromhell >= 0.5", # Parsing wikicode for manipulation + "requests >= 2.21.0", # Wiki API requests + "requests_oauthlib >= 1.2.0", # API authentication via OAuth ] extra_deps = { From ea4ee7669190ce1b58aa27f8ec9f075d78d485b0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 24 Mar 2019 17:20:59 -0400 Subject: [PATCH 88/88] release/0.3 --- CHANGELOG | 2 +- README.rst | 4 ++-- docs/conf.py | 2 +- docs/installation.rst | 4 ++-- earwigbot/__init__.py | 6 +++--- earwigbot/wiki/sitesdb.py | 2 +- setup.py | 2 +- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 433571e..81aad0a 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ -v0.3 (unreleased): +v0.3 (released March 24, 2019): - Added various new features to the WikiProjectTagger task. - Copyvio detector: improved sentence splitting algorithm; many performance diff --git a/README.rst b/README.rst index 4b9a56b..6553a75 100644 --- a/README.rst +++ b/README.rst @@ -36,7 +36,7 @@ setup.py test`` from the project's root directory. Note that some tests require an internet connection, and others may take a while to run. Coverage is currently rather incomplete. -Latest release (v0.2) +Latest release (v0.3) ~~~~~~~~~~~~~~~~~~~~~ EarwigBot is available from the `Python Package Index`_, so you can install the @@ -47,7 +47,7 @@ some header files. For example, on Ubuntu, see `this StackOverflow post`_. You can also install it from source [1]_ directly:: - curl -Lo earwigbot.tgz https://github.com/earwig/earwigbot/tarball/v0.2 + curl -Lo earwigbot.tgz https://github.com/earwig/earwigbot/tarball/v0.3 tar -xf earwigbot.tgz cd earwig-earwigbot-* python setup.py install diff --git a/docs/conf.py b/docs/conf.py index 3e66cb9..9b635ab 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -50,7 +50,7 @@ copyright = u'2009-2016 Ben Kurtovic' # The short X.Y version. version = '0.3' # The full version, including alpha/beta/rc tags. -release = '0.3.dev0' +release = '0.3' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/installation.rst b/docs/installation.rst index cc577ab..c4c1e01 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -13,7 +13,7 @@ It's recommended to run the bot's unit tests before installing. Run some tests require an internet connection, and others may take a while to run. Coverage is currently rather incomplete. -Latest release (v0.2) +Latest release (v0.3) --------------------- EarwigBot is available from the `Python Package Index`_, so you can install the @@ -24,7 +24,7 @@ some header files. For example, on Ubuntu, see `this StackOverflow post`_. You can also install it from source [1]_ directly:: - curl -Lo earwigbot.tgz https://github.com/earwig/earwigbot/tarball/v0.2 + curl -Lo earwigbot.tgz https://github.com/earwig/earwigbot/tarball/v0.3 tar -xf earwigbot.tgz cd earwig-earwigbot-* python setup.py install diff --git a/earwigbot/__init__.py b/earwigbot/__init__.py index a37f45c..9c28649 100644 --- a/earwigbot/__init__.py +++ b/earwigbot/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2017 Ben Kurtovic +# Copyright (C) 2009-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -30,9 +30,9 @@ details. This documentation is also available `online """ __author__ = "Ben Kurtovic" -__copyright__ = "Copyright (C) 2009-2017 Ben Kurtovic" +__copyright__ = "Copyright (C) 2009-2019 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.3.dev0" +__version__ = "0.3" __email__ = "ben.kurtovic@gmail.com" __release__ = False diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py index 98e9d50..020b938 100644 --- a/earwigbot/wiki/sitesdb.py +++ b/earwigbot/wiki/sitesdb.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2017 Ben Kurtovic +# Copyright (C) 2009-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/setup.py b/setup.py index aaee9bc..d4688d0 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright (C) 2009-2017 Ben Kurtovic +# Copyright (C) 2009-2019 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal