From ae6c799aee4029bce5eb62b66cb67f1310696783 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 6 Nov 2011 15:00:23 -0500 Subject: [PATCH] Get page content by revid via API, fix some other things - hopefully task works now --- bot/rules.py | 8 ++++---- bot/tasks/afc_statistics.py | 38 ++++++++++++++++++++++++++++---------- bot/wiki/site.py | 5 ----- 3 files changed, 32 insertions(+), 19 deletions(-) diff --git a/bot/rules.py b/bot/rules.py index 480c34a..5c55f9d 100644 --- a/bot/rules.py +++ b/bot/rules.py @@ -36,7 +36,7 @@ def process(rc): chans.update(("##earwigbot", "#wikipedia-en-afc")) if r_page.search(page_name): - tasks.start("afc_copyvios", action="edit", page=rc.page) + #tasks.start("afc_copyvios", action="edit", page=rc.page) chans.add("#wikipedia-en-afc") elif r_ffu.match(page_name): @@ -48,17 +48,17 @@ def process(rc): elif rc.flags == "move" and (r_move1.match(comment) or r_move2.match(comment)): p = r_moved_pages.findall(rc.comment)[0] - tasks.start("afc_copyvios", action="move", page=p) + #tasks.start("afc_copyvios", action="move", page=p) chans.add("#wikipedia-en-afc") elif rc.flags == "delete" and r_delete.match(comment): p = r_deleted_page.findall(rc.comment)[0] - tasks.start("afc_copyvios", action="delete", page=p) + #tasks.start("afc_copyvios", action="delete", page=p) chans.add("#wikipedia-en-afc") elif rc.flags == "restore" and r_restore.match(comment): p = r_restored_page.findall(rc.comment)[0] - tasks.start("afc_copyvios", action="restore", page=p) + #tasks.start("afc_copyvios", action="restore", page=p) chans.add("#wikipedia-en-afc") elif rc.flags == "protect" and r_protect.match(comment): diff --git a/bot/tasks/afc_statistics.py b/bot/tasks/afc_statistics.py index 934d1b0..1ea3ea0 100644 --- a/bot/tasks/afc_statistics.py +++ b/bot/tasks/afc_statistics.py @@ -5,6 +5,7 @@ import logging import re from os.path import expanduser from threading import Lock +from time import sleep import oursql @@ -215,7 +216,7 @@ class Task(BaseTask): self.logger.debug(" {0} -> {1}".format(oldid, real_oldid)) body = result[0][1].replace("_", " ") ns = self.site.namespace_id_to_name(result[0][2]) - real_title = ":".join(ns, body) + real_title = ":".join((ns, body)) self.update_page(cursor, pageid, real_title) def add_untracked(self, cursor): @@ -268,6 +269,11 @@ class Task(BaseTask): which are then saved to our database. """ content = self.get_content(title) + if not content: + msg = "Could not get page content for [[{0}]]".format(title) + self.logger.error(msg) + return + status, chart = self.get_status_and_chart(content) if not status: msg = "Could not find a status for [[{0}]]".format(title) @@ -304,6 +310,11 @@ class Task(BaseTask): happened, and we'll untrack the submission. """ content = self.get_content(title) + if not content: + msg = "Could not get page content for [[{0}]]".format(title) + self.logger.error(msg) + return + try: redirect_regex = wiki.Page.re_redirect target_title = re.findall(redirect_regex, content, flags=re.I)[0] @@ -402,16 +413,11 @@ class Task(BaseTask): self.logger.debug(msg.format(pageid, result["page_notes"], notes)) def get_content(self, title): - """Get the current content of a page by title from SQL. + """Get the current content of a page by title from the API. The page's current revision ID is retrieved from SQL, and then - site.get_revid_content() is called. - - The reason a more conventional method (i.e. site.get_page.get()) is - avoided is that due to replication lag, a discrepancy between the live - database (which the API uses) and the replicated database (which SQL - uses) can lead to incorrect and very confusing data, such as missing - pages that are supposed to exist, if both are used interchangeably. + an API query is made to get its content. This is the only API query + used in the task's code. """ query = "SELECT page_latest FROM page WHERE page_title = ? AND page_namespace = ?" namespace, base = title.split(":", 1) @@ -423,7 +429,19 @@ class Task(BaseTask): result = self.site.sql_query(query, (base, ns)) revid = list(result)[0] - return self.site.get_revid_content(revid) + + res = self.site.api_query(action="query", prop="revisions", + revids=revid, rvprop="content") + try: + return res["query"]["pages"].values()[0]["revisions"][0]["*"] + except KeyError: + sleep(5) + res = self.site.api_query(action="query", prop="revisions", + revids=revid, rvprop="content") + try: + return res["query"]["pages"].values()[0]["revisions"][0]["*"] + except KeyError: + return None def get_status_and_chart(self, content): """Determine the status and chart number of an AFC submission. diff --git a/bot/wiki/site.py b/bot/wiki/site.py index bdc9426..3f76d05 100644 --- a/bot/wiki/site.py +++ b/bot/wiki/site.py @@ -41,7 +41,6 @@ class Site(object): api_query -- does an API query with the given kwargs as params sql_query -- does an SQL query and yields its results get_replag -- returns the estimated database replication lag - get_revid_content -- returns the content of a revision ID from SQL namespace_id_to_name -- given a namespace ID, returns associated name(s) namespace_name_to_id -- given a namespace name, returns associated id get_page -- returns a Page object for the given title @@ -536,10 +535,6 @@ class Site(object): result = list(self.sql_query(query)) return result[0][0] - def get_revid_content(self, revid): - """Return the content of a revision ID from SQL.""" - return None - def namespace_id_to_name(self, ns_id, all=False): """Given a namespace ID, returns associated namespace names.