From ae6c799aee4029bce5eb62b66cb67f1310696783 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sun, 6 Nov 2011 15:00:23 -0500
Subject: [PATCH] Get page content by revid via API, fix some other things -
 hopefully task works now

---
 bot/rules.py                |  8 ++++----
 bot/tasks/afc_statistics.py | 38 ++++++++++++++++++++++++++++----------
 bot/wiki/site.py            |  5 -----
 3 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/bot/rules.py b/bot/rules.py
index 480c34a..5c55f9d 100644
--- a/bot/rules.py
+++ b/bot/rules.py
@@ -36,7 +36,7 @@ def process(rc):
         chans.update(("##earwigbot", "#wikipedia-en-afc"))
         
     if r_page.search(page_name):
-        tasks.start("afc_copyvios", action="edit", page=rc.page)
+        #tasks.start("afc_copyvios", action="edit", page=rc.page)
         chans.add("#wikipedia-en-afc")
         
     elif r_ffu.match(page_name):
@@ -48,17 +48,17 @@ def process(rc):
     elif rc.flags == "move" and (r_move1.match(comment) or
                                  r_move2.match(comment)):
         p = r_moved_pages.findall(rc.comment)[0]
-        tasks.start("afc_copyvios", action="move", page=p)
+        #tasks.start("afc_copyvios", action="move", page=p)
         chans.add("#wikipedia-en-afc")
     
     elif rc.flags == "delete" and r_delete.match(comment):
         p = r_deleted_page.findall(rc.comment)[0]
-        tasks.start("afc_copyvios", action="delete", page=p)
+        #tasks.start("afc_copyvios", action="delete", page=p)
         chans.add("#wikipedia-en-afc")
     
     elif rc.flags == "restore" and r_restore.match(comment):
         p = r_restored_page.findall(rc.comment)[0]
-        tasks.start("afc_copyvios", action="restore", page=p)
+        #tasks.start("afc_copyvios", action="restore", page=p)
         chans.add("#wikipedia-en-afc")
     
     elif rc.flags == "protect" and r_protect.match(comment):
diff --git a/bot/tasks/afc_statistics.py b/bot/tasks/afc_statistics.py
index 934d1b0..1ea3ea0 100644
--- a/bot/tasks/afc_statistics.py
+++ b/bot/tasks/afc_statistics.py
@@ -5,6 +5,7 @@ import logging
 import re
 from os.path import expanduser
 from threading import Lock
+from time import sleep
 
 import oursql
 
@@ -215,7 +216,7 @@ class Task(BaseTask):
                 self.logger.debug("  {0} -> {1}".format(oldid, real_oldid))
                 body = result[0][1].replace("_", " ")
                 ns = self.site.namespace_id_to_name(result[0][2])
-                real_title = ":".join(ns, body)
+                real_title = ":".join((ns, body))
                 self.update_page(cursor, pageid, real_title)
 
     def add_untracked(self, cursor):
@@ -268,6 +269,11 @@ class Task(BaseTask):
         which are then saved to our database.
         """
         content = self.get_content(title)
+        if not content:
+            msg = "Could not get page content for [[{0}]]".format(title)
+            self.logger.error(msg)
+            return
+
         status, chart = self.get_status_and_chart(content)
         if not status:
             msg = "Could not find a status for [[{0}]]".format(title)
@@ -304,6 +310,11 @@ class Task(BaseTask):
         happened, and we'll untrack the submission.
         """
         content = self.get_content(title)
+        if not content:
+            msg = "Could not get page content for [[{0}]]".format(title)
+            self.logger.error(msg)
+            return
+
         try:
             redirect_regex = wiki.Page.re_redirect
             target_title = re.findall(redirect_regex, content, flags=re.I)[0]
@@ -402,16 +413,11 @@ class Task(BaseTask):
         self.logger.debug(msg.format(pageid, result["page_notes"], notes))
 
     def get_content(self, title):
-        """Get the current content of a page by title from SQL.
+        """Get the current content of a page by title from the API.
 
         The page's current revision ID is retrieved from SQL, and then
-        site.get_revid_content() is called.
-
-        The reason a more conventional method (i.e. site.get_page.get()) is
-        avoided is that due to replication lag, a discrepancy between the live
-        database (which the API uses) and the replicated database (which SQL
-        uses) can lead to incorrect and very confusing data, such as missing
-        pages that are supposed to exist, if both are used interchangeably.
+        an API query is made to get its content. This is the only API query
+        used in the task's code.
         """
         query = "SELECT page_latest FROM page WHERE page_title = ? AND page_namespace = ?"
         namespace, base = title.split(":", 1)
@@ -423,7 +429,19 @@ class Task(BaseTask):
 
         result = self.site.sql_query(query, (base, ns))
         revid = list(result)[0]
-        return self.site.get_revid_content(revid)
+
+        res = self.site.api_query(action="query", prop="revisions",
+                                  revids=revid, rvprop="content")
+        try:
+            return res["query"]["pages"].values()[0]["revisions"][0]["*"]
+        except KeyError:
+            sleep(5)
+            res = self.site.api_query(action="query", prop="revisions",
+                                      revids=revid, rvprop="content")
+            try:
+                return res["query"]["pages"].values()[0]["revisions"][0]["*"]
+            except KeyError:
+                return None
 
     def get_status_and_chart(self, content):
         """Determine the status and chart number of an AFC submission.
diff --git a/bot/wiki/site.py b/bot/wiki/site.py
index bdc9426..3f76d05 100644
--- a/bot/wiki/site.py
+++ b/bot/wiki/site.py
@@ -41,7 +41,6 @@ class Site(object):
     api_query            -- does an API query with the given kwargs as params
     sql_query            -- does an SQL query and yields its results
     get_replag           -- returns the estimated database replication lag
-    get_revid_content    -- returns the content of a revision ID from SQL
     namespace_id_to_name -- given a namespace ID, returns associated name(s)
     namespace_name_to_id -- given a namespace name, returns associated id
     get_page             -- returns a Page object for the given title
@@ -536,10 +535,6 @@ class Site(object):
         result = list(self.sql_query(query))
         return result[0][0]
 
-    def get_revid_content(self, revid):
-        """Return the content of a revision ID from SQL."""
-        return None
-
     def namespace_id_to_name(self, ns_id, all=False):
         """Given a namespace ID, returns associated namespace names.