Get page content by revid via API, fix some other things - hopefully task works now

13 years ago · ae6c799aee
--- a/bot/rules.py
+++ b/bot/rules.py
@@ -36,7 +36,7 @@ def process(rc):
        chans.update(("##earwigbot", "#wikipedia-en-afc"))
    if r_page.search(page_name):
        tasks.start("afc_copyvios", action="edit", page=rc.page)
        #tasks.start("afc_copyvios", action="edit", page=rc.page)
        chans.add("#wikipedia-en-afc")
    elif r_ffu.match(page_name):
@@ -48,17 +48,17 @@ def process(rc):
    elif rc.flags == "move" and (r_move1.match(comment) or
                                 r_move2.match(comment)):
        p = r_moved_pages.findall(rc.comment)[0]
        tasks.start("afc_copyvios", action="move", page=p)
        #tasks.start("afc_copyvios", action="move", page=p)
        chans.add("#wikipedia-en-afc")
    elif rc.flags == "delete" and r_delete.match(comment):
        p = r_deleted_page.findall(rc.comment)[0]
        tasks.start("afc_copyvios", action="delete", page=p)
        #tasks.start("afc_copyvios", action="delete", page=p)
        chans.add("#wikipedia-en-afc")
    elif rc.flags == "restore" and r_restore.match(comment):
        p = r_restored_page.findall(rc.comment)[0]
        tasks.start("afc_copyvios", action="restore", page=p)
        #tasks.start("afc_copyvios", action="restore", page=p)
        chans.add("#wikipedia-en-afc")
    elif rc.flags == "protect" and r_protect.match(comment):
--- a/bot/tasks/afc_statistics.py
+++ b/bot/tasks/afc_statistics.py
@@ -5,6 +5,7 @@ import logging
 import re
 from os.path import expanduser
 from threading import Lock
 from time import sleep
 import oursql
@@ -215,7 +216,7 @@ class Task(BaseTask):
                self.logger.debug("  {0} -> {1}".format(oldid, real_oldid))
                body = result[0][1].replace("_", " ")
                ns = self.site.namespace_id_to_name(result[0][2])
                real_title = ":".join(ns, body)
                real_title = ":".join((ns, body))
                self.update_page(cursor, pageid, real_title)
    def add_untracked(self, cursor):
@@ -268,6 +269,11 @@ class Task(BaseTask):
        which are then saved to our database.
        """
        content = self.get_content(title)
        if not content:
            msg = "Could not get page content for [[{0}]]".format(title)
            self.logger.error(msg)
            return
        status, chart = self.get_status_and_chart(content)
        if not status:
            msg = "Could not find a status for [[{0}]]".format(title)
@@ -304,6 +310,11 @@ class Task(BaseTask):
        happened, and we'll untrack the submission.
        """
        content = self.get_content(title)
        if not content:
            msg = "Could not get page content for [[{0}]]".format(title)
            self.logger.error(msg)
            return
        try:
            redirect_regex = wiki.Page.re_redirect
            target_title = re.findall(redirect_regex, content, flags=re.I)[0]
@@ -402,16 +413,11 @@ class Task(BaseTask):
        self.logger.debug(msg.format(pageid, result["page_notes"], notes))
    def get_content(self, title):
        """Get the current content of a page by title from SQL.
        """Get the current content of a page by title from the API.
        The page's current revision ID is retrieved from SQL, and then
        site.get_revid_content() is called.
        The reason a more conventional method (i.e. site.get_page.get()) is
        avoided is that due to replication lag, a discrepancy between the live
        database (which the API uses) and the replicated database (which SQL
        uses) can lead to incorrect and very confusing data, such as missing
        pages that are supposed to exist, if both are used interchangeably.
        an API query is made to get its content. This is the only API query
        used in the task's code.
        """
        query = "SELECT page_latest FROM page WHERE page_title = ? AND page_namespace = ?"
        namespace, base = title.split(":", 1)
@@ -423,7 +429,19 @@ class Task(BaseTask):
        result = self.site.sql_query(query, (base, ns))
        revid = list(result)[0]
        return self.site.get_revid_content(revid)
        res = self.site.api_query(action="query", prop="revisions",
                                  revids=revid, rvprop="content")
        try:
            return res["query"]["pages"].values()[0]["revisions"][0]["*"]
        except KeyError:
            sleep(5)
            res = self.site.api_query(action="query", prop="revisions",
                                      revids=revid, rvprop="content")
            try:
                return res["query"]["pages"].values()[0]["revisions"][0]["*"]
            except KeyError:
                return None
    def get_status_and_chart(self, content):
        """Determine the status and chart number of an AFC submission.
--- a/bot/wiki/site.py
+++ b/bot/wiki/site.py
@@ -41,7 +41,6 @@ class Site(object):
    api_query            -- does an API query with the given kwargs as params
    sql_query            -- does an SQL query and yields its results
    get_replag           -- returns the estimated database replication lag
    get_revid_content    -- returns the content of a revision ID from SQL
    namespace_id_to_name -- given a namespace ID, returns associated name(s)
    namespace_name_to_id -- given a namespace name, returns associated id
    get_page             -- returns a Page object for the given title
@@ -536,10 +535,6 @@ class Site(object):
        result = list(self.sql_query(query))
        return result[0][0]
    def get_revid_content(self, revid):
        """Return the content of a revision ID from SQL."""
        return None
    def namespace_id_to_name(self, ns_id, all=False):
        """Given a namespace ID, returns associated namespace names.