Browse Source

Get page content by revid via API, fix some other things - hopefully task works now

tags/v0.1^2
Ben Kurtovic 13 years ago
parent
commit
ae6c799aee
3 changed files with 32 additions and 19 deletions
  1. +4
    -4
      bot/rules.py
  2. +28
    -10
      bot/tasks/afc_statistics.py
  3. +0
    -5
      bot/wiki/site.py

+ 4
- 4
bot/rules.py View File

@@ -36,7 +36,7 @@ def process(rc):
chans.update(("##earwigbot", "#wikipedia-en-afc")) chans.update(("##earwigbot", "#wikipedia-en-afc"))
if r_page.search(page_name): if r_page.search(page_name):
tasks.start("afc_copyvios", action="edit", page=rc.page)
#tasks.start("afc_copyvios", action="edit", page=rc.page)
chans.add("#wikipedia-en-afc") chans.add("#wikipedia-en-afc")
elif r_ffu.match(page_name): elif r_ffu.match(page_name):
@@ -48,17 +48,17 @@ def process(rc):
elif rc.flags == "move" and (r_move1.match(comment) or elif rc.flags == "move" and (r_move1.match(comment) or
r_move2.match(comment)): r_move2.match(comment)):
p = r_moved_pages.findall(rc.comment)[0] p = r_moved_pages.findall(rc.comment)[0]
tasks.start("afc_copyvios", action="move", page=p)
#tasks.start("afc_copyvios", action="move", page=p)
chans.add("#wikipedia-en-afc") chans.add("#wikipedia-en-afc")
elif rc.flags == "delete" and r_delete.match(comment): elif rc.flags == "delete" and r_delete.match(comment):
p = r_deleted_page.findall(rc.comment)[0] p = r_deleted_page.findall(rc.comment)[0]
tasks.start("afc_copyvios", action="delete", page=p)
#tasks.start("afc_copyvios", action="delete", page=p)
chans.add("#wikipedia-en-afc") chans.add("#wikipedia-en-afc")
elif rc.flags == "restore" and r_restore.match(comment): elif rc.flags == "restore" and r_restore.match(comment):
p = r_restored_page.findall(rc.comment)[0] p = r_restored_page.findall(rc.comment)[0]
tasks.start("afc_copyvios", action="restore", page=p)
#tasks.start("afc_copyvios", action="restore", page=p)
chans.add("#wikipedia-en-afc") chans.add("#wikipedia-en-afc")
elif rc.flags == "protect" and r_protect.match(comment): elif rc.flags == "protect" and r_protect.match(comment):


+ 28
- 10
bot/tasks/afc_statistics.py View File

@@ -5,6 +5,7 @@ import logging
import re import re
from os.path import expanduser from os.path import expanduser
from threading import Lock from threading import Lock
from time import sleep


import oursql import oursql


@@ -215,7 +216,7 @@ class Task(BaseTask):
self.logger.debug(" {0} -> {1}".format(oldid, real_oldid)) self.logger.debug(" {0} -> {1}".format(oldid, real_oldid))
body = result[0][1].replace("_", " ") body = result[0][1].replace("_", " ")
ns = self.site.namespace_id_to_name(result[0][2]) ns = self.site.namespace_id_to_name(result[0][2])
real_title = ":".join(ns, body)
real_title = ":".join((ns, body))
self.update_page(cursor, pageid, real_title) self.update_page(cursor, pageid, real_title)


def add_untracked(self, cursor): def add_untracked(self, cursor):
@@ -268,6 +269,11 @@ class Task(BaseTask):
which are then saved to our database. which are then saved to our database.
""" """
content = self.get_content(title) content = self.get_content(title)
if not content:
msg = "Could not get page content for [[{0}]]".format(title)
self.logger.error(msg)
return

status, chart = self.get_status_and_chart(content) status, chart = self.get_status_and_chart(content)
if not status: if not status:
msg = "Could not find a status for [[{0}]]".format(title) msg = "Could not find a status for [[{0}]]".format(title)
@@ -304,6 +310,11 @@ class Task(BaseTask):
happened, and we'll untrack the submission. happened, and we'll untrack the submission.
""" """
content = self.get_content(title) content = self.get_content(title)
if not content:
msg = "Could not get page content for [[{0}]]".format(title)
self.logger.error(msg)
return

try: try:
redirect_regex = wiki.Page.re_redirect redirect_regex = wiki.Page.re_redirect
target_title = re.findall(redirect_regex, content, flags=re.I)[0] target_title = re.findall(redirect_regex, content, flags=re.I)[0]
@@ -402,16 +413,11 @@ class Task(BaseTask):
self.logger.debug(msg.format(pageid, result["page_notes"], notes)) self.logger.debug(msg.format(pageid, result["page_notes"], notes))


def get_content(self, title): def get_content(self, title):
"""Get the current content of a page by title from SQL.
"""Get the current content of a page by title from the API.


The page's current revision ID is retrieved from SQL, and then The page's current revision ID is retrieved from SQL, and then
site.get_revid_content() is called.

The reason a more conventional method (i.e. site.get_page.get()) is
avoided is that due to replication lag, a discrepancy between the live
database (which the API uses) and the replicated database (which SQL
uses) can lead to incorrect and very confusing data, such as missing
pages that are supposed to exist, if both are used interchangeably.
an API query is made to get its content. This is the only API query
used in the task's code.
""" """
query = "SELECT page_latest FROM page WHERE page_title = ? AND page_namespace = ?" query = "SELECT page_latest FROM page WHERE page_title = ? AND page_namespace = ?"
namespace, base = title.split(":", 1) namespace, base = title.split(":", 1)
@@ -423,7 +429,19 @@ class Task(BaseTask):


result = self.site.sql_query(query, (base, ns)) result = self.site.sql_query(query, (base, ns))
revid = list(result)[0] revid = list(result)[0]
return self.site.get_revid_content(revid)

res = self.site.api_query(action="query", prop="revisions",
revids=revid, rvprop="content")
try:
return res["query"]["pages"].values()[0]["revisions"][0]["*"]
except KeyError:
sleep(5)
res = self.site.api_query(action="query", prop="revisions",
revids=revid, rvprop="content")
try:
return res["query"]["pages"].values()[0]["revisions"][0]["*"]
except KeyError:
return None


def get_status_and_chart(self, content): def get_status_and_chart(self, content):
"""Determine the status and chart number of an AFC submission. """Determine the status and chart number of an AFC submission.


+ 0
- 5
bot/wiki/site.py View File

@@ -41,7 +41,6 @@ class Site(object):
api_query -- does an API query with the given kwargs as params api_query -- does an API query with the given kwargs as params
sql_query -- does an SQL query and yields its results sql_query -- does an SQL query and yields its results
get_replag -- returns the estimated database replication lag get_replag -- returns the estimated database replication lag
get_revid_content -- returns the content of a revision ID from SQL
namespace_id_to_name -- given a namespace ID, returns associated name(s) namespace_id_to_name -- given a namespace ID, returns associated name(s)
namespace_name_to_id -- given a namespace name, returns associated id namespace_name_to_id -- given a namespace name, returns associated id
get_page -- returns a Page object for the given title get_page -- returns a Page object for the given title
@@ -536,10 +535,6 @@ class Site(object):
result = list(self.sql_query(query)) result = list(self.sql_query(query))
return result[0][0] return result[0][0]


def get_revid_content(self, revid):
"""Return the content of a revision ID from SQL."""
return None

def namespace_id_to_name(self, ns_id, all=False): def namespace_id_to_name(self, ns_id, all=False):
"""Given a namespace ID, returns associated namespace names. """Given a namespace ID, returns associated namespace names.




Loading…
Cancel
Save