瀏覽代碼

Get page content by revid via API, fix some other things - hopefully task works now

tags/v0.1^2
Ben Kurtovic 13 年之前
父節點
當前提交
ae6c799aee
共有 3 個檔案被更改,包括 32 行新增19 行删除
  1. +4
    -4
      bot/rules.py
  2. +28
    -10
      bot/tasks/afc_statistics.py
  3. +0
    -5
      bot/wiki/site.py

+ 4
- 4
bot/rules.py 查看文件

@@ -36,7 +36,7 @@ def process(rc):
chans.update(("##earwigbot", "#wikipedia-en-afc"))
if r_page.search(page_name):
tasks.start("afc_copyvios", action="edit", page=rc.page)
#tasks.start("afc_copyvios", action="edit", page=rc.page)
chans.add("#wikipedia-en-afc")
elif r_ffu.match(page_name):
@@ -48,17 +48,17 @@ def process(rc):
elif rc.flags == "move" and (r_move1.match(comment) or
r_move2.match(comment)):
p = r_moved_pages.findall(rc.comment)[0]
tasks.start("afc_copyvios", action="move", page=p)
#tasks.start("afc_copyvios", action="move", page=p)
chans.add("#wikipedia-en-afc")
elif rc.flags == "delete" and r_delete.match(comment):
p = r_deleted_page.findall(rc.comment)[0]
tasks.start("afc_copyvios", action="delete", page=p)
#tasks.start("afc_copyvios", action="delete", page=p)
chans.add("#wikipedia-en-afc")
elif rc.flags == "restore" and r_restore.match(comment):
p = r_restored_page.findall(rc.comment)[0]
tasks.start("afc_copyvios", action="restore", page=p)
#tasks.start("afc_copyvios", action="restore", page=p)
chans.add("#wikipedia-en-afc")
elif rc.flags == "protect" and r_protect.match(comment):


+ 28
- 10
bot/tasks/afc_statistics.py 查看文件

@@ -5,6 +5,7 @@ import logging
import re
from os.path import expanduser
from threading import Lock
from time import sleep

import oursql

@@ -215,7 +216,7 @@ class Task(BaseTask):
self.logger.debug(" {0} -> {1}".format(oldid, real_oldid))
body = result[0][1].replace("_", " ")
ns = self.site.namespace_id_to_name(result[0][2])
real_title = ":".join(ns, body)
real_title = ":".join((ns, body))
self.update_page(cursor, pageid, real_title)

def add_untracked(self, cursor):
@@ -268,6 +269,11 @@ class Task(BaseTask):
which are then saved to our database.
"""
content = self.get_content(title)
if not content:
msg = "Could not get page content for [[{0}]]".format(title)
self.logger.error(msg)
return

status, chart = self.get_status_and_chart(content)
if not status:
msg = "Could not find a status for [[{0}]]".format(title)
@@ -304,6 +310,11 @@ class Task(BaseTask):
happened, and we'll untrack the submission.
"""
content = self.get_content(title)
if not content:
msg = "Could not get page content for [[{0}]]".format(title)
self.logger.error(msg)
return

try:
redirect_regex = wiki.Page.re_redirect
target_title = re.findall(redirect_regex, content, flags=re.I)[0]
@@ -402,16 +413,11 @@ class Task(BaseTask):
self.logger.debug(msg.format(pageid, result["page_notes"], notes))

def get_content(self, title):
"""Get the current content of a page by title from SQL.
"""Get the current content of a page by title from the API.

The page's current revision ID is retrieved from SQL, and then
site.get_revid_content() is called.

The reason a more conventional method (i.e. site.get_page.get()) is
avoided is that due to replication lag, a discrepancy between the live
database (which the API uses) and the replicated database (which SQL
uses) can lead to incorrect and very confusing data, such as missing
pages that are supposed to exist, if both are used interchangeably.
an API query is made to get its content. This is the only API query
used in the task's code.
"""
query = "SELECT page_latest FROM page WHERE page_title = ? AND page_namespace = ?"
namespace, base = title.split(":", 1)
@@ -423,7 +429,19 @@ class Task(BaseTask):

result = self.site.sql_query(query, (base, ns))
revid = list(result)[0]
return self.site.get_revid_content(revid)

res = self.site.api_query(action="query", prop="revisions",
revids=revid, rvprop="content")
try:
return res["query"]["pages"].values()[0]["revisions"][0]["*"]
except KeyError:
sleep(5)
res = self.site.api_query(action="query", prop="revisions",
revids=revid, rvprop="content")
try:
return res["query"]["pages"].values()[0]["revisions"][0]["*"]
except KeyError:
return None

def get_status_and_chart(self, content):
"""Determine the status and chart number of an AFC submission.


+ 0
- 5
bot/wiki/site.py 查看文件

@@ -41,7 +41,6 @@ class Site(object):
api_query -- does an API query with the given kwargs as params
sql_query -- does an SQL query and yields its results
get_replag -- returns the estimated database replication lag
get_revid_content -- returns the content of a revision ID from SQL
namespace_id_to_name -- given a namespace ID, returns associated name(s)
namespace_name_to_id -- given a namespace name, returns associated id
get_page -- returns a Page object for the given title
@@ -536,10 +535,6 @@ class Site(object):
result = list(self.sql_query(query))
return result[0][0]

def get_revid_content(self, revid):
"""Return the content of a revision ID from SQL."""
return None

def namespace_id_to_name(self, ns_id, all=False):
"""Given a namespace ID, returns associated namespace names.



Loading…
取消
儲存