瀏覽代碼

A bunch of updates in config, wiki, and tasks.afc_statistics

tags/v0.1^2
Ben Kurtovic 13 年之前
父節點
當前提交
06118c0b4c
共有 5 個檔案被更改,包括 140 行新增72 行删除
  1. +5
    -5
      bot/config.py
  2. +100
    -57
      bot/tasks/afc_statistics.py
  3. +27
    -8
      bot/wiki/category.py
  4. +3
    -2
      bot/wiki/page.py
  5. +5
    -0
      bot/wiki/site.py

+ 5
- 5
bot/config.py 查看文件

@@ -216,13 +216,13 @@ class BotFormatter(logging.Formatter):
def format_color(self, record):
l = record.levelname.ljust(8)
if record.levelno == logging.DEBUG:
record.lvl = l.join(("\x1b[37m", "\x1b[0m"))
record.lvl = l.join(("\x1b[34m", "\x1b[0m")) # Blue
if record.levelno == logging.INFO:
record.lvl = l.join(("\x1b[32m", "\x1b[0m"))
record.lvl = l.join(("\x1b[32m", "\x1b[0m")) # Green
if record.levelno == logging.WARNING:
record.lvl = l.join(("\x1b[36m", "\x1b[0m"))
record.lvl = l.join(("\x1b[33m", "\x1b[0m")) # Yellow
if record.levelno == logging.ERROR:
record.lvl = l.join(("\x1b[33m", "\x1b[0m"))
record.lvl = l.join(("\x1b[31m", "\x1b[0m")) # Red
if record.levelno == logging.CRITICAL:
record.lvl = l.join(("\x1b[31m", "\x1b[0m"))
record.lvl = l.join(("\x1b[1m\x1b[31m", "\x1b[0m")) # Bold red
return record

+ 100
- 57
bot/tasks/afc_statistics.py 查看文件

@@ -12,6 +12,14 @@ from classes import BaseTask
import config
import wiki

# Chart status number constants:
CHART_NONE = 0
CHART_PEND = 1
CHART_DRAFT = 2
CHART_REVIEW = 3
CHART_ACCEPT = 4
CHART_DECLINE = 5

class Task(BaseTask):
"""A task to generate statistics for WikiProject Articles for Creation.

@@ -56,7 +64,7 @@ class Task(BaseTask):
elif action == "sync":
self.sync()
finally:
self.conn.close()
self.conn.close()

def save(self, **kwargs):
self.logger.info("Saving chart")
@@ -130,27 +138,25 @@ class Task(BaseTask):

def sync(self, **kwargs):
self.logger.info("Starting sync")
self.report_replag()

replag = self.site.get_replag()
self.logger.debug("Server replag is {0}".format(replag))
if replag > 600:
msg = "Sync canceled as replag ({0} secs) is greater than ten minutes."
self.logger.warn(msg.format(replag))

with self.conn.cursor() as cursor, self.db_access_lock:
self.update_tracked(cursor)
self.add_untracked(cursor)
self.delete_old(cursor)
self.logger.info("Sync completed")

def report_replag(self):
replag = self.site.get_replag()
if replag < 60:
lvl = logging.DEBUG
elif replag < 720:
lvl = logging.INFO
else:
lvl = logging.WARNING
self.logger.log(lvl, "Server replag is {0}".format(replag))
self.logger.info("Sync completed")

def update_tracked(self, cursor):
self.logger.debug("Updating tracked submissions")
query1 = "SELECT page_id, page_title, page_modify_oldid FROM page"
query2 = "SELECT page_latest, page_title FROM page WHERE page_id = ?"
query2 = """SELECT page_latest, page_title, page_namespace FROM page
WHERE page_id = ?"""
cursor.execute(query1)
for pageid, title, oldid in cursor:
msg = "Updating tracked page: [[{0}]] (id: {1}) @ {2}"
@@ -158,11 +164,13 @@ class Task(BaseTask):
result = list(self.site.sql_query(query2, (pageid,)))
try:
real_oldid = result[0][0]
real_title = result[0][1]
except IndexError: # Page doesn't exist!
self.untrack_page(cursor, pageid)
continue
if real_oldid != oldid:
body = result[0][1].replace("_", " ")
ns = self.site.namespace_id_to_name(result[0][2])
real_title = ":".join(ns, body)
self.update_page(cursor, pageid, real_title)

def add_untracked(self, cursor):
@@ -171,7 +179,7 @@ class Task(BaseTask):
tracked = [i[0] for i in cursor.fetchall()]

category = self.site.get_category(self.pending_cat)
pending = category.members(limit=500)
pending = category.members(use_sql=True)

for title, pageid in pending:
if title in self.ignore_list:
@@ -182,9 +190,9 @@ class Task(BaseTask):
def delete_old(self, cursor):
self.logger.debug("Removing old submissions from chart")
query = """DELETE FROM page, row USING page JOIN row
ON page_id = row_id WHERE row_chart IN (4, 5)
ON page_id = row_id WHERE row_chart IN ?
AND ADDTIME(page_special_time, '36:00:00') < NOW()"""
cursor.execute(query)
cursor.execute(query, ((CHART_ACCEPT, CHART_DECLINE),))

def untrack_page(self, cursor, pageid):
self.logger.debug("Untracking page (id: {0})".format(pageid))
@@ -197,20 +205,19 @@ class Task(BaseTask):
msg = "Tracking page [[{0}]] (id: {1})".format(title, pageid)
self.logger.debug(msg)

page = self.site.get_page(title)
status, chart = self.get_status_and_chart(page)
content = self.get_content(title)
status, chart = self.get_status_and_chart(content)
if not status:
msg = "Could not find a status for [[{0}]]".format(title)
self.logger.warn(msg)
self.logger.error(msg)
return

title = page.title()
short = self.get_short_title(title)
size = len(page.get())
notes = self.get_notes(page)
size = len(content)
notes = self.get_notes(pageid)
c_user, c_time, c_id = self.get_create(pageid)
m_user, m_time, m_id = self.get_modify(pageid)
s_user, s_time, s_id = self.get_special(page, status)
s_user, s_time, s_id = self.get_special(pageid, chart)

query1 = "INSERT INTO row VALUES ?"
query2 = "INSERT INTO page VALUES ?"
@@ -224,20 +231,31 @@ class Task(BaseTask):
msg = "Updating page [[{0}]] (id: {1})".format(title, pageid)
self.logger.debug(msg)

page = self.site.get_page(title)
status, chart = self.get_status_and_chart(page)
if not status:
self.untrack_page(cursor, pageid)
content = self.get_content(title)
try:
redirect_regex = wiki.Page.re_redirect
target_title = re.findall(redirect_regex, content, flags=re.I)[0]
except IndexError:
pass
else:
target_ns = self.site.get_page(target_title).namespace()
if target_ns == wiki.NS_MAIN:
status, chart = "accept", CHART_ACCEPT
elif target_ns in [wiki.NS_PROJECT, wiki.NS_PROJECT_TALK]:
title = target_title
content = self.get_content(title)
else:
msg = "Page has moved to namespace {0}".format(target_ns)
self.logger.debug(msg)
self.untrack_page(cursor, pageid)
return

if pageid != page.pageid():
msg = "Page [[{0}]] is not what it should be! (id: {0} != {1})"
self.logger.warn(msg.format(pageid, page.pageid()))
self.report_replag()
status, chart = self.get_status_and_chart(content)
if not status:
self.untrack_page(cursor, pageid)

title = page.title()
size = len(page.get())
notes = self.get_notes(page)
size = len(content)
notes = self.get_notes(pageid)
m_user, m_time, m_id = self.get_modify(pageid)

query = "SELECT * FROM page JOIN row ON page_id = row_id WHERE page_id = ?"
@@ -288,7 +306,7 @@ class Task(BaseTask):
self.logger.debug(msg.format(pageid, result["page_status"],
result["row_chart"], status, chart))

s_user, s_time, s_id = self.get_special(page, status)
s_user, s_time, s_id = self.get_special(pageid, chart)

if s_id != result["page_special_oldid"]:
cursor.execute(query2, (s_user, s_time, s_id, pageid))
@@ -304,30 +322,31 @@ class Task(BaseTask):
msg = "{0}: notes: {1} -> {2}"
self.logger.debug(msg.format(pageid, result["page_notes"], notes))

def get_status_and_chart(self, page):
def get_content(self, title):
query = "SELECT page_latest FROM page WHERE page_title = ? AND page_namespace = ?"
namespace, base = title.split(":", 1)
try:
content = page.get()
except wiki.PageNotFoundError:
msg = "Page [[{0}]] does not exist, but the server said it should!"
self.logger.warn(msg.format(page.title()))
return None, 0

if page.is_redirect():
target = page.get_redirect_target()
if self.site.get_page(target).namespace() == 0:
return "accept", 4
return None, 0
elif re.search("\{\{afc submission\|r\|(.*?)\}\}", content, re.I):
return "review", 3
ns = self.site.namespace_name_to_id(namespace)
except wiki.NamespaceNotFoundError:
base = title
ns = wiki.NS_MAIN

result = self.site.sql_query(query, (base, ns))
revid = list(result)[0]
return self.site.get_revid_content(revid)

def get_status_and_chart(self, content):
if re.search("\{\{afc submission\|r\|(.*?)\}\}", content, re.I):
return "review", CHART_REVIEW
elif re.search("\{\{afc submission\|h\|(.*?)\}\}", content, re.I):
return "pend", 2
return "pend", CHART_DRAFT
elif re.search("\{\{afc submission\|\|(.*?)\}\}", content, re.I):
return "pend", 1
return "pend", CHART_PEND
elif re.search("\{\{afc submission\|t\|(.*?)\}\}", content, re.I):
return None, 0
return None, CHART_NONE
elif re.search("\{\{afc submission\|d\|(.*?)\}\}", content, re.I):
return "decline", 5
return None, 0
return "decline", CHART_DECLINE
return None, CHART_NONE

def get_short_title(self, title):
short = re.sub("Wikipedia(\s*talk)?\:Articles\sfor\screation\/", "", title)
@@ -350,8 +369,32 @@ class Task(BaseTask):
m_user, m_time, m_id = list(result)[0]
return m_user, datetime.strptime(m_time, "%Y%m%d%H%M%S"), m_id

def get_special(self, page, status):
def get_special(self, pageid, chart):
if chart == CHART_PEND:
return None, None, None
elif chart == CHART_ACCEPT:
return self.get_create(pageid)
elif chart == CHART_DRAFT:
search = "(?!\{\{afc submission\|h\|(.*?)\}\})"
elif chart == CHART_REVIEW:
search = "(?!\{\{afc submission\|r\|(.*?)\}\})"
elif chart == CHART_DECLINE:
search = "(?!\{\{afc submission\|d\|(.*?)\}\})"

query = """SELECT rev_user_text, rev_timestamp, rev_id
FROM revision WHERE rev_page = ? ORDER BY rev_id DESC"""
result = self.site.sql_query(query, (pageid,))

counter = 0
for user, ts, revid in result:
counter += 1
if counter > 100:
break
content = self.site.get_revid_content(revid)
if re.search(search, content, re.I):
return user, datetime.strptime(ts, "%Y%m%d%H%M%S"), revid

return None, None, None

def get_notes(self, page):
def get_notes(self, pageid):
return None

+ 27
- 8
bot/wiki/category.py 查看文件

@@ -13,7 +13,7 @@ class Category(Page):
because it accepts category names without the namespace prefix.

Public methods:
members -- returns a list of pages in the category as (title, id) tuples
members -- returns a list of page titles in the category
"""

def __repr__(self):
@@ -25,15 +25,34 @@ class Category(Page):
"""Returns a nice string representation of the Category."""
return '<Category "{0}" of {1}>'.format(self.title(), str(self._site))

def members(self, limit=50):
"""Returns a list of pages in the category as (title, pageid) tuples.
def members(self, limit=50, use_sql=False):
"""Returns a list of page titles in the category.

If `limit` is provided, we will provide this many titles, or less if
the category is too small. `limit` defaults to 50; normal users can go
up to 500, and bots can go up to 5,000 on a single API query.

If `use_sql` is True, we will use a SQL query instead of the API. The
limit argument will be ignored, and pages will be returned as tuples
of (title, pageid) instead of just titles.
"""
params = {"action": "query", "list": "categorymembers",
"cmlimit": limit, "cmtitle": self._title}
result = self._site._api_query(params)
members = result['query']['categorymembers']
return [(member["title"], member["pageid"]) for member in members]
if use_sql:
query = """SELECT page_title, page_namespace, page_id FROM page
JOIN categorylinks ON page_id = cl_from
WHERE cl_to = ?"""
title = self.title().replace(" ", "_").split(":", 1)[1]
result = self.site.sql_query(query, (title,))
members = []
for row in result:
body = row[0].replace("_", " ")
namespace = self.site.namespace_id_to_name(row[1])
title = ":".join(namespace, body)
members.append((title, row[2]))
return members

else:
params = {"action": "query", "list": "categorymembers",
"cmlimit": limit, "cmtitle": self._title}
result = self._site._api_query(params)
members = result['query']['categorymembers']
return [member["title"] for member in members]

+ 3
- 2
bot/wiki/page.py 查看文件

@@ -32,6 +32,8 @@ class Page(object):
add_section -- add a new section at the bottom of the page
"""

re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"

def __init__(self, site, title, follow_redirects=False):
"""Constructor for new Page instances.

@@ -617,9 +619,8 @@ class Page(object):
if the page is not a redirect.
"""
content = self.get(force)
regexp = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]"
try:
return re.findall(regexp, content, flags=re.IGNORECASE)[0]
return re.findall(self.re_redirect, content, flags=re.I)[0]
except IndexError:
e = "The page does not appear to have a redirect target."
raise RedirectError(e)


+ 5
- 0
bot/wiki/site.py 查看文件

@@ -41,6 +41,7 @@ class Site(object):
api_query -- does an API query with the given kwargs as params
sql_query -- does an SQL query and yields its results
get_replag -- returns the estimated database replication lag
get_revid_content -- returns the content of a revision ID from SQL
namespace_id_to_name -- given a namespace ID, returns associated name(s)
namespace_name_to_id -- given a namespace name, returns associated id
get_page -- returns a Page object for the given title
@@ -535,6 +536,10 @@ class Site(object):
result = list(self.sql_query(query))
return result[0][0]

def get_revid_content(self, revid):
"""Return the content of a revision ID from SQL."""
return None

def namespace_id_to_name(self, ns_id, all=False):
"""Given a namespace ID, returns associated namespace names.



Loading…
取消
儲存