Browse Source

Unicode fixes and misc cleanup to toolset and tasks

- Category.members() -> Category.get_members()
- Category.get_members() now fully abides by the limit, which defaults
to None for SQL and 50 for the API.
- Category.get_members() always returns unicode objects instead of
strings for page titles.
- afc_statistics: Fixed a frustrating Unicode bug.
tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
4e2580e2c1
4 changed files with 97 additions and 79 deletions
  1. +1
    -1
      earwigbot/commands/afc_status.py
  2. +1
    -1
      earwigbot/tasks/afc_history.py
  3. +49
    -48
      earwigbot/tasks/afc_statistics.py
  4. +46
    -29
      earwigbot/wiki/category.py

+ 1
- 1
earwigbot/commands/afc_status.py View File

@@ -110,7 +110,7 @@ class Command(BaseCommand):
def count_submissions(self):
"""Returns the number of open AFC submissions (count of CAT:PEND)."""
cat = self.site.get_category("Pending AfC submissions")
subs = len(cat.members(limit=2500, use_sql=True))
subs = len(cat.get_members(use_sql=True))

# Remove [[Wikipedia:Articles for creation/Redirects]] and
# [[Wikipedia:Files for upload]], which aren't real submissions:


+ 1
- 1
earwigbot/tasks/afc_history.py View File

@@ -130,7 +130,7 @@ class Task(BaseTask):
q_delete = "DELETE FROM page WHERE page_id = ?"
q_update = "UPDATE page SET page_date = ?, page_status = ? WHERE page_id = ?"
q_insert = "INSERT INTO page VALUES (?, ?, ?)"
members = category.members(use_sql=True)
members = category.get_members(use_sql=True)

with self.conn.cursor() as cursor:
for title, pageid in members:


+ 49
- 48
earwigbot/tasks/afc_statistics.py View File

@@ -241,17 +241,18 @@ class Task(BaseTask):
self.untrack_page(cursor, pageid)
continue

title = title.decode("utf8") # SQL gives strings; we want Unicode
real_oldid = result[0][0]
if oldid != real_oldid:
msg = "Updating page [[{0}]] (id: {1}) @ {2}"
msg = u"Updating page [[{0}]] (id: {1}) @ {2}"
self.logger.debug(msg.format(title, pageid, oldid))
self.logger.debug(" {0} -> {1}".format(oldid, real_oldid))
body = result[0][1].replace("_", " ")
base = result[0][1].replace("_", " ")
ns = self.site.namespace_id_to_name(result[0][2])
if ns:
real_title = ":".join((str(ns), body))
real_title = u":".join(ns, base))
else:
real_title = body
real_title = base
self.update_page(cursor, pageid, real_title)

def add_untracked(self, cursor):
@@ -267,13 +268,13 @@ class Task(BaseTask):
tracked = [i[0] for i in cursor.fetchall()]

category = self.site.get_category(self.pending_cat)
pending = category.members(use_sql=True)
pending = category.get_members(use_sql=True)

for title, pageid in pending:
if title.decode("utf8") in self.ignore_list:
if title in self.ignore_list:
continue
if pageid not in tracked:
msg = "Tracking page [[{0}]] (id: {1})".format(title, pageid)
msg = u"Tracking page [[{0}]] (id: {1})".format(title, pageid)
self.logger.debug(msg)
self.track_page(cursor, pageid, title)

@@ -299,17 +300,17 @@ class Task(BaseTask):
if not title:
return

title = title.replace("_", " ")
title = title.replace("_", " ").decode("utf8")
query = "SELECT page_id, page_modify_oldid FROM page WHERE page_title = ?"
with self.conn.cursor() as cursor:
cursor.execute(query, (title,))
try:
pageid, oldid = cursor.fetchall()[0]
except IndexError:
msg = "Page [[{0}]] not found in database".format(title)
msg = u"Page [[{0}]] not found in database".format(title)
self.logger.error(msg)

msg = "Updating page [[{0}]] (id: {1}) @ {2}"
msg = u"Updating page [[{0}]] (id: {1}) @ {2}"
self.logger.info(msg.format(title, pageid, oldid))
self.update_page(cursor, pageid, title)

@@ -328,14 +329,14 @@ class Task(BaseTask):
"""
content = self.get_content(title)
if content is None:
msg = "Could not get page content for [[{0}]]".format(title)
msg = u"Could not get page content for [[{0}]]".format(title)
self.logger.error(msg)
return

namespace = self.site.get_page(title).namespace()
status, chart = self.get_status_and_chart(content, namespace)
if chart == self.CHART_NONE:
msg = "Could not find a status for [[{0}]]".format(title)
msg = u"Could not find a status for [[{0}]]".format(title)
self.logger.warn(msg)
return

@@ -348,10 +349,8 @@ class Task(BaseTask):
query1 = "INSERT INTO row VALUES (?, ?)"
query2 = "INSERT INTO page VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
cursor.execute(query1, (pageid, chart))
cursor.execute(query2, (pageid, status, title.decode("utf8"),
short.decode("utf8"), size, notes,
m_user.decode("utf8"), m_time, m_id,
s_user.decode("utf8"), s_time, s_id))
cursor.execute(query2, (pageid, status, title, short, size, notes,
m_user, m_time, m_id, s_user, s_time, s_id))

def update_page(self, cursor, pageid, title):
"""Update hook for when page is already in our database.
@@ -362,7 +361,7 @@ class Task(BaseTask):
"""
content = self.get_content(title)
if content is None:
msg = "Could not get page content for [[{0}]]".format(title)
msg = u"Could not get page content for [[{0}]]".format(title)
self.logger.error(msg)
return

@@ -379,9 +378,10 @@ class Task(BaseTask):

size = self.get_size(content)
m_user, m_time, m_id = self.get_modify(pageid)
notes = self.get_notes(chart, content, m_time, result["page_special_user"])
notes = self.get_notes(chart, content, m_time,
result["page_special_user"].decode("utf8"))

if title != result["page_title"]:
if title != result["page_title"].decode("utf8"):
self.update_page_title(cursor, result, pageid, title)

if m_id != result["page_modify_oldid"]:
@@ -397,21 +397,21 @@ class Task(BaseTask):
"""Update the title and short_title of a page in our database."""
query = "UPDATE page SET page_title = ?, page_short = ? WHERE page_id = ?"
short = self.get_short_title(title)
cursor.execute(query, (title.decode("utf8"), short.decode("utf8"),
pageid))
msg = " {0}: title: {1} -> {2}"
self.logger.debug(msg.format(pageid, result["page_title"], title))
cursor.execute(query, (title, short, pageid))

msg = u" {0}: title: {1} -> {2}"
old_title = result["page_title"].decode("utf8")
self.logger.debug(msg.format(pageid, old_title, title))

def update_page_modify(self, cursor, result, pageid, size, m_user, m_time, m_id):
"""Update the last modified information of a page in our database."""
query = """UPDATE page SET page_size = ?, page_modify_user = ?,
page_modify_time = ?, page_modify_oldid = ?
WHERE page_id = ?"""
cursor.execute(query, (size, m_user.decode("utf8"), m_time, m_id,
pageid))
cursor.execute(query, (size, m_user, m_time, m_id, pageid))

msg = " {0}: modify: {1} / {2} / {3} -> {4} / {5} / {6}"
msg = msg.format(pageid, result["page_modify_user"],
msg = u" {0}: modify: {1} / {2} / {3} -> {4} / {5} / {6}"
msg = msg.format(pageid, result["page_modify_user"].decode("utf8"),
result["page_modify_time"],
result["page_modify_oldid"], m_user, m_time, m_id)
self.logger.debug(msg)
@@ -432,10 +432,10 @@ class Task(BaseTask):
s_user, s_time, s_id = self.get_special(pageid, chart)

if s_id != result["page_special_oldid"]:
cursor.execute(query2, (s_user.decode("utf8"), s_time, s_id,
pageid))
msg = "{0}: special: {1} / {2} / {3} -> {4} / {5} / {6}"
msg = msg.format(pageid, result["page_special_user"],
cursor.execute(query2, (s_user, s_time, s_id, pageid))
msg = u"{0}: special: {1} / {2} / {3} -> {4} / {5} / {6}"
msg = msg.format(pageid,
result["page_special_user"].decode("utf8"),
result["page_special_time"],
result["page_special_oldid"], s_user, s_time, s_id)
self.logger.debug(msg)
@@ -456,36 +456,34 @@ class Task(BaseTask):
"""
query = "SELECT page_latest FROM page WHERE page_title = ? AND page_namespace = ?"
try:
namespace, base = title.decode("utf8").split(":", 1)
namespace, base = title.split(":", 1)
except ValueError:
base = title.decode("utf8")
base = title
ns = wiki.NS_MAIN
else:
try:
ns = self.site.namespace_name_to_id(namespace)
except wiki.NamespaceNotFoundError:
base = title.decode("utf8")
base = title
ns = wiki.NS_MAIN

result = self.site.sql_query(query, (base.replace(" ", "_"), ns))
revid = int(list(result)[0][0])

try:
revid = int(list(result)[0][0])
except IndexError:
return None
return self.get_revision_content(revid)

def get_revision_content(self, revid):
def get_revision_content(self, revid, tries=1):
"""Get the content of a revision by ID from the API."""
res = self.site.api_query(action="query", prop="revisions",
revids=revid, rvprop="content")
try:
return res["query"]["pages"].values()[0]["revisions"][0]["*"]
except KeyError:
sleep(5)
res = self.site.api_query(action="query", prop="revisions",
revids=revid, rvprop="content")
try:
return res["query"]["pages"].values()[0]["revisions"][0]["*"]
except KeyError:
return None
if tries > 0:
sleep(5)
return self.get_revision_content(revid, tries=tries-1)

def get_status_and_chart(self, content, namespace):
"""Determine the status and chart number of an AFC submission.
@@ -598,7 +596,8 @@ class Task(BaseTask):
JOIN page ON rev_id = page_latest WHERE page_id = ?"""
result = self.site.sql_query(query, (pageid,))
m_user, m_time, m_id = list(result)[0]
return m_user, datetime.strptime(m_time, "%Y%m%d%H%M%S"), m_id
timestamp = datetime.strptime(m_time, "%Y%m%d%H%M%S")
return m_user.decode("utf8"), timestamp, m_id

def get_special(self, pageid, chart):
"""Return information about a page's "special" edit.
@@ -613,7 +612,7 @@ class Task(BaseTask):
its revision ID. If the page's status is not something that involves
"special"-ing, we will return None for all three. The same will be
returned if we cannot determine when the page was "special"-ed, or if
it was "special"-ed more than 250 edits ago.
it was "special"-ed more than 100 edits ago.
"""
if chart ==self.CHART_NONE:
return None, None, None
@@ -656,7 +655,8 @@ class Task(BaseTask):
else:
if any(matches):
return last
last = (user, datetime.strptime(ts, "%Y%m%d%H%M%S"), revid)
timestamp = datetime.strptime(ts, "%Y%m%d%H%M%S")
last = (user.decode("utf8"), timestamp, revid)

return last

@@ -671,7 +671,8 @@ class Task(BaseTask):
(SELECT MIN(rev_id) FROM revision WHERE rev_page = ?)"""
result = self.site.sql_query(query, (pageid,))
c_user, c_time, c_id = list(result)[0]
return c_user, datetime.strptime(c_time, "%Y%m%d%H%M%S"), c_id
timestamp = datetime.strptime(c_time, "%Y%m%d%H%M%S")
return c_user.encode("utf8"), timestamp, c_id

def get_notes(self, chart, content, m_time, s_user):
"""Return any special notes or warnings about this page.


+ 46
- 29
earwigbot/wiki/category.py View File

@@ -35,7 +35,7 @@ class Category(Page):
because it accepts category names without the namespace prefix.

Public methods:
members -- returns a list of page titles in the category
get_members -- returns a list of page titles in the category
"""

def __repr__(self):
@@ -47,37 +47,54 @@ class Category(Page):
"""Returns a nice string representation of the Category."""
return '<Category "{0}" of {1}>'.format(self.title(), str(self._site))

def members(self, limit=50, use_sql=False):
def _get_members_via_sql(self, limit):
"""Return a list of tuples of (title, pageid) in the category."""
query = """SELECT page_title, page_namespace, page_id FROM page
JOIN categorylinks ON page_id = cl_from
WHERE cl_to = ?"""
title = self.title().replace(" ", "_").split(":", 1)[1]

if limit:
query += " LIMIT = ?"
result = self._site.sql_query(query, (title, limit))
else:
result = self._site.sql_query(query, (title,))

members = []
for row in result:
base = row[0].replace("_", " ").decode("utf8")
namespace = self._site.namespace_id_to_name(row[1])
if namespace:
title = u":".join((namespace, base))
else: # Avoid doing a silly (albeit valid) ":Pagename" thing
title = base
members.append((title, row[2]))
return members

def _get_members_via_api(self, limit):
"""Return a list of page titles in the category using the API."""
params = {"action": "query", "list": "categorymembers",
"cmlimit": limit, "cmtitle": self._title}
if not limit:
params["cmlimit"] = 50 # Default value

result = self._site._api_query(params)
members = result['query']['categorymembers']
return [member["title"] for member in members]

def get_members(self, use_sql=False, limit=None):
"""Returns a list of page titles in the category.

If `limit` is provided, we will provide this many titles, or less if
the category is too small. `limit` defaults to 50; normal users can go
up to 500, and bots can go up to 5,000 on a single API query.
If `use_sql` is True, we will use a SQL query instead of the API. Pages
will be returned as tuples of (title, pageid) instead of just titles.

If `use_sql` is True, we will use a SQL query instead of the API. The
limit argument will be ignored, and pages will be returned as tuples
of (title, pageid) instead of just titles.
If `limit` is provided, we will provide this many titles, or less if
the category is smaller. `limit` defaults to 50 for API queries; normal
users can go up to 500, and bots can go up to 5,000 on a single API
query. If we're using SQL, the limit is None by default (returning all
pages in the category), but an arbitrary limit can still be chosen.
"""
if use_sql:
query = """SELECT page_title, page_namespace, page_id FROM page
JOIN categorylinks ON page_id = cl_from
WHERE cl_to = ?"""
title = self.title().replace(" ", "_").split(":", 1)[1]
result = self._site.sql_query(query, (title,))
members = []
for row in result:
body = row[0].replace("_", " ")
namespace = self._site.namespace_id_to_name(row[1])
if namespace:
title = ":".join((str(namespace), body))
else: # Avoid doing a silly (albeit valid) ":Pagename" thing
title = body
members.append((title, row[2]))
return members

return self._get_members_via_sql(limit)
else:
params = {"action": "query", "list": "categorymembers",
"cmlimit": limit, "cmtitle": self._title}
result = self._site._api_query(params)
members = result['query']['categorymembers']
return [member["title"] for member in members]
return self._get_members_via_api(limit)

Loading…
Cancel
Save