From 06118c0b4c6cf19155093340827b5e0abdb2ab6e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 5 Nov 2011 23:37:36 -0400 Subject: [PATCH] A bunch of updates in config, wiki, and tasks.afc_statistics --- bot/config.py | 10 +-- bot/tasks/afc_statistics.py | 157 ++++++++++++++++++++++++++++---------------- bot/wiki/category.py | 35 +++++++--- bot/wiki/page.py | 5 +- bot/wiki/site.py | 5 ++ 5 files changed, 140 insertions(+), 72 deletions(-) diff --git a/bot/config.py b/bot/config.py index 02626a9..bf9b872 100644 --- a/bot/config.py +++ b/bot/config.py @@ -216,13 +216,13 @@ class BotFormatter(logging.Formatter): def format_color(self, record): l = record.levelname.ljust(8) if record.levelno == logging.DEBUG: - record.lvl = l.join(("\x1b[37m", "\x1b[0m")) + record.lvl = l.join(("\x1b[34m", "\x1b[0m")) # Blue if record.levelno == logging.INFO: - record.lvl = l.join(("\x1b[32m", "\x1b[0m")) + record.lvl = l.join(("\x1b[32m", "\x1b[0m")) # Green if record.levelno == logging.WARNING: - record.lvl = l.join(("\x1b[36m", "\x1b[0m")) + record.lvl = l.join(("\x1b[33m", "\x1b[0m")) # Yellow if record.levelno == logging.ERROR: - record.lvl = l.join(("\x1b[33m", "\x1b[0m")) + record.lvl = l.join(("\x1b[31m", "\x1b[0m")) # Red if record.levelno == logging.CRITICAL: - record.lvl = l.join(("\x1b[31m", "\x1b[0m")) + record.lvl = l.join(("\x1b[1m\x1b[31m", "\x1b[0m")) # Bold red return record diff --git a/bot/tasks/afc_statistics.py b/bot/tasks/afc_statistics.py index 59ff327..f9446cf 100644 --- a/bot/tasks/afc_statistics.py +++ b/bot/tasks/afc_statistics.py @@ -12,6 +12,14 @@ from classes import BaseTask import config import wiki +# Chart status number constants: +CHART_NONE = 0 +CHART_PEND = 1 +CHART_DRAFT = 2 +CHART_REVIEW = 3 +CHART_ACCEPT = 4 +CHART_DECLINE = 5 + class Task(BaseTask): """A task to generate statistics for WikiProject Articles for Creation. @@ -56,7 +64,7 @@ class Task(BaseTask): elif action == "sync": self.sync() finally: - self.conn.close() + self.conn.close() def save(self, **kwargs): self.logger.info("Saving chart") @@ -130,27 +138,25 @@ class Task(BaseTask): def sync(self, **kwargs): self.logger.info("Starting sync") - self.report_replag() + + replag = self.site.get_replag() + self.logger.debug("Server replag is {0}".format(replag)) + if replag > 600: + msg = "Sync canceled as replag ({0} secs) is greater than ten minutes." + self.logger.warn(msg.format(replag)) + with self.conn.cursor() as cursor, self.db_access_lock: self.update_tracked(cursor) self.add_untracked(cursor) self.delete_old(cursor) - self.logger.info("Sync completed") - def report_replag(self): - replag = self.site.get_replag() - if replag < 60: - lvl = logging.DEBUG - elif replag < 720: - lvl = logging.INFO - else: - lvl = logging.WARNING - self.logger.log(lvl, "Server replag is {0}".format(replag)) + self.logger.info("Sync completed") def update_tracked(self, cursor): self.logger.debug("Updating tracked submissions") query1 = "SELECT page_id, page_title, page_modify_oldid FROM page" - query2 = "SELECT page_latest, page_title FROM page WHERE page_id = ?" + query2 = """SELECT page_latest, page_title, page_namespace FROM page + WHERE page_id = ?""" cursor.execute(query1) for pageid, title, oldid in cursor: msg = "Updating tracked page: [[{0}]] (id: {1}) @ {2}" @@ -158,11 +164,13 @@ class Task(BaseTask): result = list(self.site.sql_query(query2, (pageid,))) try: real_oldid = result[0][0] - real_title = result[0][1] except IndexError: # Page doesn't exist! self.untrack_page(cursor, pageid) continue if real_oldid != oldid: + body = result[0][1].replace("_", " ") + ns = self.site.namespace_id_to_name(result[0][2]) + real_title = ":".join(ns, body) self.update_page(cursor, pageid, real_title) def add_untracked(self, cursor): @@ -171,7 +179,7 @@ class Task(BaseTask): tracked = [i[0] for i in cursor.fetchall()] category = self.site.get_category(self.pending_cat) - pending = category.members(limit=500) + pending = category.members(use_sql=True) for title, pageid in pending: if title in self.ignore_list: @@ -182,9 +190,9 @@ class Task(BaseTask): def delete_old(self, cursor): self.logger.debug("Removing old submissions from chart") query = """DELETE FROM page, row USING page JOIN row - ON page_id = row_id WHERE row_chart IN (4, 5) + ON page_id = row_id WHERE row_chart IN ? AND ADDTIME(page_special_time, '36:00:00') < NOW()""" - cursor.execute(query) + cursor.execute(query, ((CHART_ACCEPT, CHART_DECLINE),)) def untrack_page(self, cursor, pageid): self.logger.debug("Untracking page (id: {0})".format(pageid)) @@ -197,20 +205,19 @@ class Task(BaseTask): msg = "Tracking page [[{0}]] (id: {1})".format(title, pageid) self.logger.debug(msg) - page = self.site.get_page(title) - status, chart = self.get_status_and_chart(page) + content = self.get_content(title) + status, chart = self.get_status_and_chart(content) if not status: msg = "Could not find a status for [[{0}]]".format(title) - self.logger.warn(msg) + self.logger.error(msg) return - title = page.title() short = self.get_short_title(title) - size = len(page.get()) - notes = self.get_notes(page) + size = len(content) + notes = self.get_notes(pageid) c_user, c_time, c_id = self.get_create(pageid) m_user, m_time, m_id = self.get_modify(pageid) - s_user, s_time, s_id = self.get_special(page, status) + s_user, s_time, s_id = self.get_special(pageid, chart) query1 = "INSERT INTO row VALUES ?" query2 = "INSERT INTO page VALUES ?" @@ -224,20 +231,31 @@ class Task(BaseTask): msg = "Updating page [[{0}]] (id: {1})".format(title, pageid) self.logger.debug(msg) - page = self.site.get_page(title) - status, chart = self.get_status_and_chart(page) - if not status: - self.untrack_page(cursor, pageid) + content = self.get_content(title) + try: + redirect_regex = wiki.Page.re_redirect + target_title = re.findall(redirect_regex, content, flags=re.I)[0] + except IndexError: + pass + else: + target_ns = self.site.get_page(target_title).namespace() + if target_ns == wiki.NS_MAIN: + status, chart = "accept", CHART_ACCEPT + elif target_ns in [wiki.NS_PROJECT, wiki.NS_PROJECT_TALK]: + title = target_title + content = self.get_content(title) + else: + msg = "Page has moved to namespace {0}".format(target_ns) + self.logger.debug(msg) + self.untrack_page(cursor, pageid) + return - if pageid != page.pageid(): - msg = "Page [[{0}]] is not what it should be! (id: {0} != {1})" - self.logger.warn(msg.format(pageid, page.pageid())) - self.report_replag() + status, chart = self.get_status_and_chart(content) + if not status: self.untrack_page(cursor, pageid) - title = page.title() - size = len(page.get()) - notes = self.get_notes(page) + size = len(content) + notes = self.get_notes(pageid) m_user, m_time, m_id = self.get_modify(pageid) query = "SELECT * FROM page JOIN row ON page_id = row_id WHERE page_id = ?" @@ -288,7 +306,7 @@ class Task(BaseTask): self.logger.debug(msg.format(pageid, result["page_status"], result["row_chart"], status, chart)) - s_user, s_time, s_id = self.get_special(page, status) + s_user, s_time, s_id = self.get_special(pageid, chart) if s_id != result["page_special_oldid"]: cursor.execute(query2, (s_user, s_time, s_id, pageid)) @@ -304,30 +322,31 @@ class Task(BaseTask): msg = "{0}: notes: {1} -> {2}" self.logger.debug(msg.format(pageid, result["page_notes"], notes)) - def get_status_and_chart(self, page): + def get_content(self, title): + query = "SELECT page_latest FROM page WHERE page_title = ? AND page_namespace = ?" + namespace, base = title.split(":", 1) try: - content = page.get() - except wiki.PageNotFoundError: - msg = "Page [[{0}]] does not exist, but the server said it should!" - self.logger.warn(msg.format(page.title())) - return None, 0 - - if page.is_redirect(): - target = page.get_redirect_target() - if self.site.get_page(target).namespace() == 0: - return "accept", 4 - return None, 0 - elif re.search("\{\{afc submission\|r\|(.*?)\}\}", content, re.I): - return "review", 3 + ns = self.site.namespace_name_to_id(namespace) + except wiki.NamespaceNotFoundError: + base = title + ns = wiki.NS_MAIN + + result = self.site.sql_query(query, (base, ns)) + revid = list(result)[0] + return self.site.get_revid_content(revid) + + def get_status_and_chart(self, content): + if re.search("\{\{afc submission\|r\|(.*?)\}\}", content, re.I): + return "review", CHART_REVIEW elif re.search("\{\{afc submission\|h\|(.*?)\}\}", content, re.I): - return "pend", 2 + return "pend", CHART_DRAFT elif re.search("\{\{afc submission\|\|(.*?)\}\}", content, re.I): - return "pend", 1 + return "pend", CHART_PEND elif re.search("\{\{afc submission\|t\|(.*?)\}\}", content, re.I): - return None, 0 + return None, CHART_NONE elif re.search("\{\{afc submission\|d\|(.*?)\}\}", content, re.I): - return "decline", 5 - return None, 0 + return "decline", CHART_DECLINE + return None, CHART_NONE def get_short_title(self, title): short = re.sub("Wikipedia(\s*talk)?\:Articles\sfor\screation\/", "", title) @@ -350,8 +369,32 @@ class Task(BaseTask): m_user, m_time, m_id = list(result)[0] return m_user, datetime.strptime(m_time, "%Y%m%d%H%M%S"), m_id - def get_special(self, page, status): + def get_special(self, pageid, chart): + if chart == CHART_PEND: + return None, None, None + elif chart == CHART_ACCEPT: + return self.get_create(pageid) + elif chart == CHART_DRAFT: + search = "(?!\{\{afc submission\|h\|(.*?)\}\})" + elif chart == CHART_REVIEW: + search = "(?!\{\{afc submission\|r\|(.*?)\}\})" + elif chart == CHART_DECLINE: + search = "(?!\{\{afc submission\|d\|(.*?)\}\})" + + query = """SELECT rev_user_text, rev_timestamp, rev_id + FROM revision WHERE rev_page = ? ORDER BY rev_id DESC""" + result = self.site.sql_query(query, (pageid,)) + + counter = 0 + for user, ts, revid in result: + counter += 1 + if counter > 100: + break + content = self.site.get_revid_content(revid) + if re.search(search, content, re.I): + return user, datetime.strptime(ts, "%Y%m%d%H%M%S"), revid + return None, None, None - def get_notes(self, page): + def get_notes(self, pageid): return None diff --git a/bot/wiki/category.py b/bot/wiki/category.py index 0fd0669..bb8886f 100644 --- a/bot/wiki/category.py +++ b/bot/wiki/category.py @@ -13,7 +13,7 @@ class Category(Page): because it accepts category names without the namespace prefix. Public methods: - members -- returns a list of pages in the category as (title, id) tuples + members -- returns a list of page titles in the category """ def __repr__(self): @@ -25,15 +25,34 @@ class Category(Page): """Returns a nice string representation of the Category.""" return ''.format(self.title(), str(self._site)) - def members(self, limit=50): - """Returns a list of pages in the category as (title, pageid) tuples. + def members(self, limit=50, use_sql=False): + """Returns a list of page titles in the category. If `limit` is provided, we will provide this many titles, or less if the category is too small. `limit` defaults to 50; normal users can go up to 500, and bots can go up to 5,000 on a single API query. + + If `use_sql` is True, we will use a SQL query instead of the API. The + limit argument will be ignored, and pages will be returned as tuples + of (title, pageid) instead of just titles. """ - params = {"action": "query", "list": "categorymembers", - "cmlimit": limit, "cmtitle": self._title} - result = self._site._api_query(params) - members = result['query']['categorymembers'] - return [(member["title"], member["pageid"]) for member in members] + if use_sql: + query = """SELECT page_title, page_namespace, page_id FROM page + JOIN categorylinks ON page_id = cl_from + WHERE cl_to = ?""" + title = self.title().replace(" ", "_").split(":", 1)[1] + result = self.site.sql_query(query, (title,)) + members = [] + for row in result: + body = row[0].replace("_", " ") + namespace = self.site.namespace_id_to_name(row[1]) + title = ":".join(namespace, body) + members.append((title, row[2])) + return members + + else: + params = {"action": "query", "list": "categorymembers", + "cmlimit": limit, "cmtitle": self._title} + result = self._site._api_query(params) + members = result['query']['categorymembers'] + return [member["title"] for member in members] diff --git a/bot/wiki/page.py b/bot/wiki/page.py index 808cc18..970197b 100644 --- a/bot/wiki/page.py +++ b/bot/wiki/page.py @@ -32,6 +32,8 @@ class Page(object): add_section -- add a new section at the bottom of the page """ + re_redirect = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]" + def __init__(self, site, title, follow_redirects=False): """Constructor for new Page instances. @@ -617,9 +619,8 @@ class Page(object): if the page is not a redirect. """ content = self.get(force) - regexp = "^\s*\#\s*redirect\s*\[\[(.*?)\]\]" try: - return re.findall(regexp, content, flags=re.IGNORECASE)[0] + return re.findall(self.re_redirect, content, flags=re.I)[0] except IndexError: e = "The page does not appear to have a redirect target." raise RedirectError(e) diff --git a/bot/wiki/site.py b/bot/wiki/site.py index 624a86c..bbff658 100644 --- a/bot/wiki/site.py +++ b/bot/wiki/site.py @@ -41,6 +41,7 @@ class Site(object): api_query -- does an API query with the given kwargs as params sql_query -- does an SQL query and yields its results get_replag -- returns the estimated database replication lag + get_revid_content -- returns the content of a revision ID from SQL namespace_id_to_name -- given a namespace ID, returns associated name(s) namespace_name_to_id -- given a namespace name, returns associated id get_page -- returns a Page object for the given title @@ -535,6 +536,10 @@ class Site(object): result = list(self.sql_query(query)) return result[0][0] + def get_revid_content(self, revid): + """Return the content of a revision ID from SQL.""" + return None + def namespace_id_to_name(self, ns_id, all=False): """Given a namespace ID, returns associated namespace names.