diff --git a/bot/classes/base_task.py b/bot/classes/base_task.py index 19aca0c..d568bb8 100644 --- a/bot/classes/base_task.py +++ b/bot/classes/base_task.py @@ -89,4 +89,6 @@ class BaseTask(object): return False if content == cfg.get("disabled", "run"): return False + + self.logger.warn("Emergency task shutoff has been enabled!") return True diff --git a/bot/rules.py b/bot/rules.py index 6388586..480c34a 100644 --- a/bot/rules.py +++ b/bot/rules.py @@ -36,7 +36,6 @@ def process(rc): chans.update(("##earwigbot", "#wikipedia-en-afc")) if r_page.search(page_name): - tasks.start("afc_statistics", action="edit", page=rc.page) tasks.start("afc_copyvios", action="edit", page=rc.page) chans.add("#wikipedia-en-afc") @@ -49,19 +48,16 @@ def process(rc): elif rc.flags == "move" and (r_move1.match(comment) or r_move2.match(comment)): p = r_moved_pages.findall(rc.comment)[0] - tasks.start("afc_statistics", action="move", page=p) tasks.start("afc_copyvios", action="move", page=p) chans.add("#wikipedia-en-afc") elif rc.flags == "delete" and r_delete.match(comment): p = r_deleted_page.findall(rc.comment)[0] - tasks.start("afc_statistics", action="delete", page=p) tasks.start("afc_copyvios", action="delete", page=p) chans.add("#wikipedia-en-afc") elif rc.flags == "restore" and r_restore.match(comment): p = r_restored_page.findall(rc.comment)[0] - tasks.start("afc_statistics", action="restore", page=p) tasks.start("afc_copyvios", action="restore", page=p) chans.add("#wikipedia-en-afc") diff --git a/bot/tasks/afc_statistics.py b/bot/tasks/afc_statistics.py index c2cba50..59ff327 100644 --- a/bot/tasks/afc_statistics.py +++ b/bot/tasks/afc_statistics.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from datetime import datetime +import logging import re from os.path import expanduser from threading import Lock @@ -27,7 +28,7 @@ class Task(BaseTask): # Set some wiki-related attributes: self.pagename = cfg.get("page", "Template:AFC statistics") - self.pending_cat = cfg.get("pending", "Pending_AfC_submissions") + self.pending_cat = cfg.get("pending", "Pending AfC submissions") self.ignore_list = cfg.get("ignore_list", []) default_summary = "Updating statistics for [[WP:WPAFC|WikiProject Articles for creation]]." self.summary = self.make_summary(cfg.get("summary", default_summary)) @@ -49,26 +50,16 @@ class Task(BaseTask): self.conn = oursql.connect(**self.conn_data) action = kwargs.get("action") - if not action: - return - - methods = { - "save": self.save, - "sync": self.sync, - "edit": self.process_edit, - "move": self.process_move, - "delete": self.process_delete, - "restore": self.process_edit, - } - - method = methods.get(action) - if method: - try: - method(**kwargs) - finally: - self.conn.close() + try: + if action == "save": + self.save() + elif action == "sync": + self.sync() + finally: + self.conn.close() def save(self, **kwargs): + self.logger.info("Saving chart") if kwargs.get("fromIRC"): summary = " ".join((self.summary, "(!earwigbot)")) else: @@ -84,11 +75,13 @@ class Task(BaseTask): statistics.join(("\\1\n", "\n\\3")), text, flags=re.DOTALL) if newtext == text: + self.logger.info("Chart unchanged; not saving") return # Don't edit the page if we're not adding anything newtext = re.sub("()(.*?)()", "\\1~~~ at ~~~~~\\3", newtext) page.edit(newtext, summary, minor=True, bot=True) + self.logger.info("Chart saved to [[{0}]]".format(page.title())) def compile_charts(self): stats = "" @@ -136,118 +129,81 @@ class Task(BaseTask): return timestamp.strftime("%H:%M, %d %B %Y") def sync(self, **kwargs): + self.logger.info("Starting sync") + self.report_replag() with self.conn.cursor() as cursor, self.db_access_lock: - self.sync_deleted(cursor) # Remove deleted subs - self.sync_oldids(cursor) # Make sure all subs are up to date - self.sync_pending(cursor) # Add missing pending subs - self.sync_old(cursor) # Remove old declined and accepted subs - - def sync_deleted(self, cursor): - query1 = "SELECT page_id FROM page" - query2 = "SELECT page_id FROM page WHERE page_id = ?" - cursor.execute(query1) - for page in cursor: - result = self.site.sql_query(query2, (page[0],)) - if not list(result): - self.untrack_page(cursor, pageid=page[0]) + self.update_tracked(cursor) + self.add_untracked(cursor) + self.delete_old(cursor) + self.logger.info("Sync completed") + + def report_replag(self): + replag = self.site.get_replag() + if replag < 60: + lvl = logging.DEBUG + elif replag < 720: + lvl = logging.INFO + else: + lvl = logging.WARNING + self.logger.log(lvl, "Server replag is {0}".format(replag)) - def sync_oldids(self, cursor): + def update_tracked(self, cursor): + self.logger.debug("Updating tracked submissions") query1 = "SELECT page_id, page_title, page_modify_oldid FROM page" query2 = "SELECT page_latest, page_title FROM page WHERE page_id = ?" cursor.execute(query1) - for page_id, title, oldid in cursor: - result = list(self.site.sql_query(query2, (page_id,))) + for pageid, title, oldid in cursor: + msg = "Updating tracked page: [[{0}]] (id: {1}) @ {2}" + self.logger.debug(msg.format(pageid, title, oldid)) + result = list(self.site.sql_query(query2, (pageid,))) try: real_oldid = result[0][0] real_title = result[0][1] except IndexError: # Page doesn't exist! - self.untrack_page(cursor, pageid=page_id) + self.untrack_page(cursor, pageid) continue if real_oldid != oldid: - self.update_page(cursor, real_title) - - def sync_pending(self, cursor): - query1 = """SELECT page_id FROM page JOIN row ON page_id = row_id - WHERE row_chart IN (1, 2, 3)""" - query2 = """SELECT cl_from, page_title, page_namespace - FROM categorylinks JOIN page ON cl_from = page_id - WHERE cl_to = ?""" - cursor.execute(query1) + self.update_page(cursor, pageid, real_title) + + def add_untracked(self, cursor): + self.logger.debug("Adding untracked pending submissions") + cursor.execute("SELECT page_id FROM page") tracked = [i[0] for i in cursor.fetchall()] - result = self.site.sql_query(query2, (self.pending_cat,)) - for pageid, title, ns in result: - title = ":".join((self.site.namespace_id_to_name(ns), title)) - if title.replace("_", " ") in self.ignore_list: + category = self.site.get_category(self.pending_cat) + pending = category.members(limit=500) + + for title, pageid in pending: + if title in self.ignore_list: continue if pageid not in tracked: - self.track_page(cursor, title) + self.track_page(cursor, pageid, title) - def sync_old(self, cursor): + def delete_old(self, cursor): + self.logger.debug("Removing old submissions from chart") query = """DELETE FROM page, row USING page JOIN row ON page_id = row_id WHERE row_chart IN (4, 5) AND ADDTIME(page_special_time, '36:00:00') < NOW()""" cursor.execute(query) - def process_edit(self, page, **kwargs): - if page in self.ignore_list: - return - with self.conn.cursor() as cursor, self.db_access_lock: - self.sync_page(cursor, page) + def untrack_page(self, cursor, pageid): + self.logger.debug("Untracking page (id: {0})".format(pageid)) + query = """DELETE FROM page, row USING page JOIN row + ON page_id = row_id WHERE page_id = ?""" + cursor.execute(query, (pageid,)) - def process_move(self, page, **kwargs): - query1 = "SELECT * FROM page WHERE page_title = ?" - query2 = "SELECT page_latest FROM page WHERE page_namespace = ? AND page_title = ?" - query3 = "UPDATE page SET page_title = ?, page_modify_oldid = ? WHERE page_title = ?" - source, dest = page - with self.conn.cursor() as cursor, self.db_access_lock: - cursor.execute(query1, (source,)) - result = cursor.fetchall() - if result: - res = self.site.sql_query(query2, self.split_title(dest)) - try: - new_oldid = list(res)[0][0] - except IndexError: - new_oldid = result[0][11] - cursor.execute(query3, (dest, new_oldid, source)) - else: - self.track_page(cursor, dest) - - def process_delete(self, page, **kwargs): - query = "SELECT page_id FROM page WHERE page_namespace = ? AND page_title = ?" - with self.conn.cursor() as cursor, self.db_access_lock: - result = self.site.sql_query(query, self.split_title(page)) - if list(result): - self.sync_page(cursor, page) - else: - self.untrack_page(cursor, title=page) - - def sync_page(self, cursor, page): - query = "SELECT * FROM page WHERE page_title = ?" - cursor.execute(query, (page,)) - result = cursor.fetchall() - if result: - self.update_page(cursor, page) - else: - self.track_page(cursor, page) - - def untrack_page(self, cursor, pageid=None, title=None): - query = "DELETE FROM page, row USING page JOIN row ON page_id = row_id WHERE " - if pageid: - query += "page_id = ?" - cursor.execute(query, (pageid,)) - elif title: - query += "page_title = ?" - cursor.execute(query, (title,)) - - def track_page(self, cursor, title): + def track_page(self, cursor, pageid, title): """Update hook for when page is not in our database.""" + msg = "Tracking page [[{0}]] (id: {1})".format(title, pageid) + self.logger.debug(msg) + page = self.site.get_page(title) status, chart = self.get_status_and_chart(page) - if not status or status in ("accept", "decline"): + if not status: + msg = "Could not find a status for [[{0}]]".format(title) + self.logger.warn(msg) return - pageid = page.pageid() title = page.title() short = self.get_short_title(title) size = len(page.get()) @@ -256,21 +212,29 @@ class Task(BaseTask): m_user, m_time, m_id = self.get_modify(pageid) s_user, s_time, s_id = self.get_special(page, status) - query1 = "INSERT INTO row VALUES (?, ?)" - query2 = "INSERT INTO page VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" - cursor.execute(query1, (pageid, chart)) - cursor.execute(query2, (pageid, status, title, short, size, notes, + query1 = "INSERT INTO row VALUES ?" + query2 = "INSERT INTO page VALUES ?" + cursor.execute(query1, ((pageid, chart),)) + cursor.execute(query2, ((pageid, status, title, short, size, notes, c_user, c_time, c_id, m_user, m_time, m_id, - s_user, s_time, s_id)) + s_user, s_time, s_id),)) - def update_page(self, cursor, title): + def update_page(self, cursor, pageid, title): """Update hook for when page is in our database.""" + msg = "Updating page [[{0}]] (id: {1})".format(title, pageid) + self.logger.debug(msg) + page = self.site.get_page(title) status, chart = self.get_status_and_chart(page) if not status: - self.untrack_page(cursor, title=title) + self.untrack_page(cursor, pageid) + + if pageid != page.pageid(): + msg = "Page [[{0}]] is not what it should be! (id: {0} != {1})" + self.logger.warn(msg.format(pageid, page.pageid())) + self.report_replag() + self.untrack_page(cursor, pageid) - pageid = page.pageid() title = page.title() size = len(page.get()) notes = self.get_notes(page) @@ -282,43 +246,72 @@ class Task(BaseTask): result = dict_cursor.fetchall()[0] if title != result["page_title"]: - query = "UPDATE page SET page_title = ?, page_short = ? WHERE page_id = ?" - short = self.get_short_title(title) - cursor.execute(query, (title, short, pageid)) + self.update_page_title(cursor, result, pageid, title) if m_id != result["page_modify_oldid"]: - query = """UPDATE page SET page_size = ?, page_modify_user = ?, - page_modify_time = ?, page_modify_oldid = ? - WHERE page_id = ?""" - cursor.execute(query, (size, m_user, m_time, m_id, pageid)) + self.update_page_modify(cursor, result, pageid, size, m_user, m_time, m_id) if status != result["page_status"]: - query1 = """UPDATE page JOIN row ON page_id = row_id - SET page_status = ?, row_chart = ? WHERE page_id = ?""" - query2 = """UPDATE page SET page_special_user = ?, - page_special_time = ?, page_special_oldid = ? - WHERE page_id = ?""" - cursor.execute(query1, (status, chart, pageid)) - s_user, s_time, s_id = self.get_special(page, status) - if s_id != result["page_special_oldid"]: - cursor.execute(query2, (s_user, s_time, s_id, pageid)) + self.update_page_special(cursor, result, pageid, status, chart, page) if notes != result["page_notes"]: - query = "UPDATE page SET page_notes = ? WHERE page_id = ?" - cursor.execute(query, (notes, pageid)) + self.update_page_notes(cursor, result, pageid, notes) - def split_title(self, title): - namespace, body = title.split(":", 1)[0] - if not body: - return 0, title - try: - ns = self.site.namespace_name_to_id(namespace) - except wiki.NamespaceNotFoundError: - return 0, title - return ns, body + def update_page_title(self, cursor, result, pageid, title): + query = "UPDATE page SET page_title = ?, page_short = ? WHERE page_id = ?" + short = self.get_short_title(title) + cursor.execute(query, (title, short, pageid)) + msg = "{0}: title: {1} -> {2}" + self.logger.debug(msg.format(pageid, result["page_title"], title)) + + def update_page_modify(self, cursor, result, pageid, size, m_user, m_time, m_id): + query = """UPDATE page SET page_size = ?, page_modify_user = ?, + page_modify_time = ?, page_modify_oldid = ? + WHERE page_id = ?""" + cursor.execute(query, (size, m_user, m_time, m_id, pageid)) + + msg = "{0}: modify: {1} / {2} / {3} -> {4} / {5} / {6}" + msg = msg.format(pageid, result["page_modify_user"], + result["page_modify_time"], + result["page_modify_oldid"], m_user, m_time, m_id) + self.logger.debug(msg) + + def update_page_special(self, cursor, result, pageid, status, chart, page): + query1 = """UPDATE page JOIN row ON page_id = row_id + SET page_status = ?, row_chart = ? WHERE page_id = ?""" + query2 = """UPDATE page SET page_special_user = ?, + page_special_time = ?, page_special_oldid = ? + WHERE page_id = ?""" + cursor.execute(query1, (status, chart, pageid)) + + msg = "{0}: status: {1} ({2}) -> {3} ({4})" + self.logger.debug(msg.format(pageid, result["page_status"], + result["row_chart"], status, chart)) + + s_user, s_time, s_id = self.get_special(page, status) + + if s_id != result["page_special_oldid"]: + cursor.execute(query2, (s_user, s_time, s_id, pageid)) + msg = "{0}: special: {1} / {2} / {3} -> {4} / {5} / {6}" + msg = msg.format(pageid, result["page_special_user"], + result["page_special_time"], + result["page_special_oldid"], m_user, m_time, m_id) + self.logger.debug(msg) + + def update_page_notes(self, cursor, result, pageid, notes): + query = "UPDATE page SET page_notes = ? WHERE page_id = ?" + cursor.execute(query, (notes, pageid)) + msg = "{0}: notes: {1} -> {2}" + self.logger.debug(msg.format(pageid, result["page_notes"], notes)) def get_status_and_chart(self, page): - content = page.get() + try: + content = page.get() + except wiki.PageNotFoundError: + msg = "Page [[{0}]] does not exist, but the server said it should!" + self.logger.warn(msg.format(page.title())) + return None, 0 + if page.is_redirect(): target = page.get_redirect_target() if self.site.get_page(target).namespace() == 0: @@ -343,13 +336,12 @@ class Task(BaseTask): return short def get_create(self, pageid): - query1 = "SELECT MIN(rev_id) FROM revision WHERE rev_page = ?" - query2 = "SELECT rev_user_text, rev_timestamp, rev_id FROM revision WHERE rev_id = ?" - result1 = self.site.sql_query(query1, (pageid,)) - rev_id = list(result1)[0][0] - result2 = self.site.sql_query(query2, (rev_id,)) - m_user, m_time, m_id = list(result2)[0] - return m_user, datetime.strptime(m_time, "%Y%m%d%H%M%S"), m_id + query = """SELECT rev_user_text, rev_timestamp, rev_id + FROM revision WHERE rev_id = + (SELECT MIN(rev_id) FROM revision WHERE rev_page = ?)""" + result = self.site.sql_query(query, (pageid,)) + c_user, c_time, c_id = list(result)[0] + return c_user, datetime.strptime(c_time, "%Y%m%d%H%M%S"), c_id def get_modify(self, pageid): query = """SELECT rev_user_text, rev_timestamp, rev_id FROM revision diff --git a/bot/wiki/category.py b/bot/wiki/category.py index 6d9b841..0fd0669 100644 --- a/bot/wiki/category.py +++ b/bot/wiki/category.py @@ -13,7 +13,7 @@ class Category(Page): because it accepts category names without the namespace prefix. Public methods: - members -- returns a list of titles in the category + members -- returns a list of pages in the category as (title, id) tuples """ def __repr__(self): @@ -26,7 +26,7 @@ class Category(Page): return ''.format(self.title(), str(self._site)) def members(self, limit=50): - """Returns a list of titles in the category. + """Returns a list of pages in the category as (title, pageid) tuples. If `limit` is provided, we will provide this many titles, or less if the category is too small. `limit` defaults to 50; normal users can go @@ -36,4 +36,4 @@ class Category(Page): "cmlimit": limit, "cmtitle": self._title} result = self._site._api_query(params) members = result['query']['categorymembers'] - return [member["title"] for member in members] + return [(member["title"], member["pageid"]) for member in members]