|
|
@@ -54,19 +54,32 @@ class Task(BaseTask): |
|
|
|
self.db_access_lock = Lock() |
|
|
|
|
|
|
|
def run(self, **kwargs): |
|
|
|
"""Entry point for a task event. |
|
|
|
|
|
|
|
Depending on the kwargs passed, we will either synchronize our local |
|
|
|
statistics database with the site (self.sync()) or save it to the wiki |
|
|
|
(self.save()). We will additionally create an SQL connection with our |
|
|
|
local database. |
|
|
|
""" |
|
|
|
self.site = wiki.get_site() |
|
|
|
self.conn = oursql.connect(**self.conn_data) |
|
|
|
|
|
|
|
action = kwargs.get("action") |
|
|
|
try: |
|
|
|
if action == "save": |
|
|
|
self.save() |
|
|
|
self.save(**kwargs) |
|
|
|
elif action == "sync": |
|
|
|
self.sync() |
|
|
|
self.sync(**kwargs) |
|
|
|
finally: |
|
|
|
self.conn.close() |
|
|
|
|
|
|
|
def save(self, **kwargs): |
|
|
|
"""Save our local statistics to the wiki. |
|
|
|
|
|
|
|
After checking for emergency shutoff, the statistics chart is compiled, |
|
|
|
and then saved to self.pagename using self.summary iff it has changed |
|
|
|
since last save. |
|
|
|
""" |
|
|
|
self.logger.info("Saving chart") |
|
|
|
if kwargs.get("fromIRC"): |
|
|
|
summary = " ".join((self.summary, "(!earwigbot)")) |
|
|
@@ -92,6 +105,7 @@ class Task(BaseTask): |
|
|
|
self.logger.info("Chart saved to [[{0}]]".format(page.title())) |
|
|
|
|
|
|
|
def compile_charts(self): |
|
|
|
"""Compile and return all statistics information from our local db.""" |
|
|
|
stats = "" |
|
|
|
with self.conn.cursor() as cursor, self.db_access_lock: |
|
|
|
cursor.execute("SELECT * FROM chart") |
|
|
@@ -100,6 +114,7 @@ class Task(BaseTask): |
|
|
|
return stats[:-1] # Drop the last newline |
|
|
|
|
|
|
|
def compile_chart(self, chart_info): |
|
|
|
"""Compile and return a single statistics chart.""" |
|
|
|
chart_id, chart_title, special_title = chart_info |
|
|
|
|
|
|
|
chart = "|".join((self.tl_header, chart_title)) |
|
|
@@ -117,6 +132,11 @@ class Task(BaseTask): |
|
|
|
return chart |
|
|
|
|
|
|
|
def compile_chart_row(self, page): |
|
|
|
"""Compile and return a single chart row. |
|
|
|
|
|
|
|
'page' is a dict of page information, taken as a row from the page |
|
|
|
table, where keys are column names and values are their cell contents. |
|
|
|
""" |
|
|
|
row = "{0}|s={page_status}|t={page_title}|h={page_short}|z={page_size}|" |
|
|
|
row += "cr={page_create_user}|cd={page_create_time}|ci={page_create_oldid}|" |
|
|
|
row += "mr={page_modify_user}|md={page_modify_time}|mi={page_modify_oldid}|" |
|
|
@@ -134,14 +154,27 @@ class Task(BaseTask): |
|
|
|
return "".join(("{{", row.format(self.tl_row, **page), "}}")) |
|
|
|
|
|
|
|
def format_time(self, timestamp): |
|
|
|
"""Format a datetime into the standard MediaWiki timestamp format.""" |
|
|
|
return timestamp.strftime("%H:%M, %d %B %Y") |
|
|
|
|
|
|
|
def sync(self, **kwargs): |
|
|
|
"""Synchronize our local statistics database with the site. |
|
|
|
|
|
|
|
Syncing involves, in order, updating tracked submissions that have |
|
|
|
been changed since last sync (self.update_tracked()), adding pending |
|
|
|
submissions that are not tracked (self.add_untracked()), and removing |
|
|
|
old submissions from the database (self.delete_old()). |
|
|
|
|
|
|
|
The sync will be canceled if SQL replication lag is greater than 600 |
|
|
|
seconds, because this will lead to potential problems and outdated |
|
|
|
data, not to mention putting demand on an already overloaded server. |
|
|
|
Giving sync the kwarg "ignore_replag" will go around this restriction. |
|
|
|
""" |
|
|
|
self.logger.info("Starting sync") |
|
|
|
|
|
|
|
replag = self.site.get_replag() |
|
|
|
self.logger.debug("Server replag is {0}".format(replag)) |
|
|
|
if replag > 600: |
|
|
|
if replag > 600 and not kwargs.get("ignore_replag"): |
|
|
|
msg = "Sync canceled as replag ({0} secs) is greater than ten minutes." |
|
|
|
self.logger.warn(msg.format(replag)) |
|
|
|
|
|
|
@@ -153,6 +186,16 @@ class Task(BaseTask): |
|
|
|
self.logger.info("Sync completed") |
|
|
|
|
|
|
|
def update_tracked(self, cursor): |
|
|
|
"""Update tracked submissions that have been changed since last sync. |
|
|
|
|
|
|
|
This is done by iterating through every page in our database and |
|
|
|
comparing our stored latest revision ID with the actual latest revision |
|
|
|
ID from an SQL query. If they differ, we will update our information |
|
|
|
about the page (self.update_page()). |
|
|
|
|
|
|
|
If the page does not exist, we will remove it from our database with |
|
|
|
self.untrack_page(). |
|
|
|
""" |
|
|
|
self.logger.debug("Updating tracked submissions") |
|
|
|
query1 = "SELECT page_id, page_title, page_modify_oldid FROM page" |
|
|
|
query2 = """SELECT page_latest, page_title, page_namespace FROM page |
|
|
@@ -174,6 +217,13 @@ class Task(BaseTask): |
|
|
|
self.update_page(cursor, pageid, real_title) |
|
|
|
|
|
|
|
def add_untracked(self, cursor): |
|
|
|
"""Add pending submissions that are not yet tracked. |
|
|
|
|
|
|
|
This is done by compiling a list of all currently tracked submissions |
|
|
|
and iterating through all members of self.pending_cat via SQL. If a |
|
|
|
page in the pending category is not tracked and is not in |
|
|
|
self.ignore_list, we will track it with self.track_page(). |
|
|
|
""" |
|
|
|
self.logger.debug("Adding untracked pending submissions") |
|
|
|
cursor.execute("SELECT page_id FROM page") |
|
|
|
tracked = [i[0] for i in cursor.fetchall()] |
|
|
@@ -188,20 +238,31 @@ class Task(BaseTask): |
|
|
|
self.track_page(cursor, pageid, title) |
|
|
|
|
|
|
|
def delete_old(self, cursor): |
|
|
|
"""Remove old submissions from the database. |
|
|
|
|
|
|
|
"Old" is defined as a submission that has been declined or accepted |
|
|
|
more than 36 hours ago. Pending submissions cannot be "old". |
|
|
|
""" |
|
|
|
self.logger.debug("Removing old submissions from chart") |
|
|
|
query = """DELETE FROM page, row USING page JOIN row |
|
|
|
ON page_id = row_id WHERE row_chart IN ? |
|
|
|
AND ADDTIME(page_special_time, '36:00:00') < NOW()""" |
|
|
|
cursor.execute(query, ((CHART_ACCEPT, CHART_DECLINE),)) |
|
|
|
old_charts = (CHART_ACCEPT, CHART_DECLINE) |
|
|
|
cursor.execute(query, (old_charts,)) |
|
|
|
|
|
|
|
def untrack_page(self, cursor, pageid): |
|
|
|
"""Remove a page, given by ID, from our database.""" |
|
|
|
self.logger.debug("Untracking page (id: {0})".format(pageid)) |
|
|
|
query = """DELETE FROM page, row USING page JOIN row |
|
|
|
ON page_id = row_id WHERE page_id = ?""" |
|
|
|
cursor.execute(query, (pageid,)) |
|
|
|
|
|
|
|
def track_page(self, cursor, pageid, title): |
|
|
|
"""Update hook for when page is not in our database.""" |
|
|
|
"""Update hook for when page is not in our database. |
|
|
|
|
|
|
|
A variety of SQL queries are used to gather information about the page, |
|
|
|
which are then saved to our database. |
|
|
|
""" |
|
|
|
msg = "Tracking page [[{0}]] (id: {1})".format(title, pageid) |
|
|
|
self.logger.debug(msg) |
|
|
|
|
|
|
@@ -227,7 +288,20 @@ class Task(BaseTask): |
|
|
|
s_user, s_time, s_id),)) |
|
|
|
|
|
|
|
def update_page(self, cursor, pageid, title): |
|
|
|
"""Update hook for when page is in our database.""" |
|
|
|
"""Update hook for when page is already in our database. |
|
|
|
|
|
|
|
A variety of SQL queries are used to gather information about the page, |
|
|
|
which is compared against our stored information. Differing information |
|
|
|
is then updated. |
|
|
|
|
|
|
|
If our page is now a redirect, we will determine the namespace it was |
|
|
|
moved to. If it was moved to the mainspace or template space, we will |
|
|
|
set the sub's status as accepted. If it was to the Project: or Project |
|
|
|
talk: namespaces, we'll merely update our stored title (this is likely |
|
|
|
to occur if a submission was moved from the userspace to the project |
|
|
|
space). If it was moved to another namespace, something unusual has |
|
|
|
happened, and we'll untrack the submission. |
|
|
|
""" |
|
|
|
msg = "Updating page [[{0}]] (id: {1})".format(title, pageid) |
|
|
|
self.logger.debug(msg) |
|
|
|
|
|
|
@@ -236,24 +310,27 @@ class Task(BaseTask): |
|
|
|
redirect_regex = wiki.Page.re_redirect |
|
|
|
target_title = re.findall(redirect_regex, content, flags=re.I)[0] |
|
|
|
except IndexError: |
|
|
|
pass |
|
|
|
status, chart = self.get_status_and_chart(content) |
|
|
|
if not status: |
|
|
|
self.untrack_page(cursor, pageid) |
|
|
|
return |
|
|
|
else: |
|
|
|
target_ns = self.site.get_page(target_title).namespace() |
|
|
|
if target_ns == wiki.NS_MAIN: |
|
|
|
if target_ns in [wiki.NS_MAIN, wiki.NS_TEMPLATE]: |
|
|
|
status, chart = "accept", CHART_ACCEPT |
|
|
|
elif target_ns in [wiki.NS_PROJECT, wiki.NS_PROJECT_TALK]: |
|
|
|
title = target_title |
|
|
|
content = self.get_content(title) |
|
|
|
status, chart = self.get_status_and_chart(content) |
|
|
|
if not status: |
|
|
|
self.untrack_page(cursor, pageid) |
|
|
|
return |
|
|
|
else: |
|
|
|
msg = "Page has moved to namespace {0}".format(target_ns) |
|
|
|
self.logger.debug(msg) |
|
|
|
self.untrack_page(cursor, pageid) |
|
|
|
return |
|
|
|
|
|
|
|
status, chart = self.get_status_and_chart(content) |
|
|
|
if not status: |
|
|
|
self.untrack_page(cursor, pageid) |
|
|
|
|
|
|
|
size = len(content) |
|
|
|
notes = self.get_notes(pageid) |
|
|
|
m_user, m_time, m_id = self.get_modify(pageid) |
|
|
@@ -270,12 +347,13 @@ class Task(BaseTask): |
|
|
|
self.update_page_modify(cursor, result, pageid, size, m_user, m_time, m_id) |
|
|
|
|
|
|
|
if status != result["page_status"]: |
|
|
|
self.update_page_special(cursor, result, pageid, status, chart, page) |
|
|
|
self.update_page_status(cursor, result, pageid, status, chart, page) |
|
|
|
|
|
|
|
if notes != result["page_notes"]: |
|
|
|
self.update_page_notes(cursor, result, pageid, notes) |
|
|
|
|
|
|
|
def update_page_title(self, cursor, result, pageid, title): |
|
|
|
"""Update the title and short_title of a page in our database.""" |
|
|
|
query = "UPDATE page SET page_title = ?, page_short = ? WHERE page_id = ?" |
|
|
|
short = self.get_short_title(title) |
|
|
|
cursor.execute(query, (title, short, pageid)) |
|
|
@@ -283,6 +361,7 @@ class Task(BaseTask): |
|
|
|
self.logger.debug(msg.format(pageid, result["page_title"], title)) |
|
|
|
|
|
|
|
def update_page_modify(self, cursor, result, pageid, size, m_user, m_time, m_id): |
|
|
|
"""Update the last modified information of a page in our database.""" |
|
|
|
query = """UPDATE page SET page_size = ?, page_modify_user = ?, |
|
|
|
page_modify_time = ?, page_modify_oldid = ? |
|
|
|
WHERE page_id = ?""" |
|
|
@@ -294,7 +373,8 @@ class Task(BaseTask): |
|
|
|
result["page_modify_oldid"], m_user, m_time, m_id) |
|
|
|
self.logger.debug(msg) |
|
|
|
|
|
|
|
def update_page_special(self, cursor, result, pageid, status, chart, page): |
|
|
|
def update_page_status(self, cursor, result, pageid, status, chart, page): |
|
|
|
"""Update the status and "specialed" information of a page.""" |
|
|
|
query1 = """UPDATE page JOIN row ON page_id = row_id |
|
|
|
SET page_status = ?, row_chart = ? WHERE page_id = ?""" |
|
|
|
query2 = """UPDATE page SET page_special_user = ?, |
|
|
@@ -317,12 +397,24 @@ class Task(BaseTask): |
|
|
|
self.logger.debug(msg) |
|
|
|
|
|
|
|
def update_page_notes(self, cursor, result, pageid, notes): |
|
|
|
"""Update the notes (or warnings) of a page in our database.""" |
|
|
|
query = "UPDATE page SET page_notes = ? WHERE page_id = ?" |
|
|
|
cursor.execute(query, (notes, pageid)) |
|
|
|
msg = "{0}: notes: {1} -> {2}" |
|
|
|
self.logger.debug(msg.format(pageid, result["page_notes"], notes)) |
|
|
|
|
|
|
|
def get_content(self, title): |
|
|
|
"""Get the current content of a page by title from SQL. |
|
|
|
|
|
|
|
The page's current revision ID is retrieved from SQL, and then |
|
|
|
site.get_revid_content() is called. |
|
|
|
|
|
|
|
The reason a more conventional method (i.e. site.get_page.get()) is |
|
|
|
avoided is that due to replication lag, a discrepancy between the live |
|
|
|
database (which the API uses) and the replicated database (which SQL |
|
|
|
uses) can lead to incorrect and very confusing data, such as missing |
|
|
|
pages that are supposed to exist, if both are used interchangeably. |
|
|
|
""" |
|
|
|
query = "SELECT page_latest FROM page WHERE page_title = ? AND page_namespace = ?" |
|
|
|
namespace, base = title.split(":", 1) |
|
|
|
try: |
|
|
@@ -336,6 +428,15 @@ class Task(BaseTask): |
|
|
|
return self.site.get_revid_content(revid) |
|
|
|
|
|
|
|
def get_status_and_chart(self, content): |
|
|
|
"""Determine the status and chart number of an AFC submission. |
|
|
|
|
|
|
|
The methodology used here is the same one I've been using for years |
|
|
|
(see also commands.afc_report), but with the new draft system taken |
|
|
|
into account. The order here is important: if there is more than one |
|
|
|
{{AFC submission}} template on a page, we need to know which one to |
|
|
|
use (revision history search to find the most recent isn't a viable |
|
|
|
idea :P). |
|
|
|
""" |
|
|
|
if re.search("\{\{afc submission\|r\|(.*?)\}\}", content, re.I): |
|
|
|
return "review", CHART_REVIEW |
|
|
|
elif re.search("\{\{afc submission\|h\|(.*?)\}\}", content, re.I): |
|
|
@@ -349,12 +450,23 @@ class Task(BaseTask): |
|
|
|
return None, CHART_NONE |
|
|
|
|
|
|
|
def get_short_title(self, title): |
|
|
|
"""Shorten a title so we can display it in a chart using less space. |
|
|
|
|
|
|
|
Basically, this just means removing the "Wikipedia talk:Articles for |
|
|
|
creation" part from the beginning. If it is longer than 50 characters, |
|
|
|
we'll shorten it down to 47 and add an poor-man's ellipsis at the end. |
|
|
|
""" |
|
|
|
short = re.sub("Wikipedia(\s*talk)?\:Articles\sfor\screation\/", "", title) |
|
|
|
if len(short) > 50: |
|
|
|
short = "".join((short[:47], "...")) |
|
|
|
return short |
|
|
|
|
|
|
|
def get_create(self, pageid): |
|
|
|
"""Return information about a page's first edit ("creation"). |
|
|
|
|
|
|
|
This consists of the page creator, creation time, and the earliest |
|
|
|
revision ID. |
|
|
|
""" |
|
|
|
query = """SELECT rev_user_text, rev_timestamp, rev_id |
|
|
|
FROM revision WHERE rev_id = |
|
|
|
(SELECT MIN(rev_id) FROM revision WHERE rev_page = ?)""" |
|
|
@@ -363,6 +475,11 @@ class Task(BaseTask): |
|
|
|
return c_user, datetime.strptime(c_time, "%Y%m%d%H%M%S"), c_id |
|
|
|
|
|
|
|
def get_modify(self, pageid): |
|
|
|
"""Return information about a page's last edit ("modification"). |
|
|
|
|
|
|
|
This consists of the most recent editor, modification time, and the |
|
|
|
lastest revision ID. |
|
|
|
""" |
|
|
|
query = """SELECT rev_user_text, rev_timestamp, rev_id FROM revision |
|
|
|
JOIN page ON rev_id = page_latest WHERE page_id = ?""" |
|
|
|
result = self.site.sql_query(query, (pageid,)) |
|
|
@@ -370,7 +487,21 @@ class Task(BaseTask): |
|
|
|
return m_user, datetime.strptime(m_time, "%Y%m%d%H%M%S"), m_id |
|
|
|
|
|
|
|
def get_special(self, pageid, chart): |
|
|
|
if chart == CHART_PEND: |
|
|
|
"""Return information about a page's "special" edit. |
|
|
|
|
|
|
|
I tend to use the term "special" as a verb a lot, which is bound to |
|
|
|
cause confusion. It is merely a short way of saying "the edit in which |
|
|
|
a declined submission was declined, an accepted submission was |
|
|
|
accepted, a submission in review was set as such, and a pending draft |
|
|
|
was submitted." |
|
|
|
|
|
|
|
This "information" consists of the special edit's editor, its time, and |
|
|
|
its revision ID. If the page's status is not something that involves |
|
|
|
"special"-ing, we will return None for all three. The same will be |
|
|
|
returned if we cannot determine when the page was "special"-ed, or if |
|
|
|
it was "special"-ed more than 100 edits ago. |
|
|
|
""" |
|
|
|
if chart in [CHART_NONE, CHART_PEND]: |
|
|
|
return None, None, None |
|
|
|
elif chart == CHART_ACCEPT: |
|
|
|
return self.get_create(pageid) |
|
|
@@ -397,4 +528,8 @@ class Task(BaseTask): |
|
|
|
return None, None, None |
|
|
|
|
|
|
|
def get_notes(self, pageid): |
|
|
|
"""Return any special notes or warnings about this page. |
|
|
|
|
|
|
|
Currently unimplemented, so always returns None. |
|
|
|
""" |
|
|
|
return None |