Browse Source

Get special data more cheaply (closes #14); cleanup.

pull/15/head
Ben Kurtovic 11 years ago
parent
commit
e26ef6597e
2 changed files with 146 additions and 99 deletions
  1. +145
    -97
      tasks/afc_statistics.py
  2. +1
    -2
      tasks/afc_undated.py

+ 145
- 97
tasks/afc_statistics.py View File

@@ -108,6 +108,8 @@ class AFCStatistics(Task):
finally: finally:
self.db_access_lock.release() self.db_access_lock.release()


#################### CHART BUILDING AND SAVING METHODS ####################

def save(self, kwargs): def save(self, kwargs):
"""Save our local statistics to the wiki. """Save our local statistics to the wiki.


@@ -123,7 +125,7 @@ class AFCStatistics(Task):
return return
summary = self.summary summary = self.summary


statistics = self.compile_charts()
statistics = self._compile_charts()


page = self.site.get_page(self.pagename) page = self.site.get_page(self.pagename)
text = page.get() text = page.get()
@@ -140,16 +142,16 @@ class AFCStatistics(Task):
page.edit(newtext, summary, minor=True, bot=True) page.edit(newtext, summary, minor=True, bot=True)
self.logger.info(u"Chart saved to [[{0}]]".format(page.title)) self.logger.info(u"Chart saved to [[{0}]]".format(page.title))


def compile_charts(self):
def _compile_charts(self):
"""Compile and return all statistics information from our local db.""" """Compile and return all statistics information from our local db."""
stats = "" stats = ""
with self.conn.cursor() as cursor: with self.conn.cursor() as cursor:
cursor.execute("SELECT * FROM chart") cursor.execute("SELECT * FROM chart")
for chart in cursor: for chart in cursor:
stats += self.compile_chart(chart) + "\n"
stats += self._compile_chart(chart) + "\n"
return stats[:-1] # Drop the last newline return stats[:-1] # Drop the last newline


def compile_chart(self, chart_info):
def _compile_chart(self, chart_info):
"""Compile and return a single statistics chart.""" """Compile and return a single statistics chart."""
chart_id, chart_title, special_title = chart_info chart_id, chart_title, special_title = chart_info


@@ -162,12 +164,12 @@ class AFCStatistics(Task):
with self.conn.cursor(oursql.DictCursor) as cursor: with self.conn.cursor(oursql.DictCursor) as cursor:
cursor.execute(query, (chart_id,)) cursor.execute(query, (chart_id,))
for page in cursor.fetchall(): for page in cursor.fetchall():
chart += "\n" + self.compile_chart_row(page)
chart += "\n" + self._compile_chart_row(page)


chart += "\n{{" + self.tl_footer + "}}" chart += "\n{{" + self.tl_footer + "}}"
return chart return chart


def compile_chart_row(self, page):
def _compile_chart_row(self, page):
"""Compile and return a single chart row. """Compile and return a single chart row.


'page' is a dict of page information, taken as a row from the page 'page' is a dict of page information, taken as a row from the page
@@ -178,25 +180,27 @@ class AFCStatistics(Task):
row += "sr={page_special_user}|sd={page_special_time}|si={page_special_oldid}|" row += "sr={page_special_user}|sd={page_special_time}|si={page_special_oldid}|"
row += "mr={page_modify_user}|md={page_modify_time}|mi={page_modify_oldid}" row += "mr={page_modify_user}|md={page_modify_time}|mi={page_modify_oldid}"


page["page_special_time"] = self.format_time(page["page_special_time"])
page["page_modify_time"] = self.format_time(page["page_modify_time"])
page["page_special_time"] = self._fmt_time(page["page_special_time"])
page["page_modify_time"] = self._fmt_time(page["page_modify_time"])


if page["page_notes"]: if page["page_notes"]:
row += "|n=1{page_notes}" row += "|n=1{page_notes}"


return "{{" + row.format(self.tl_row, **page) + "}}" return "{{" + row.format(self.tl_row, **page) + "}}"


def format_time(self, dt):
def _fmt_time(self, date):
"""Format a datetime into the standard MediaWiki timestamp format.""" """Format a datetime into the standard MediaWiki timestamp format."""
return dt.strftime("%H:%M, %d %b %Y")
return date.strftime("%H:%M, %d %b %Y")

######################## PRIMARY SYNC ENTRY POINTS ########################


def sync(self, kwargs): def sync(self, kwargs):
"""Synchronize our local statistics database with the site. """Synchronize our local statistics database with the site.


Syncing involves, in order, updating tracked submissions that have Syncing involves, in order, updating tracked submissions that have
been changed since last sync (self.update_tracked()), adding pending
submissions that are not tracked (self.add_untracked()), and removing
old submissions from the database (self.delete_old()).
been changed since last sync (self._update_tracked()), adding pending
submissions that are not tracked (self._add_untracked()), and removing
old submissions from the database (self._delete_old()).


The sync will be canceled if SQL replication lag is greater than 600 The sync will be canceled if SQL replication lag is greater than 600
seconds, because this will lead to potential problems and outdated seconds, because this will lead to potential problems and outdated
@@ -213,22 +217,22 @@ class AFCStatistics(Task):
return return


with self.conn.cursor() as cursor: with self.conn.cursor() as cursor:
self.update_tracked(cursor)
self.add_untracked(cursor)
self.delete_old(cursor)
self._update_tracked(cursor)
self._add_untracked(cursor)
self._delete_old(cursor)


self.logger.info("Sync completed") self.logger.info("Sync completed")


def update_tracked(self, cursor):
def _update_tracked(self, cursor):
"""Update tracked submissions that have been changed since last sync. """Update tracked submissions that have been changed since last sync.


This is done by iterating through every page in our database and This is done by iterating through every page in our database and
comparing our stored latest revision ID with the actual latest revision comparing our stored latest revision ID with the actual latest revision
ID from an SQL query. If they differ, we will update our information ID from an SQL query. If they differ, we will update our information
about the page (self.update_page()).
about the page (self._update_page()).


If the page does not exist, we will remove it from our database with If the page does not exist, we will remove it from our database with
self.untrack_page().
self._untrack_page().
""" """
self.logger.debug("Updating tracked submissions") self.logger.debug("Updating tracked submissions")
query = """SELECT s.page_id, s.page_title, s.page_modify_oldid, query = """SELECT s.page_id, s.page_title, s.page_modify_oldid,
@@ -241,7 +245,7 @@ class AFCStatistics(Task):


for pageid, title, oldid, real_oldid, real_title, real_ns in cursor: for pageid, title, oldid, real_oldid, real_title, real_ns in cursor:
if not real_oldid: if not real_oldid:
self.untrack_page(cursor, pageid)
self._untrack_page(cursor, pageid)
continue continue
msg = u"Updating page [[{0}]] (id: {1}) @ {2}" msg = u"Updating page [[{0}]] (id: {1}) @ {2}"
self.logger.debug(msg.format(title, pageid, oldid)) self.logger.debug(msg.format(title, pageid, oldid))
@@ -252,18 +256,18 @@ class AFCStatistics(Task):
if ns: if ns:
real_title = u":".join((ns, real_title)) real_title = u":".join((ns, real_title))
try: try:
self.update_page(cursor, pageid, real_title)
self._update_page(cursor, pageid, real_title)
except Exception: except Exception:
e = u"Error updating page [[{0}]] (id: {1})" e = u"Error updating page [[{0}]] (id: {1})"
self.logger.exception(e.format(real_title, pageid)) self.logger.exception(e.format(real_title, pageid))


def add_untracked(self, cursor):
def _add_untracked(self, cursor):
"""Add pending submissions that are not yet tracked. """Add pending submissions that are not yet tracked.


This is done by compiling a list of all currently tracked submissions This is done by compiling a list of all currently tracked submissions
and iterating through all members of self.pending_cat via SQL. If a and iterating through all members of self.pending_cat via SQL. If a
page in the pending category is not tracked and is not in page in the pending category is not tracked and is not in
self.ignore_list, we will track it with self.track_page().
self.ignore_list, we will track it with self._track_page().
""" """
self.logger.debug("Adding untracked pending submissions") self.logger.debug("Adding untracked pending submissions")
query = """SELECT r.page_id, r.page_title, r.page_namespace query = """SELECT r.page_id, r.page_title, r.page_namespace
@@ -284,12 +288,12 @@ class AFCStatistics(Task):
msg = u"Tracking page [[{0}]] (id: {1})".format(title, pageid) msg = u"Tracking page [[{0}]] (id: {1})".format(title, pageid)
self.logger.debug(msg) self.logger.debug(msg)
try: try:
self.track_page(cursor, pageid, title)
self._track_page(cursor, pageid, title)
except Exception: except Exception:
e = u"Error tracking page [[{0}]] (id: {1})" e = u"Error tracking page [[{0}]] (id: {1})"
self.logger.exception(e.format(title, pageid)) self.logger.exception(e.format(title, pageid))


def delete_old(self, cursor):
def _delete_old(self, cursor):
"""Remove old submissions from the database. """Remove old submissions from the database.


"Old" is defined as a submission that has been declined or accepted "Old" is defined as a submission that has been declined or accepted
@@ -323,37 +327,39 @@ class AFCStatistics(Task):


msg = u"Updating page [[{0}]] (id: {1}) @ {2}" msg = u"Updating page [[{0}]] (id: {1}) @ {2}"
self.logger.info(msg.format(title, pageid, oldid)) self.logger.info(msg.format(title, pageid, oldid))
self.update_page(cursor, pageid, title)
self._update_page(cursor, pageid, title)

######################## PRIMARY PAGE ENTRY POINTS ########################


def untrack_page(self, cursor, pageid):
def _untrack_page(self, cursor, pageid):
"""Remove a page, given by ID, from our database.""" """Remove a page, given by ID, from our database."""
self.logger.debug("Untracking page (id: {0})".format(pageid)) self.logger.debug("Untracking page (id: {0})".format(pageid))
query = """DELETE FROM page, row USING page JOIN row query = """DELETE FROM page, row USING page JOIN row
ON page_id = row_id WHERE page_id = ?""" ON page_id = row_id WHERE page_id = ?"""
cursor.execute(query, (pageid,)) cursor.execute(query, (pageid,))


def track_page(self, cursor, pageid, title):
def _track_page(self, cursor, pageid, title):
"""Update hook for when page is not in our database. """Update hook for when page is not in our database.


A variety of SQL queries are used to gather information about the page, A variety of SQL queries are used to gather information about the page,
which is then saved to our database. which is then saved to our database.
""" """
content = self.get_content(title)
content = self._get_content(title)
if content is None: if content is None:
msg = u"Could not get page content for [[{0}]]".format(title) msg = u"Could not get page content for [[{0}]]".format(title)
self.logger.error(msg) self.logger.error(msg)
return return


namespace = self.site.get_page(title).namespace namespace = self.site.get_page(title).namespace
status, chart = self.get_status_and_chart(content, namespace)
status, chart = self._get_status_and_chart(content, namespace)
if chart == self.CHART_NONE: if chart == self.CHART_NONE:
msg = u"Could not find a status for [[{0}]]".format(title) msg = u"Could not find a status for [[{0}]]".format(title)
self.logger.warn(msg) self.logger.warn(msg)
return return


m_user, m_time, m_id = self.get_modify(pageid)
s_user, s_time, s_id = self.get_special(pageid, chart)
notes = self.get_notes(chart, content, m_time, s_user)
m_user, m_time, m_id = self._get_modify(pageid)
s_user, s_time, s_id = self._get_special(pageid, content, chart)
notes = self._get_notes(chart, content, m_time, s_user)


query1 = "INSERT INTO row VALUES (?, ?)" query1 = "INSERT INTO row VALUES (?, ?)"
query2 = "INSERT INTO page VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" query2 = "INSERT INTO page VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
@@ -361,23 +367,23 @@ class AFCStatistics(Task):
cursor.execute(query2, (pageid, status, title, len(content), notes, cursor.execute(query2, (pageid, status, title, len(content), notes,
m_user, m_time, m_id, s_user, s_time, s_id)) m_user, m_time, m_id, s_user, s_time, s_id))


def update_page(self, cursor, pageid, title):
def _update_page(self, cursor, pageid, title):
"""Update hook for when page is already in our database. """Update hook for when page is already in our database.


A variety of SQL queries are used to gather information about the page, A variety of SQL queries are used to gather information about the page,
which is compared against our stored information. Differing information which is compared against our stored information. Differing information
is then updated. is then updated.
""" """
content = self.get_content(title)
content = self._get_content(title)
if content is None: if content is None:
msg = u"Could not get page content for [[{0}]]".format(title) msg = u"Could not get page content for [[{0}]]".format(title)
self.logger.error(msg) self.logger.error(msg)
return return


namespace = self.site.get_page(title).namespace namespace = self.site.get_page(title).namespace
status, chart = self.get_status_and_chart(content, namespace)
status, chart = self._get_status_and_chart(content, namespace)
if chart == self.CHART_NONE: if chart == self.CHART_NONE:
self.untrack_page(cursor, pageid)
self._untrack_page(cursor, pageid)
return return


query = "SELECT * FROM page JOIN row ON page_id = row_id WHERE page_id = ?" query = "SELECT * FROM page JOIN row ON page_id = row_id WHERE page_id = ?"
@@ -385,27 +391,29 @@ class AFCStatistics(Task):
dict_cursor.execute(query, (pageid,)) dict_cursor.execute(query, (pageid,))
result = dict_cursor.fetchall()[0] result = dict_cursor.fetchall()[0]


m_user, m_time, m_id = self.get_modify(pageid)
m_user, m_time, m_id = self._get_modify(pageid)


if title != result["page_title"]: if title != result["page_title"]:
self.update_page_title(cursor, result, pageid, title)
self._update_page_title(cursor, result, pageid, title)


if m_id != result["page_modify_oldid"]: if m_id != result["page_modify_oldid"]:
self.update_page_modify(cursor, result, pageid, len(content),
m_user, m_time, m_id)
self._update_page_modify(cursor, result, pageid, len(content),
m_user, m_time, m_id)


if status != result["page_status"]: if status != result["page_status"]:
special = self.update_page_status(cursor, result, pageid, status,
chart)
special = self._update_page_status(cursor, result, pageid, content,
status, chart)
s_user = special[0] s_user = special[0]
else: else:
s_user = result["page_special_user"] s_user = result["page_special_user"]


notes = self.get_notes(chart, content, m_time, s_user)
notes = self._get_notes(chart, content, m_time, s_user)
if notes != result["page_notes"]: if notes != result["page_notes"]:
self.update_page_notes(cursor, result, pageid, notes)
self._update_page_notes(cursor, result, pageid, notes)


def update_page_title(self, cursor, result, pageid, title):
###################### PAGE ATTRIBUTE UPDATE METHODS ######################

def _update_page_title(self, cursor, result, pageid, title):
"""Update the title of a page in our database.""" """Update the title of a page in our database."""
query = "UPDATE page SET page_title = ? WHERE page_id = ?" query = "UPDATE page SET page_title = ? WHERE page_id = ?"
cursor.execute(query, (title, pageid)) cursor.execute(query, (title, pageid))
@@ -413,7 +421,7 @@ class AFCStatistics(Task):
msg = u" {0}: title: {1} -> {2}" msg = u" {0}: title: {1} -> {2}"
self.logger.debug(msg.format(pageid, result["page_title"], title)) self.logger.debug(msg.format(pageid, result["page_title"], title))


def update_page_modify(self, cursor, result, pageid, size, m_user, m_time, m_id):
def _update_page_modify(self, cursor, result, pageid, size, m_user, m_time, m_id):
"""Update the last modified information of a page in our database.""" """Update the last modified information of a page in our database."""
query = """UPDATE page SET page_size = ?, page_modify_user = ?, query = """UPDATE page SET page_size = ?, page_modify_user = ?,
page_modify_time = ?, page_modify_oldid = ? page_modify_time = ?, page_modify_oldid = ?
@@ -426,7 +434,7 @@ class AFCStatistics(Task):
result["page_modify_oldid"], m_user, m_time, m_id) result["page_modify_oldid"], m_user, m_time, m_id)
self.logger.debug(msg) self.logger.debug(msg)


def update_page_status(self, cursor, result, pageid, status, chart):
def _update_page_status(self, cursor, result, pageid, content, status, chart):
"""Update the status and "specialed" information of a page.""" """Update the status and "specialed" information of a page."""
query1 = """UPDATE page JOIN row ON page_id = row_id query1 = """UPDATE page JOIN row ON page_id = row_id
SET page_status = ?, row_chart = ? WHERE page_id = ?""" SET page_status = ?, row_chart = ? WHERE page_id = ?"""
@@ -439,7 +447,7 @@ class AFCStatistics(Task):
self.logger.debug(msg.format(pageid, result["page_status"], self.logger.debug(msg.format(pageid, result["page_status"],
result["row_chart"], status, chart)) result["row_chart"], status, chart))


s_user, s_time, s_id = self.get_special(pageid, chart)
s_user, s_time, s_id = self._get_special(pageid, content, chart)
if s_id != result["page_special_oldid"]: if s_id != result["page_special_oldid"]:
cursor.execute(query2, (s_user, s_time, s_id, pageid)) cursor.execute(query2, (s_user, s_time, s_id, pageid))
msg = u" {0}: special: {1} / {2} / {3} -> {4} / {5} / {6}" msg = u" {0}: special: {1} / {2} / {3} -> {4} / {5} / {6}"
@@ -450,14 +458,16 @@ class AFCStatistics(Task):


return s_user, s_time, s_id return s_user, s_time, s_id


def update_page_notes(self, cursor, result, pageid, notes):
def _update_page_notes(self, cursor, result, pageid, notes):
"""Update the notes (or warnings) of a page in our database.""" """Update the notes (or warnings) of a page in our database."""
query = "UPDATE page SET page_notes = ? WHERE page_id = ?" query = "UPDATE page SET page_notes = ? WHERE page_id = ?"
cursor.execute(query, (notes, pageid)) cursor.execute(query, (notes, pageid))
msg = " {0}: notes: {1} -> {2}" msg = " {0}: notes: {1} -> {2}"
self.logger.debug(msg.format(pageid, result["page_notes"], notes)) self.logger.debug(msg.format(pageid, result["page_notes"], notes))


def get_content(self, title):
###################### DATA RETRIEVAL HELPER METHODS ######################

def _get_content(self, title):
"""Get the current content of a page by title from the API. """Get the current content of a page by title from the API.


The page's current revision ID is retrieved from SQL, and then The page's current revision ID is retrieved from SQL, and then
@@ -482,9 +492,9 @@ class AFCStatistics(Task):
revid = int(list(result)[0][0]) revid = int(list(result)[0][0])
except IndexError: except IndexError:
return None return None
return self.get_revision_content(revid)
return self._get_revision_content(revid)


def get_revision_content(self, revid, tries=1):
def _get_revision_content(self, revid, tries=1):
"""Get the content of a revision by ID from the API.""" """Get the content of a revision by ID from the API."""
if revid in self.revision_cache: if revid in self.revision_cache:
return self.revision_cache[revid] return self.revision_cache[revid]
@@ -495,11 +505,11 @@ class AFCStatistics(Task):
except KeyError: except KeyError:
if tries > 0: if tries > 0:
sleep(5) sleep(5)
return self.get_revision_content(revid, tries=tries - 1)
return self._get_revision_content(revid, tries=tries - 1)
self.revision_cache[revid] = content self.revision_cache[revid] = content
return content return content


def get_status_and_chart(self, content, namespace):
def _get_status_and_chart(self, content, namespace):
"""Determine the status and chart number of an AFC submission. """Determine the status and chart number of an AFC submission.


The methodology used here is the same one I've been using for years The methodology used here is the same one I've been using for years
@@ -549,7 +559,7 @@ class AFCStatistics(Task):
statuses.append(aliases[name]) statuses.append(aliases[name])
return statuses return statuses


def get_modify(self, pageid):
def _get_modify(self, pageid):
"""Return information about a page's last edit ("modification"). """Return information about a page's last edit ("modification").


This consists of the most recent editor, modification time, and the This consists of the most recent editor, modification time, and the
@@ -562,7 +572,7 @@ class AFCStatistics(Task):
timestamp = datetime.strptime(m_time, "%Y%m%d%H%M%S") timestamp = datetime.strptime(m_time, "%Y%m%d%H%M%S")
return m_user.decode("utf8"), timestamp, m_id return m_user.decode("utf8"), timestamp, m_id


def get_special(self, pageid, chart):
def _get_special(self, pageid, content, chart):
"""Return information about a page's "special" edit. """Return information about a page's "special" edit.


I tend to use the term "special" as a verb a lot, which is bound to I tend to use the term "special" as a verb a lot, which is bound to
@@ -574,28 +584,80 @@ class AFCStatistics(Task):
This "information" consists of the special edit's editor, its time, and This "information" consists of the special edit's editor, its time, and
its revision ID. If the page's status is not something that involves its revision ID. If the page's status is not something that involves
"special"-ing, we will return None for all three. The same will be "special"-ing, we will return None for all three. The same will be
returned if we cannot determine when the page was "special"-ed, or if
it was "special"-ed more than 100 edits ago.
returned if we cannot determine when the page was "special"-ed.
""" """
if chart == self.CHART_NONE:
charts = {
self.CHART_NONE: (lambda pageid, content: None, None, None),
self.CHART_MISPLACE: self.get_create,
self.CHART_ACCEPT: self.get_accepted,
self.CHART_REVIEW: self.get_reviewing,
self.CHART_PEND: self.get_pending,
self.CHART_DECLINE: self.get_decline
}
return charts[chart](pageid, content)

def get_create(self, pageid, content=None):
"""Return (creator, create_ts, create_revid) for the given page."""
query = """SELECT rev_user_text, rev_timestamp, rev_id
FROM revision WHERE rev_id =
(SELECT MIN(rev_id) FROM revision WHERE rev_page = ?)"""
result = self.site.sql_query(query, (pageid,))
c_user, c_time, c_id = list(result)[0]
timestamp = datetime.strptime(c_time, "%Y%m%d%H%M%S")
return c_user.decode("utf8"), timestamp, c_id

def get_accepted(self, pageid, content=None):
"""Return (acceptor, accept_ts, accept_revid) for the given page."""
query = """SELECT rev_user_text, rev_timestamp, rev_id FROM revision
WHERE rev_comment LIKE "% moved page [[%]] to [[%]]%"
AND rev_page = ? ORDER BY rev_timestamp DESC LIMIT 1"""
result = self.site.sql_query(query, (pageid,))
try:
a_user, a_time, a_id = list(result)[0]
except IndexError:
return None, None, None return None, None, None
elif chart == self.CHART_MISPLACE:
return self.get_create(pageid)
elif chart == self.CHART_ACCEPT:
search_with = []
search_without = ["R", "P", "T", "D"]
elif chart == self.CHART_PEND:
search_with = ["P"]
search_without = []
elif chart == self.CHART_REVIEW:
search_with = ["R"]
search_without = []
elif chart == self.CHART_DECLINE:
search_with = ["D"]
search_without = ["R", "P", "T"]
return self.search_history(pageid, chart, search_with, search_without)

def search_history(self, pageid, chart, search_with, search_without):
timestamp = datetime.strptime(a_time, "%Y%m%d%H%M%S")
return a_user.decode("utf8"), timestamp, a_id

def get_reviewing(self, pageid, content=None):
"""Return (reviewer, review_ts, review_revid) for the given page."""
return self._search_history(pageid, self.CHART_REVIEW, ["R"], [])

def get_pending(self, pageid, content):
"""Return (submitter, submit_ts, submit_revid) for the given page."""
check = lambda tmpl: not tmpl.has(1) or tmpl.get(1).value.strip().upper() == "P"
res = self._get_status_helper(pageid, content, check, "u", "ts")
return res or self._search_history(pageid, self.CHART_PEND, ["P"], [])

def get_decline(self, pageid, content):
"""Return (decliner, decline_ts, decline_revid) for the given page."""
check = lambda tmpl: tmpl.has(1) and tmpl.get(1).value.strip().upper() == "D"
res = self._get_status_helper(
pageid, content, check, "decliner", "declinets")
return res or self._search_history(
pageid, self.CHART_DECLINE, ["D"], ["R", "P", "T"])

def _get_status_helper(self, pageid, content, check, param_u, param_ts):
"""Helper function for get_pending() and get_decline()."""
submits = []
code = mwparserfromhell.parse(content)
for tmpl in code.filter_templates():
if tmpl.name.strip().lower() == "afc submission" and check(tmpl):
if tmpl.has(param_u) and tmpl.has(param_ts):
submits.append((tmpl.get(param_u), tmpl.get(param_ts)))
if not submits:
return None
latest = max(submits, lambda pair: pair[1])

query = """SELECT rev_id FROM revision WHERE rev_page = ?
AND rev_user_text = ? AND rev_timestamp = ?"""
result = self.site.sql_query(query, (pageid, latest[0], latest[1]))
try:
return latest[0], latest[1], list(result)[0]
except IndexError:
return None

def _search_history(self, pageid, chart, search_with, search_without):
"""Search through a page's history to find when a status was set. """Search through a page's history to find when a status was set.


Linear search backwards in time for the edit right after the most Linear search backwards in time for the edit right after the most
@@ -616,9 +678,9 @@ class AFCStatistics(Task):
self.logger.warn(msg.format(pageid, chart)) self.logger.warn(msg.format(pageid, chart))
return None, None, None return None, None, None
try: try:
content = self.get_revision_content(revid)
content = self._get_revision_content(revid)
except exceptions.APIError: except exceptions.APIError:
msg = "API error interrupted SQL query in search_history() for page (id: {0}, chart: {1})"
msg = "API error interrupted SQL query in _search_history() for page (id: {0}, chart: {1})"
self.logger.exception(msg.format(pageid, chart)) self.logger.exception(msg.format(pageid, chart))
return None, None, None return None, None, None
statuses = self.get_statuses(content) statuses = self.get_statuses(content)
@@ -630,21 +692,7 @@ class AFCStatistics(Task):


return last return last


def get_create(self, pageid):
"""Return information about a page's first edit ("creation").

This consists of the page creator, creation time, and the earliest
revision ID.
"""
query = """SELECT rev_user_text, rev_timestamp, rev_id
FROM revision WHERE rev_id =
(SELECT MIN(rev_id) FROM revision WHERE rev_page = ?)"""
result = self.site.sql_query(query, (pageid,))
c_user, c_time, c_id = list(result)[0]
timestamp = datetime.strptime(c_time, "%Y%m%d%H%M%S")
return c_user.decode("utf8"), timestamp, c_id

def get_notes(self, chart, content, m_time, s_user):
def _get_notes(self, chart, content, m_time, s_user):
"""Return any special notes or warnings about this page. """Return any special notes or warnings about this page.


copyvio: submission is a suspected copyright violation copyvio: submission is a suspected copyright violation
@@ -662,12 +710,12 @@ class AFCStatistics(Task):
return notes return notes


copyvios = self.config.tasks.get("afc_copyvios", {}) copyvios = self.config.tasks.get("afc_copyvios", {})
regex = "\{\{\s*" + copyvios.get("template", "AfC suspected copyvio")
regex = r"\{\{s*" + copyvios.get("template", "AfC suspected copyvio")
if re.search(regex, content): if re.search(regex, content):
notes += "|nc=1" # Submission is a suspected copyvio notes += "|nc=1" # Submission is a suspected copyvio


if not re.search("\<ref\s*(.*?)\>(.*?)\</ref\>", content, re.I | re.S):
regex = "(https?:)|\[//(?!{0})([^ \]\\t\\n\\r\\f\\v]+?)"
if not re.search(r"\<ref\s*(.*?)\>(.*?)\</ref\>", content, re.I|re.S):
regex = r"(https?:)|\[//(?!{0})([^ \]\t\n\r\f\v]+?)"
sitedomain = re.escape(self.site.domain) sitedomain = re.escape(self.site.domain)
if re.search(regex.format(sitedomain), content, re.I | re.S): if re.search(regex.format(sitedomain), content, re.I | re.S):
notes += "|ni=1" # Submission has no inline citations notes += "|ni=1" # Submission has no inline citations


+ 1
- 2
tasks/afc_undated.py View File

@@ -154,8 +154,7 @@ class AFCUndated(Task):
if subject.namespace == NS_FILE: if subject.namespace == NS_FILE:
return self.get_filedata(subject) return self.get_filedata(subject)
self.logger.debug(u"[[{0}]]: Getting talkdata".format(page.title)) self.logger.debug(u"[[{0}]]: Getting talkdata".format(page.title))
chart = self.statistics.CHART_ACCEPT
user, ts, revid = self.statistics.get_special(subject.pageid, chart)
user, ts, revid = self.statistics.get_accepted(subject.pageid)
if not ts: if not ts:
log = u"Couldn't get talkdata for [[{0}]]" log = u"Couldn't get talkdata for [[{0}]]"
self.logger.warn(log.format(page.title)) self.logger.warn(log.format(page.title))


Loading…
Cancel
Save