|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799 |
- # -*- coding: utf-8 -*-
- #
- # Copyright (C) 2009-2019 Ben Kurtovic <ben.kurtovic@gmail.com>
- #
- # Permission is hereby granted, free of charge, to any person obtaining a copy
- # of this software and associated documentation files (the "Software"), to deal
- # in the Software without restriction, including without limitation the rights
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- # copies of the Software, and to permit persons to whom the Software is
- # furnished to do so, subject to the following conditions:
- #
- # The above copyright notice and this permission notice shall be included in
- # all copies or substantial portions of the Software.
- #
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- # SOFTWARE.
-
- from collections import OrderedDict
- from datetime import datetime
- import re
- from os.path import expanduser
- from threading import Lock
- from time import sleep
-
- import mwparserfromhell
- import oursql
-
- from earwigbot import exceptions
- from earwigbot import wiki
- from earwigbot.tasks import Task
-
- _DEFAULT_PAGE_TEXT = """<noinclude><!-- You can edit anything on this page \
- except for content inside of <!-- stat begin/end -> and <!-- sig begin/end -> \
- without causing problems. Most of the chart can be modified by editing the \
- templates it uses, documented in [[Template:AFC statistics/doc]]. -->
- {{NOINDEX}}</noinclude>
- <!-- stat begin --><!-- stat end -->
- <span style="font-style: italic; font-weight: bold;">Last updated by \
- {{#ifeq:{{REVISIONUSER:Template:AFC statistics}}|EarwigBot|<!-- sig begin -->\
- <!-- sig end -->|{{User|{{REVISIONUSER:Template:AFC statistics}}}} at \
- {{#time:H:i, j F Y "(UTC)"|{{REVISIONTIMESTAMP:Template:AFC statistics}}}}}}\
- </span><noinclude>
- {{Documentation}}</noinclude>
- """
-
- class AFCStatistics(Task):
- """A task to generate statistics for WikiProject Articles for Creation.
-
- Statistics are stored in a MySQL database ("u_earwig_afc_statistics")
- accessed with oursql. Statistics are synchronied with the live database
- every four minutes and saved once an hour, on the hour, to subpages of
- self.pageroot. In the live bot, this is "Template:AFC statistics".
- """
- name = "afc_statistics"
- number = 2
-
- # Chart status number constants:
- CHART_NONE = 0
- CHART_PEND = 1
- CHART_REVIEW = 3
- CHART_ACCEPT = 4
- CHART_DECLINE = 5
- CHART_MISPLACE = 6
-
- def setup(self):
- self.cfg = cfg = self.config.tasks.get(self.name, {})
- self.site = self.bot.wiki.get_site()
- self.revision_cache = {}
-
- # Set some wiki-related attributes:
- self.pageroot = cfg.get("page", "Template:AFC statistics")
- self.pending_cat = cfg.get("pending", "Pending AfC submissions")
- self.ignore_list = cfg.get("ignoreList", [])
- default_summary = "Updating statistics for [[WP:WPAFC|WikiProject Articles for creation]]."
- self.summary = self.make_summary(cfg.get("summary", default_summary))
-
- # Templates used in chart generation:
- templates = cfg.get("templates", {})
- self.tl_header = templates.get("header", "AFC statistics/header")
- self.tl_row = templates.get("row", "#invoke:AfC|row")
- self.tl_footer = templates.get("footer", "AFC statistics/footer")
-
- # Connection data for our SQL database:
- kwargs = cfg.get("sql", {})
- kwargs["read_default_file"] = expanduser("~/.my.cnf")
- self.conn_data = kwargs
- self.db_access_lock = Lock()
-
- def run(self, **kwargs):
- """Entry point for a task event.
-
- Depending on the kwargs passed, we will either synchronize our local
- statistics database with the site (self.sync()) or save it to the wiki
- (self.save()). We will additionally create an SQL connection with our
- local database.
- """
- action = kwargs.get("action")
- if not self.db_access_lock.acquire(False): # Non-blocking
- if action == "sync":
- self.logger.info("A sync is already ongoing; aborting")
- return
- self.logger.info("Waiting for database access lock")
- self.db_access_lock.acquire()
-
- try:
- self.site = self.bot.wiki.get_site()
- self.conn = oursql.connect(**self.conn_data)
- self.revision_cache = {}
- try:
- if action == "save":
- self.save(kwargs)
- elif action == "sync":
- self.sync(kwargs)
- finally:
- self.conn.close()
- finally:
- self.db_access_lock.release()
-
- #################### CHART BUILDING AND SAVING METHODS ####################
-
- def save(self, kwargs):
- """Save our local statistics to the wiki.
-
- After checking for emergency shutoff, the statistics chart is compiled,
- and then saved to subpages of self.pageroot using self.summary iff it
- has changed since last save.
- """
- self.logger.info("Saving chart")
- if kwargs.get("fromIRC"):
- summary = self.summary + " (!earwigbot)"
- else:
- if self.shutoff_enabled():
- return
- summary = self.summary
-
- statistics = self._compile_charts()
- for name, chart in statistics.iteritems():
- self._save_page(name, chart)
-
- def _save_page(self, name, chart):
- """Save a statistics chart to a single page."""
- page = self.site.get_page(u"{}/{}".format(self.pageroot, name))
- try:
- text = page.get()
- except exceptions.PageNotFoundError:
- text = _DEFAULT_PAGE_TEXT
-
- newtext = re.sub(u"<!-- stat begin -->(.*?)<!-- stat end -->",
- "<!-- stat begin -->\n" + chart + "\n<!-- stat end -->",
- text, flags=re.DOTALL)
- if newtext == text:
- self.logger.info(u"Chart for {} unchanged; not saving".format(name))
- return
-
- newtext = re.sub("<!-- sig begin -->(.*?)<!-- sig end -->",
- "<!-- sig begin -->~~~ at ~~~~~<!-- sig end -->",
- newtext)
- page.edit(newtext, summary, minor=True, bot=True)
- self.logger.info(u"Chart for {} saved to [[{}]]".format(name, page.title))
-
- def _compile_charts(self):
- """Compile and return all statistics information from our local db."""
- stats = OrderedDict()
- with self.conn.cursor(oursql.DictCursor) as cursor:
- cursor.execute("SELECT * FROM chart")
- for chart in cursor:
- name = chart['chart_name']
- stats[name] = self._compile_chart(chart)
- return stats
-
- def _compile_chart(self, chart_info):
- """Compile and return a single statistics chart."""
- chart = self.tl_header + "|" + chart_info['chart_title']
- if chart_info['chart_special_title']:
- chart += "|" + chart_info['chart_special_title']
- chart = "{{" + chart + "}}"
-
- query = "SELECT * FROM page JOIN row ON page_id = row_id WHERE row_chart = ?"
- with self.conn.cursor(oursql.DictCursor) as cursor:
- cursor.execute(query, (chart_info['chart_id'],))
- for page in cursor.fetchall():
- chart += "\n" + self._compile_chart_row(page)
-
- chart += "\n{{" + self.tl_footer + "}}"
- return chart
-
- def _compile_chart_row(self, page):
- """Compile and return a single chart row.
-
- 'page' is a dict of page information, taken as a row from the page
- table, where keys are column names and values are their cell contents.
- """
- row = u"{0}|s={page_status}|t={page_title}|z={page_size}|"
- if page["page_special_oldid"]:
- row += "sr={page_special_user}|sd={page_special_time}|si={page_special_oldid}|"
- row += "mr={page_modify_user}|md={page_modify_time}|mi={page_modify_oldid}"
-
- page["page_special_time"] = self._fmt_time(page["page_special_time"])
- page["page_modify_time"] = self._fmt_time(page["page_modify_time"])
-
- if page["page_notes"]:
- row += "|n=1{page_notes}"
-
- return "{{" + row.format(self.tl_row, **page) + "}}"
-
- def _fmt_time(self, date):
- """Format a datetime into the standard MediaWiki timestamp format."""
- return date.strftime("%H:%M, %d %b %Y")
-
- ######################## PRIMARY SYNC ENTRY POINTS ########################
-
- def sync(self, kwargs):
- """Synchronize our local statistics database with the site.
-
- Syncing involves, in order, updating tracked submissions that have
- been changed since last sync (self._update_tracked()), adding pending
- submissions that are not tracked (self._add_untracked()), and removing
- old submissions from the database (self._delete_old()).
-
- The sync will be canceled if SQL replication lag is greater than 600
- seconds, because this will lead to potential problems and outdated
- data, not to mention putting demand on an already overloaded server.
- Giving sync the kwarg "ignore_replag" will go around this restriction.
- """
- self.logger.info("Starting sync")
-
- replag = self.site.get_replag()
- self.logger.debug("Server replag is {0}".format(replag))
- if replag > 600 and not kwargs.get("ignore_replag"):
- msg = "Sync canceled as replag ({0} secs) is greater than ten minutes"
- self.logger.warn(msg.format(replag))
- return
-
- with self.conn.cursor() as cursor:
- self._update_tracked(cursor)
- self._add_untracked(cursor)
- self._update_stale(cursor)
- self._delete_old(cursor)
-
- self.logger.info("Sync completed")
-
- def _update_tracked(self, cursor):
- """Update tracked submissions that have been changed since last sync.
-
- This is done by iterating through every page in our database and
- comparing our stored latest revision ID with the actual latest revision
- ID from an SQL query. If they differ, we will update our information
- about the page (self._update_page()).
-
- If the page does not exist, we will remove it from our database with
- self._untrack_page().
- """
- self.logger.debug("Updating tracked submissions")
- query1 = """SELECT page_id, page_title, page_modify_oldid
- FROM page"""
- query2 = """SELECT page_latest, page_title, page_namespace
- FROM page WHERE page_id = ?"""
-
- cursor.execute(query1)
- for pageid, title, oldid in cursor.fetchall():
- result = list(self.site.sql_query(query2, (pageid,)))
- if not result:
- self._untrack_page(cursor, pageid)
- continue
- real_oldid, real_title, real_ns = result[0]
- if oldid == real_oldid:
- continue
-
- msg = u"Updating page [[{0}]] (id: {1}) @ {2}"
- self.logger.debug(msg.format(title, pageid, oldid))
- msg = u" {0}: oldid: {1} -> {2}"
- self.logger.debug(msg.format(pageid, oldid, real_oldid))
- real_title = real_title.decode("utf8").replace("_", " ")
- ns = self.site.namespace_id_to_name(real_ns)
- if ns:
- real_title = u":".join((ns, real_title))
- try:
- self._update_page(cursor, pageid, real_title)
- except Exception:
- e = u"Error updating page [[{0}]] (id: {1})"
- self.logger.exception(e.format(real_title, pageid))
-
- def _add_untracked(self, cursor):
- """Add pending submissions that are not yet tracked.
-
- This is done by compiling a list of all currently tracked submissions
- and iterating through all members of self.pending_cat via SQL. If a
- page in the pending category is not tracked and is not in
- self.ignore_list, we will track it with self._track_page().
- """
- self.logger.debug("Adding untracked pending submissions")
- query1 = "SELECT page_id FROM page"
- query2 = """SELECT page_id, page_title, page_namespace
- FROM page
- INNER JOIN categorylinks ON page_id = cl_from
- WHERE cl_to = ?"""
-
- cursor.execute(query1)
- tracked = [pid for (pid,) in cursor.fetchall()]
- pend_cat = self.pending_cat.replace(" ", "_")
-
- for pageid, title, ns in self.site.sql_query(query2, (pend_cat,)):
- if pageid in tracked:
- continue
-
- title = title.decode("utf8").replace("_", " ")
- ns_name = self.site.namespace_id_to_name(ns)
- if ns_name:
- title = u":".join((ns_name, title))
- if title in self.ignore_list or ns == wiki.NS_CATEGORY:
- continue
- msg = u"Tracking page [[{0}]] (id: {1})".format(title, pageid)
- self.logger.debug(msg)
- try:
- self._track_page(cursor, pageid, title)
- except Exception:
- e = u"Error tracking page [[{0}]] (id: {1})"
- self.logger.exception(e.format(title, pageid))
-
- def _update_stale(self, cursor):
- """Update submissions that haven't been updated in a long time.
-
- This is intended to update notes that change without typical update
- triggers, like when submitters are blocked. It also resolves conflicts
- when pages are tracked during high replag, potentially causing data to
- be inaccurate (like a missed decline). It updates no more than the ten
- stalest pages that haven't been updated in two days.
- """
- self.logger.debug("Updating stale submissions")
- query = """SELECT page_id, page_title, page_modify_oldid
- FROM page JOIN updatelog ON page_id = update_id
- WHERE ADDTIME(update_time, '48:00:00') < NOW()
- ORDER BY update_time ASC LIMIT 10"""
- cursor.execute(query)
-
- for pageid, title, oldid in cursor:
- msg = u"Updating page [[{0}]] (id: {1}) @ {2}"
- self.logger.debug(msg.format(title, pageid, oldid))
- try:
- self._update_page(cursor, pageid, title)
- except Exception:
- e = u"Error updating page [[{0}]] (id: {1})"
- self.logger.exception(e.format(title, pageid))
-
- def _delete_old(self, cursor):
- """Remove old submissions from the database.
-
- "Old" is defined as a submission that has been declined or accepted
- more than 36 hours ago. Pending submissions cannot be "old".
- """
- self.logger.debug("Removing old submissions from chart")
- query = """DELETE FROM page, row, updatelog USING page JOIN row
- ON page_id = row_id JOIN updatelog ON page_id = update_id
- WHERE row_chart IN (?, ?)
- AND ADDTIME(page_special_time, '36:00:00') < NOW()"""
- cursor.execute(query, (self.CHART_ACCEPT, self.CHART_DECLINE))
-
- ######################## PRIMARY PAGE ENTRY POINTS ########################
-
- def _untrack_page(self, cursor, pageid):
- """Remove a page, given by ID, from our database."""
- self.logger.debug("Untracking page (id: {0})".format(pageid))
- query = """DELETE FROM page, row, updatelog USING page JOIN row
- ON page_id = row_id JOIN updatelog ON page_id = update_id
- WHERE page_id = ?"""
- cursor.execute(query, (pageid,))
-
- def _track_page(self, cursor, pageid, title):
- """Update hook for when page is not in our database.
-
- A variety of SQL queries are used to gather information about the page,
- which is then saved to our database.
- """
- content = self._get_content(pageid)
- if content is None:
- msg = u"Could not get page content for [[{0}]]".format(title)
- self.logger.error(msg)
- return
-
- namespace = self.site.get_page(title).namespace
- status, chart = self._get_status_and_chart(content, namespace)
- if chart == self.CHART_NONE:
- msg = u"Could not find a status for [[{0}]]".format(title)
- self.logger.warn(msg)
- return
-
- m_user, m_time, m_id = self._get_modify(pageid)
- s_user, s_time, s_id = self._get_special(pageid, content, chart)
- notes = self._get_notes(chart, content, m_time, s_user)
-
- query1 = "INSERT INTO row VALUES (?, ?)"
- query2 = "INSERT INTO page VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
- query3 = "INSERT INTO updatelog VALUES (?, ?)"
- cursor.execute(query1, (pageid, chart))
- cursor.execute(query2, (pageid, status, title, len(content), notes,
- m_user, m_time, m_id, s_user, s_time, s_id))
- cursor.execute(query3, (pageid, datetime.utcnow()))
-
- def _update_page(self, cursor, pageid, title):
- """Update hook for when page is already in our database.
-
- A variety of SQL queries are used to gather information about the page,
- which is compared against our stored information. Differing information
- is then updated.
- """
- content = self._get_content(pageid)
- if content is None:
- msg = u"Could not get page content for [[{0}]]".format(title)
- self.logger.error(msg)
- return
-
- namespace = self.site.get_page(title).namespace
- status, chart = self._get_status_and_chart(content, namespace)
- if chart == self.CHART_NONE:
- self._untrack_page(cursor, pageid)
- return
-
- query = "SELECT * FROM page JOIN row ON page_id = row_id WHERE page_id = ?"
- with self.conn.cursor(oursql.DictCursor) as dict_cursor:
- dict_cursor.execute(query, (pageid,))
- result = dict_cursor.fetchall()[0]
-
- m_user, m_time, m_id = self._get_modify(pageid)
-
- if title != result["page_title"]:
- self._update_page_title(cursor, result, pageid, title)
-
- if m_id != result["page_modify_oldid"]:
- self._update_page_modify(cursor, result, pageid, len(content),
- m_user, m_time, m_id)
-
- if status != result["page_status"]:
- special = self._update_page_status(cursor, result, pageid, content,
- status, chart)
- s_user = special[0]
- else:
- s_user = result["page_special_user"]
-
- notes = self._get_notes(chart, content, m_time, s_user)
- if notes != result["page_notes"]:
- self._update_page_notes(cursor, result, pageid, notes)
-
- query = "UPDATE updatelog SET update_time = ? WHERE update_id = ?"
- cursor.execute(query, (datetime.utcnow(), pageid))
-
- ###################### PAGE ATTRIBUTE UPDATE METHODS ######################
-
- def _update_page_title(self, cursor, result, pageid, title):
- """Update the title of a page in our database."""
- query = "UPDATE page SET page_title = ? WHERE page_id = ?"
- cursor.execute(query, (title, pageid))
-
- msg = u" {0}: title: {1} -> {2}"
- self.logger.debug(msg.format(pageid, result["page_title"], title))
-
- def _update_page_modify(self, cursor, result, pageid, size, m_user, m_time, m_id):
- """Update the last modified information of a page in our database."""
- query = """UPDATE page SET page_size = ?, page_modify_user = ?,
- page_modify_time = ?, page_modify_oldid = ?
- WHERE page_id = ?"""
- cursor.execute(query, (size, m_user, m_time, m_id, pageid))
-
- msg = u" {0}: modify: {1} / {2} / {3} -> {4} / {5} / {6}"
- msg = msg.format(pageid, result["page_modify_user"],
- result["page_modify_time"],
- result["page_modify_oldid"], m_user, m_time, m_id)
- self.logger.debug(msg)
-
- def _update_page_status(self, cursor, result, pageid, content, status, chart):
- """Update the status and "specialed" information of a page."""
- query1 = """UPDATE page JOIN row ON page_id = row_id
- SET page_status = ?, row_chart = ? WHERE page_id = ?"""
- query2 = """UPDATE page SET page_special_user = ?,
- page_special_time = ?, page_special_oldid = ?
- WHERE page_id = ?"""
- cursor.execute(query1, (status, chart, pageid))
-
- msg = " {0}: status: {1} ({2}) -> {3} ({4})"
- self.logger.debug(msg.format(pageid, result["page_status"],
- result["row_chart"], status, chart))
-
- s_user, s_time, s_id = self._get_special(pageid, content, chart)
- if s_id != result["page_special_oldid"]:
- cursor.execute(query2, (s_user, s_time, s_id, pageid))
- msg = u" {0}: special: {1} / {2} / {3} -> {4} / {5} / {6}"
- msg = msg.format(pageid, result["page_special_user"],
- result["page_special_time"],
- result["page_special_oldid"], s_user, s_time, s_id)
- self.logger.debug(msg)
-
- return s_user, s_time, s_id
-
- def _update_page_notes(self, cursor, result, pageid, notes):
- """Update the notes (or warnings) of a page in our database."""
- query = "UPDATE page SET page_notes = ? WHERE page_id = ?"
- cursor.execute(query, (notes, pageid))
- msg = " {0}: notes: {1} -> {2}"
- self.logger.debug(msg.format(pageid, result["page_notes"], notes))
-
- ###################### DATA RETRIEVAL HELPER METHODS ######################
-
- def _get_content(self, pageid):
- """Get the current content of a page by ID from the API.
-
- The page's current revision ID is retrieved from SQL, and then
- an API query is made to get its content. This is the only API query
- used in the task's code.
- """
- query = "SELECT page_latest FROM page WHERE page_id = ?"
- result = self.site.sql_query(query, (pageid,))
- try:
- revid = int(list(result)[0][0])
- except IndexError:
- return None
- return self._get_revision_content(revid)
-
- def _get_revision_content(self, revid, tries=1):
- """Get the content of a revision by ID from the API."""
- if revid in self.revision_cache:
- return self.revision_cache[revid]
- res = self.site.api_query(action="query", prop="revisions",
- rvprop="content", rvslots="main",
- revids=revid)
- try:
- revision = res["query"]["pages"].values()[0]["revisions"][0]
- content = revision["slots"]["main"]["*"]
- except KeyError:
- if tries == 0:
- raise
- sleep(5)
- return self._get_revision_content(revid, tries=tries - 1)
- self.revision_cache[revid] = content
- return content
-
- def _get_status_and_chart(self, content, namespace):
- """Determine the status and chart number of an AFC submission.
-
- The methodology used here is the same one I've been using for years
- (see also commands.afc_report), but with the new draft system taken
- into account. The order here is important: if there is more than one
- {{AFC submission}} template on a page, we need to know which one to
- use (revision history search to find the most recent isn't a viable
- idea :P).
- """
- statuses = self.get_statuses(content)
- if namespace == wiki.NS_MAIN:
- if statuses:
- return None, self.CHART_MISPLACE
- return "a", self.CHART_ACCEPT
- elif "R" in statuses:
- return "r", self.CHART_REVIEW
- elif "P" in statuses:
- return "p", self.CHART_PEND
- elif "T" in statuses:
- return None, self.CHART_NONE
- elif "D" in statuses:
- return "d", self.CHART_DECLINE
- return None, self.CHART_NONE
-
- def get_statuses(self, content):
- """Return a list of all AFC submission statuses in a page's text."""
- valid = ["P", "R", "T", "D"]
- aliases = {
- "submit": "P",
- "afc submission/submit": "P",
- "afc submission/reviewing": "R",
- "afc submission/pending": "P",
- "afc submission/draft": "T",
- "afc submission/declined": "D"
- }
- statuses = []
- code = mwparserfromhell.parse(content)
- for template in code.filter_templates():
- name = template.name.strip().lower()
- if name == "afc submission":
- if template.has(1, ignore_empty=True):
- status = template.get(1).value.strip().upper()
- statuses.append(status if status in valid else "P")
- else:
- statuses.append("P")
- elif name in aliases:
- statuses.append(aliases[name])
- return statuses
-
- def _get_modify(self, pageid):
- """Return information about a page's last edit ("modification").
-
- This consists of the most recent editor, modification time, and the
- lastest revision ID.
- """
- query = """SELECT actor_name, rev_timestamp, rev_id
- FROM revision
- JOIN page ON rev_id = page_latest
- JOIN actor ON rev_actor = actor_id
- WHERE page_id = ?"""
- result = self.site.sql_query(query, (pageid,))
- m_user, m_time, m_id = list(result)[0]
- timestamp = datetime.strptime(m_time, "%Y%m%d%H%M%S")
- return m_user.decode("utf8"), timestamp, m_id
-
- def _get_special(self, pageid, content, chart):
- """Return information about a page's "special" edit.
-
- I tend to use the term "special" as a verb a lot, which is bound to
- cause confusion. It is merely a short way of saying "the edit in which
- a declined submission was declined, an accepted submission was
- accepted, a submission in review was set as such, a pending submission
- was submitted, and a "misplaced" submission was created."
-
- This "information" consists of the special edit's editor, its time, and
- its revision ID. If the page's status is not something that involves
- "special"-ing, we will return None for all three. The same will be
- returned if we cannot determine when the page was "special"-ed.
- """
- charts = {
- self.CHART_NONE: (lambda pageid, content: None, None, None),
- self.CHART_MISPLACE: self.get_create,
- self.CHART_ACCEPT: self.get_accepted,
- self.CHART_REVIEW: self.get_reviewing,
- self.CHART_PEND: self.get_pending,
- self.CHART_DECLINE: self.get_decline
- }
- return charts[chart](pageid, content)
-
- def get_create(self, pageid, content=None):
- """Return (creator, create_ts, create_revid) for the given page."""
- query = """SELECT actor_name, rev_timestamp, rev_id
- FROM revision
- JOIN actor ON rev_actor = actor_id
- WHERE rev_id = (SELECT MIN(rev_id) FROM revision WHERE rev_page = ?)"""
- result = self.site.sql_query(query, (pageid,))
- c_user, c_time, c_id = list(result)[0]
- timestamp = datetime.strptime(c_time, "%Y%m%d%H%M%S")
- return c_user.decode("utf8"), timestamp, c_id
-
- def get_accepted(self, pageid, content=None):
- """Return (acceptor, accept_ts, accept_revid) for the given page."""
- query = """SELECT actor_name, rev_timestamp, rev_id
- FROM revision
- JOIN actor ON rev_actor = actor_id
- JOIN comment ON rev_comment_id = comment_id
- WHERE rev_page = ?
- AND comment_text LIKE "% moved page [[%]] to [[%]]%"
- ORDER BY rev_timestamp DESC LIMIT 1"""
- result = self.site.sql_query(query, (pageid,))
- try:
- a_user, a_time, a_id = list(result)[0]
- except IndexError:
- return None, None, None
- timestamp = datetime.strptime(a_time, "%Y%m%d%H%M%S")
- return a_user.decode("utf8"), timestamp, a_id
-
- def get_reviewing(self, pageid, content=None):
- """Return (reviewer, review_ts, review_revid) for the given page."""
- return self._search_history(pageid, self.CHART_REVIEW, ["R"], [])
-
- def get_pending(self, pageid, content):
- """Return (submitter, submit_ts, submit_revid) for the given page."""
- res = self._get_status_helper(pageid, content, ("P", ""), ("u", "ts"))
- return res or self._search_history(pageid, self.CHART_PEND, ["P"], [])
-
- def get_decline(self, pageid, content):
- """Return (decliner, decline_ts, decline_revid) for the given page."""
- params = ("decliner", "declinets")
- res = self._get_status_helper(pageid, content, ("D"), params)
- return res or self._search_history(
- pageid, self.CHART_DECLINE, ["D"], ["R", "P", "T"])
-
- def _get_status_helper(self, pageid, content, statuses, params):
- """Helper function for get_pending() and get_decline()."""
- submits = []
- code = mwparserfromhell.parse(content)
- for tmpl in code.filter_templates():
- status = tmpl.get(1).value.strip().upper() if tmpl.has(1) else "P"
- if tmpl.name.strip().lower() == "afc submission":
- if all([tmpl.has(par, ignore_empty=True) for par in params]):
- if status in statuses:
- data = [unicode(tmpl.get(par).value) for par in params]
- submits.append(data)
- if not submits:
- return None
- user, stamp = max(submits, key=lambda pair: pair[1])
-
- query = """SELECT rev_id
- FROM revision_userindex
- JOIN actor ON rev_actor = actor_id
- WHERE rev_page = ? AND actor_name = ? AND ABS(rev_timestamp - ?) <= 60
- ORDER BY ABS(rev_timestamp - ?) ASC LIMIT 1"""
- result = self.site.sql_query(query, (pageid, user, stamp, stamp))
- try:
- dtime = datetime.strptime(stamp, "%Y%m%d%H%M%S")
- return user, dtime, list(result)[0][0]
- except (ValueError, IndexError):
- return None
-
- def _search_history(self, pageid, chart, search_with, search_without):
- """Search through a page's history to find when a status was set.
-
- Linear search backwards in time for the edit right after the most
- recent edit that fails the (pseudocode) test:
-
- ``status_set(any(search_with)) && !status_set(any(search_without))``
- """
- query = """SELECT actor_name, rev_timestamp, rev_id
- FROM revision
- JOIN actor ON rev_actor = actor_id
- WHERE rev_page = ? ORDER BY rev_id DESC"""
- result = self.site.sql_query(query, (pageid,))
-
- counter = 0
- last = (None, None, None)
- for user, ts, revid in result:
- counter += 1
- if counter > 50:
- msg = "Exceeded 50 content lookups while searching history of page (id: {0}, chart: {1})"
- self.logger.warn(msg.format(pageid, chart))
- return None, None, None
- try:
- content = self._get_revision_content(revid)
- except exceptions.APIError:
- msg = "API error interrupted SQL query in _search_history() for page (id: {0}, chart: {1})"
- self.logger.exception(msg.format(pageid, chart))
- return None, None, None
- statuses = self.get_statuses(content)
- req = search_with and not any([s in statuses for s in search_with])
- if any([s in statuses for s in search_without]) or req:
- return last
- timestamp = datetime.strptime(ts, "%Y%m%d%H%M%S")
- last = (user.decode("utf8"), timestamp, revid)
-
- return last
-
- def _get_notes(self, chart, content, m_time, s_user):
- """Return any special notes or warnings about this page.
-
- copyvio: submission is a suspected copyright violation
- unsourced: submission lacks references completely
- no-inline: submission has no inline citations
- short: submission is less than a kilobyte in length
- resubmit: submission was resubmitted after a previous decline
- old: submission has not been touched in > 4 days
- rejected: submission was rejected, a more severe form of declined
- blocked: submitter is currently blocked
- """
- notes = ""
-
- ignored_charts = [self.CHART_NONE, self.CHART_ACCEPT]
- if chart in ignored_charts:
- return notes
-
- if chart == self.CHART_DECLINE:
- # Decline is special, as only the rejected note is meaningful
- code = mwparserfromhell.parse(content)
- for tmpl in code.filter_templates():
- if tmpl.name.strip().lower() == "afc submission":
- if tmpl.has("reject") and tmpl.get("reject").value:
- notes += "|nj=1" # Submission was rejected
- break
- return notes
-
- copyvios = self.config.tasks.get("afc_copyvios", {})
- regex = r"\{\{s*" + copyvios.get("template", "AfC suspected copyvio")
- if re.search(regex, content):
- notes += "|nc=1" # Submission is a suspected copyvio
-
- if not re.search(r"\<ref\s*(.*?)\>(.*?)\</ref\>", content, re.I|re.S):
- regex = r"(https?:)|\[//(?!{0})([^ \]\t\n\r\f\v]+?)"
- sitedomain = re.escape(self.site.domain)
- if re.search(regex.format(sitedomain), content, re.I | re.S):
- notes += "|ni=1" # Submission has no inline citations
- else:
- notes += "|nu=1" # Submission is completely unsourced
-
- if len(content) < 1000:
- notes += "|ns=1" # Submission is short
-
- statuses = self.get_statuses(content)
- if "D" in statuses and chart != self.CHART_MISPLACE:
- notes += "|nr=1" # Submission was resubmitted
-
- time_since_modify = (datetime.utcnow() - m_time).total_seconds()
- max_time = 4 * 24 * 60 * 60
- if time_since_modify > max_time:
- notes += "|no=1" # Submission hasn't been touched in over 4 days
-
- if chart == self.CHART_PEND and s_user:
- submitter = self.site.get_user(s_user)
- try:
- if submitter.blockinfo:
- notes += "|nb=1" # Submitter is blocked
- except exceptions.UserNotFoundError: # Likely an IP
- pass
-
- return notes
|