# -*- coding: utf-8 -*- from datetime import datetime, timedelta from gzip import GzipFile from json import loads from os.path import expanduser from StringIO import StringIO from urllib import quote from urllib2 import URLError from earwigbot.bot import Bot from oursql import connect __all__ = ["calculate_tif"] SITE_DB = "enwiki_p" def _get_db(bot): args = bot.config.wiki["_tifSQL"] args["read_default_file"] = expanduser("~/.my.cnf") args["autoping"] = True args["autoreconnect"] = True return connect(**args) def _count_transclusions(cursor, title, ns): query = """SELECT COUNT(*) FROM {0}.templatelinks WHERE tl_title = ? AND tl_namespace = ? AND tl_from_namespace = 0""" cursor.execute(query.format(SITE_DB), (title, ns)) return cursor.fetchall()[0][0] def _count_views(cursor, title, ns): query = """SELECT SUM(cache_views), MIN(cache_time) FROM {0}.templatelinks INNER JOIN cache ON tl_from = cache_id WHERE tl_title = ? AND tl_namespace = ? AND tl_from_namespace = 0""" cursor.execute(query.format(SITE_DB), (title, ns)) return cursor.fetchall()[0] def _get_avg_views(site, article): url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/" "{0}.{1}/all-access/user/{2}/daily/{3}/{4}") days = 30 slug = quote(article, safe="") start = (datetime.utcnow() - timedelta(days=days)).strftime("%Y%m%d") end = datetime.utcnow().strftime("%Y%m%d") query = url.format(site.lang, site.project, slug, start, end) try: response = site._opener.open(query) # We're terrible except URLError: return None result = response.read() if response.headers.get("Content-Encoding") == "gzip": stream = StringIO(result) gzipper = GzipFile(fileobj=stream) result = gzipper.read() try: res = loads(result) except ValueError: return None if "items" not in res: return None total_views = sum(item["views"] for item in res["items"]) return total_views / (float(days) * 24 * 60) def _update_views(cursor, site, title, ns): cache_life = "7 DAY" query1 = """DELETE FROM cache WHERE cache_time < DATE_SUB(NOW(), INTERVAL {0})""" query2 = """SELECT tl_from, page_title FROM {0}.templatelinks LEFT JOIN {0}.page ON tl_from = page_id LEFT JOIN cache ON tl_from = cache_id WHERE tl_title = ? AND tl_namespace = ? AND tl_from_namespace = 0 AND cache_id IS NULL""" query3 = """INSERT INTO cache (cache_id, cache_views, cache_time) VALUES (?, ?, NOW()) ON DUPLICATE KEY UPDATE cache_views = ?, cache_time = NOW()""" cursor.execute(query1.format(cache_life)) cursor.execute(query2.format(SITE_DB), (title, ns)) while True: titles = cursor.fetchmany(1024) if not titles: break viewcounts = [(pageid, _get_avg_views(site, name)) for (pageid, name) in titles] parambatch = [(i, v, v) for (i, v) in viewcounts if v is not None] cursor.executemany(query3, parambatch) def _compute_stats(db, page): title = page.title.split(":", 1)[-1].replace(" ", "_") title = title[0].upper() + title[1:] with db.cursor() as cursor: transclusions = _count_transclusions(cursor, title, page.namespace) _update_views(cursor, page.site, title, page.namespace) tif, cache_time = _count_views(cursor, title, page.namespace) return tif, transclusions, cache_time def _format_time(cache_time): formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s") diff = datetime.utcnow() - cache_time total_seconds = diff.days * 86400 + diff.seconds if total_seconds > 86400: return formatter(total_seconds / 86400, "day") if total_seconds > 3600: return formatter(total_seconds / 3600, "hour") if total_seconds > 60: return formatter(total_seconds / 60, "minute") return formatter(total_seconds, "second") def _get_protection(page): edit = [prot for prot in page.protection if prot["type"] == "edit"] return edit[0] if edit else None def calculate_tif(title): bot = Bot(".earwigbot") db = _get_db(bot) site = bot.wiki.get_site() page = site.get_page(title) result = {"title": title, "page": page} if page.exists != page.PAGE_EXISTS: result["error"] = "no page" return result tif, transclusions, cache_time = _compute_stats(db, page) result["tif"] = tif result["transclusions"] = transclusions result["protection"] = _get_protection(page) if cache_time: result["cache_time"] = cache_time.strftime("%b %d, %Y %H:%M:%S UTC") result["cache_age"] = _format_time(cache_time) return result