From bfcd4cbaab11cd067891d573b25cd027e72332c3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 9 Feb 2016 00:57:29 -0600 Subject: [PATCH] Implement pageviews API requests. --- app.py | 0 schema.sql | 4 +-- templates/index.mako | 2 +- tif/calc.py | 91 +++++++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 83 insertions(+), 14 deletions(-) mode change 100644 => 100755 app.py diff --git a/app.py b/app.py old mode 100644 new mode 100755 diff --git a/schema.sql b/schema.sql index 99b5d0d..8edb002 100644 --- a/schema.sql +++ b/schema.sql @@ -3,7 +3,7 @@ CREATE DATABASE `sXXXXX__tif`; DROP TABLE IF EXISTS `cache`; CREATE TABLE `cache` ( `cache_id` INT(10) UNSIGNED NOT NULL, - `cache_views` INT UNSIGNED NOT NULL, - `cache_time` TIMESTAMP NOT NULL, + `cache_views` FLOAT NOT NULL, + `cache_time` TIMESTAMP NOT NULL DEFAULT '0000-00-00 00:00:00', PRIMARY KEY (`cache_id`) ) ENGINE=InnoDB; diff --git a/templates/index.mako b/templates/index.mako index b8bb765..b5043bf 100644 --- a/templates/index.mako +++ b/templates/index.mako @@ -24,7 +24,7 @@ - + diff --git a/tif/calc.py b/tif/calc.py index cc47dfb..66596d6 100644 --- a/tif/calc.py +++ b/tif/calc.py @@ -1,7 +1,12 @@ # -*- coding: utf-8 -*- -from datetime import datetime +from datetime import datetime, timedelta +from gzip import GzipFile +from json import loads from os.path import expanduser +from StringIO import StringIO +from urllib import quote +from urllib2 import URLError from earwigbot.bot import Bot from oursql import connect @@ -15,16 +20,80 @@ def _get_db(bot): args["autoreconnect"] = True return connect(**args) -def _compute_stats(page, db): - with db.cursor() as cursor: - query = """SELECT COUNT(*) FROM templatelinks WHERE tl_title = ? - AND tl_namespace = 10 AND tl_from_namespace = 0""" - cursor.execute(query, (page.title.replace(" ", "_"),)) - transclusions = cursor.fetchall()[0][0] +def _count_transclusions(cursor, title): + query = """SELECT COUNT(*) + FROM templatelinks + WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0""" + cursor.execute(query, (title,)) + return cursor.fetchall()[0][0] + +def _count_views(cursor, title, dbname): + query = """SELECT SUM(cache_views), MIN(cache_time) + FROM templatelinks + INNER JOIN {0}.cache ON tl_from = cache_id + WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0""" + cursor.execute(query.format(dbname), (title,)) + return cursor.fetchall()[0] + +def _get_avg_views(site, article): + url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/" + "{0}.{1}/all-access/user/{2}/daily/{3}/{4}") + days = 30 + slug = quote(article.replace(" ", "_"), safe="") + start = datetime.utcnow().strftime("%Y%M%D") + end = (datetime.utcnow() - timedelta(days=days)).strftime("%Y%M%D") + query = url.format(site.lang, site.project, slug, start, end) + + try: + response = site._opener.open(query) # We're terrible + except URLError: + return None + + result = response.read() + if response.headers.get("Content-Encoding") == "gzip": + stream = StringIO(result) + gzipper = GzipFile(fileobj=stream) + result = gzipper.read() + + try: + res = loads(result) + except ValueError: + return None - # TODO - tif = 0.0 - cache_time = None + if "items" not in res: + return None + return sum(item["views"] for item in res["items"]) / float(days) + +def _update_views(cursor, site, title, dbname): + cache_life = "7 DAY" + query1 = """SELECT tl_from + FROM templatelinks + LEFT JOIN {0}.cache ON tl_from = cache_id + WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0 + AND cache_id IS NULL + OR DATE_SUB(NOW(), INTERVAL {1}) > cache_time""" + query2 = """INSERT INTO {0}.cache (cache_id, cache_views, cache_time) + VALUES (?, ?, NOW()) ON DUPLICATE KEY + UPDATE cache_views = ?, cache_time = NOW()""".format(dbname) + + cursor.execute(query1.format(dbname, cache_life), (title,)) + while True: + titles = cursor.fetchmany(1024) + if not titles: + break + + viewcounts = [(t, _get_avg_views(site, t)) for t in titles] + parambatch = [(t, v, v) for (t, v) in viewcounts if v is not None] + cursor.executemany(query2, parambatch) + +def _compute_stats(bot, db, page): + dbname = bot.config.wiki["_tifSQL"]["db"] + title = page.title.replace(" ", "_") + + with db.cursor() as cursor: + transclusions = _count_transclusions(cursor, title) + _update_views(cursor, page.site, title, dbname) + tif, cache_time = _count_views(cursor, title, dbname) return tif, transclusions, cache_time @@ -48,7 +117,7 @@ def calculate_tif(title): result["error"] = "no page" return result - tif, transclusions, cache_time = _compute_stats(page, db) + tif, transclusions, cache_time = _compute_stats(bot, db, page) result["tif"] = tif result["transclusions"] = transclusions
TIF${result["tif"]}${"{0:.2f}".format(result["tif"])}
Transclusions