Browse Source

Implement pageviews API requests.

master
Ben Kurtovic 9 years ago
parent
commit
bfcd4cbaab
4 changed files with 83 additions and 14 deletions
  1. +0
    -0
     
  2. +2
    -2
      schema.sql
  3. +1
    -1
      templates/index.mako
  4. +80
    -11
      tif/calc.py

+ 0
- 0
View File


+ 2
- 2
schema.sql View File

@@ -3,7 +3,7 @@ CREATE DATABASE `sXXXXX__tif`;
DROP TABLE IF EXISTS `cache`;
CREATE TABLE `cache` (
`cache_id` INT(10) UNSIGNED NOT NULL,
`cache_views` INT UNSIGNED NOT NULL,
`cache_time` TIMESTAMP NOT NULL,
`cache_views` FLOAT NOT NULL,
`cache_time` TIMESTAMP NOT NULL DEFAULT '0000-00-00 00:00:00',
PRIMARY KEY (`cache_id`)
) ENGINE=InnoDB;

+ 1
- 1
templates/index.mako View File

@@ -24,7 +24,7 @@
<table>
<tr>
<td>TIF</td>
<td>${result["tif"]}</td>
<td>${"{0:.2f}".format(result["tif"])}</td>
</tr>
<tr>
<td>Transclusions</td>


+ 80
- 11
tif/calc.py View File

@@ -1,7 +1,12 @@
# -*- coding: utf-8 -*-

from datetime import datetime
from datetime import datetime, timedelta
from gzip import GzipFile
from json import loads
from os.path import expanduser
from StringIO import StringIO
from urllib import quote
from urllib2 import URLError

from earwigbot.bot import Bot
from oursql import connect
@@ -15,16 +20,80 @@ def _get_db(bot):
args["autoreconnect"] = True
return connect(**args)

def _compute_stats(page, db):
with db.cursor() as cursor:
query = """SELECT COUNT(*) FROM templatelinks WHERE tl_title = ?
AND tl_namespace = 10 AND tl_from_namespace = 0"""
cursor.execute(query, (page.title.replace(" ", "_"),))
transclusions = cursor.fetchall()[0][0]
def _count_transclusions(cursor, title):
query = """SELECT COUNT(*)
FROM templatelinks
WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0"""
cursor.execute(query, (title,))
return cursor.fetchall()[0][0]

def _count_views(cursor, title, dbname):
query = """SELECT SUM(cache_views), MIN(cache_time)
FROM templatelinks
INNER JOIN {0}.cache ON tl_from = cache_id
WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0"""
cursor.execute(query.format(dbname), (title,))
return cursor.fetchall()[0]

def _get_avg_views(site, article):
url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
"{0}.{1}/all-access/user/{2}/daily/{3}/{4}")
days = 30
slug = quote(article.replace(" ", "_"), safe="")
start = datetime.utcnow().strftime("%Y%M%D")
end = (datetime.utcnow() - timedelta(days=days)).strftime("%Y%M%D")
query = url.format(site.lang, site.project, slug, start, end)

try:
response = site._opener.open(query) # We're terrible
except URLError:
return None

result = response.read()
if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(result)
gzipper = GzipFile(fileobj=stream)
result = gzipper.read()

try:
res = loads(result)
except ValueError:
return None

# TODO
tif = 0.0
cache_time = None
if "items" not in res:
return None
return sum(item["views"] for item in res["items"]) / float(days)

def _update_views(cursor, site, title, dbname):
cache_life = "7 DAY"
query1 = """SELECT tl_from
FROM templatelinks
LEFT JOIN {0}.cache ON tl_from = cache_id
WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0
AND cache_id IS NULL
OR DATE_SUB(NOW(), INTERVAL {1}) > cache_time"""
query2 = """INSERT INTO {0}.cache (cache_id, cache_views, cache_time)
VALUES (?, ?, NOW()) ON DUPLICATE KEY
UPDATE cache_views = ?, cache_time = NOW()""".format(dbname)

cursor.execute(query1.format(dbname, cache_life), (title,))
while True:
titles = cursor.fetchmany(1024)
if not titles:
break

viewcounts = [(t, _get_avg_views(site, t)) for t in titles]
parambatch = [(t, v, v) for (t, v) in viewcounts if v is not None]
cursor.executemany(query2, parambatch)

def _compute_stats(bot, db, page):
dbname = bot.config.wiki["_tifSQL"]["db"]
title = page.title.replace(" ", "_")

with db.cursor() as cursor:
transclusions = _count_transclusions(cursor, title)
_update_views(cursor, page.site, title, dbname)
tif, cache_time = _count_views(cursor, title, dbname)

return tif, transclusions, cache_time

@@ -48,7 +117,7 @@ def calculate_tif(title):
result["error"] = "no page"
return result

tif, transclusions, cache_time = _compute_stats(page, db)
tif, transclusions, cache_time = _compute_stats(bot, db, page)

result["tif"] = tif
result["transclusions"] = transclusions


Loading…
Cancel
Save