Ver código fonte

Implement pageviews API requests.

master
Ben Kurtovic 8 anos atrás
pai
commit
bfcd4cbaab
4 arquivos alterados com 83 adições e 14 exclusões
  1. +0
    -0
     
  2. +2
    -2
      schema.sql
  3. +1
    -1
      templates/index.mako
  4. +80
    -11
      tif/calc.py

+ 0
- 0
Ver arquivo


+ 2
- 2
schema.sql Ver arquivo

@@ -3,7 +3,7 @@ CREATE DATABASE `sXXXXX__tif`;
DROP TABLE IF EXISTS `cache`; DROP TABLE IF EXISTS `cache`;
CREATE TABLE `cache` ( CREATE TABLE `cache` (
`cache_id` INT(10) UNSIGNED NOT NULL, `cache_id` INT(10) UNSIGNED NOT NULL,
`cache_views` INT UNSIGNED NOT NULL,
`cache_time` TIMESTAMP NOT NULL,
`cache_views` FLOAT NOT NULL,
`cache_time` TIMESTAMP NOT NULL DEFAULT '0000-00-00 00:00:00',
PRIMARY KEY (`cache_id`) PRIMARY KEY (`cache_id`)
) ENGINE=InnoDB; ) ENGINE=InnoDB;

+ 1
- 1
templates/index.mako Ver arquivo

@@ -24,7 +24,7 @@
<table> <table>
<tr> <tr>
<td>TIF</td> <td>TIF</td>
<td>${result["tif"]}</td>
<td>${"{0:.2f}".format(result["tif"])}</td>
</tr> </tr>
<tr> <tr>
<td>Transclusions</td> <td>Transclusions</td>


+ 80
- 11
tif/calc.py Ver arquivo

@@ -1,7 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-


from datetime import datetime
from datetime import datetime, timedelta
from gzip import GzipFile
from json import loads
from os.path import expanduser from os.path import expanduser
from StringIO import StringIO
from urllib import quote
from urllib2 import URLError


from earwigbot.bot import Bot from earwigbot.bot import Bot
from oursql import connect from oursql import connect
@@ -15,16 +20,80 @@ def _get_db(bot):
args["autoreconnect"] = True args["autoreconnect"] = True
return connect(**args) return connect(**args)


def _compute_stats(page, db):
with db.cursor() as cursor:
query = """SELECT COUNT(*) FROM templatelinks WHERE tl_title = ?
AND tl_namespace = 10 AND tl_from_namespace = 0"""
cursor.execute(query, (page.title.replace(" ", "_"),))
transclusions = cursor.fetchall()[0][0]
def _count_transclusions(cursor, title):
query = """SELECT COUNT(*)
FROM templatelinks
WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0"""
cursor.execute(query, (title,))
return cursor.fetchall()[0][0]

def _count_views(cursor, title, dbname):
query = """SELECT SUM(cache_views), MIN(cache_time)
FROM templatelinks
INNER JOIN {0}.cache ON tl_from = cache_id
WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0"""
cursor.execute(query.format(dbname), (title,))
return cursor.fetchall()[0]

def _get_avg_views(site, article):
url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
"{0}.{1}/all-access/user/{2}/daily/{3}/{4}")
days = 30
slug = quote(article.replace(" ", "_"), safe="")
start = datetime.utcnow().strftime("%Y%M%D")
end = (datetime.utcnow() - timedelta(days=days)).strftime("%Y%M%D")
query = url.format(site.lang, site.project, slug, start, end)

try:
response = site._opener.open(query) # We're terrible
except URLError:
return None

result = response.read()
if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(result)
gzipper = GzipFile(fileobj=stream)
result = gzipper.read()

try:
res = loads(result)
except ValueError:
return None


# TODO
tif = 0.0
cache_time = None
if "items" not in res:
return None
return sum(item["views"] for item in res["items"]) / float(days)

def _update_views(cursor, site, title, dbname):
cache_life = "7 DAY"
query1 = """SELECT tl_from
FROM templatelinks
LEFT JOIN {0}.cache ON tl_from = cache_id
WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0
AND cache_id IS NULL
OR DATE_SUB(NOW(), INTERVAL {1}) > cache_time"""
query2 = """INSERT INTO {0}.cache (cache_id, cache_views, cache_time)
VALUES (?, ?, NOW()) ON DUPLICATE KEY
UPDATE cache_views = ?, cache_time = NOW()""".format(dbname)

cursor.execute(query1.format(dbname, cache_life), (title,))
while True:
titles = cursor.fetchmany(1024)
if not titles:
break

viewcounts = [(t, _get_avg_views(site, t)) for t in titles]
parambatch = [(t, v, v) for (t, v) in viewcounts if v is not None]
cursor.executemany(query2, parambatch)

def _compute_stats(bot, db, page):
dbname = bot.config.wiki["_tifSQL"]["db"]
title = page.title.replace(" ", "_")

with db.cursor() as cursor:
transclusions = _count_transclusions(cursor, title)
_update_views(cursor, page.site, title, dbname)
tif, cache_time = _count_views(cursor, title, dbname)


return tif, transclusions, cache_time return tif, transclusions, cache_time


@@ -48,7 +117,7 @@ def calculate_tif(title):
result["error"] = "no page" result["error"] = "no page"
return result return result


tif, transclusions, cache_time = _compute_stats(page, db)
tif, transclusions, cache_time = _compute_stats(bot, db, page)


result["tif"] = tif result["tif"] = tif
result["transclusions"] = transclusions result["transclusions"] = transclusions


Carregando…
Cancelar
Salvar