A tool that evaluates high-risk Wikipedia templates https://tools.wmflabs.org/earwig-dev/tif
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

129 lines
4.2 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime, timedelta
  3. from gzip import GzipFile
  4. from json import loads
  5. from os.path import expanduser
  6. from StringIO import StringIO
  7. from urllib import quote
  8. from urllib2 import URLError
  9. from earwigbot.bot import Bot
  10. from oursql import connect
  11. __all__ = ["calculate_tif"]
  12. def _get_db(bot):
  13. args = bot.config.wiki["_tifSQL"]
  14. args["read_default_file"] = expanduser("~/.my.cnf")
  15. args["autoping"] = True
  16. args["autoreconnect"] = True
  17. return connect(**args)
  18. def _count_transclusions(cursor, title):
  19. query = """SELECT COUNT(*)
  20. FROM templatelinks
  21. WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0"""
  22. cursor.execute(query, (title,))
  23. return cursor.fetchall()[0][0]
  24. def _count_views(cursor, title, dbname):
  25. query = """SELECT SUM(cache_views), MIN(cache_time)
  26. FROM templatelinks
  27. INNER JOIN {0}.cache ON tl_from = cache_id
  28. WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0"""
  29. cursor.execute(query.format(dbname), (title,))
  30. return cursor.fetchall()[0]
  31. def _get_avg_views(site, article):
  32. url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
  33. "{0}.{1}/all-access/user/{2}/daily/{3}/{4}")
  34. days = 30
  35. slug = quote(article.replace(" ", "_"), safe="")
  36. start = datetime.utcnow().strftime("%Y%M%D")
  37. end = (datetime.utcnow() - timedelta(days=days)).strftime("%Y%M%D")
  38. query = url.format(site.lang, site.project, slug, start, end)
  39. try:
  40. response = site._opener.open(query) # We're terrible
  41. except URLError:
  42. return None
  43. result = response.read()
  44. if response.headers.get("Content-Encoding") == "gzip":
  45. stream = StringIO(result)
  46. gzipper = GzipFile(fileobj=stream)
  47. result = gzipper.read()
  48. try:
  49. res = loads(result)
  50. except ValueError:
  51. return None
  52. if "items" not in res:
  53. return None
  54. return sum(item["views"] for item in res["items"]) / float(days)
  55. def _update_views(cursor, site, title, dbname):
  56. cache_life = "7 DAY"
  57. query1 = """SELECT tl_from
  58. FROM templatelinks
  59. LEFT JOIN {0}.cache ON tl_from = cache_id
  60. WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0
  61. AND cache_id IS NULL
  62. OR DATE_SUB(NOW(), INTERVAL {1}) > cache_time"""
  63. query2 = """INSERT INTO {0}.cache (cache_id, cache_views, cache_time)
  64. VALUES (?, ?, NOW()) ON DUPLICATE KEY
  65. UPDATE cache_views = ?, cache_time = NOW()""".format(dbname)
  66. cursor.execute(query1.format(dbname, cache_life), (title,))
  67. while True:
  68. titles = cursor.fetchmany(1024)
  69. if not titles:
  70. break
  71. viewcounts = [(t, _get_avg_views(site, t)) for t in titles]
  72. parambatch = [(t, v, v) for (t, v) in viewcounts if v is not None]
  73. cursor.executemany(query2, parambatch)
  74. def _compute_stats(bot, db, page):
  75. dbname = bot.config.wiki["_tifSQL"]["db"]
  76. title = page.title.replace(" ", "_")
  77. with db.cursor() as cursor:
  78. transclusions = _count_transclusions(cursor, title)
  79. _update_views(cursor, page.site, title, dbname)
  80. tif, cache_time = _count_views(cursor, title, dbname)
  81. return tif, transclusions, cache_time
  82. def _format_time(cache_time):
  83. formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
  84. diff = datetime.utcnow() - cache_time
  85. if diff.seconds > 3600:
  86. return formatter(diff.seconds / 3600, "hour")
  87. if diff.seconds > 60:
  88. return formatter(diff.seconds / 60, "minute")
  89. return formatter(diff.seconds, "second")
  90. def calculate_tif(title):
  91. bot = Bot(".earwigbot")
  92. db = _get_db(bot)
  93. site = bot.wiki.get_site()
  94. page = site.get_page(title)
  95. result = {"title": title, "page": page}
  96. if page.exists != page.PAGE_EXISTS:
  97. result["error"] = "no page"
  98. return result
  99. tif, transclusions, cache_time = _compute_stats(bot, db, page)
  100. result["tif"] = tif
  101. result["transclusions"] = transclusions
  102. result["protection"] = page.protection
  103. if cache_time:
  104. result["cache_time"] = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  105. result["cache_ago"] = _format_time(cache_time)
  106. return result