A tool that evaluates high-risk Wikipedia templates https://tools.wmflabs.org/earwig-dev/tif
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

127 lines
4.1 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime, timedelta
  3. from gzip import GzipFile
  4. from json import loads
  5. from os.path import expanduser
  6. from StringIO import StringIO
  7. from urllib import quote
  8. from urllib2 import URLError
  9. from earwigbot.bot import Bot
  10. from oursql import connect
  11. __all__ = ["calculate_tif"]
  12. SITE_DB = "enwiki_p"
  13. def _get_db(bot):
  14. args = bot.config.wiki["_tifSQL"]
  15. args["read_default_file"] = expanduser("~/.my.cnf")
  16. args["autoping"] = True
  17. args["autoreconnect"] = True
  18. return connect(**args)
  19. def _count_transclusions(cursor, title):
  20. query = """SELECT COUNT(*)
  21. FROM {0}.templatelinks
  22. WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0"""
  23. cursor.execute(query.format(SITE_DB), (title,))
  24. return cursor.fetchall()[0][0]
  25. def _count_views(cursor, title):
  26. query = """SELECT SUM(cache_views), MIN(cache_time)
  27. FROM {0}.templatelinks
  28. INNER JOIN cache ON tl_from = cache_id
  29. WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0"""
  30. cursor.execute(query.format(SITE_DB), (title,))
  31. return cursor.fetchall()[0]
  32. def _get_avg_views(site, article):
  33. url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
  34. "{0}.{1}/all-access/user/{2}/daily/{3}/{4}")
  35. days = 30
  36. slug = quote(article.replace(" ", "_"), safe="")
  37. start = datetime.utcnow().strftime("%Y%M%D")
  38. end = (datetime.utcnow() - timedelta(days=days)).strftime("%Y%M%D")
  39. query = url.format(site.lang, site.project, slug, start, end)
  40. try:
  41. response = site._opener.open(query) # We're terrible
  42. except URLError:
  43. return None
  44. result = response.read()
  45. if response.headers.get("Content-Encoding") == "gzip":
  46. stream = StringIO(result)
  47. gzipper = GzipFile(fileobj=stream)
  48. result = gzipper.read()
  49. try:
  50. res = loads(result)
  51. except ValueError:
  52. return None
  53. if "items" not in res:
  54. return None
  55. return sum(item["views"] for item in res["items"]) / float(days)
  56. def _update_views(cursor, site, title):
  57. cache_life = "7 DAY"
  58. query1 = """SELECT tl_from
  59. FROM {0}.templatelinks
  60. LEFT JOIN cache ON tl_from = cache_id
  61. WHERE tl_title = ? AND tl_namespace = 10 AND tl_from_namespace = 0 AND
  62. (cache_id IS NULL OR cache_time < DATE_SUB(NOW(), INTERVAL {1}))"""
  63. query2 = """INSERT INTO cache (cache_id, cache_views, cache_time)
  64. VALUES (?, ?, NOW()) ON DUPLICATE KEY
  65. UPDATE cache_views = ?, cache_time = NOW()"""
  66. cursor.execute(query1.format(SITE_DB, cache_life), (title,))
  67. while True:
  68. titles = cursor.fetchmany(1024)
  69. if not titles:
  70. break
  71. viewcounts = [(t, _get_avg_views(site, t)) for t in titles]
  72. parambatch = [(t, v, v) for (t, v) in viewcounts if v is not None]
  73. cursor.executemany(query2, parambatch)
  74. def _compute_stats(db, page):
  75. title = page.title.replace(" ", "_")
  76. with db.cursor() as cursor:
  77. transclusions = _count_transclusions(cursor, title)
  78. _update_views(cursor, page.site, title)
  79. tif, cache_time = _count_views(cursor, title)
  80. return tif, transclusions, cache_time
  81. def _format_time(cache_time):
  82. formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
  83. diff = datetime.utcnow() - cache_time
  84. if diff.seconds > 3600:
  85. return formatter(diff.seconds / 3600, "hour")
  86. if diff.seconds > 60:
  87. return formatter(diff.seconds / 60, "minute")
  88. return formatter(diff.seconds, "second")
  89. def calculate_tif(title):
  90. bot = Bot(".earwigbot")
  91. db = _get_db(bot)
  92. site = bot.wiki.get_site()
  93. page = site.get_page(title)
  94. result = {"title": title, "page": page}
  95. if page.exists != page.PAGE_EXISTS:
  96. result["error"] = "no page"
  97. return result
  98. tif, transclusions, cache_time = _compute_stats(db, page)
  99. result["tif"] = tif
  100. result["transclusions"] = transclusions
  101. result["protection"] = page.protection
  102. if cache_time:
  103. result["cache_time"] = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  104. result["cache_ago"] = _format_time(cache_time)
  105. return result