A tool that evaluates high-risk Wikipedia templates https://tools.wmflabs.org/earwig-dev/tif
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

132 lines
4.4 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime, timedelta
  3. from gzip import GzipFile
  4. from json import loads
  5. from os.path import expanduser
  6. from StringIO import StringIO
  7. from urllib import quote
  8. from urllib2 import URLError
  9. from earwigbot.bot import Bot
  10. from oursql import connect
  11. __all__ = ["calculate_tif"]
  12. SITE_DB = "enwiki_p"
  13. def _get_db(bot):
  14. args = bot.config.wiki["_tifSQL"]
  15. args["read_default_file"] = expanduser("~/.my.cnf")
  16. args["autoping"] = True
  17. args["autoreconnect"] = True
  18. return connect(**args)
  19. def _count_transclusions(cursor, title, ns):
  20. query = """SELECT COUNT(*)
  21. FROM {0}.templatelinks
  22. WHERE tl_title = ? AND tl_namespace = ? AND tl_from_namespace = 0"""
  23. cursor.execute(query.format(SITE_DB), (title, ns))
  24. return cursor.fetchall()[0][0]
  25. def _count_views(cursor, title, ns):
  26. query = """SELECT SUM(cache_views), MIN(cache_time)
  27. FROM {0}.templatelinks
  28. INNER JOIN cache ON tl_from = cache_id
  29. WHERE tl_title = ? AND tl_namespace = ? AND tl_from_namespace = 0"""
  30. cursor.execute(query.format(SITE_DB), (title, ns))
  31. return cursor.fetchall()[0]
  32. def _get_avg_views(site, article):
  33. url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
  34. "{0}.{1}/all-access/user/{2}/daily/{3}/{4}")
  35. days = 30
  36. slug = quote(article, safe="")
  37. start = (datetime.utcnow() - timedelta(days=days)).strftime("%Y%m%d")
  38. end = datetime.utcnow().strftime("%Y%m%d")
  39. query = url.format(site.lang, site.project, slug, start, end)
  40. try:
  41. response = site._opener.open(query) # We're terrible
  42. except URLError:
  43. return None
  44. result = response.read()
  45. if response.headers.get("Content-Encoding") == "gzip":
  46. stream = StringIO(result)
  47. gzipper = GzipFile(fileobj=stream)
  48. result = gzipper.read()
  49. try:
  50. res = loads(result)
  51. except ValueError:
  52. return None
  53. if "items" not in res:
  54. return None
  55. total_views = sum(item["views"] for item in res["items"])
  56. return total_views / (float(days) * 24 * 60)
  57. def _update_views(cursor, site, title, ns):
  58. cache_life = "7 DAY"
  59. query1 = """SELECT tl_from, page_title
  60. FROM {0}.templatelinks
  61. LEFT JOIN {0}.page ON tl_from = page_id
  62. LEFT JOIN cache ON tl_from = cache_id
  63. WHERE tl_title = ? AND tl_namespace = ? AND tl_from_namespace = 0 AND
  64. (cache_id IS NULL OR cache_time < DATE_SUB(NOW(), INTERVAL {1}))"""
  65. query2 = """INSERT INTO cache (cache_id, cache_views, cache_time)
  66. VALUES (?, ?, NOW()) ON DUPLICATE KEY
  67. UPDATE cache_views = ?, cache_time = NOW()"""
  68. cursor.execute(query1.format(SITE_DB, cache_life), (title, ns))
  69. while True:
  70. titles = cursor.fetchmany(1024)
  71. if not titles:
  72. break
  73. viewcounts = [(pageid, _get_avg_views(site, name))
  74. for (pageid, name) in titles]
  75. parambatch = [(i, v, v) for (i, v) in viewcounts if v is not None]
  76. cursor.executemany(query2, parambatch)
  77. def _compute_stats(db, page):
  78. title = page.title.split(":", 1)[-1].replace(" ", "_")
  79. title = title[0].upper() + title[1:]
  80. with db.cursor() as cursor:
  81. transclusions = _count_transclusions(cursor, title, page.namespace)
  82. _update_views(cursor, page.site, title, page.namespace)
  83. tif, cache_time = _count_views(cursor, title, page.namespace)
  84. return tif, transclusions, cache_time
  85. def _format_time(cache_time):
  86. formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
  87. diff = datetime.utcnow() - cache_time
  88. if diff.seconds > 3600:
  89. return formatter(diff.seconds / 3600, "hour")
  90. if diff.seconds > 60:
  91. return formatter(diff.seconds / 60, "minute")
  92. return formatter(diff.seconds, "second")
  93. def calculate_tif(title):
  94. bot = Bot(".earwigbot")
  95. db = _get_db(bot)
  96. site = bot.wiki.get_site()
  97. page = site.get_page(title)
  98. result = {"title": title, "page": page}
  99. if page.exists != page.PAGE_EXISTS:
  100. result["error"] = "no page"
  101. return result
  102. tif, transclusions, cache_time = _compute_stats(db, page)
  103. result["tif"] = tif
  104. result["transclusions"] = transclusions
  105. result["protection"] = page.protection
  106. if cache_time:
  107. result["cache_time"] = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  108. result["cache_ago"] = _format_time(cache_time)
  109. return result