A tool that evaluates high-risk Wikipedia templates https://tools.wmflabs.org/earwig-dev/tif
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

131 line
4.3 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime, timedelta
  3. from gzip import GzipFile
  4. from json import loads
  5. from os.path import expanduser
  6. from StringIO import StringIO
  7. from urllib import quote
  8. from urllib2 import URLError
  9. from earwigbot.bot import Bot
  10. from oursql import connect
  11. __all__ = ["calculate_tif"]
  12. SITE_DB = "enwiki_p"
  13. def _get_db(bot):
  14. args = bot.config.wiki["_tifSQL"]
  15. args["read_default_file"] = expanduser("~/.my.cnf")
  16. args["autoping"] = True
  17. args["autoreconnect"] = True
  18. return connect(**args)
  19. def _count_transclusions(cursor, title, ns):
  20. query = """SELECT COUNT(*)
  21. FROM {0}.templatelinks
  22. WHERE tl_title = ? AND tl_namespace = ? AND tl_from_namespace = 0"""
  23. cursor.execute(query.format(SITE_DB), (title, ns))
  24. return cursor.fetchall()[0][0]
  25. def _count_views(cursor, title, ns):
  26. query = """SELECT SUM(cache_views), MIN(cache_time)
  27. FROM {0}.templatelinks
  28. INNER JOIN cache ON tl_from = cache_id
  29. WHERE tl_title = ? AND tl_namespace = ? AND tl_from_namespace = 0"""
  30. cursor.execute(query.format(SITE_DB), (title, ns))
  31. return cursor.fetchall()[0]
  32. def _get_avg_views(site, article):
  33. url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
  34. "{0}.{1}/all-access/user/{2}/daily/{3}/{4}")
  35. days = 30
  36. slug = quote(article, safe="")
  37. start = (datetime.utcnow() - timedelta(days=days)).strftime("%Y%m%d")
  38. end = datetime.utcnow().strftime("%Y%m%d")
  39. query = url.format(site.lang, site.project, slug, start, end)
  40. try:
  41. response = site._opener.open(query) # We're terrible
  42. except URLError:
  43. return None
  44. result = response.read()
  45. if response.headers.get("Content-Encoding") == "gzip":
  46. stream = StringIO(result)
  47. gzipper = GzipFile(fileobj=stream)
  48. result = gzipper.read()
  49. try:
  50. res = loads(result)
  51. except ValueError:
  52. return None
  53. if "items" not in res:
  54. return None
  55. return sum(item["views"] for item in res["items"]) / float(days)
  56. def _update_views(cursor, site, title, ns):
  57. cache_life = "7 DAY"
  58. query1 = """SELECT tl_from, page_title
  59. FROM {0}.templatelinks
  60. LEFT JOIN page ON tl_from = page_id
  61. LEFT JOIN cache ON tl_from = cache_id
  62. WHERE tl_title = ? AND tl_namespace = ? AND tl_from_namespace = 0 AND
  63. (cache_id IS NULL OR cache_time < DATE_SUB(NOW(), INTERVAL {1}))"""
  64. query2 = """INSERT INTO cache (cache_id, cache_views, cache_time)
  65. VALUES (?, ?, NOW()) ON DUPLICATE KEY
  66. UPDATE cache_views = ?, cache_time = NOW()"""
  67. cursor.execute(query1.format(SITE_DB, cache_life), (title, ns))
  68. while True:
  69. titles = cursor.fetchmany(1024)
  70. if not titles:
  71. break
  72. viewcounts = [(pageid, _get_avg_views(site, name))
  73. for (pageid, name) in titles]
  74. parambatch = [(i, v, v) for (i, v) in viewcounts if v is not None]
  75. cursor.executemany(query2, parambatch)
  76. def _compute_stats(db, page):
  77. title = page.title.split(":", 1)[-1].replace(" ", "_")
  78. title = title[0].upper() + title[1:]
  79. with db.cursor() as cursor:
  80. transclusions = _count_transclusions(cursor, title, page.namespace)
  81. _update_views(cursor, page.site, title, page.namespace)
  82. tif, cache_time = _count_views(cursor, title, page.namespace)
  83. return tif, transclusions, cache_time
  84. def _format_time(cache_time):
  85. formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
  86. diff = datetime.utcnow() - cache_time
  87. if diff.seconds > 3600:
  88. return formatter(diff.seconds / 3600, "hour")
  89. if diff.seconds > 60:
  90. return formatter(diff.seconds / 60, "minute")
  91. return formatter(diff.seconds, "second")
  92. def calculate_tif(title):
  93. bot = Bot(".earwigbot")
  94. db = _get_db(bot)
  95. site = bot.wiki.get_site()
  96. page = site.get_page(title)
  97. result = {"title": title, "page": page}
  98. if page.exists != page.PAGE_EXISTS:
  99. result["error"] = "no page"
  100. return result
  101. tif, transclusions, cache_time = _compute_stats(db, page)
  102. result["tif"] = tif
  103. result["transclusions"] = transclusions
  104. result["protection"] = page.protection
  105. if cache_time:
  106. result["cache_time"] = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  107. result["cache_ago"] = _format_time(cache_time)
  108. return result