A tool that evaluates high-risk Wikipedia templates https://tools.wmflabs.org/earwig-dev/tif
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

142 lines
4.7 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime, timedelta
  3. from gzip import GzipFile
  4. from json import loads
  5. from os.path import expanduser
  6. from StringIO import StringIO
  7. from urllib import quote
  8. from urllib2 import URLError
  9. from earwigbot.bot import Bot
  10. from oursql import connect
  11. __all__ = ["calculate_tif"]
  12. SITE_DB = "enwiki_p"
  13. def _get_db(bot):
  14. args = bot.config.wiki["_tifSQL"]
  15. args["read_default_file"] = expanduser("~/.my.cnf")
  16. args["autoping"] = True
  17. args["autoreconnect"] = True
  18. return connect(**args)
  19. def _count_transclusions(cursor, title, ns):
  20. query = """SELECT COUNT(*)
  21. FROM {0}.templatelinks
  22. WHERE tl_title = ? AND tl_namespace = ? AND tl_from_namespace = 0"""
  23. cursor.execute(query.format(SITE_DB), (title, ns))
  24. return cursor.fetchall()[0][0]
  25. def _count_views(cursor, title, ns):
  26. query = """SELECT SUM(cache_views), MIN(cache_time)
  27. FROM {0}.templatelinks
  28. INNER JOIN cache ON tl_from = cache_id
  29. WHERE tl_title = ? AND tl_namespace = ? AND tl_from_namespace = 0"""
  30. cursor.execute(query.format(SITE_DB), (title, ns))
  31. return cursor.fetchall()[0]
  32. def _get_avg_views(site, article):
  33. url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
  34. "{0}.{1}/all-access/user/{2}/daily/{3}/{4}")
  35. days = 30
  36. slug = quote(article, safe="")
  37. start = (datetime.utcnow() - timedelta(days=days)).strftime("%Y%m%d")
  38. end = datetime.utcnow().strftime("%Y%m%d")
  39. query = url.format(site.lang, site.project, slug, start, end)
  40. try:
  41. response = site._opener.open(query) # We're terrible
  42. except URLError:
  43. return None
  44. result = response.read()
  45. if response.headers.get("Content-Encoding") == "gzip":
  46. stream = StringIO(result)
  47. gzipper = GzipFile(fileobj=stream)
  48. result = gzipper.read()
  49. try:
  50. res = loads(result)
  51. except ValueError:
  52. return None
  53. if "items" not in res:
  54. return None
  55. total_views = sum(item["views"] for item in res["items"])
  56. return total_views / (float(days) * 24 * 60)
  57. def _update_views(cursor, site, title, ns):
  58. cache_life = "7 DAY"
  59. query1 = """DELETE FROM cache
  60. WHERE cache_time < DATE_SUB(NOW(), INTERVAL {0})"""
  61. query2 = """SELECT tl_from, page_title
  62. FROM {0}.templatelinks
  63. LEFT JOIN {0}.page ON tl_from = page_id
  64. LEFT JOIN cache ON tl_from = cache_id
  65. WHERE tl_title = ? AND tl_namespace = ? AND tl_from_namespace = 0
  66. AND cache_id IS NULL"""
  67. query3 = """INSERT INTO cache (cache_id, cache_views, cache_time)
  68. VALUES (?, ?, NOW()) ON DUPLICATE KEY
  69. UPDATE cache_views = ?, cache_time = NOW()"""
  70. cursor.execute(query1.format(cache_life))
  71. cursor.execute(query2.format(SITE_DB), (title, ns))
  72. while True:
  73. titles = cursor.fetchmany(1024)
  74. if not titles:
  75. break
  76. viewcounts = [(pageid, _get_avg_views(site, name))
  77. for (pageid, name) in titles]
  78. parambatch = [(i, v, v) for (i, v) in viewcounts if v is not None]
  79. cursor.executemany(query3, parambatch)
  80. def _compute_stats(db, page):
  81. title = page.title.split(":", 1)[-1].replace(" ", "_")
  82. title = title[0].upper() + title[1:]
  83. with db.cursor() as cursor:
  84. transclusions = _count_transclusions(cursor, title, page.namespace)
  85. _update_views(cursor, page.site, title, page.namespace)
  86. tif, cache_time = _count_views(cursor, title, page.namespace)
  87. return tif, transclusions, cache_time
  88. def _format_time(cache_time):
  89. formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
  90. diff = datetime.utcnow() - cache_time
  91. total_seconds = diff.days * 86400 + diff.seconds
  92. if total_seconds > 86400:
  93. return formatter(total_seconds / 86400, "day")
  94. if total_seconds > 3600:
  95. return formatter(total_seconds / 3600, "hour")
  96. if total_seconds > 60:
  97. return formatter(total_seconds / 60, "minute")
  98. return formatter(total_seconds, "second")
  99. def _get_protection(page):
  100. edit = [prot for prot in page.protection if prot["type"] == "edit"]
  101. return edit[0] if edit else None
  102. def calculate_tif(title):
  103. bot = Bot(".earwigbot")
  104. db = _get_db(bot)
  105. site = bot.wiki.get_site()
  106. page = site.get_page(title)
  107. result = {"title": title, "page": page}
  108. if page.exists != page.PAGE_EXISTS:
  109. result["error"] = "no page"
  110. return result
  111. tif, transclusions, cache_time = _compute_stats(db, page)
  112. result["tif"] = tif
  113. result["transclusions"] = transclusions
  114. result["protection"] = _get_protection(page)
  115. if cache_time:
  116. result["cache_time"] = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  117. result["cache_age"] = _format_time(cache_time)
  118. return result