A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

122 line
4.2 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime
  3. from hashlib import sha256
  4. from urlparse import urlparse
  5. from earwigbot import exceptions
  6. from .misc import Query, get_cache_db
  7. from .sites import get_site, get_sites
  8. __all__ = ["do_check"]
  9. def do_check():
  10. query = Query()
  11. if query.lang:
  12. query.lang = query.orig_lang = query.lang.lower()
  13. if "::" in query.lang:
  14. query.lang, query.name = query.lang.split("::", 1)
  15. if query.project:
  16. query.project = query.project.lower()
  17. query.all_langs, query.all_projects = get_sites()
  18. if query.project and query.lang and (query.title or query.oldid):
  19. query.site = get_site(query)
  20. if query.site:
  21. _get_results(query)
  22. return query
  23. def _get_results(query):
  24. if query.oldid:
  25. page = query.page = _get_page_by_revid(query.site, query.oldid)
  26. if not page:
  27. return
  28. else:
  29. page = query.page = query.site.get_page(query.title)
  30. try:
  31. page.get() # Make sure that the page exists before we check it!
  32. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  33. return
  34. if query.url:
  35. if urlparse(query.url).scheme not in ["http", "https"]:
  36. query.error = "bad URI"
  37. return
  38. result = page.copyvio_compare(query.url)
  39. if result.source_chain is page.EMPTY:
  40. query.error = "no data"
  41. return
  42. query.result = result
  43. query.result.cached = False
  44. else:
  45. conn = get_cache_db()
  46. if not query.nocache:
  47. query.result = _get_cached_results(page, conn)
  48. if not query.result:
  49. query.result = page.copyvio_check(max_queries=10, max_time=45)
  50. query.result.cached = False
  51. _cache_result(page, query.result, conn)
  52. def _get_page_by_revid(site, revid):
  53. res = site.api_query(action="query", prop="info|revisions", revids=revid,
  54. rvprop="content|timestamp", inprop="protection|url")
  55. try:
  56. page_data = res["query"]["pages"].values()[0]
  57. title = page_data["title"]
  58. page_data["revisions"][0]["*"] # Only need to check that these exist
  59. page_data["revisions"][0]["timestamp"]
  60. except KeyError:
  61. return
  62. page = site.get_page(title)
  63. # EarwigBot doesn't understand old revisions of pages, so we use a somewhat
  64. # dirty hack to make this work:
  65. page._load_attributes(res)
  66. page._load_content(res)
  67. return page
  68. def _get_cached_results(page, conn):
  69. query1 = """DELETE FROM cache
  70. WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"""
  71. query2 = """SELECT cache_url, cache_time, cache_queries, cache_process_time
  72. FROM cache
  73. WHERE cache_id = ? AND cache_hash = ?"""
  74. shahash = sha256(page.get().encode("utf8")).hexdigest()
  75. with conn.cursor() as cursor:
  76. cursor.execute(query1)
  77. cursor.execute(query2, (page.pageid, shahash))
  78. results = cursor.fetchall()
  79. if not results:
  80. return None
  81. url, cache_time, num_queries, original_time = results[0]
  82. result = page.copyvio_compare(url)
  83. result.cached = True
  84. result.queries = num_queries
  85. result.original_time = original_time
  86. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  87. result.cache_age = _format_date(cache_time)
  88. return result
  89. def _format_date(cache_time):
  90. diff = datetime.utcnow() - cache_time
  91. if diff.seconds > 3600:
  92. return "{0} hours".format(diff.seconds / 3600)
  93. if diff.seconds > 60:
  94. return "{0} minutes".format(diff.seconds / 60)
  95. return "{0} seconds".format(diff.seconds)
  96. def _cache_result(page, result, conn):
  97. query = """INSERT INTO cache
  98. VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)
  99. ON DUPLICATE KEY UPDATE
  100. cache_url = ?, cache_time = CURRENT_TIMESTAMP,
  101. cache_queries = ?, cache_process_time = ?"""
  102. shahash = sha256(page.get().encode("utf8")).hexdigest()
  103. args = (page.pageid, shahash, result.url, result.queries, result.time,
  104. result.url, result.queries, result.time)
  105. with conn.cursor() as cursor:
  106. cursor.execute(query, args)