A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

139 lines
4.9 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime
  3. from hashlib import sha256
  4. from urlparse import urlparse
  5. from earwigbot import exceptions
  6. from .misc import Query, get_cache_db
  7. from .sites import get_site, get_sites
  8. __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]
  9. T_POSSIBLE = 0.3
  10. T_SUSPECT = 0.6
  11. def do_check():
  12. query = Query()
  13. if query.lang:
  14. query.lang = query.orig_lang = query.lang.lower()
  15. if "::" in query.lang:
  16. query.lang, query.name = query.lang.split("::", 1)
  17. if query.project:
  18. query.project = query.project.lower()
  19. query.all_langs, query.all_projects = get_sites()
  20. if query.project and query.lang and (query.title or query.oldid):
  21. query.site = get_site(query)
  22. if query.site:
  23. _get_results(query, follow=query.noredirect is None)
  24. return query
  25. def _get_results(query, follow=True):
  26. if query.oldid:
  27. page = query.page = _get_page_by_revid(query.site, query.oldid)
  28. if not page:
  29. return
  30. else:
  31. page = query.page = query.site.get_page(query.title)
  32. try:
  33. page.get() # Make sure that the page exists before we check it!
  34. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  35. return
  36. if page.is_redirect and follow:
  37. try:
  38. query.title = page.get_redirect_target()
  39. except exceptions.RedirectError:
  40. pass # Something's wrong. Continue checking the original page.
  41. else:
  42. query.redirected_from = page
  43. return _get_results(query, follow=False)
  44. if query.url:
  45. if urlparse(query.url).scheme not in ["http", "https"]:
  46. query.error = "bad URI"
  47. return
  48. result = _do_copyvio_compare(query, page, query.url)
  49. if result:
  50. query.result = result
  51. query.result.cached = False
  52. else:
  53. conn = get_cache_db()
  54. if not query.nocache:
  55. query.result = _get_cached_results(page, conn, query)
  56. if not query.result:
  57. query.result = page.copyvio_check(
  58. min_confidence=T_SUSPECT, max_queries=10, max_time=45)
  59. query.result.cached = False
  60. _cache_result(page, query.result, conn)
  61. def _get_page_by_revid(site, revid):
  62. res = site.api_query(action="query", prop="info|revisions", revids=revid,
  63. rvprop="content|timestamp", inprop="protection|url")
  64. try:
  65. page_data = res["query"]["pages"].values()[0]
  66. title = page_data["title"]
  67. page_data["revisions"][0]["*"] # Only need to check that these exist
  68. page_data["revisions"][0]["timestamp"]
  69. except KeyError:
  70. return
  71. page = site.get_page(title)
  72. # EarwigBot doesn't understand old revisions of pages, so we use a somewhat
  73. # dirty hack to make this work:
  74. page._load_attributes(res)
  75. page._load_content(res)
  76. return page
  77. def _get_cached_results(page, conn, query):
  78. query1 = """DELETE FROM cache
  79. WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"""
  80. query2 = """SELECT cache_url, cache_time, cache_queries, cache_process_time
  81. FROM cache
  82. WHERE cache_id = ? AND cache_hash = ?"""
  83. shahash = sha256(page.get().encode("utf8")).hexdigest()
  84. with conn.cursor() as cursor:
  85. cursor.execute(query1)
  86. cursor.execute(query2, (page.pageid, shahash))
  87. results = cursor.fetchall()
  88. if not results:
  89. return None
  90. url, cache_time, num_queries, original_time = results[0]
  91. result = _do_copyvio_compare(query, page, url)
  92. if result:
  93. result.cached = True
  94. result.queries = num_queries
  95. result.original_time = original_time
  96. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  97. result.cache_age = _format_date(cache_time)
  98. return result
  99. def _do_copyvio_compare(query, page, url):
  100. result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30)
  101. if result.source_chain is not page.EMPTY:
  102. return result
  103. query.error = "timeout" if result.time > 30 else "no data"
  104. def _format_date(cache_time):
  105. diff = datetime.utcnow() - cache_time
  106. if diff.seconds > 3600:
  107. return "{0} hours".format(diff.seconds / 3600)
  108. if diff.seconds > 60:
  109. return "{0} minutes".format(diff.seconds / 60)
  110. return "{0} seconds".format(diff.seconds)
  111. def _cache_result(page, result, conn):
  112. query = """INSERT INTO cache
  113. VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)
  114. ON DUPLICATE KEY UPDATE
  115. cache_url = ?, cache_time = CURRENT_TIMESTAMP,
  116. cache_queries = ?, cache_process_time = ?"""
  117. shahash = sha256(page.get().encode("utf8")).hexdigest()
  118. args = (page.pageid, shahash, result.url, result.queries, result.time,
  119. result.url, result.queries, result.time)
  120. with conn.cursor() as cursor:
  121. cursor.execute(query, args)