A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

196 line
7.4 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime
  3. from hashlib import sha256
  4. from urlparse import urlparse
  5. from earwigbot import exceptions
  6. from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain
  7. from earwigbot.wiki.copyvios.parsers import ArticleTextParser
  8. from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult
  9. from .misc import Query, get_cache_db
  10. from .sites import get_site, get_sites
  11. __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]
  12. T_POSSIBLE = 0.4
  13. T_SUSPECT = 0.75
  14. def _coerce_bool(val):
  15. return val and val not in ("0", "false")
  16. def do_check(query=None):
  17. if not query:
  18. query = Query()
  19. if query.lang:
  20. query.lang = query.orig_lang = query.lang.lower()
  21. if "::" in query.lang:
  22. query.lang, query.name = query.lang.split("::", 1)
  23. if query.project:
  24. query.project = query.project.lower()
  25. query.all_langs, query.all_projects = get_sites()
  26. query.submitted = query.project and query.lang and (query.title or query.oldid)
  27. if query.submitted:
  28. query.site = get_site(query)
  29. if query.site:
  30. _get_results(query, follow=not _coerce_bool(query.noredirect))
  31. return query
  32. def _get_results(query, follow=True):
  33. if query.oldid:
  34. page = query.page = _get_page_by_revid(query.site, query.oldid)
  35. if not page:
  36. return
  37. else:
  38. page = query.page = query.site.get_page(query.title)
  39. try:
  40. page.get() # Make sure that the page exists before we check it!
  41. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  42. return
  43. if page.is_redirect and follow:
  44. try:
  45. query.title = page.get_redirect_target()
  46. except exceptions.RedirectError:
  47. pass # Something's wrong. Continue checking the original page.
  48. else:
  49. query.redirected_from = page
  50. return _get_results(query, follow=False)
  51. if not query.action:
  52. query.action = "compare" if query.url else "search"
  53. if query.action == "search":
  54. conn = get_cache_db()
  55. use_engine = 0 if query.use_engine in ("0", "false") else 1
  56. use_links = 0 if query.use_links in ("0", "false") else 1
  57. if not use_engine and not use_links:
  58. query.error = "no search method"
  59. return
  60. mode = "{0}:{1}:".format(use_engine, use_links)
  61. if not _coerce_bool(query.nocache):
  62. query.result = _get_cached_results(
  63. page, conn, mode, _coerce_bool(query.noskip))
  64. if not query.result:
  65. try:
  66. query.result = page.copyvio_check(
  67. min_confidence=T_SUSPECT, max_queries=10, max_time=45,
  68. no_searches=not use_engine, no_links=not use_links,
  69. short_circuit=not query.noskip)
  70. except exceptions.SearchQueryError as exc:
  71. query.error = "search error"
  72. query.exception = exc
  73. return
  74. query.result.cached = False
  75. _cache_result(page, query.result, conn, mode)
  76. elif query.action == "compare":
  77. if not query.url:
  78. query.error = "no URL"
  79. return
  80. scheme = urlparse(query.url).scheme
  81. if not scheme and query.url[0] not in ":/":
  82. query.url = "http://" + query.url
  83. elif scheme not in ["http", "https"]:
  84. query.error = "bad URI"
  85. return
  86. result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT,
  87. max_time=30)
  88. if result.best.chains[0] is EMPTY:
  89. query.error = "timeout" if result.time > 30 else "no data"
  90. return
  91. query.result = result
  92. query.result.cached = False
  93. else:
  94. query.error = "bad action"
  95. def _get_page_by_revid(site, revid):
  96. res = site.api_query(action="query", prop="info|revisions", revids=revid,
  97. rvprop="content|timestamp", inprop="protection|url")
  98. try:
  99. page_data = res["query"]["pages"].values()[0]
  100. title = page_data["title"]
  101. page_data["revisions"][0]["*"] # Only need to check that these exist
  102. page_data["revisions"][0]["timestamp"]
  103. except KeyError:
  104. return
  105. page = site.get_page(title)
  106. # EarwigBot doesn't understand old revisions of pages, so we use a somewhat
  107. # dirty hack to make this work:
  108. page._load_attributes(res)
  109. page._load_content(res)
  110. return page
  111. def _get_cached_results(page, conn, mode, noskip):
  112. query1 = """DELETE FROM cache
  113. WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"""
  114. query2 = """SELECT cache_time, cache_queries, cache_process_time
  115. FROM cache
  116. WHERE cache_id = ?"""
  117. query3 = """SELECT cdata_url, cdata_confidence, cdata_skipped
  118. FROM cache_data
  119. WHERE cdata_cache_id = ?"""
  120. cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
  121. with conn.cursor() as cursor:
  122. cursor.execute(query1)
  123. cursor.execute(query2, (cache_id,))
  124. results = cursor.fetchall()
  125. if not results:
  126. return None
  127. cache_time, queries, check_time = results[0]
  128. cursor.execute(query3, (cache_id,))
  129. data = cursor.fetchall()
  130. if not data: # TODO: do something less hacky for this edge case
  131. artchain = MarkovChain(ArticleTextParser(page.get()).strip())
  132. result = CopyvioCheckResult(False, [], queries, check_time, artchain)
  133. result.cached = True
  134. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  135. result.cache_age = _format_date(cache_time)
  136. return result
  137. url, confidence, skipped = data.pop(0)
  138. if skipped: # Should be impossible: data must be bad; run a new check
  139. return None
  140. result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30)
  141. if abs(result.confidence - confidence) >= 0.0001:
  142. return None
  143. for url, confidence, skipped in data:
  144. if noskip and skipped:
  145. return None
  146. source = CopyvioSource(None, url)
  147. source.confidence = confidence
  148. source.skipped = bool(skipped)
  149. result.sources.append(source)
  150. result.queries = queries
  151. result.time = check_time
  152. result.cached = True
  153. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  154. result.cache_age = _format_date(cache_time)
  155. return result
  156. def _format_date(cache_time):
  157. format = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
  158. diff = datetime.utcnow() - cache_time
  159. if diff.seconds > 3600:
  160. return format(diff.seconds / 3600, "hour")
  161. if diff.seconds > 60:
  162. return format(diff.seconds / 60, "minute")
  163. return format(diff.seconds, "second")
  164. def _cache_result(page, result, conn, mode):
  165. query1 = "DELETE FROM cache WHERE cache_id = ?"
  166. query2 = "INSERT INTO cache VALUES (?, DEFAULT, ?, ?)"
  167. query3 = "INSERT INTO cache_data VALUES (DEFAULT, ?, ?, ?, ?)"
  168. cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
  169. data = [(cache_id, source.url, source.confidence, source.skipped)
  170. for source in result.sources]
  171. with conn.cursor() as cursor:
  172. cursor.execute("START TRANSACTION")
  173. cursor.execute(query1, (cache_id,))
  174. cursor.execute(query2, (cache_id, result.queries, result.time))
  175. cursor.executemany(query3, data)
  176. cursor.execute("COMMIT")