A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 
 
 
 

192 Zeilen
7.3 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime
  3. from hashlib import sha256
  4. from urlparse import urlparse
  5. from earwigbot import exceptions
  6. from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain
  7. from earwigbot.wiki.copyvios.parsers import ArticleTextParser
  8. from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult
  9. from .misc import Query, get_cache_db
  10. from .sites import get_site, get_sites
  11. __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]
  12. T_POSSIBLE = 0.4
  13. T_SUSPECT = 0.75
  14. def do_check(query=None):
  15. if not query:
  16. query = Query()
  17. if query.lang:
  18. query.lang = query.orig_lang = query.lang.lower()
  19. if "::" in query.lang:
  20. query.lang, query.name = query.lang.split("::", 1)
  21. if query.project:
  22. query.project = query.project.lower()
  23. query.all_langs, query.all_projects = get_sites()
  24. query.submitted = query.project and query.lang and (query.title or query.oldid)
  25. if query.submitted:
  26. query.site = get_site(query)
  27. if query.site:
  28. _get_results(query, follow=query.noredirect is None)
  29. return query
  30. def _get_results(query, follow=True):
  31. if query.oldid:
  32. page = query.page = _get_page_by_revid(query.site, query.oldid)
  33. if not page:
  34. return
  35. else:
  36. page = query.page = query.site.get_page(query.title)
  37. try:
  38. page.get() # Make sure that the page exists before we check it!
  39. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  40. return
  41. if page.is_redirect and follow:
  42. try:
  43. query.title = page.get_redirect_target()
  44. except exceptions.RedirectError:
  45. pass # Something's wrong. Continue checking the original page.
  46. else:
  47. query.redirected_from = page
  48. return _get_results(query, follow=False)
  49. if not query.action:
  50. query.action = "compare" if query.url else "search"
  51. if query.action == "search":
  52. conn = get_cache_db()
  53. use_engine = 0 if query.use_engine == "0" else 1
  54. use_links = 0 if query.use_links == "0" else 1
  55. if not use_engine and not use_links:
  56. query.error = "no search method"
  57. return
  58. mode = "{0}:{1}:".format(use_engine, use_links)
  59. if not query.nocache:
  60. query.result = _get_cached_results(page, conn, mode, query.noskip)
  61. if not query.result:
  62. try:
  63. query.result = page.copyvio_check(
  64. min_confidence=T_SUSPECT, max_queries=10, max_time=45,
  65. no_searches=not use_engine, no_links=not use_links,
  66. short_circuit=not query.noskip)
  67. except exceptions.SearchQueryError as exc:
  68. query.error = "search error"
  69. query.exception = exc
  70. return
  71. query.result.cached = False
  72. _cache_result(page, query.result, conn, mode)
  73. elif query.action == "compare":
  74. if not query.url:
  75. query.error = "no URL"
  76. return
  77. scheme = urlparse(query.url).scheme
  78. if not scheme and query.url[0] not in ":/":
  79. query.url = "http://" + query.url
  80. elif scheme not in ["http", "https"]:
  81. query.error = "bad URI"
  82. return
  83. result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT,
  84. max_time=30)
  85. if result.best.chains[0] is EMPTY:
  86. query.error = "timeout" if result.time > 30 else "no data"
  87. return
  88. query.result = result
  89. query.result.cached = False
  90. else:
  91. query.error = "bad action"
  92. def _get_page_by_revid(site, revid):
  93. res = site.api_query(action="query", prop="info|revisions", revids=revid,
  94. rvprop="content|timestamp", inprop="protection|url")
  95. try:
  96. page_data = res["query"]["pages"].values()[0]
  97. title = page_data["title"]
  98. page_data["revisions"][0]["*"] # Only need to check that these exist
  99. page_data["revisions"][0]["timestamp"]
  100. except KeyError:
  101. return
  102. page = site.get_page(title)
  103. # EarwigBot doesn't understand old revisions of pages, so we use a somewhat
  104. # dirty hack to make this work:
  105. page._load_attributes(res)
  106. page._load_content(res)
  107. return page
  108. def _get_cached_results(page, conn, mode, noskip):
  109. query1 = """DELETE FROM cache
  110. WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"""
  111. query2 = """SELECT cache_time, cache_queries, cache_process_time
  112. FROM cache
  113. WHERE cache_id = ?"""
  114. query3 = """SELECT cdata_url, cdata_confidence, cdata_skipped
  115. FROM cache_data
  116. WHERE cdata_cache_id = ?"""
  117. cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
  118. with conn.cursor() as cursor:
  119. cursor.execute(query1)
  120. cursor.execute(query2, (cache_id,))
  121. results = cursor.fetchall()
  122. if not results:
  123. return None
  124. cache_time, queries, check_time = results[0]
  125. cursor.execute(query3, (cache_id,))
  126. data = cursor.fetchall()
  127. if not data: # TODO: do something less hacky for this edge case
  128. artchain = MarkovChain(ArticleTextParser(page.get()).strip())
  129. result = CopyvioCheckResult(False, [], queries, check_time, artchain)
  130. result.cached = True
  131. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  132. result.cache_age = _format_date(cache_time)
  133. return result
  134. url, confidence, skipped = data.pop(0)
  135. if skipped: # Should be impossible: data must be bad; run a new check
  136. return None
  137. result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30)
  138. if abs(result.confidence - confidence) >= 0.0001:
  139. return None
  140. for url, confidence, skipped in data:
  141. if noskip and skipped:
  142. return None
  143. source = CopyvioSource(None, url)
  144. source.confidence = confidence
  145. source.skipped = bool(skipped)
  146. result.sources.append(source)
  147. result.queries = queries
  148. result.time = check_time
  149. result.cached = True
  150. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  151. result.cache_age = _format_date(cache_time)
  152. return result
  153. def _format_date(cache_time):
  154. format = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
  155. diff = datetime.utcnow() - cache_time
  156. if diff.seconds > 3600:
  157. return format(diff.seconds / 3600, "hour")
  158. if diff.seconds > 60:
  159. return format(diff.seconds / 60, "minute")
  160. return format(diff.seconds, "second")
  161. def _cache_result(page, result, conn, mode):
  162. query1 = "DELETE FROM cache WHERE cache_id = ?"
  163. query2 = "INSERT INTO cache VALUES (?, DEFAULT, ?, ?)"
  164. query3 = "INSERT INTO cache_data VALUES (DEFAULT, ?, ?, ?, ?)"
  165. cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
  166. data = [(cache_id, source.url, source.confidence, source.skipped)
  167. for source in result.sources]
  168. with conn.cursor() as cursor:
  169. cursor.execute("START TRANSACTION")
  170. cursor.execute(query1, (cache_id,))
  171. cursor.execute(query2, (cache_id, result.queries, result.time))
  172. cursor.executemany(query3, data)
  173. cursor.execute("COMMIT")