A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

216 lines
8.5 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime
  3. from hashlib import sha256
  4. from urlparse import urlparse
  5. from earwigbot import exceptions
  6. from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain
  7. from earwigbot.wiki.copyvios.parsers import ArticleTextParser
  8. from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult
  9. from .misc import Query, get_db, get_cursor, sql_dialect
  10. from .sites import get_site
  11. from .turnitin import search_turnitin
  12. __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]
  13. T_POSSIBLE = 0.4
  14. T_SUSPECT = 0.75
  15. def _coerce_bool(val):
  16. return val and val not in ("0", "false")
  17. def do_check(query=None):
  18. if not query:
  19. query = Query()
  20. if query.lang:
  21. query.lang = query.orig_lang = query.lang.lower()
  22. if "::" in query.lang:
  23. query.lang, query.name = query.lang.split("::", 1)
  24. if query.project:
  25. query.project = query.project.lower()
  26. query.submitted = query.project and query.lang and (query.title or query.oldid)
  27. if query.submitted:
  28. query.site = get_site(query)
  29. if query.site:
  30. _get_results(query, follow=not _coerce_bool(query.noredirect))
  31. return query
  32. def _get_results(query, follow=True):
  33. if query.oldid:
  34. page = query.page = _get_page_by_revid(query.site, query.oldid)
  35. if not page:
  36. return
  37. else:
  38. page = query.page = query.site.get_page(query.title)
  39. try:
  40. page.get() # Make sure that the page exists before we check it!
  41. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  42. return
  43. if page.is_redirect and follow:
  44. try:
  45. query.title = page.get_redirect_target()
  46. except exceptions.RedirectError:
  47. pass # Something's wrong. Continue checking the original page.
  48. else:
  49. query.redirected_from = page
  50. return _get_results(query, follow=False)
  51. if not query.action:
  52. query.action = "compare" if query.url else "search"
  53. if query.action == "search":
  54. conn = get_db()
  55. use_engine = 0 if query.use_engine in ("0", "false") else 1
  56. use_links = 0 if query.use_links in ("0", "false") else 1
  57. use_turnitin = 1 if query.turnitin in ("1", "true") else 0
  58. if not use_engine and not use_links and not use_turnitin:
  59. query.error = "no search method"
  60. return
  61. # Handle the turnitin check
  62. if use_turnitin:
  63. query.turnitin_result = search_turnitin(page.title, query.lang)
  64. # Handle the copyvio check
  65. mode = "{0}:{1}:".format(use_engine, use_links)
  66. if not _coerce_bool(query.nocache):
  67. query.result = _get_cached_results(
  68. page, conn, mode, _coerce_bool(query.noskip))
  69. if not query.result:
  70. try:
  71. query.result = page.copyvio_check(
  72. min_confidence=T_SUSPECT, max_queries=8, max_time=45,
  73. no_searches=not use_engine, no_links=not use_links,
  74. short_circuit=not query.noskip)
  75. except exceptions.SearchQueryError as exc:
  76. query.error = "search error"
  77. query.exception = exc
  78. return
  79. query.result.cached = False
  80. _cache_result(page, query.result, conn, mode)
  81. elif query.action == "compare":
  82. if not query.url:
  83. query.error = "no URL"
  84. return
  85. scheme = urlparse(query.url).scheme
  86. if not scheme and query.url[0] not in ":/":
  87. query.url = "http://" + query.url
  88. elif scheme not in ["http", "https"]:
  89. query.error = "bad URI"
  90. return
  91. result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT,
  92. max_time=30)
  93. if result.best.chains[0] is EMPTY:
  94. query.error = "timeout" if result.time > 30 else "no data"
  95. return
  96. query.result = result
  97. query.result.cached = False
  98. else:
  99. query.error = "bad action"
  100. def _get_page_by_revid(site, revid):
  101. res = site.api_query(action="query", prop="info|revisions", revids=revid,
  102. rvprop="content|timestamp", inprop="protection|url")
  103. try:
  104. page_data = res["query"]["pages"].values()[0]
  105. title = page_data["title"]
  106. page_data["revisions"][0]["*"] # Only need to check that these exist
  107. page_data["revisions"][0]["timestamp"]
  108. except KeyError:
  109. return None
  110. page = site.get_page(title)
  111. # EarwigBot doesn't understand old revisions of pages, so we use a somewhat
  112. # dirty hack to make this work:
  113. page._load_attributes(res)
  114. page._load_content(res)
  115. return page
  116. def _get_cached_results(page, conn, mode, noskip):
  117. expiry = sql_dialect(mysql="DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)",
  118. sqlite="STRFTIME('%s', 'now', '-3 days')")
  119. query1 = "DELETE FROM cache WHERE cache_time < %s" % expiry
  120. query2 = """SELECT cache_time, cache_queries, cache_process_time,
  121. cache_possible_miss
  122. FROM cache
  123. WHERE cache_id = ?"""
  124. query3 = """SELECT cdata_url, cdata_confidence, cdata_skipped, cdata_excluded
  125. FROM cache_data
  126. WHERE cdata_cache_id = ?"""
  127. cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
  128. with get_cursor(conn) as cursor:
  129. cursor.execute(query1)
  130. cursor.execute(query2, (cache_id,))
  131. results = cursor.fetchall()
  132. if not results:
  133. return None
  134. cache_time, queries, check_time, possible_miss = results[0]
  135. if possible_miss and noskip:
  136. return None
  137. if not isinstance(cache_time, datetime):
  138. cache_time = datetime.utcfromtimestamp(cache_time)
  139. cursor.execute(query3, (cache_id,))
  140. data = cursor.fetchall()
  141. if not data: # TODO: do something less hacky for this edge case
  142. article_chain = MarkovChain(ArticleTextParser(page.get()).strip())
  143. result = CopyvioCheckResult(False, [], queries, check_time,
  144. article_chain, possible_miss)
  145. result.cached = True
  146. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  147. result.cache_age = _format_date(cache_time)
  148. return result
  149. url, confidence, skipped, excluded = data.pop(0)
  150. if skipped: # Should be impossible: data must be bad; run a new check
  151. return None
  152. result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30)
  153. if abs(result.confidence - confidence) >= 0.0001:
  154. return None
  155. for url, confidence, skipped, excluded in data:
  156. if noskip and skipped:
  157. return None
  158. source = CopyvioSource(None, url)
  159. source.confidence = confidence
  160. source.skipped = bool(skipped)
  161. source.excluded = bool(excluded)
  162. result.sources.append(source)
  163. result.queries = queries
  164. result.time = check_time
  165. result.possible_miss = possible_miss
  166. result.cached = True
  167. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  168. result.cache_age = _format_date(cache_time)
  169. return result
  170. def _format_date(cache_time):
  171. formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
  172. diff = datetime.utcnow() - cache_time
  173. total_seconds = diff.days * 86400 + diff.seconds
  174. if total_seconds > 3600:
  175. return formatter(total_seconds / 3600, "hour")
  176. if total_seconds > 60:
  177. return formatter(total_seconds / 60, "minute")
  178. return formatter(total_seconds, "second")
  179. def _cache_result(page, result, conn, mode):
  180. query1 = "DELETE FROM cache WHERE cache_id = ?"
  181. query2 = """INSERT INTO cache (cache_id, cache_queries, cache_process_time,
  182. cache_possible_miss) VALUES (?, ?, ?, ?)"""
  183. query3 = """INSERT INTO cache_data (cdata_cache_id, cdata_url,
  184. cdata_confidence, cdata_skipped,
  185. cdata_excluded) VALUES (?, ?, ?, ?, ?)"""
  186. cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
  187. data = [(cache_id, source.url[:1024], source.confidence, source.skipped,
  188. source.excluded)
  189. for source in result.sources]
  190. with get_cursor(conn) as cursor:
  191. cursor.execute(query1, (cache_id,))
  192. cursor.execute(query2, (cache_id, result.queries, result.time,
  193. result.possible_miss))
  194. cursor.executemany(query3, data)