A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
Du kannst nicht mehr als 25 Themen auswählen Themen müssen entweder mit einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.
 
 
 
 
 

237 Zeilen
9.0 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime, timedelta
  3. from hashlib import sha256
  4. from logging import getLogger
  5. from urlparse import urlparse
  6. from earwigbot import exceptions
  7. from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain
  8. from earwigbot.wiki.copyvios.parsers import ArticleTextParser
  9. from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult
  10. from .misc import Query, get_db, get_cursor, get_sql_error, sql_dialect
  11. from .sites import get_site
  12. from .turnitin import search_turnitin
  13. __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]
  14. T_POSSIBLE = 0.4
  15. T_SUSPECT = 0.75
  16. _LOGGER = getLogger("copyvios.checker")
  17. def _coerce_bool(val):
  18. return val and val not in ("0", "false")
  19. def do_check(query=None):
  20. if not query:
  21. query = Query()
  22. if query.lang:
  23. query.lang = query.orig_lang = query.lang.lower()
  24. if "::" in query.lang:
  25. query.lang, query.name = query.lang.split("::", 1)
  26. if query.project:
  27. query.project = query.project.lower()
  28. query.submitted = query.project and query.lang and (query.title or query.oldid)
  29. if query.submitted:
  30. query.site = get_site(query)
  31. if query.site:
  32. _get_results(query, follow=not _coerce_bool(query.noredirect))
  33. return query
  34. def _get_results(query, follow=True):
  35. if query.oldid:
  36. page = query.page = _get_page_by_revid(query.site, query.oldid)
  37. if not page:
  38. return
  39. else:
  40. page = query.page = query.site.get_page(query.title)
  41. try:
  42. page.get() # Make sure that the page exists before we check it!
  43. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  44. return
  45. if page.is_redirect and follow:
  46. try:
  47. query.title = page.get_redirect_target()
  48. except exceptions.RedirectError:
  49. pass # Something's wrong. Continue checking the original page.
  50. else:
  51. query.redirected_from = page
  52. _get_results(query, follow=False)
  53. return
  54. if not query.action:
  55. query.action = "compare" if query.url else "search"
  56. if query.action == "search":
  57. use_engine = 0 if query.use_engine in ("0", "false") else 1
  58. use_links = 0 if query.use_links in ("0", "false") else 1
  59. use_turnitin = 1 if query.turnitin in ("1", "true") else 0
  60. if not use_engine and not use_links and not use_turnitin:
  61. query.error = "no search method"
  62. return
  63. # Handle the turnitin check
  64. if use_turnitin:
  65. query.turnitin_result = search_turnitin(page.title, query.lang)
  66. # Handle the copyvio check
  67. _perform_check(query, page, use_engine, use_links)
  68. elif query.action == "compare":
  69. if not query.url:
  70. query.error = "no URL"
  71. return
  72. scheme = urlparse(query.url).scheme
  73. if not scheme and query.url[0] not in ":/":
  74. query.url = "http://" + query.url
  75. elif scheme not in ["http", "https"]:
  76. query.error = "bad URI"
  77. return
  78. result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT,
  79. max_time=30)
  80. if result.best.chains[0] is EMPTY:
  81. query.error = "timeout" if result.time > 30 else "no data"
  82. return
  83. query.result = result
  84. query.result.cached = False
  85. else:
  86. query.error = "bad action"
  87. def _get_page_by_revid(site, revid):
  88. res = site.api_query(action="query", prop="info|revisions", revids=revid,
  89. rvprop="content|timestamp", inprop="protection|url",
  90. rvslots="main")
  91. try:
  92. page_data = res["query"]["pages"].values()[0]
  93. title = page_data["title"]
  94. # Only need to check that these exist:
  95. revision = page_data["revisions"][0]
  96. revision["slots"]["main"]["*"]
  97. revision["timestamp"]
  98. except (KeyError, IndexError):
  99. return None
  100. page = site.get_page(title)
  101. # EarwigBot doesn't understand old revisions of pages, so we use a somewhat
  102. # dirty hack to make this work:
  103. page._load_attributes(res)
  104. page._load_content(res)
  105. return page
  106. def _perform_check(query, page, use_engine, use_links):
  107. conn = get_db()
  108. sql_error = get_sql_error()
  109. mode = "{0}:{1}:".format(use_engine, use_links)
  110. if not _coerce_bool(query.nocache):
  111. try:
  112. query.result = _get_cached_results(
  113. page, conn, mode, _coerce_bool(query.noskip))
  114. except sql_error:
  115. _LOGGER.exception("Failed to retrieve cached results")
  116. if not query.result:
  117. try:
  118. query.result = page.copyvio_check(
  119. min_confidence=T_SUSPECT, max_queries=8, max_time=45,
  120. no_searches=not use_engine, no_links=not use_links,
  121. short_circuit=not query.noskip)
  122. except exceptions.SearchQueryError as exc:
  123. query.error = "search error"
  124. query.exception = exc
  125. return
  126. query.result.cached = False
  127. try:
  128. _cache_result(page, query.result, conn, mode)
  129. except sql_error:
  130. _LOGGER.exception("Failed to cache results")
  131. def _get_cached_results(page, conn, mode, noskip):
  132. query1 = """SELECT cache_time, cache_queries, cache_process_time,
  133. cache_possible_miss
  134. FROM cache
  135. WHERE cache_id = ?"""
  136. query2 = """SELECT cdata_url, cdata_confidence, cdata_skipped, cdata_excluded
  137. FROM cache_data
  138. WHERE cdata_cache_id = ?"""
  139. cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
  140. cursor = conn.cursor()
  141. cursor.execute(query1, (cache_id,))
  142. results = cursor.fetchall()
  143. if not results:
  144. return None
  145. cache_time, queries, check_time, possible_miss = results[0]
  146. if possible_miss and noskip:
  147. return None
  148. if not isinstance(cache_time, datetime):
  149. cache_time = datetime.utcfromtimestamp(cache_time)
  150. if datetime.utcnow() - cache_time > timedelta(days=3):
  151. return None
  152. cursor.execute(query2, (cache_id,))
  153. data = cursor.fetchall()
  154. if not data: # TODO: do something less hacky for this edge case
  155. article_chain = MarkovChain(ArticleTextParser(page.get()).strip())
  156. result = CopyvioCheckResult(False, [], queries, check_time,
  157. article_chain, possible_miss)
  158. result.cached = True
  159. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  160. result.cache_age = _format_date(cache_time)
  161. return result
  162. url, confidence, skipped, excluded = data.pop(0)
  163. if skipped: # Should be impossible: data must be bad; run a new check
  164. return None
  165. result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30)
  166. if abs(result.confidence - confidence) >= 0.0001:
  167. return None
  168. for url, confidence, skipped, excluded in data:
  169. if noskip and skipped:
  170. return None
  171. source = CopyvioSource(None, url)
  172. source.confidence = confidence
  173. source.skipped = bool(skipped)
  174. source.excluded = bool(excluded)
  175. result.sources.append(source)
  176. result.queries = queries
  177. result.time = check_time
  178. result.possible_miss = possible_miss
  179. result.cached = True
  180. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  181. result.cache_age = _format_date(cache_time)
  182. return result
  183. def _format_date(cache_time):
  184. formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
  185. diff = datetime.utcnow() - cache_time
  186. total_seconds = diff.days * 86400 + diff.seconds
  187. if total_seconds > 3600:
  188. return formatter(total_seconds / 3600, "hour")
  189. if total_seconds > 60:
  190. return formatter(total_seconds / 60, "minute")
  191. return formatter(total_seconds, "second")
  192. def _cache_result(page, result, conn, mode):
  193. expiry = sql_dialect(mysql="DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)",
  194. sqlite="STRFTIME('%s', 'now', '-3 days')")
  195. query1 = "DELETE FROM cache WHERE cache_id = ?"
  196. query2 = "DELETE FROM cache WHERE cache_time < %s" % expiry
  197. query3 = """INSERT INTO cache (cache_id, cache_queries, cache_process_time,
  198. cache_possible_miss) VALUES (?, ?, ?, ?)"""
  199. query4 = """INSERT INTO cache_data (cdata_cache_id, cdata_url,
  200. cdata_confidence, cdata_skipped,
  201. cdata_excluded) VALUES (?, ?, ?, ?, ?)"""
  202. cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
  203. data = [(cache_id, source.url[:1024], source.confidence, source.skipped,
  204. source.excluded)
  205. for source in result.sources]
  206. with get_cursor(conn) as cursor:
  207. cursor.execute(query1, (cache_id,))
  208. cursor.execute(query2)
  209. cursor.execute(query3, (cache_id, result.queries, result.time,
  210. result.possible_miss))
  211. cursor.executemany(query4, data)