A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

checker.py 9.2 KiB

12 years ago
10 years ago
10 years ago
10 years ago
10 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242
  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime, timedelta
  3. from hashlib import sha256
  4. from logging import getLogger
  5. import re
  6. from urlparse import urlparse
  7. from earwigbot import exceptions
  8. from earwigbot.wiki.copyvios.markov import EMPTY, MarkovChain
  9. from earwigbot.wiki.copyvios.parsers import ArticleTextParser
  10. from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult
  11. from .misc import Query, get_db, get_cursor, get_sql_error, sql_dialect
  12. from .sites import get_site
  13. from .turnitin import search_turnitin
  14. __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]
  15. T_POSSIBLE = 0.4
  16. T_SUSPECT = 0.75
  17. _LOGGER = getLogger("copyvios.checker")
  18. def _coerce_bool(val):
  19. return val and val not in ("0", "false")
  20. def do_check(query=None):
  21. if not query:
  22. query = Query()
  23. if query.lang:
  24. query.lang = query.orig_lang = query.lang.strip().lower()
  25. if "::" in query.lang:
  26. query.lang, query.name = query.lang.split("::", 1)
  27. if query.project:
  28. query.project = query.project.strip().lower()
  29. if query.oldid:
  30. query.oldid = query.oldid.strip().lstrip("0")
  31. query.submitted = query.project and query.lang and (query.title or query.oldid)
  32. if query.submitted:
  33. query.site = get_site(query)
  34. if query.site:
  35. _get_results(query, follow=not _coerce_bool(query.noredirect))
  36. return query
  37. def _get_results(query, follow=True):
  38. if query.oldid:
  39. if not re.match(r"^\d+$", query.oldid):
  40. query.error = "bad oldid"
  41. return
  42. page = query.page = _get_page_by_revid(query.site, query.oldid)
  43. if not page:
  44. return
  45. else:
  46. page = query.page = query.site.get_page(query.title)
  47. try:
  48. page.get() # Make sure that the page exists before we check it!
  49. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  50. return
  51. if page.is_redirect and follow:
  52. try:
  53. query.title = page.get_redirect_target()
  54. except exceptions.RedirectError:
  55. pass # Something's wrong. Continue checking the original page.
  56. else:
  57. query.redirected_from = page
  58. _get_results(query, follow=False)
  59. return
  60. if not query.action:
  61. query.action = "compare" if query.url else "search"
  62. if query.action == "search":
  63. use_engine = 0 if query.use_engine in ("0", "false") else 1
  64. use_links = 0 if query.use_links in ("0", "false") else 1
  65. use_turnitin = 1 if query.turnitin in ("1", "true") else 0
  66. if not use_engine and not use_links and not use_turnitin:
  67. query.error = "no search method"
  68. return
  69. # Handle the turnitin check
  70. if use_turnitin:
  71. query.turnitin_result = search_turnitin(page.title, query.lang)
  72. # Handle the copyvio check
  73. _perform_check(query, page, use_engine, use_links)
  74. elif query.action == "compare":
  75. if not query.url:
  76. query.error = "no URL"
  77. return
  78. scheme = urlparse(query.url).scheme
  79. if not scheme and query.url[0] not in ":/":
  80. query.url = "http://" + query.url
  81. elif scheme not in ["http", "https"]:
  82. query.error = "bad URI"
  83. return
  84. result = page.copyvio_compare(query.url, min_confidence=T_SUSPECT,
  85. max_time=30)
  86. if result.best.chains[0] is EMPTY:
  87. query.error = "timeout" if result.time > 30 else "no data"
  88. return
  89. query.result = result
  90. query.result.cached = False
  91. else:
  92. query.error = "bad action"
  93. def _get_page_by_revid(site, revid):
  94. try:
  95. res = site.api_query(action="query", prop="info|revisions", revids=revid,
  96. rvprop="content|timestamp", inprop="protection|url",
  97. rvslots="main")
  98. page_data = res["query"]["pages"].values()[0]
  99. title = page_data["title"]
  100. # Only need to check that these exist:
  101. revision = page_data["revisions"][0]
  102. revision["slots"]["main"]["*"]
  103. revision["timestamp"]
  104. except (exceptions.APIError, KeyError, IndexError):
  105. return None
  106. page = site.get_page(title)
  107. # EarwigBot doesn't understand old revisions of pages, so we use a somewhat
  108. # dirty hack to make this work:
  109. page._load_attributes(res)
  110. page._load_content(res)
  111. return page
  112. def _perform_check(query, page, use_engine, use_links):
  113. conn = get_db()
  114. sql_error = get_sql_error()
  115. mode = "{0}:{1}:".format(use_engine, use_links)
  116. if not _coerce_bool(query.nocache):
  117. try:
  118. query.result = _get_cached_results(
  119. page, conn, mode, _coerce_bool(query.noskip))
  120. except sql_error:
  121. _LOGGER.exception("Failed to retrieve cached results")
  122. if not query.result:
  123. try:
  124. query.result = page.copyvio_check(
  125. min_confidence=T_SUSPECT, max_queries=8, max_time=45,
  126. no_searches=not use_engine, no_links=not use_links,
  127. short_circuit=not query.noskip)
  128. except exceptions.SearchQueryError as exc:
  129. query.error = "search error"
  130. query.exception = exc
  131. return
  132. query.result.cached = False
  133. try:
  134. _cache_result(page, query.result, conn, mode)
  135. except sql_error:
  136. _LOGGER.exception("Failed to cache results")
  137. def _get_cached_results(page, conn, mode, noskip):
  138. query1 = """SELECT cache_time, cache_queries, cache_process_time,
  139. cache_possible_miss
  140. FROM cache
  141. WHERE cache_id = ?"""
  142. query2 = """SELECT cdata_url, cdata_confidence, cdata_skipped, cdata_excluded
  143. FROM cache_data
  144. WHERE cdata_cache_id = ?"""
  145. cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
  146. cursor = conn.cursor()
  147. cursor.execute(query1, (cache_id,))
  148. results = cursor.fetchall()
  149. if not results:
  150. return None
  151. cache_time, queries, check_time, possible_miss = results[0]
  152. if possible_miss and noskip:
  153. return None
  154. if not isinstance(cache_time, datetime):
  155. cache_time = datetime.utcfromtimestamp(cache_time)
  156. if datetime.utcnow() - cache_time > timedelta(days=3):
  157. return None
  158. cursor.execute(query2, (cache_id,))
  159. data = cursor.fetchall()
  160. if not data: # TODO: do something less hacky for this edge case
  161. article_chain = MarkovChain(ArticleTextParser(page.get()).strip())
  162. result = CopyvioCheckResult(False, [], queries, check_time,
  163. article_chain, possible_miss)
  164. result.cached = True
  165. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  166. result.cache_age = _format_date(cache_time)
  167. return result
  168. url, confidence, skipped, excluded = data.pop(0)
  169. if skipped: # Should be impossible: data must be bad; run a new check
  170. return None
  171. result = page.copyvio_compare(url, min_confidence=T_SUSPECT, max_time=30)
  172. if abs(result.confidence - confidence) >= 0.0001:
  173. return None
  174. for url, confidence, skipped, excluded in data:
  175. if noskip and skipped:
  176. return None
  177. source = CopyvioSource(None, url)
  178. source.confidence = confidence
  179. source.skipped = bool(skipped)
  180. source.excluded = bool(excluded)
  181. result.sources.append(source)
  182. result.queries = queries
  183. result.time = check_time
  184. result.possible_miss = possible_miss
  185. result.cached = True
  186. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  187. result.cache_age = _format_date(cache_time)
  188. return result
  189. def _format_date(cache_time):
  190. formatter = lambda n, w: "{0} {1}{2}".format(n, w, "" if n == 1 else "s")
  191. diff = datetime.utcnow() - cache_time
  192. total_seconds = diff.days * 86400 + diff.seconds
  193. if total_seconds > 3600:
  194. return formatter(total_seconds / 3600, "hour")
  195. if total_seconds > 60:
  196. return formatter(total_seconds / 60, "minute")
  197. return formatter(total_seconds, "second")
  198. def _cache_result(page, result, conn, mode):
  199. expiry = sql_dialect(mysql="DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)",
  200. sqlite="STRFTIME('%s', 'now', '-3 days')")
  201. query1 = "DELETE FROM cache WHERE cache_id = ?"
  202. query2 = "DELETE FROM cache WHERE cache_time < %s" % expiry
  203. query3 = """INSERT INTO cache (cache_id, cache_queries, cache_process_time,
  204. cache_possible_miss) VALUES (?, ?, ?, ?)"""
  205. query4 = """INSERT INTO cache_data (cdata_cache_id, cdata_url,
  206. cdata_confidence, cdata_skipped,
  207. cdata_excluded) VALUES (?, ?, ?, ?, ?)"""
  208. cache_id = buffer(sha256(mode + page.get().encode("utf8")).digest())
  209. data = [(cache_id, source.url[:1024], source.confidence, source.skipped,
  210. source.excluded)
  211. for source in result.sources]
  212. with get_cursor(conn) as cursor:
  213. cursor.execute(query1, (cache_id,))
  214. cursor.execute(query2)
  215. cursor.execute(query3, (cache_id, result.queries, result.time,
  216. result.possible_miss))
  217. cursor.executemany(query4, data)