A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

97 regels
3.5 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime
  3. from hashlib import sha256
  4. from time import time
  5. from earwigbot import exceptions
  6. def get_results(context, bot, lang, project, name, all_projects, title, url, query):
  7. site = get_site(bot, lang, project, name, all_projects)
  8. if not site:
  9. return None, None, None
  10. page = site.get_page(title)
  11. try:
  12. page.get() # Make sure that the page exists before we check it!
  13. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  14. return site, page, None
  15. # if url:
  16. # result = get_url_specific_results(page, url)
  17. # else:
  18. # conn = open_sql_connection(bot, "copyvioCache")
  19. # if not query.get("nocache"):
  20. # result = get_cached_results(page, conn)
  21. # if query.get("nocache") or not result:
  22. # result = get_fresh_results(page, conn)
  23. tstart = time()
  24. mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get())
  25. mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.")
  26. mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2)
  27. result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult(
  28. True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci))
  29. result.cached = False
  30. result.tdiff = time() - tstart
  31. # END TEST BLOCK
  32. return site, page, result
  33. def get_url_specific_results(page, url):
  34. t_start = time()
  35. result = page.copyvio_compare(url)
  36. result.cached = False
  37. result.tdiff = time() - t_start
  38. return result
  39. def get_cached_results(page, conn):
  40. query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"
  41. query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
  42. pageid = page.pageid()
  43. hash = sha256(page.get()).hexdigest()
  44. t_start = time()
  45. with conn.cursor() as cursor:
  46. cursor.execute(query1)
  47. cursor.execute(query2, (pageid, hash))
  48. results = cursor.fetchall()
  49. if not results:
  50. return None
  51. url, cache_time, num_queries, original_tdiff = results[0]
  52. result = page.copyvio_compare(url)
  53. result.cached = True
  54. result.queries = num_queries
  55. result.tdiff = time() - t_start
  56. result.original_tdiff = original_tdiff
  57. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  58. result.cache_age = format_date(cache_time)
  59. return result
  60. def format_date(cache_time):
  61. diff = datetime.utcnow() - cache_time
  62. if diff.seconds > 3600:
  63. return "{0} hours".format(diff.seconds / 3600)
  64. if diff.seconds > 60:
  65. return "{0} minutes".format(diff.seconds / 60)
  66. return "{0} seconds".format(diff.seconds)
  67. def get_fresh_results(page, conn):
  68. t_start = time()
  69. result = page.copyvio_check(max_queries=10)
  70. result.cached = False
  71. result.tdiff = time() - t_start
  72. cache_result(page, result, conn)
  73. return result
  74. def cache_result(page, result, conn):
  75. pageid = page.pageid()
  76. hash = sha256(page.get()).hexdigest()
  77. query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
  78. query2 = "DELETE FROM cache WHERE cache_id = ?"
  79. query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
  80. with conn.cursor() as cursor:
  81. cursor.execute(query1, (pageid,))
  82. if cursor.fetchall():
  83. cursor.execute(query2, (pageid,))
  84. cursor.execute(query3, (pageid, hash, result.url, result.queries,
  85. result.tdiff))