A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
Não pode escolher mais do que 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.
 
 
 
 
 

96 linhas
3.4 KiB

  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime
  3. from hashlib import sha256
  4. from time import time
  5. from earwigbot import exceptions
  6. from ..misc import open_sql_connection
  7. def get_results(bot, site, title, url, query):
  8. page = site.get_page(title)
  9. try:
  10. page.get() # Make sure that the page exists before we check it!
  11. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  12. return page, None
  13. # if url:
  14. # result = _get_url_specific_results(page, url)
  15. # else:
  16. # conn = open_sql_connection(bot, "copyvioCache")
  17. # if not query.get("nocache"):
  18. # result = _get_cached_results(page, conn)
  19. # if query.get("nocache") or not result:
  20. # result = _get_fresh_results(page, conn)
  21. tstart = time()
  22. mc1 = __import__("earwigbot").wiki.copyvios.MarkovChain(page.get())
  23. mc2 = __import__("earwigbot").wiki.copyvios.MarkovChain(u"This is some random textual content for a page.")
  24. mci = __import__("earwigbot").wiki.copyvios.MarkovChainIntersection(mc1, mc2)
  25. result = __import__("earwigbot").wiki.copyvios.CopyvioCheckResult(
  26. True, 0.67123, "http://example.com/", 7, mc1, (mc2, mci))
  27. result.cached = False
  28. result.tdiff = time() - tstart
  29. # END TEST BLOCK
  30. return page, result
  31. def _get_url_specific_results(page, url):
  32. t_start = time()
  33. result = page.copyvio_compare(url)
  34. result.cached = False
  35. result.tdiff = time() - t_start
  36. return result
  37. def _get_cached_results(page, conn):
  38. query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"
  39. query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
  40. pageid = page.pageid()
  41. hash = sha256(page.get()).hexdigest()
  42. t_start = time()
  43. with conn.cursor() as cursor:
  44. cursor.execute(query1)
  45. cursor.execute(query2, (pageid, hash))
  46. results = cursor.fetchall()
  47. if not results:
  48. return None
  49. url, cache_time, num_queries, original_tdiff = results[0]
  50. result = page.copyvio_compare(url)
  51. result.cached = True
  52. result.queries = num_queries
  53. result.tdiff = time() - t_start
  54. result.original_tdiff = original_tdiff
  55. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  56. result.cache_age = _format_date(cache_time)
  57. return result
  58. def _format_date(cache_time):
  59. diff = datetime.utcnow() - cache_time
  60. if diff.seconds > 3600:
  61. return "{0} hours".format(diff.seconds / 3600)
  62. if diff.seconds > 60:
  63. return "{0} minutes".format(diff.seconds / 60)
  64. return "{0} seconds".format(diff.seconds)
  65. def _get_fresh_results(page, conn):
  66. t_start = time()
  67. result = page.copyvio_check(max_queries=10)
  68. result.cached = False
  69. result.tdiff = time() - t_start
  70. _cache_result(page, result, conn)
  71. return result
  72. def _cache_result(page, result, conn):
  73. pageid = page.pageid()
  74. hash = sha256(page.get()).hexdigest()
  75. query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
  76. query2 = "DELETE FROM cache WHERE cache_id = ?"
  77. query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
  78. with conn.cursor() as cursor:
  79. cursor.execute(query1, (pageid,))
  80. if cursor.fetchall():
  81. cursor.execute(query2, (pageid,))
  82. cursor.execute(query3, (pageid, hash, result.url, result.queries,
  83. result.tdiff))