A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

checker.py 2.7 KiB

12 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
11 years ago
11 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime
  3. from hashlib import sha256
  4. from urlparse import urlparse
  5. from earwigbot import exceptions
  6. from .misc import open_sql_connection
  7. def get_results(query):
  8. page = query.page = query.site.get_page(query.title)
  9. try:
  10. page.get() # Make sure that the page exists before we check it!
  11. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  12. return
  13. if query.url:
  14. if urlparse(query.url).scheme not in ["http", "https"]:
  15. query.result = "bad URI"
  16. return
  17. query.result = page.copyvio_compare(query.url)
  18. query.result.cached = False
  19. else:
  20. conn = open_sql_connection(query.bot, "cache")
  21. if not query.nocache:
  22. query.result = _get_cached_results(page, conn)
  23. if not query.result:
  24. query.result = page.copyvio_check(max_queries=10, max_time=45)
  25. query.result.cached = False
  26. _cache_result(page, query.result, conn)
  27. def _get_cached_results(page, conn):
  28. query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"
  29. query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
  30. shahash = sha256(page.get().encode("utf8")).hexdigest()
  31. with conn.cursor() as cursor:
  32. cursor.execute(query1)
  33. cursor.execute(query2, (page.pageid, shahash))
  34. results = cursor.fetchall()
  35. if not results:
  36. return None
  37. url, cache_time, num_queries, original_time = results[0]
  38. result = page.copyvio_compare(url)
  39. result.cached = True
  40. result.queries = num_queries
  41. result.original_time = original_time
  42. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  43. result.cache_age = _format_date(cache_time)
  44. return result
  45. def _format_date(cache_time):
  46. diff = datetime.utcnow() - cache_time
  47. if diff.seconds > 3600:
  48. return "{0} hours".format(diff.seconds / 3600)
  49. if diff.seconds > 60:
  50. return "{0} minutes".format(diff.seconds / 60)
  51. return "{0} seconds".format(diff.seconds)
  52. def _cache_result(page, result, conn):
  53. pageid = page.pageid
  54. shahash = sha256(page.get().encode("utf8")).hexdigest()
  55. query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
  56. query2 = "DELETE FROM cache WHERE cache_id = ?"
  57. query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
  58. with conn.cursor() as cursor:
  59. cursor.execute(query1, (pageid,))
  60. if cursor.fetchall():
  61. cursor.execute(query2, (pageid,))
  62. cursor.execute(query3, (pageid, shahash, result.url, result.queries,
  63. result.time))