A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

checker.py 3.3 KiB

12 years ago
10 years ago
10 years ago
10 years ago
10 years ago
11 years ago
11 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. # -*- coding: utf-8 -*-
  2. from datetime import datetime
  3. from hashlib import sha256
  4. from urlparse import urlparse
  5. from earwigbot import exceptions
  6. from .misc import Query, get_cache_db
  7. from .sites import get_site, get_sites
  8. __all__ = ["do_check"]
  9. def do_check():
  10. query = Query()
  11. if query.lang:
  12. query.lang = query.orig_lang = query.lang.lower()
  13. if "::" in query.lang:
  14. query.lang, query.name = query.lang.split("::", 1)
  15. if query.project:
  16. query.project = query.project.lower()
  17. query.all_langs, query.all_projects = get_sites()
  18. if query.project and query.lang and (query.title or query.oldid):
  19. query.site = get_site(query)
  20. if query.site:
  21. if query.title:
  22. _get_results(query)
  23. elif query.oldid:
  24. pass
  25. return query
  26. def _get_results(query):
  27. page = query.page = query.site.get_page(query.title)
  28. try:
  29. page.get() # Make sure that the page exists before we check it!
  30. except (exceptions.PageNotFoundError, exceptions.InvalidPageError):
  31. return
  32. if query.url:
  33. if urlparse(query.url).scheme not in ["http", "https"]:
  34. query.error = "bad URI"
  35. return
  36. query.result = page.copyvio_compare(query.url)
  37. query.result.cached = False
  38. else:
  39. conn = get_cache_db()
  40. if not query.nocache:
  41. query.result = _get_cached_results(page, conn)
  42. if not query.result:
  43. query.result = page.copyvio_check(max_queries=10, max_time=45)
  44. query.result.cached = False
  45. _cache_result(page, query.result, conn)
  46. def _get_cached_results(page, conn):
  47. query1 = "DELETE FROM cache WHERE cache_time < DATE_SUB(CURRENT_TIMESTAMP, INTERVAL 3 DAY)"
  48. query2 = "SELECT cache_url, cache_time, cache_queries, cache_process_time FROM cache WHERE cache_id = ? AND cache_hash = ?"
  49. shahash = sha256(page.get().encode("utf8")).hexdigest()
  50. with conn.cursor() as cursor:
  51. cursor.execute(query1)
  52. cursor.execute(query2, (page.pageid, shahash))
  53. results = cursor.fetchall()
  54. if not results:
  55. return None
  56. url, cache_time, num_queries, original_time = results[0]
  57. result = page.copyvio_compare(url)
  58. result.cached = True
  59. result.queries = num_queries
  60. result.original_time = original_time
  61. result.cache_time = cache_time.strftime("%b %d, %Y %H:%M:%S UTC")
  62. result.cache_age = _format_date(cache_time)
  63. return result
  64. def _format_date(cache_time):
  65. diff = datetime.utcnow() - cache_time
  66. if diff.seconds > 3600:
  67. return "{0} hours".format(diff.seconds / 3600)
  68. if diff.seconds > 60:
  69. return "{0} minutes".format(diff.seconds / 60)
  70. return "{0} seconds".format(diff.seconds)
  71. def _cache_result(page, result, conn):
  72. pageid = page.pageid
  73. shahash = sha256(page.get().encode("utf8")).hexdigest()
  74. query1 = "SELECT 1 FROM cache WHERE cache_id = ?"
  75. query2 = "DELETE FROM cache WHERE cache_id = ?"
  76. query3 = "INSERT INTO cache VALUES (?, ?, ?, CURRENT_TIMESTAMP, ?, ?)"
  77. with conn.cursor() as cursor:
  78. cursor.execute(query1, (pageid,))
  79. if cursor.fetchall():
  80. cursor.execute(query2, (pageid,))
  81. cursor.execute(query3, (pageid, shahash, result.url, result.queries,
  82. result.time))