A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

108 lines
3.8 KiB

  1. # -*- coding: utf-8 -*-
  2. from ast import literal_eval
  3. import re
  4. import requests
  5. from .misc import parse_wiki_timestamp
  6. __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
  7. TURNITIN_API_ENDPOINT = 'https://eranbot.toolforge.org/plagiabot/api.py'
  8. def search_turnitin(page_title, lang):
  9. """ Search the Plagiabot database for Turnitin reports for a page.
  10. Keyword arguments:
  11. page_title -- string containing the page title
  12. lang -- string containing the page's project language code
  13. Return a TurnitinResult (contains a list of TurnitinReports).
  14. """
  15. return TurnitinResult(_make_api_request(page_title, lang))
  16. def _make_api_request(page_title, lang):
  17. """ Query the plagiabot API for Turnitin reports for a given page.
  18. """
  19. stripped_page_title = page_title.replace(' ', '_')
  20. api_parameters = {'action': 'suspected_diffs',
  21. 'page_title': stripped_page_title,
  22. 'lang': lang,
  23. 'report': 1}
  24. result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
  25. # use literal_eval to *safely* parse the resulting dict-containing string
  26. try:
  27. parsed_api_result = literal_eval(result.text)
  28. except (SyntaxError, ValueError):
  29. parsed_api_result = []
  30. return parsed_api_result
  31. class TurnitinResult(object):
  32. """ Container class for TurnitinReports. Each page may have zero or
  33. more reports of plagiarism. The list will have multiple
  34. TurnitinReports if plagiarism has been detected for more than one
  35. revision.
  36. TurnitinResult.reports -- list containing >= 0 TurnitinReport items
  37. """
  38. def __init__(self, turnitin_data):
  39. """
  40. Keyword argument:
  41. turnitin_data -- plagiabot API result
  42. """
  43. self.reports = []
  44. for item in turnitin_data:
  45. report = TurnitinReport(
  46. item['diff_timestamp'], item['diff'], item['report'])
  47. self.reports.append(report)
  48. def __repr__(self):
  49. return str(self.__dict__)
  50. class TurnitinReport(object):
  51. """ Contains data for each Turnitin report (one on each potentially
  52. plagiarized revision).
  53. TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
  54. TurnitinReport.diffid -- diff ID from Wikipedia database
  55. TurnitinReport.time_posted -- datetime of the time the diff posted
  56. TurnitinReport.sources -- list of dicts with information on:
  57. percent -- percent of revision found in source as well
  58. words -- number of words found in both source and revision
  59. url -- url for the possibly-plagiarized source
  60. """
  61. def __init__(self, timestamp, diffid, report):
  62. """
  63. Keyword argument:
  64. timestamp -- diff timestamp from Wikipedia database
  65. diffid -- diff ID from Wikipedia database
  66. report -- Turnitin report from the plagiabot database
  67. """
  68. self.report_data = self._parse_report(report)
  69. self.reportid = self.report_data[0]
  70. self.diffid = diffid
  71. self.time_posted = parse_wiki_timestamp(timestamp)
  72. self.sources = []
  73. for item in self.report_data[1]:
  74. source = {'percent': item[0],
  75. 'words': item[1],
  76. 'url': item[2]}
  77. self.sources.append(source)
  78. def __repr__(self):
  79. return str(self.__dict__)
  80. def _parse_report(self, report_text):
  81. # extract report ID
  82. report_id_pattern = re.compile(r'\?rid=(\d*)')
  83. report_id = report_id_pattern.search(report_text).groups()[0]
  84. # extract percent match, words, and URL for each source in the report
  85. extract_info_pattern = re.compile(
  86. r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
  87. results = extract_info_pattern.findall(report_text)
  88. return (report_id, results)