A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.
 
 
 
 
 

111 líneas
3.8 KiB

  1. # -*- coding: utf-8 -*-
  2. from ast import literal_eval
  3. import re
  4. import requests
  5. __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
  6. TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
  7. def search_turnitin(page_title, lang):
  8. """ Search the Plagiabot database for Turnitin reports for a page.
  9. Keyword arguments:
  10. page_title -- string containing the page title
  11. lang -- string containing the page's project language code
  12. Return a TurnitinResult (containing a list of TurnitinReports, with
  13. report ID and source data).
  14. """
  15. turnitin_data = _parse_plagiabot_result(_make_api_request(
  16. page_title, lang))
  17. turnitin_result = TurnitinResult(turnitin_data)
  18. return turnitin_result
  19. def _make_api_request(page_title, lang):
  20. """ Query the plagiabot API for Turnitin reports for a given page.
  21. """
  22. stripped_page_title = page_title.replace(' ', '_')
  23. api_parameters = {'action': 'suspected_diffs',
  24. 'page_title': stripped_page_title,
  25. 'lang': lang,
  26. 'report': 1}
  27. result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
  28. # use literal_eval to *safely* parse the resulting dict-containing string
  29. parsed_api_result = literal_eval(result.text)
  30. return parsed_api_result
  31. def _parse_plagiabot_result(turnitin_api_result):
  32. result_data = []
  33. for item in turnitin_api_result:
  34. result_data.append(_parse_report(item['report']))
  35. return result_data
  36. def _parse_report(report):
  37. # extract report ID
  38. report_id_pattern = re.compile(r'\?rid=(\d*)')
  39. report_id = report_id_pattern.search(report).groups()[0]
  40. # extract percent match, words, and URL for each source in the report
  41. extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
  42. results = extract_info_pattern.findall(report)
  43. return (report_id, results)
  44. class TurnitinResult:
  45. """ Container class for TurnitinReports. Each page may have zero or
  46. more reports of plagiarism. The list will have multiple
  47. TurnitinReports if plagiarism has been detected for more than one
  48. revision.
  49. TurnitinResult.reports -- list containing >= 0 TurnitinReport items
  50. """
  51. def __init__(self, turnitin_data):
  52. """
  53. Keyword argument:
  54. turnitin_data -- list of tuples with data on each report; see
  55. TurnitinReport.__init__ for the contents.
  56. """
  57. self.reports = []
  58. for item in turnitin_data:
  59. report = TurnitinReport(item)
  60. self.reports.append(report)
  61. def __repr__(self):
  62. return str(self.__dict__)
  63. class TurnitinReport:
  64. """ Contains data for each Turnitin report (one on each potentially
  65. plagiarized revision).
  66. TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
  67. TurnitinReport.sources -- list of dicts with information on:
  68. percent -- percent of revision found in source as well
  69. words -- number of words found in both source and revision
  70. url -- url for the possibly-plagiarized source
  71. """
  72. def __init__(self, data):
  73. """
  74. Keyword argument:
  75. data -- tuple containing report data. All values are strings.
  76. data[0] -- turnitin report ID
  77. data[1] -- list of tuples with data on each source in the
  78. report
  79. data[<index>][0] -- percent of revision found in source
  80. data[<index>][1] -- number of words matching the source
  81. data[<index>][2] -- url for the matched source
  82. """
  83. self.reportid = data[0]
  84. self.sources = []
  85. for item in data[1]:
  86. source = {'percent': item[0],
  87. 'words': item[1],
  88. 'url': item[2]}
  89. self.sources.append(source)
  90. def __repr__(self):
  91. return str(self.__dict__)