A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.
 
 
 
 
 

105 rader
3.7 KiB

  1. # -*- coding: utf-8 -*-
  2. import json
  3. import re
  4. import requests
  5. from .misc import parse_wiki_timestamp
  6. __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
  7. TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
  8. def search_turnitin(page_title, lang):
  9. """ Search the Plagiabot database for Turnitin reports for a page.
  10. Keyword arguments:
  11. page_title -- string containing the page title
  12. lang -- string containing the page's project language code
  13. Return a TurnitinResult (contains a list of TurnitinReports).
  14. """
  15. return TurnitinResult(_make_api_request(page_title, lang))
  16. def _make_api_request(page_title, lang):
  17. """ Query the plagiabot API for Turnitin reports for a given page.
  18. """
  19. stripped_page_title = page_title.replace(' ', '_')
  20. api_parameters = {'action': 'suspected_diffs',
  21. 'page_title': stripped_page_title,
  22. 'lang': lang,
  23. 'report': 1}
  24. result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
  25. # use json.loads to *safely* parse the resulting dict-containing string
  26. parsed_api_result = json.loads(result.text)
  27. return parsed_api_result
  28. class TurnitinResult(object):
  29. """ Container class for TurnitinReports. Each page may have zero or
  30. more reports of plagiarism. The list will have multiple
  31. TurnitinReports if plagiarism has been detected for more than one
  32. revision.
  33. TurnitinResult.reports -- list containing >= 0 TurnitinReport items
  34. """
  35. def __init__(self, turnitin_data):
  36. """
  37. Keyword argument:
  38. turnitin_data -- plagiabot API result
  39. """
  40. self.reports = []
  41. for item in turnitin_data:
  42. report = TurnitinReport(
  43. item['diff_timestamp'], item['diff'], item['report'])
  44. self.reports.append(report)
  45. def __repr__(self):
  46. return str(self.__dict__)
  47. class TurnitinReport(object):
  48. """ Contains data for each Turnitin report (one on each potentially
  49. plagiarized revision).
  50. TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
  51. TurnitinReport.diffid -- diff ID from Wikipedia database
  52. TurnitinReport.time_posted -- datetime of the time the diff posted
  53. TurnitinReport.sources -- list of dicts with information on:
  54. percent -- percent of revision found in source as well
  55. words -- number of words found in both source and revision
  56. url -- url for the possibly-plagiarized source
  57. """
  58. def __init__(self, timestamp, diffid, report):
  59. """
  60. Keyword argument:
  61. timestamp -- diff timestamp from Wikipedia database
  62. diffid -- diff ID from Wikipedia database
  63. report -- Turnitin report from the plagiabot database
  64. """
  65. self.report_data = self._parse_report(report)
  66. self.reportid = self.report_data[0]
  67. self.diffid = diffid
  68. self.time_posted = parse_wiki_timestamp(timestamp)
  69. self.sources = []
  70. for item in self.report_data[1]:
  71. source = {'percent': item[0],
  72. 'words': item[1],
  73. 'url': item[2]}
  74. self.sources.append(source)
  75. def __repr__(self):
  76. return str(self.__dict__)
  77. def _parse_report(self, report_text):
  78. # extract report ID
  79. report_id_pattern = re.compile(r'\?rid=(\d*)')
  80. report_id = report_id_pattern.search(report_text).groups()[0]
  81. # extract percent match, words, and URL for each source in the report
  82. extract_info_pattern = re.compile(
  83. r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
  84. results = extract_info_pattern.findall(report_text)
  85. return (report_id, results)