A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

69 lines
2.2 KiB

  1. # -*- coding: utf-8 -*-
  2. """TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)"""
  3. from ast import literal_eval
  4. import re
  5. import requests
  6. __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
  7. TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
  8. def search_turnitin(page_title, lang):
  9. """ returns a list of tuples, one per report, each containing report id and data from the report"""
  10. turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang))
  11. turnitin_result = TurnitinResult(turnitin_data)
  12. return turnitin_result
  13. def _make_api_request(page_title, lang):
  14. stripped_page_title = page_title.replace(' ', '_')
  15. api_parameters = {'action': 'suspected_diffs',
  16. 'page_title': stripped_page_title,
  17. 'lang': lang,
  18. 'report': 1}
  19. result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
  20. parsed_result = literal_eval(result.text) # should be ok with encoding, content-type utf-8
  21. return parsed_result
  22. def _parse_reports(turnitin_api_result):
  23. reports_data = []
  24. for item in turnitin_api_result:
  25. reports_data.append(_regex_magic(item['report']))
  26. return reports_data
  27. def _regex_magic(report):
  28. # ~magic~
  29. report_id_pattern = re.compile(r'\?rid=(\d*)')
  30. report_id = report_id_pattern.search(report).groups()[0]
  31. extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
  32. results = extract_info_pattern.findall(report)
  33. return (report_id, results)
  34. class TurnitinResult:
  35. def __init__(self, turnitin_data):
  36. self.reports = []
  37. for item in turnitin_data:
  38. report = TurnitinReport(item)
  39. self.reports.append(report)
  40. def __repr__(self):
  41. return str(self.__dict__)
  42. class TurnitinReport:
  43. def __init__(self, data):
  44. self.reportid = data[0]
  45. self.sources = []
  46. for item in data[1]:
  47. source = {'percent': item[0],
  48. 'words': item[1],
  49. 'url': item[2]}
  50. self.sources.append(source)
  51. def __repr__(self):
  52. return str(self.__dict__)