|
|
@@ -0,0 +1,68 @@ |
|
|
|
# -*- coding: utf-8 -*- |
|
|
|
"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)""" |
|
|
|
|
|
|
|
from ast import literal_eval |
|
|
|
import re |
|
|
|
|
|
|
|
import requests |
|
|
|
|
|
|
|
__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] |
|
|
|
|
|
|
|
TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' |
|
|
|
|
|
|
|
def search_turnitin(page_title, lang): |
|
|
|
""" returns a list of tuples, one per report, each containing report id and data from the report""" |
|
|
|
turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang)) |
|
|
|
turnitin_result = TurnitinResult(turnitin_data) |
|
|
|
return turnitin_result |
|
|
|
|
|
|
|
def _make_api_request(page_title, lang): |
|
|
|
stripped_page_title = page_title.replace(' ', '_') |
|
|
|
api_parameters = {'action': 'suspected_diffs', |
|
|
|
'page_title': stripped_page_title, |
|
|
|
'lang': lang, |
|
|
|
'report': 1} |
|
|
|
|
|
|
|
result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters) |
|
|
|
parsed_result = literal_eval(result.text) # should be ok with encoding, content-type utf-8 |
|
|
|
return parsed_result |
|
|
|
|
|
|
|
def _parse_reports(turnitin_api_result): |
|
|
|
reports_data = [] |
|
|
|
for item in turnitin_api_result: |
|
|
|
reports_data.append(_regex_magic(item['report'])) |
|
|
|
return reports_data |
|
|
|
|
|
|
|
def _regex_magic(report): |
|
|
|
# ~magic~ |
|
|
|
report_id_pattern = re.compile(r'\?rid=(\d*)') |
|
|
|
report_id = report_id_pattern.search(report).groups()[0] |
|
|
|
|
|
|
|
extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') |
|
|
|
results = extract_info_pattern.findall(report) |
|
|
|
|
|
|
|
return (report_id, results) |
|
|
|
|
|
|
|
class TurnitinResult: |
|
|
|
def __init__(self, turnitin_data): |
|
|
|
self.reports = [] |
|
|
|
for item in turnitin_data: |
|
|
|
report = TurnitinReport(item) |
|
|
|
self.reports.append(report) |
|
|
|
|
|
|
|
def __repr__(self): |
|
|
|
return str(self.__dict__) |
|
|
|
|
|
|
|
class TurnitinReport: |
|
|
|
def __init__(self, data): |
|
|
|
self.reportid = data[0] |
|
|
|
|
|
|
|
self.sources = [] |
|
|
|
for item in data[1]: |
|
|
|
source = {'percent': item[0], |
|
|
|
'words': item[1], |
|
|
|
'url': item[2]} |
|
|
|
self.sources.append(source) |
|
|
|
|
|
|
|
def __repr__(self): |
|
|
|
return str(self.__dict__) |