diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py index 0e99d92..a5f46de 100644 --- a/copyvios/turnitin.py +++ b/copyvios/turnitin.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)""" +"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures)""" from ast import literal_eval import re @@ -11,12 +11,17 @@ __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' def search_turnitin(page_title, lang): - """ returns a list of tuples, one per report, each containing report id and data from the report""" - turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang)) + """ returns a TurnitinResult, containing a list of TurnitinReport items, each containing report id and a list of dicts with data from the report""" + turnitin_data = _parse_plagiabot_result(_make_api_request( + 'The quick brown fox jumps over the lazy dog', lang)) # FIXME: replace with page_title when the earwigbot dev setup is working properly turnitin_result = TurnitinResult(turnitin_data) return turnitin_result def _make_api_request(page_title, lang): + """ Query the plagiabot API for Turnitin reports for a given page + page_title : string containing title of the page in question + lang : string containing language code for the current project + """ stripped_page_title = page_title.replace(' ', '_') api_parameters = {'action': 'suspected_diffs', 'page_title': stripped_page_title, @@ -24,16 +29,19 @@ def _make_api_request(page_title, lang): 'report': 1} result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters) - parsed_result = literal_eval(result.text) # should be ok with encoding, content-type utf-8 - return parsed_result + # use literal_eval to *safely* parse the resulting dict-containing string + parsed_api_result = literal_eval(result.text) + return parsed_api_result -def _parse_reports(turnitin_api_result): - reports_data = [] +def _parse_plagiabot_result(turnitin_api_result): + result_data = [] for item in turnitin_api_result: - reports_data.append(_regex_magic(item['report'])) - return reports_data + reports_data.append(_parse_report(item['report'])) + return result_data -def _regex_magic(report): +def _parse_report(report): + """ Given the "report" bit from the plagiabot API, extract the report ID and the percent/words/url + """ # ~magic~ report_id_pattern = re.compile(r'\?rid=(\d*)') report_id = report_id_pattern.search(report).groups()[0] @@ -44,6 +52,12 @@ def _regex_magic(report): return (report_id, results) class TurnitinResult: + """ Container class for TurnitinReports. Each page may have zero or + more reports of plagiarism, if plagiarism has been detected for + different revisions. + + TurnitinResult.reports : list containing zero or more TurnitinReports + """ def __init__(self, turnitin_data): self.reports = [] for item in turnitin_data: @@ -54,6 +68,10 @@ class TurnitinResult: return str(self.__dict__) class TurnitinReport: + """ Contains data for each Turnitin report. + TurnitinReport.sources : list of dicts with info from each source + TurnitinReport.reportid : Turnitin report ID, taken from plagiabot + """ def __init__(self, data): self.reportid = data[0]