[WIP] improve docstrings and naming, mark TODO

9 years ago · bf0aa22fa9
--- a/copyvios/turnitin.py
+++ b/copyvios/turnitin.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
 """TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)"""
 """TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures)"""

 from ast import literal_eval
 import re
@@ -11,12 +11,17 @@ __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
 TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'

 def search_turnitin(page_title, lang):
    """ returns a list of tuples, one per report, each containing report id and data from the report"""
    turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang))
    """ returns a TurnitinResult, containing a list of TurnitinReport items, each containing report id and a list of dicts with data from the report"""
    turnitin_data = _parse_plagiabot_result(_make_api_request(
        'The quick brown fox jumps over the lazy dog', lang))  # FIXME: replace with page_title when the earwigbot dev setup is working properly
    turnitin_result = TurnitinResult(turnitin_data)
    return turnitin_result

 def _make_api_request(page_title, lang):
    """ Query the plagiabot API for Turnitin reports for a given page
    page_title : string containing title of the page in question
    lang : string containing language code for the current project
    """
    stripped_page_title = page_title.replace(' ', '_')
    api_parameters = {'action': 'suspected_diffs',
                      'page_title': stripped_page_title,
@@ -24,16 +29,19 @@ def _make_api_request(page_title, lang):
                      'report': 1}

    result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
    parsed_result = literal_eval(result.text)  # should be ok with encoding, content-type utf-8
    return parsed_result
    # use literal_eval to *safely* parse the resulting dict-containing string
    parsed_api_result = literal_eval(result.text)
    return parsed_api_result

 def _parse_reports(turnitin_api_result):
    reports_data = []
 def _parse_plagiabot_result(turnitin_api_result):
    result_data = []
    for item in turnitin_api_result:
        reports_data.append(_regex_magic(item['report']))
    return reports_data
        reports_data.append(_parse_report(item['report']))
    return result_data

 def _regex_magic(report):
 def _parse_report(report):
    """ Given the "report" bit from the plagiabot API, extract the report ID and the percent/words/url
    """
    # ~magic~
    report_id_pattern = re.compile(r'\?rid=(\d*)')
    report_id = report_id_pattern.search(report).groups()[0]
@@ -44,6 +52,12 @@ def _regex_magic(report):
    return (report_id, results)

 class TurnitinResult:
    """ Container class for TurnitinReports. Each page may have zero or
    more reports of plagiarism, if plagiarism has been detected for
    different revisions.

    TurnitinResult.reports : list containing zero or more TurnitinReports
    """
    def __init__(self, turnitin_data):
        self.reports = []
        for item in turnitin_data:
@@ -54,6 +68,10 @@ class TurnitinResult:
        return str(self.__dict__)

 class TurnitinReport:
    """ Contains data for each Turnitin report.
    TurnitinReport.sources : list of dicts with info from each source
    TurnitinReport.reportid : Turnitin report ID, taken from plagiabot
    """
    def __init__(self, data):
        self.reportid = data[0]