ben
/
copyvios
réplica de https://github.com/earwig/copyvios


			
				
					
						
						
							
							# -*- coding: utf-8 -*-
from ast import literal_eval
import re

import requests

__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']

TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'

def search_turnitin(page_title, lang):
    """ Search the Plagiabot database for Turnitin reports for a page.

    Keyword arguments:
    page_title -- string containing the page title
    lang       -- string containing the page's project language code

    Return a TurnitinResult (containing a list of TurnitinReports, with
    report ID and source data).
    """
    turnitin_data = _parse_plagiabot_result(_make_api_request(
        page_title, lang))
    turnitin_result = TurnitinResult(turnitin_data)
    return turnitin_result

def _make_api_request(page_title, lang):
    """ Query the plagiabot API for Turnitin reports for a given page.
    """
    stripped_page_title = page_title.replace(' ', '_')
    api_parameters = {'action': 'suspected_diffs',
                      'page_title': stripped_page_title,
                      'lang': lang,
                      'report': 1}

    result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
    # use literal_eval to *safely* parse the resulting dict-containing string
    parsed_api_result = literal_eval(result.text)
    return parsed_api_result

def _parse_plagiabot_result(turnitin_api_result):
    result_data = []
    for item in turnitin_api_result:
        result_data.append(_parse_report(item['report']))
    return result_data

def _parse_report(report):
    # extract report ID
    report_id_pattern = re.compile(r'\?rid=(\d*)')
    report_id = report_id_pattern.search(report).groups()[0]

    # extract percent match, words, and URL for each source in the report
    extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
    results = extract_info_pattern.findall(report)

    return (report_id, results)

class TurnitinResult:
    """ Container class for TurnitinReports. Each page may have zero or
    more reports of plagiarism. The list will have multiple
    TurnitinReports if plagiarism has been detected for more than one
    revision.

    TurnitinResult.reports -- list containing >= 0 TurnitinReport items
    """
    def __init__(self, turnitin_data):
        """
        Keyword argument:
        turnitin_data -- list of tuples with data on each report; see
                         TurnitinReport.__init__ for the contents.
        """
        self.reports = []
        for item in turnitin_data:
            report = TurnitinReport(item)
            self.reports.append(report)

    def __repr__(self):
        return str(self.__dict__)

class TurnitinReport:
    """ Contains data for each Turnitin report (one on each potentially
    plagiarized revision).

    TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
    TurnitinReport.sources -- list of dicts with information on:
        percent -- percent of revision found in source as well
        words   -- number of words found in both source and revision
        url     -- url for the possibly-plagiarized source
    """
    def __init__(self, data):
        """
        Keyword argument:
        data -- tuple containing report data. All values are strings.
            data[0] -- turnitin report ID
            data[1] -- list of tuples with data on each source in the
                       report
               data[<index>][0] -- percent of revision found in source
               data[<index>][1] -- number of words matching the source
               data[<index>][2] -- url for the matched source
        """
        self.reportid = data[0]

        self.sources = []
        for item in data[1]:
            source = {'percent': item[0],
                      'words': item[1],
                      'url': item[2]}
            self.sources.append(source)

    def __repr__(self):
        return str(self.__dict__)