Refactor turnitin.py, incorporate diff link/timestamp

* Add a wiki timestamp parser to copyvios/misc.py * Refactor copyvios/turnitin.py for more sensible structure * Update templates/index.mako to incorporate diff link/timestamp and make it clearer that Turnitin is revision-based checking
9 年前 · 4e994f1302
--- a/copyvios/misc.py
+++ b/copyvios/misc.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8  -*-

 import datetime
 from os.path import expanduser

 from flask import g, request
@@ -64,6 +65,9 @@ def httpsfix(context, url):
        url = url[len("http:"):]
    return url

 def parse_wiki_timestamp(timestamp):
    return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S')

 def urlstrip(context, url):
    if url.startswith("http://"):
        url = url[7:]
--- a/copyvios/turnitin.py
+++ b/copyvios/turnitin.py
@@ -4,6 +4,8 @@ import re

 import requests

 from .misc import parse_wiki_timestamp

 __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']

 TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
@@ -15,13 +17,9 @@ def search_turnitin(page_title, lang):
    page_title -- string containing the page title
    lang       -- string containing the page's project language code

    Return a TurnitinResult (containing a list of TurnitinReports, with
    report ID and source data).
    Return a TurnitinResult (contains a list of TurnitinReports).
    """
    turnitin_data = _parse_plagiabot_result(_make_api_request(
        page_title, lang))
    turnitin_result = TurnitinResult(turnitin_data)
    return turnitin_result
    return TurnitinResult(_make_api_request(page_title, lang))

 def _make_api_request(page_title, lang):
    """ Query the plagiabot API for Turnitin reports for a given page.
@@ -37,23 +35,6 @@ def _make_api_request(page_title, lang):
    parsed_api_result = literal_eval(result.text)
    return parsed_api_result

 def _parse_plagiabot_result(turnitin_api_result):
    result_data = []
    for item in turnitin_api_result:
        result_data.append(_parse_report(item['report']))
    return result_data

 def _parse_report(report):
    # extract report ID
    report_id_pattern = re.compile(r'\?rid=(\d*)')
    report_id = report_id_pattern.search(report).groups()[0]

    # extract percent match, words, and URL for each source in the report
    extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
    results = extract_info_pattern.findall(report)

    return (report_id, results)

 class TurnitinResult:
    """ Container class for TurnitinReports. Each page may have zero or
    more reports of plagiarism. The list will have multiple
@@ -65,12 +46,12 @@ class TurnitinResult:
    def __init__(self, turnitin_data):
        """
        Keyword argument:
        turnitin_data -- list of tuples with data on each report; see
                         TurnitinReport.__init__ for the contents.
        turnitin_data -- plagiabot API result
        """
        self.reports = []
        for item in turnitin_data:
            report = TurnitinReport(item)
            report = TurnitinReport(
                item['diff_timestamp'], item['diff'], item['report'])
            self.reports.append(report)

    def __repr__(self):
@@ -80,27 +61,28 @@ class TurnitinReport:
    """ Contains data for each Turnitin report (one on each potentially
    plagiarized revision).

    TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
    TurnitinReport.sources -- list of dicts with information on:
    TurnitinReport.reportid  -- Turnitin report ID, taken from plagiabot
    TurnitinReport.diffid    -- diff ID from Wikipedia database
    TurnitinReport.time_posted -- datetime of the time the diff posted
    TurnitinReport.sources   -- list of dicts with information on:
        percent -- percent of revision found in source as well
        words   -- number of words found in both source and revision
        url     -- url for the possibly-plagiarized source
    """
    def __init__(self, data):
    def __init__(self, timestamp, diffid, report):
        """
        Keyword argument:
        data -- tuple containing report data. All values are strings.
            data[0] -- turnitin report ID
            data[1] -- list of tuples with data on each source in the
                       report
               data[<index>][0] -- percent of revision found in source
               data[<index>][1] -- number of words matching the source
               data[<index>][2] -- url for the matched source
        timestamp  -- diff timestamp from Wikipedia database
        diffid     -- diff ID from Wikipedia database
        report     -- Turnitin report from the plagiabot database
        """
        self.reportid = data[0]
        self.report_data = self._parse_report(report)
        self.reportid = self.report_data[0]
        self.diffid = diffid
        self.time_posted = parse_wiki_timestamp(timestamp)

        self.sources = []
        for item in data[1]:
        for item in self.report_data[1]:
            source = {'percent': item[0],
                      'words': item[1],
                      'url': item[2]}
@@ -108,3 +90,15 @@ class TurnitinReport:

    def __repr__(self):
        return str(self.__dict__)

    def _parse_report(self, report_text):
        # extract report ID
        report_id_pattern = re.compile(r'\?rid=(\d*)')
        report_id = report_id_pattern.search(report_text).groups()[0]

        # extract percent match, words, and URL for each source in the report
        extract_info_pattern = re.compile(
            r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
        results = extract_info_pattern.findall(report_text)

        return (report_id, results)
--- a/templates/index.mako
+++ b/templates/index.mako
@@ -172,10 +172,8 @@
                <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p>

                <table id="turnitin-table"><tbody>
                ## TODO: make this prettier/tabular
                %for report in turnitin_result.reports:
                    <tr><td><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid} for text added in revision ${loop.index}</a>
 ## TODO: Rework this to something like: [Turnitin report](link) for [revision at timestamp](diff link). Requires API-result-parsing/TurnitinReport changes. Shouldn't be too bad. Reason: needs to make it clear that Turnitin is looking at individual revisions; current report does not.
                    <tr><td id="turnitin-table-cell"><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid}</a> for text added <a href="https://${query.lang}.wikipedia.org/w/index.php?title=${query.title}&diff=${report.diffid}"> at ${report.time_posted}</a>:
                    <ul>
                    % for source in report.sources:
                          <li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li>
@@ -183,7 +181,6 @@
                    </ul></td></tr>
                %endfor
                </tbody></table>

            % else:
                <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p>
            % endif