From 4e994f13022e7a0a38d93403a246468964f0ce54 Mon Sep 17 00:00:00 2001 From: Frances Hocutt Date: Tue, 22 Dec 2015 15:56:27 -0800 Subject: [PATCH] Refactor turnitin.py, incorporate diff link/timestamp * Add a wiki timestamp parser to copyvios/misc.py * Refactor copyvios/turnitin.py for more sensible structure * Update templates/index.mako to incorporate diff link/timestamp and make it clearer that Turnitin is revision-based checking --- copyvios/misc.py | 4 +++ copyvios/turnitin.py | 70 ++++++++++++++++++++++++---------------------------- templates/index.mako | 5 +--- 3 files changed, 37 insertions(+), 42 deletions(-) diff --git a/copyvios/misc.py b/copyvios/misc.py index 045386a..9c4d824 100644 --- a/copyvios/misc.py +++ b/copyvios/misc.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import datetime from os.path import expanduser from flask import g, request @@ -64,6 +65,9 @@ def httpsfix(context, url): url = url[len("http:"):] return url +def parse_wiki_timestamp(timestamp): + return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S') + def urlstrip(context, url): if url.startswith("http://"): url = url[7:] diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py index 801b312..07a3b8a 100644 --- a/copyvios/turnitin.py +++ b/copyvios/turnitin.py @@ -4,6 +4,8 @@ import re import requests +from .misc import parse_wiki_timestamp + __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' @@ -15,13 +17,9 @@ def search_turnitin(page_title, lang): page_title -- string containing the page title lang -- string containing the page's project language code - Return a TurnitinResult (containing a list of TurnitinReports, with - report ID and source data). + Return a TurnitinResult (contains a list of TurnitinReports). """ - turnitin_data = _parse_plagiabot_result(_make_api_request( - page_title, lang)) - turnitin_result = TurnitinResult(turnitin_data) - return turnitin_result + return TurnitinResult(_make_api_request(page_title, lang)) def _make_api_request(page_title, lang): """ Query the plagiabot API for Turnitin reports for a given page. @@ -37,23 +35,6 @@ def _make_api_request(page_title, lang): parsed_api_result = literal_eval(result.text) return parsed_api_result -def _parse_plagiabot_result(turnitin_api_result): - result_data = [] - for item in turnitin_api_result: - result_data.append(_parse_report(item['report'])) - return result_data - -def _parse_report(report): - # extract report ID - report_id_pattern = re.compile(r'\?rid=(\d*)') - report_id = report_id_pattern.search(report).groups()[0] - - # extract percent match, words, and URL for each source in the report - extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') - results = extract_info_pattern.findall(report) - - return (report_id, results) - class TurnitinResult: """ Container class for TurnitinReports. Each page may have zero or more reports of plagiarism. The list will have multiple @@ -65,12 +46,12 @@ class TurnitinResult: def __init__(self, turnitin_data): """ Keyword argument: - turnitin_data -- list of tuples with data on each report; see - TurnitinReport.__init__ for the contents. + turnitin_data -- plagiabot API result """ self.reports = [] for item in turnitin_data: - report = TurnitinReport(item) + report = TurnitinReport( + item['diff_timestamp'], item['diff'], item['report']) self.reports.append(report) def __repr__(self): @@ -80,27 +61,28 @@ class TurnitinReport: """ Contains data for each Turnitin report (one on each potentially plagiarized revision). - TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot - TurnitinReport.sources -- list of dicts with information on: + TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot + TurnitinReport.diffid -- diff ID from Wikipedia database + TurnitinReport.time_posted -- datetime of the time the diff posted + TurnitinReport.sources -- list of dicts with information on: percent -- percent of revision found in source as well words -- number of words found in both source and revision url -- url for the possibly-plagiarized source """ - def __init__(self, data): + def __init__(self, timestamp, diffid, report): """ Keyword argument: - data -- tuple containing report data. All values are strings. - data[0] -- turnitin report ID - data[1] -- list of tuples with data on each source in the - report - data[][0] -- percent of revision found in source - data[][1] -- number of words matching the source - data[][2] -- url for the matched source + timestamp -- diff timestamp from Wikipedia database + diffid -- diff ID from Wikipedia database + report -- Turnitin report from the plagiabot database """ - self.reportid = data[0] + self.report_data = self._parse_report(report) + self.reportid = self.report_data[0] + self.diffid = diffid + self.time_posted = parse_wiki_timestamp(timestamp) self.sources = [] - for item in data[1]: + for item in self.report_data[1]: source = {'percent': item[0], 'words': item[1], 'url': item[2]} @@ -108,3 +90,15 @@ class TurnitinReport: def __repr__(self): return str(self.__dict__) + + def _parse_report(self, report_text): + # extract report ID + report_id_pattern = re.compile(r'\?rid=(\d*)') + report_id = report_id_pattern.search(report_text).groups()[0] + + # extract percent match, words, and URL for each source in the report + extract_info_pattern = re.compile( + r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') + results = extract_info_pattern.findall(report_text) + + return (report_id, results) diff --git a/templates/index.mako b/templates/index.mako index 1e4340e..5522517 100644 --- a/templates/index.mako +++ b/templates/index.mako @@ -172,10 +172,8 @@

Turnitin (through EranBot) found revisions that may have been plagiarized. Please review them.

- ## TODO: make this prettier/tabular %for report in turnitin_result.reports: - %endfor
Turnitin report ${report.reportid} for text added in revision ${loop.index} -## TODO: Rework this to something like: [Turnitin report](link) for [revision at timestamp](diff link). Requires API-result-parsing/TurnitinReport changes. Shouldn't be too bad. Reason: needs to make it clear that Turnitin is looking at individual revisions; current report does not. +
Turnitin report ${report.reportid} for text added at ${report.time_posted}:
    % for source in report.sources:
  • ${source['percent']}% of revision text (${source['words']} words) found at ${source['url']}
  • @@ -183,7 +181,6 @@
- % else:

Turnitin (through EranBot) found no matching sources.

% endif