Browse Source

Refactor turnitin.py, incorporate diff link/timestamp

* Add a wiki timestamp parser to copyvios/misc.py
* Refactor copyvios/turnitin.py for more sensible structure
* Update templates/index.mako to incorporate diff link/timestamp and
  make it clearer that Turnitin is revision-based checking
pull/24/head
Frances Hocutt 9 years ago
parent
commit
4e994f1302
3 changed files with 37 additions and 42 deletions
  1. +4
    -0
      copyvios/misc.py
  2. +32
    -38
      copyvios/turnitin.py
  3. +1
    -4
      templates/index.mako

+ 4
- 0
copyvios/misc.py View File

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-


import datetime
from os.path import expanduser from os.path import expanduser


from flask import g, request from flask import g, request
@@ -64,6 +65,9 @@ def httpsfix(context, url):
url = url[len("http:"):] url = url[len("http:"):]
return url return url


def parse_wiki_timestamp(timestamp):
return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S')

def urlstrip(context, url): def urlstrip(context, url):
if url.startswith("http://"): if url.startswith("http://"):
url = url[7:] url = url[7:]


+ 32
- 38
copyvios/turnitin.py View File

@@ -4,6 +4,8 @@ import re


import requests import requests


from .misc import parse_wiki_timestamp

__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']


TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
@@ -15,13 +17,9 @@ def search_turnitin(page_title, lang):
page_title -- string containing the page title page_title -- string containing the page title
lang -- string containing the page's project language code lang -- string containing the page's project language code


Return a TurnitinResult (containing a list of TurnitinReports, with
report ID and source data).
Return a TurnitinResult (contains a list of TurnitinReports).
""" """
turnitin_data = _parse_plagiabot_result(_make_api_request(
page_title, lang))
turnitin_result = TurnitinResult(turnitin_data)
return turnitin_result
return TurnitinResult(_make_api_request(page_title, lang))


def _make_api_request(page_title, lang): def _make_api_request(page_title, lang):
""" Query the plagiabot API for Turnitin reports for a given page. """ Query the plagiabot API for Turnitin reports for a given page.
@@ -37,23 +35,6 @@ def _make_api_request(page_title, lang):
parsed_api_result = literal_eval(result.text) parsed_api_result = literal_eval(result.text)
return parsed_api_result return parsed_api_result


def _parse_plagiabot_result(turnitin_api_result):
result_data = []
for item in turnitin_api_result:
result_data.append(_parse_report(item['report']))
return result_data

def _parse_report(report):
# extract report ID
report_id_pattern = re.compile(r'\?rid=(\d*)')
report_id = report_id_pattern.search(report).groups()[0]

# extract percent match, words, and URL for each source in the report
extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
results = extract_info_pattern.findall(report)

return (report_id, results)

class TurnitinResult: class TurnitinResult:
""" Container class for TurnitinReports. Each page may have zero or """ Container class for TurnitinReports. Each page may have zero or
more reports of plagiarism. The list will have multiple more reports of plagiarism. The list will have multiple
@@ -65,12 +46,12 @@ class TurnitinResult:
def __init__(self, turnitin_data): def __init__(self, turnitin_data):
""" """
Keyword argument: Keyword argument:
turnitin_data -- list of tuples with data on each report; see
TurnitinReport.__init__ for the contents.
turnitin_data -- plagiabot API result
""" """
self.reports = [] self.reports = []
for item in turnitin_data: for item in turnitin_data:
report = TurnitinReport(item)
report = TurnitinReport(
item['diff_timestamp'], item['diff'], item['report'])
self.reports.append(report) self.reports.append(report)


def __repr__(self): def __repr__(self):
@@ -80,27 +61,28 @@ class TurnitinReport:
""" Contains data for each Turnitin report (one on each potentially """ Contains data for each Turnitin report (one on each potentially
plagiarized revision). plagiarized revision).


TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
TurnitinReport.sources -- list of dicts with information on:
TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
TurnitinReport.diffid -- diff ID from Wikipedia database
TurnitinReport.time_posted -- datetime of the time the diff posted
TurnitinReport.sources -- list of dicts with information on:
percent -- percent of revision found in source as well percent -- percent of revision found in source as well
words -- number of words found in both source and revision words -- number of words found in both source and revision
url -- url for the possibly-plagiarized source url -- url for the possibly-plagiarized source
""" """
def __init__(self, data):
def __init__(self, timestamp, diffid, report):
""" """
Keyword argument: Keyword argument:
data -- tuple containing report data. All values are strings.
data[0] -- turnitin report ID
data[1] -- list of tuples with data on each source in the
report
data[<index>][0] -- percent of revision found in source
data[<index>][1] -- number of words matching the source
data[<index>][2] -- url for the matched source
timestamp -- diff timestamp from Wikipedia database
diffid -- diff ID from Wikipedia database
report -- Turnitin report from the plagiabot database
""" """
self.reportid = data[0]
self.report_data = self._parse_report(report)
self.reportid = self.report_data[0]
self.diffid = diffid
self.time_posted = parse_wiki_timestamp(timestamp)


self.sources = [] self.sources = []
for item in data[1]:
for item in self.report_data[1]:
source = {'percent': item[0], source = {'percent': item[0],
'words': item[1], 'words': item[1],
'url': item[2]} 'url': item[2]}
@@ -108,3 +90,15 @@ class TurnitinReport:


def __repr__(self): def __repr__(self):
return str(self.__dict__) return str(self.__dict__)

def _parse_report(self, report_text):
# extract report ID
report_id_pattern = re.compile(r'\?rid=(\d*)')
report_id = report_id_pattern.search(report_text).groups()[0]

# extract percent match, words, and URL for each source in the report
extract_info_pattern = re.compile(
r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
results = extract_info_pattern.findall(report_text)

return (report_id, results)

+ 1
- 4
templates/index.mako View File

@@ -172,10 +172,8 @@
<p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p> <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p>


<table id="turnitin-table"><tbody> <table id="turnitin-table"><tbody>
## TODO: make this prettier/tabular
%for report in turnitin_result.reports: %for report in turnitin_result.reports:
<tr><td><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid} for text added in revision ${loop.index}</a>
## TODO: Rework this to something like: [Turnitin report](link) for [revision at timestamp](diff link). Requires API-result-parsing/TurnitinReport changes. Shouldn't be too bad. Reason: needs to make it clear that Turnitin is looking at individual revisions; current report does not.
<tr><td id="turnitin-table-cell"><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid}</a> for text added <a href="https://${query.lang}.wikipedia.org/w/index.php?title=${query.title}&diff=${report.diffid}"> at ${report.time_posted}</a>:
<ul> <ul>
% for source in report.sources: % for source in report.sources:
<li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li> <li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li>
@@ -183,7 +181,6 @@
</ul></td></tr> </ul></td></tr>
%endfor %endfor
</tbody></table> </tbody></table>

% else: % else:
<p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p> <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p>
% endif % endif


Loading…
Cancel
Save