Ver código fonte

Refactor turnitin.py, incorporate diff link/timestamp

* Add a wiki timestamp parser to copyvios/misc.py
* Refactor copyvios/turnitin.py for more sensible structure
* Update templates/index.mako to incorporate diff link/timestamp and
  make it clearer that Turnitin is revision-based checking
pull/24/head
Frances Hocutt 9 anos atrás
pai
commit
4e994f1302
3 arquivos alterados com 37 adições e 42 exclusões
  1. +4
    -0
      copyvios/misc.py
  2. +32
    -38
      copyvios/turnitin.py
  3. +1
    -4
      templates/index.mako

+ 4
- 0
copyvios/misc.py Ver arquivo

@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-

import datetime
from os.path import expanduser

from flask import g, request
@@ -64,6 +65,9 @@ def httpsfix(context, url):
url = url[len("http:"):]
return url

def parse_wiki_timestamp(timestamp):
return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S')

def urlstrip(context, url):
if url.startswith("http://"):
url = url[7:]


+ 32
- 38
copyvios/turnitin.py Ver arquivo

@@ -4,6 +4,8 @@ import re

import requests

from .misc import parse_wiki_timestamp

__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']

TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
@@ -15,13 +17,9 @@ def search_turnitin(page_title, lang):
page_title -- string containing the page title
lang -- string containing the page's project language code

Return a TurnitinResult (containing a list of TurnitinReports, with
report ID and source data).
Return a TurnitinResult (contains a list of TurnitinReports).
"""
turnitin_data = _parse_plagiabot_result(_make_api_request(
page_title, lang))
turnitin_result = TurnitinResult(turnitin_data)
return turnitin_result
return TurnitinResult(_make_api_request(page_title, lang))

def _make_api_request(page_title, lang):
""" Query the plagiabot API for Turnitin reports for a given page.
@@ -37,23 +35,6 @@ def _make_api_request(page_title, lang):
parsed_api_result = literal_eval(result.text)
return parsed_api_result

def _parse_plagiabot_result(turnitin_api_result):
result_data = []
for item in turnitin_api_result:
result_data.append(_parse_report(item['report']))
return result_data

def _parse_report(report):
# extract report ID
report_id_pattern = re.compile(r'\?rid=(\d*)')
report_id = report_id_pattern.search(report).groups()[0]

# extract percent match, words, and URL for each source in the report
extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
results = extract_info_pattern.findall(report)

return (report_id, results)

class TurnitinResult:
""" Container class for TurnitinReports. Each page may have zero or
more reports of plagiarism. The list will have multiple
@@ -65,12 +46,12 @@ class TurnitinResult:
def __init__(self, turnitin_data):
"""
Keyword argument:
turnitin_data -- list of tuples with data on each report; see
TurnitinReport.__init__ for the contents.
turnitin_data -- plagiabot API result
"""
self.reports = []
for item in turnitin_data:
report = TurnitinReport(item)
report = TurnitinReport(
item['diff_timestamp'], item['diff'], item['report'])
self.reports.append(report)

def __repr__(self):
@@ -80,27 +61,28 @@ class TurnitinReport:
""" Contains data for each Turnitin report (one on each potentially
plagiarized revision).

TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
TurnitinReport.sources -- list of dicts with information on:
TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
TurnitinReport.diffid -- diff ID from Wikipedia database
TurnitinReport.time_posted -- datetime of the time the diff posted
TurnitinReport.sources -- list of dicts with information on:
percent -- percent of revision found in source as well
words -- number of words found in both source and revision
url -- url for the possibly-plagiarized source
"""
def __init__(self, data):
def __init__(self, timestamp, diffid, report):
"""
Keyword argument:
data -- tuple containing report data. All values are strings.
data[0] -- turnitin report ID
data[1] -- list of tuples with data on each source in the
report
data[<index>][0] -- percent of revision found in source
data[<index>][1] -- number of words matching the source
data[<index>][2] -- url for the matched source
timestamp -- diff timestamp from Wikipedia database
diffid -- diff ID from Wikipedia database
report -- Turnitin report from the plagiabot database
"""
self.reportid = data[0]
self.report_data = self._parse_report(report)
self.reportid = self.report_data[0]
self.diffid = diffid
self.time_posted = parse_wiki_timestamp(timestamp)

self.sources = []
for item in data[1]:
for item in self.report_data[1]:
source = {'percent': item[0],
'words': item[1],
'url': item[2]}
@@ -108,3 +90,15 @@ class TurnitinReport:

def __repr__(self):
return str(self.__dict__)

def _parse_report(self, report_text):
# extract report ID
report_id_pattern = re.compile(r'\?rid=(\d*)')
report_id = report_id_pattern.search(report_text).groups()[0]

# extract percent match, words, and URL for each source in the report
extract_info_pattern = re.compile(
r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
results = extract_info_pattern.findall(report_text)

return (report_id, results)

+ 1
- 4
templates/index.mako Ver arquivo

@@ -172,10 +172,8 @@
<p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p>

<table id="turnitin-table"><tbody>
## TODO: make this prettier/tabular
%for report in turnitin_result.reports:
<tr><td><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid} for text added in revision ${loop.index}</a>
## TODO: Rework this to something like: [Turnitin report](link) for [revision at timestamp](diff link). Requires API-result-parsing/TurnitinReport changes. Shouldn't be too bad. Reason: needs to make it clear that Turnitin is looking at individual revisions; current report does not.
<tr><td id="turnitin-table-cell"><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid}</a> for text added <a href="https://${query.lang}.wikipedia.org/w/index.php?title=${query.title}&diff=${report.diffid}"> at ${report.time_posted}</a>:
<ul>
% for source in report.sources:
<li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li>
@@ -183,7 +181,6 @@
</ul></td></tr>
%endfor
</tbody></table>

% else:
<p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p>
% endif


Carregando…
Cancelar
Salvar