Add integration with turnitin/plagiabot/EranBotcopyvios-ng
@@ -103,7 +103,8 @@ def index(): | |||
update_sites() | |||
query = do_check() | |||
return render_template( | |||
"index.mako", notice=notice, query=query, result=query.result) | |||
"index.mako", notice=notice, query=query, result=query.result, | |||
turnitin_result=query.turnitin_result) | |||
@app.route("/settings", methods=["GET", "POST"]) | |||
@catch_errors | |||
@@ -11,6 +11,7 @@ from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult | |||
from .misc import Query, get_db | |||
from .sites import get_site | |||
from .turnitin import search_turnitin | |||
__all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"] | |||
@@ -63,9 +64,16 @@ def _get_results(query, follow=True): | |||
conn = get_db() | |||
use_engine = 0 if query.use_engine in ("0", "false") else 1 | |||
use_links = 0 if query.use_links in ("0", "false") else 1 | |||
use_turnitin = 0 if query.turnitin in ("0", "false") else 1 | |||
if not use_engine and not use_links: | |||
query.error = "no search method" | |||
return | |||
# Handle the turnitin check | |||
if use_turnitin: | |||
query.turnitin_result = search_turnitin(query.title, query.lang) | |||
# Handle the copyvio check | |||
mode = "{0}:{1}:".format(use_engine, use_links) | |||
if not _coerce_bool(query.nocache): | |||
query.result = _get_cached_results( | |||
@@ -1,5 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
import datetime | |||
from os.path import expanduser | |||
from flask import g, request | |||
@@ -64,6 +65,9 @@ def httpsfix(context, url): | |||
url = url[len("http:"):] | |||
return url | |||
def parse_wiki_timestamp(timestamp): | |||
return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S') | |||
def urlstrip(context, url): | |||
if url.startswith("http://"): | |||
url = url[7:] | |||
@@ -0,0 +1,104 @@ | |||
# -*- coding: utf-8 -*- | |||
from ast import literal_eval | |||
import re | |||
import requests | |||
from .misc import parse_wiki_timestamp | |||
__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] | |||
TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' | |||
def search_turnitin(page_title, lang): | |||
""" Search the Plagiabot database for Turnitin reports for a page. | |||
Keyword arguments: | |||
page_title -- string containing the page title | |||
lang -- string containing the page's project language code | |||
Return a TurnitinResult (contains a list of TurnitinReports). | |||
""" | |||
return TurnitinResult(_make_api_request(page_title, lang)) | |||
def _make_api_request(page_title, lang): | |||
""" Query the plagiabot API for Turnitin reports for a given page. | |||
""" | |||
stripped_page_title = page_title.replace(' ', '_') | |||
api_parameters = {'action': 'suspected_diffs', | |||
'page_title': stripped_page_title, | |||
'lang': lang, | |||
'report': 1} | |||
result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters) | |||
# use literal_eval to *safely* parse the resulting dict-containing string | |||
parsed_api_result = literal_eval(result.text) | |||
return parsed_api_result | |||
class TurnitinResult: | |||
""" Container class for TurnitinReports. Each page may have zero or | |||
more reports of plagiarism. The list will have multiple | |||
TurnitinReports if plagiarism has been detected for more than one | |||
revision. | |||
TurnitinResult.reports -- list containing >= 0 TurnitinReport items | |||
""" | |||
def __init__(self, turnitin_data): | |||
""" | |||
Keyword argument: | |||
turnitin_data -- plagiabot API result | |||
""" | |||
self.reports = [] | |||
for item in turnitin_data: | |||
report = TurnitinReport( | |||
item['diff_timestamp'], item['diff'], item['report']) | |||
self.reports.append(report) | |||
def __repr__(self): | |||
return str(self.__dict__) | |||
class TurnitinReport: | |||
""" Contains data for each Turnitin report (one on each potentially | |||
plagiarized revision). | |||
TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot | |||
TurnitinReport.diffid -- diff ID from Wikipedia database | |||
TurnitinReport.time_posted -- datetime of the time the diff posted | |||
TurnitinReport.sources -- list of dicts with information on: | |||
percent -- percent of revision found in source as well | |||
words -- number of words found in both source and revision | |||
url -- url for the possibly-plagiarized source | |||
""" | |||
def __init__(self, timestamp, diffid, report): | |||
""" | |||
Keyword argument: | |||
timestamp -- diff timestamp from Wikipedia database | |||
diffid -- diff ID from Wikipedia database | |||
report -- Turnitin report from the plagiabot database | |||
""" | |||
self.report_data = self._parse_report(report) | |||
self.reportid = self.report_data[0] | |||
self.diffid = diffid | |||
self.time_posted = parse_wiki_timestamp(timestamp) | |||
self.sources = [] | |||
for item in self.report_data[1]: | |||
source = {'percent': item[0], | |||
'words': item[1], | |||
'url': item[2]} | |||
self.sources.append(source) | |||
def __repr__(self): | |||
return str(self.__dict__) | |||
def _parse_report(self, report_text): | |||
# extract report ID | |||
report_id_pattern = re.compile(r'\?rid=(\d*)') | |||
report_id = report_id_pattern.search(report_text).groups()[0] | |||
# extract percent match, words, and URL for each source in the report | |||
extract_info_pattern = re.compile( | |||
r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') | |||
results = extract_info_pattern.findall(report_text) | |||
return (report_id, results) |
@@ -63,6 +63,17 @@ div#info-box { | |||
margin: 10px 5px; | |||
} | |||
div#turnitin-container { | |||
padding: 5px 10px; | |||
margin: 15px 5px 10px 5px; | |||
} | |||
div#turnitin-title { | |||
margin-bottom: -5px; | |||
text-align: center; | |||
font-weight: bold; | |||
} | |||
div#cv-result { | |||
padding: 5px; | |||
margin: 10px 5px; | |||
@@ -113,6 +113,9 @@ | |||
<input class="cv-search" type="hidden" name="use_links" value="0" /> | |||
<input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" ${'checked="checked"' if (query.use_links != "0") else ""} /> | |||
<label for="cv-cb-links">Use links in page</label> | |||
<input class="cv-search" type="hidden" name="use_links" value="0" /> | |||
<span style="white-space:nowrap"><input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1" ${'checked="checked"' if (query.turnitin != "0") else ""}/> | |||
<label for="cv-cb-turnitin">Use Turnitin database</label></span> | |||
</td> | |||
</tr> | |||
<tr> | |||
@@ -146,6 +149,7 @@ | |||
</tr> | |||
</table> | |||
</form> | |||
% if result: | |||
<div id="generation-time"> | |||
Results | |||
@@ -160,6 +164,29 @@ | |||
% endif | |||
<a href="${request.script_root | h}?lang=${query.lang | h}&project=${query.project | h}&oldid=${query.oldid or query.page.lastrevid | h}&action=${query.action | h}&${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a> | |||
</div> | |||
% if query.turnitin: | |||
<div id="turnitin-container" class="${'red' if query.turnitin_result.reports else 'green'}-box"> | |||
<div id="turnitin-title">Turnitin Results</div> | |||
% if query.turnitin_result.reports: | |||
<p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p> | |||
<table id="turnitin-table"><tbody> | |||
%for report in turnitin_result.reports: | |||
<tr><td id="turnitin-table-cell"><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid}</a> for text added <a href="https://${query.lang}.wikipedia.org/w/index.php?title=${query.title}&diff=${report.diffid}"> at ${report.time_posted}</a>: | |||
<ul> | |||
% for source in report.sources: | |||
<li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li> | |||
% endfor | |||
</ul></td></tr> | |||
%endfor | |||
</tbody></table> | |||
% else: | |||
<p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p> | |||
% endif | |||
</div> | |||
% endif | |||
<div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box"> | |||
<table id="cv-result-head-table"> | |||
<colgroup> | |||