Add integration with turnitin/plagiabot/EranBotcopyvios-ng
@@ -103,7 +103,8 @@ def index(): | |||||
update_sites() | update_sites() | ||||
query = do_check() | query = do_check() | ||||
return render_template( | return render_template( | ||||
"index.mako", notice=notice, query=query, result=query.result) | |||||
"index.mako", notice=notice, query=query, result=query.result, | |||||
turnitin_result=query.turnitin_result) | |||||
@app.route("/settings", methods=["GET", "POST"]) | @app.route("/settings", methods=["GET", "POST"]) | ||||
@catch_errors | @catch_errors | ||||
@@ -11,6 +11,7 @@ from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult | |||||
from .misc import Query, get_db | from .misc import Query, get_db | ||||
from .sites import get_site | from .sites import get_site | ||||
from .turnitin import search_turnitin | |||||
__all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"] | __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"] | ||||
@@ -63,9 +64,16 @@ def _get_results(query, follow=True): | |||||
conn = get_db() | conn = get_db() | ||||
use_engine = 0 if query.use_engine in ("0", "false") else 1 | use_engine = 0 if query.use_engine in ("0", "false") else 1 | ||||
use_links = 0 if query.use_links in ("0", "false") else 1 | use_links = 0 if query.use_links in ("0", "false") else 1 | ||||
use_turnitin = 0 if query.turnitin in ("0", "false") else 1 | |||||
if not use_engine and not use_links: | if not use_engine and not use_links: | ||||
query.error = "no search method" | query.error = "no search method" | ||||
return | return | ||||
# Handle the turnitin check | |||||
if use_turnitin: | |||||
query.turnitin_result = search_turnitin(query.title, query.lang) | |||||
# Handle the copyvio check | |||||
mode = "{0}:{1}:".format(use_engine, use_links) | mode = "{0}:{1}:".format(use_engine, use_links) | ||||
if not _coerce_bool(query.nocache): | if not _coerce_bool(query.nocache): | ||||
query.result = _get_cached_results( | query.result = _get_cached_results( | ||||
@@ -1,5 +1,6 @@ | |||||
# -*- coding: utf-8 -*- | # -*- coding: utf-8 -*- | ||||
import datetime | |||||
from os.path import expanduser | from os.path import expanduser | ||||
from flask import g, request | from flask import g, request | ||||
@@ -64,6 +65,9 @@ def httpsfix(context, url): | |||||
url = url[len("http:"):] | url = url[len("http:"):] | ||||
return url | return url | ||||
def parse_wiki_timestamp(timestamp): | |||||
return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S') | |||||
def urlstrip(context, url): | def urlstrip(context, url): | ||||
if url.startswith("http://"): | if url.startswith("http://"): | ||||
url = url[7:] | url = url[7:] | ||||
@@ -0,0 +1,104 @@ | |||||
# -*- coding: utf-8 -*- | |||||
from ast import literal_eval | |||||
import re | |||||
import requests | |||||
from .misc import parse_wiki_timestamp | |||||
__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] | |||||
TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' | |||||
def search_turnitin(page_title, lang): | |||||
""" Search the Plagiabot database for Turnitin reports for a page. | |||||
Keyword arguments: | |||||
page_title -- string containing the page title | |||||
lang -- string containing the page's project language code | |||||
Return a TurnitinResult (contains a list of TurnitinReports). | |||||
""" | |||||
return TurnitinResult(_make_api_request(page_title, lang)) | |||||
def _make_api_request(page_title, lang): | |||||
""" Query the plagiabot API for Turnitin reports for a given page. | |||||
""" | |||||
stripped_page_title = page_title.replace(' ', '_') | |||||
api_parameters = {'action': 'suspected_diffs', | |||||
'page_title': stripped_page_title, | |||||
'lang': lang, | |||||
'report': 1} | |||||
result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters) | |||||
# use literal_eval to *safely* parse the resulting dict-containing string | |||||
parsed_api_result = literal_eval(result.text) | |||||
return parsed_api_result | |||||
class TurnitinResult: | |||||
""" Container class for TurnitinReports. Each page may have zero or | |||||
more reports of plagiarism. The list will have multiple | |||||
TurnitinReports if plagiarism has been detected for more than one | |||||
revision. | |||||
TurnitinResult.reports -- list containing >= 0 TurnitinReport items | |||||
""" | |||||
def __init__(self, turnitin_data): | |||||
""" | |||||
Keyword argument: | |||||
turnitin_data -- plagiabot API result | |||||
""" | |||||
self.reports = [] | |||||
for item in turnitin_data: | |||||
report = TurnitinReport( | |||||
item['diff_timestamp'], item['diff'], item['report']) | |||||
self.reports.append(report) | |||||
def __repr__(self): | |||||
return str(self.__dict__) | |||||
class TurnitinReport: | |||||
""" Contains data for each Turnitin report (one on each potentially | |||||
plagiarized revision). | |||||
TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot | |||||
TurnitinReport.diffid -- diff ID from Wikipedia database | |||||
TurnitinReport.time_posted -- datetime of the time the diff posted | |||||
TurnitinReport.sources -- list of dicts with information on: | |||||
percent -- percent of revision found in source as well | |||||
words -- number of words found in both source and revision | |||||
url -- url for the possibly-plagiarized source | |||||
""" | |||||
def __init__(self, timestamp, diffid, report): | |||||
""" | |||||
Keyword argument: | |||||
timestamp -- diff timestamp from Wikipedia database | |||||
diffid -- diff ID from Wikipedia database | |||||
report -- Turnitin report from the plagiabot database | |||||
""" | |||||
self.report_data = self._parse_report(report) | |||||
self.reportid = self.report_data[0] | |||||
self.diffid = diffid | |||||
self.time_posted = parse_wiki_timestamp(timestamp) | |||||
self.sources = [] | |||||
for item in self.report_data[1]: | |||||
source = {'percent': item[0], | |||||
'words': item[1], | |||||
'url': item[2]} | |||||
self.sources.append(source) | |||||
def __repr__(self): | |||||
return str(self.__dict__) | |||||
def _parse_report(self, report_text): | |||||
# extract report ID | |||||
report_id_pattern = re.compile(r'\?rid=(\d*)') | |||||
report_id = report_id_pattern.search(report_text).groups()[0] | |||||
# extract percent match, words, and URL for each source in the report | |||||
extract_info_pattern = re.compile( | |||||
r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') | |||||
results = extract_info_pattern.findall(report_text) | |||||
return (report_id, results) |
@@ -63,6 +63,17 @@ div#info-box { | |||||
margin: 10px 5px; | margin: 10px 5px; | ||||
} | } | ||||
div#turnitin-container { | |||||
padding: 5px 10px; | |||||
margin: 15px 5px 10px 5px; | |||||
} | |||||
div#turnitin-title { | |||||
margin-bottom: -5px; | |||||
text-align: center; | |||||
font-weight: bold; | |||||
} | |||||
div#cv-result { | div#cv-result { | ||||
padding: 5px; | padding: 5px; | ||||
margin: 10px 5px; | margin: 10px 5px; | ||||
@@ -113,6 +113,9 @@ | |||||
<input class="cv-search" type="hidden" name="use_links" value="0" /> | <input class="cv-search" type="hidden" name="use_links" value="0" /> | ||||
<input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" ${'checked="checked"' if (query.use_links != "0") else ""} /> | <input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" ${'checked="checked"' if (query.use_links != "0") else ""} /> | ||||
<label for="cv-cb-links">Use links in page</label> | <label for="cv-cb-links">Use links in page</label> | ||||
<input class="cv-search" type="hidden" name="use_links" value="0" /> | |||||
<span style="white-space:nowrap"><input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1" ${'checked="checked"' if (query.turnitin != "0") else ""}/> | |||||
<label for="cv-cb-turnitin">Use Turnitin database</label></span> | |||||
</td> | </td> | ||||
</tr> | </tr> | ||||
<tr> | <tr> | ||||
@@ -146,6 +149,7 @@ | |||||
</tr> | </tr> | ||||
</table> | </table> | ||||
</form> | </form> | ||||
% if result: | % if result: | ||||
<div id="generation-time"> | <div id="generation-time"> | ||||
Results | Results | ||||
@@ -160,6 +164,29 @@ | |||||
% endif | % endif | ||||
<a href="${request.script_root | h}?lang=${query.lang | h}&project=${query.project | h}&oldid=${query.oldid or query.page.lastrevid | h}&action=${query.action | h}&${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a> | <a href="${request.script_root | h}?lang=${query.lang | h}&project=${query.project | h}&oldid=${query.oldid or query.page.lastrevid | h}&action=${query.action | h}&${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a> | ||||
</div> | </div> | ||||
% if query.turnitin: | |||||
<div id="turnitin-container" class="${'red' if query.turnitin_result.reports else 'green'}-box"> | |||||
<div id="turnitin-title">Turnitin Results</div> | |||||
% if query.turnitin_result.reports: | |||||
<p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p> | |||||
<table id="turnitin-table"><tbody> | |||||
%for report in turnitin_result.reports: | |||||
<tr><td id="turnitin-table-cell"><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid}</a> for text added <a href="https://${query.lang}.wikipedia.org/w/index.php?title=${query.title}&diff=${report.diffid}"> at ${report.time_posted}</a>: | |||||
<ul> | |||||
% for source in report.sources: | |||||
<li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li> | |||||
% endfor | |||||
</ul></td></tr> | |||||
%endfor | |||||
</tbody></table> | |||||
% else: | |||||
<p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p> | |||||
% endif | |||||
</div> | |||||
% endif | |||||
<div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box"> | <div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box"> | ||||
<table id="cv-result-head-table"> | <table id="cv-result-head-table"> | ||||
<colgroup> | <colgroup> | ||||