Add a checkbox to allow searching the EranBot/plagiabot database for Turnitin results, and display them in a similar form to the on-wiki EranBot reports if they exist. Add a new module (copyvios/turnitin.py) to handle fetching and parsing the EranBot results. Bug: https://phabricator.wikimedia.org/T110144 TODO: tweak display HTML/CSS; refactor/clean up turnitin.py; improve dev set-up so it doesn't always default to testwiki and can test without hardcoding page titlepull/24/head
@@ -103,7 +103,8 @@ def index(): | |||
update_sites() | |||
query = do_check() | |||
return render_template( | |||
"index.mako", notice=notice, query=query, result=query.result) | |||
"index.mako", notice=notice, query=query, result=query.result, | |||
turnitin_result=query.turnitin_result) | |||
@app.route("/settings", methods=["GET", "POST"]) | |||
@catch_errors | |||
@@ -11,6 +11,7 @@ from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult | |||
from .misc import Query, get_db | |||
from .sites import get_site | |||
from .turnitin import search_turnitin | |||
__all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"] | |||
@@ -63,9 +64,16 @@ def _get_results(query, follow=True): | |||
conn = get_db() | |||
use_engine = 0 if query.use_engine in ("0", "false") else 1 | |||
use_links = 0 if query.use_links in ("0", "false") else 1 | |||
use_turnitin = 0 if query.turnitin in ("0", "false") else 1 | |||
if not use_engine and not use_links: | |||
query.error = "no search method" | |||
return | |||
# Handle the turnitin check | |||
if use_turnitin: | |||
query.turnitin_result = search_turnitin(query.title, query.lang) | |||
# Handle the copyvio check | |||
mode = "{0}:{1}:".format(use_engine, use_links) | |||
if not _coerce_bool(query.nocache): | |||
query.result = _get_cached_results( | |||
@@ -0,0 +1,68 @@ | |||
# -*- coding: utf-8 -*- | |||
"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)""" | |||
from ast import literal_eval | |||
import re | |||
import requests | |||
__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] | |||
TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' | |||
def search_turnitin(page_title, lang): | |||
""" returns a list of tuples, one per report, each containing report id and data from the report""" | |||
turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang)) | |||
turnitin_result = TurnitinResult(turnitin_data) | |||
return turnitin_result | |||
def _make_api_request(page_title, lang): | |||
stripped_page_title = page_title.replace(' ', '_') | |||
api_parameters = {'action': 'suspected_diffs', | |||
'page_title': stripped_page_title, | |||
'lang': lang, | |||
'report': 1} | |||
result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters) | |||
parsed_result = literal_eval(result.text) # should be ok with encoding, content-type utf-8 | |||
return parsed_result | |||
def _parse_reports(turnitin_api_result): | |||
reports_data = [] | |||
for item in turnitin_api_result: | |||
reports_data.append(_regex_magic(item['report'])) | |||
return reports_data | |||
def _regex_magic(report): | |||
# ~magic~ | |||
report_id_pattern = re.compile(r'\?rid=(\d*)') | |||
report_id = report_id_pattern.search(report).groups()[0] | |||
extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') | |||
results = extract_info_pattern.findall(report) | |||
return (report_id, results) | |||
class TurnitinResult: | |||
def __init__(self, turnitin_data): | |||
self.reports = [] | |||
for item in turnitin_data: | |||
report = TurnitinReport(item) | |||
self.reports.append(report) | |||
def __repr__(self): | |||
return str(self.__dict__) | |||
class TurnitinReport: | |||
def __init__(self, data): | |||
self.reportid = data[0] | |||
self.sources = [] | |||
for item in data[1]: | |||
source = {'percent': item[0], | |||
'words': item[1], | |||
'url': item[2]} | |||
self.sources.append(source) | |||
def __repr__(self): | |||
return str(self.__dict__) |
@@ -113,6 +113,10 @@ | |||
<input class="cv-search" type="hidden" name="use_links" value="0" /> | |||
<input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" ${'checked="checked"' if (query.use_links != "0") else ""} /> | |||
<label for="cv-cb-links">Use links in page</label> | |||
<input class="cv-search" type="hidden" name="use_links" value="0" /> | |||
<br> | |||
<input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1" ${'checked="checked"' if (query.turnitin != "0") else ""}/> | |||
<label for="cv-cb-turnitin">Find reports through Turnitin</label> | |||
</td> | |||
</tr> | |||
<tr> | |||
@@ -160,6 +164,31 @@ | |||
% endif | |||
<a href="${request.script_root | h}?lang=${query.lang | h}&project=${query.project | h}&oldid=${query.oldid or query.page.lastrevid | h}&action=${query.action | h}&${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a> | |||
</div> | |||
% if query.turnitin: | |||
<div id="turnitin-result" class="${'red' if query.turnitin_result else 'green'}-box"> | |||
<p>Turnitin results (this should be centered like "checked sources")</p> | |||
% if query.turnitin_result: | |||
Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them. (Does this need some sort of p tag or something?) | |||
%for report in turnitin_result.reports: | |||
<ul> | |||
<li><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report</a> | |||
<ul> | |||
% for source in report.sources: | |||
<li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li> | |||
% endfor | |||
</ul></li> | |||
%endfor | |||
</ul> | |||
${turnitin_result} | |||
% else: | |||
Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources. | |||
% endif | |||
</div> | |||
% endif | |||
<div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box"> | |||
<table id="cv-result-head-table"> | |||
<colgroup> | |||