Add a checkbox to allow searching the EranBot/plagiabot database for Turnitin results, and display them in a similar form to the on-wiki EranBot reports if they exist. Add a new module (copyvios/turnitin.py) to handle fetching and parsing the EranBot results. Bug: https://phabricator.wikimedia.org/T110144 TODO: tweak display HTML/CSS; refactor/clean up turnitin.py; improve dev set-up so it doesn't always default to testwiki and can test without hardcoding page titlepull/24/head
@@ -103,7 +103,8 @@ def index(): | |||||
update_sites() | update_sites() | ||||
query = do_check() | query = do_check() | ||||
return render_template( | return render_template( | ||||
"index.mako", notice=notice, query=query, result=query.result) | |||||
"index.mako", notice=notice, query=query, result=query.result, | |||||
turnitin_result=query.turnitin_result) | |||||
@app.route("/settings", methods=["GET", "POST"]) | @app.route("/settings", methods=["GET", "POST"]) | ||||
@catch_errors | @catch_errors | ||||
@@ -11,6 +11,7 @@ from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult | |||||
from .misc import Query, get_db | from .misc import Query, get_db | ||||
from .sites import get_site | from .sites import get_site | ||||
from .turnitin import search_turnitin | |||||
__all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"] | __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"] | ||||
@@ -63,9 +64,16 @@ def _get_results(query, follow=True): | |||||
conn = get_db() | conn = get_db() | ||||
use_engine = 0 if query.use_engine in ("0", "false") else 1 | use_engine = 0 if query.use_engine in ("0", "false") else 1 | ||||
use_links = 0 if query.use_links in ("0", "false") else 1 | use_links = 0 if query.use_links in ("0", "false") else 1 | ||||
use_turnitin = 0 if query.turnitin in ("0", "false") else 1 | |||||
if not use_engine and not use_links: | if not use_engine and not use_links: | ||||
query.error = "no search method" | query.error = "no search method" | ||||
return | return | ||||
# Handle the turnitin check | |||||
if use_turnitin: | |||||
query.turnitin_result = search_turnitin(query.title, query.lang) | |||||
# Handle the copyvio check | |||||
mode = "{0}:{1}:".format(use_engine, use_links) | mode = "{0}:{1}:".format(use_engine, use_links) | ||||
if not _coerce_bool(query.nocache): | if not _coerce_bool(query.nocache): | ||||
query.result = _get_cached_results( | query.result = _get_cached_results( | ||||
@@ -0,0 +1,68 @@ | |||||
# -*- coding: utf-8 -*- | |||||
"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)""" | |||||
from ast import literal_eval | |||||
import re | |||||
import requests | |||||
__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] | |||||
TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' | |||||
def search_turnitin(page_title, lang): | |||||
""" returns a list of tuples, one per report, each containing report id and data from the report""" | |||||
turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang)) | |||||
turnitin_result = TurnitinResult(turnitin_data) | |||||
return turnitin_result | |||||
def _make_api_request(page_title, lang): | |||||
stripped_page_title = page_title.replace(' ', '_') | |||||
api_parameters = {'action': 'suspected_diffs', | |||||
'page_title': stripped_page_title, | |||||
'lang': lang, | |||||
'report': 1} | |||||
result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters) | |||||
parsed_result = literal_eval(result.text) # should be ok with encoding, content-type utf-8 | |||||
return parsed_result | |||||
def _parse_reports(turnitin_api_result): | |||||
reports_data = [] | |||||
for item in turnitin_api_result: | |||||
reports_data.append(_regex_magic(item['report'])) | |||||
return reports_data | |||||
def _regex_magic(report): | |||||
# ~magic~ | |||||
report_id_pattern = re.compile(r'\?rid=(\d*)') | |||||
report_id = report_id_pattern.search(report).groups()[0] | |||||
extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') | |||||
results = extract_info_pattern.findall(report) | |||||
return (report_id, results) | |||||
class TurnitinResult: | |||||
def __init__(self, turnitin_data): | |||||
self.reports = [] | |||||
for item in turnitin_data: | |||||
report = TurnitinReport(item) | |||||
self.reports.append(report) | |||||
def __repr__(self): | |||||
return str(self.__dict__) | |||||
class TurnitinReport: | |||||
def __init__(self, data): | |||||
self.reportid = data[0] | |||||
self.sources = [] | |||||
for item in data[1]: | |||||
source = {'percent': item[0], | |||||
'words': item[1], | |||||
'url': item[2]} | |||||
self.sources.append(source) | |||||
def __repr__(self): | |||||
return str(self.__dict__) |
@@ -113,6 +113,10 @@ | |||||
<input class="cv-search" type="hidden" name="use_links" value="0" /> | <input class="cv-search" type="hidden" name="use_links" value="0" /> | ||||
<input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" ${'checked="checked"' if (query.use_links != "0") else ""} /> | <input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" ${'checked="checked"' if (query.use_links != "0") else ""} /> | ||||
<label for="cv-cb-links">Use links in page</label> | <label for="cv-cb-links">Use links in page</label> | ||||
<input class="cv-search" type="hidden" name="use_links" value="0" /> | |||||
<br> | |||||
<input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1" ${'checked="checked"' if (query.turnitin != "0") else ""}/> | |||||
<label for="cv-cb-turnitin">Find reports through Turnitin</label> | |||||
</td> | </td> | ||||
</tr> | </tr> | ||||
<tr> | <tr> | ||||
@@ -160,6 +164,31 @@ | |||||
% endif | % endif | ||||
<a href="${request.script_root | h}?lang=${query.lang | h}&project=${query.project | h}&oldid=${query.oldid or query.page.lastrevid | h}&action=${query.action | h}&${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a> | <a href="${request.script_root | h}?lang=${query.lang | h}&project=${query.project | h}&oldid=${query.oldid or query.page.lastrevid | h}&action=${query.action | h}&${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a> | ||||
</div> | </div> | ||||
% if query.turnitin: | |||||
<div id="turnitin-result" class="${'red' if query.turnitin_result else 'green'}-box"> | |||||
<p>Turnitin results (this should be centered like "checked sources")</p> | |||||
% if query.turnitin_result: | |||||
Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them. (Does this need some sort of p tag or something?) | |||||
%for report in turnitin_result.reports: | |||||
<ul> | |||||
<li><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report</a> | |||||
<ul> | |||||
% for source in report.sources: | |||||
<li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li> | |||||
% endfor | |||||
</ul></li> | |||||
%endfor | |||||
</ul> | |||||
${turnitin_result} | |||||
% else: | |||||
Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources. | |||||
% endif | |||||
</div> | |||||
% endif | |||||
<div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box"> | <div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box"> | ||||
<table id="cv-result-head-table"> | <table id="cv-result-head-table"> | ||||
<colgroup> | <colgroup> | ||||