Bladeren bron

[WIP] Basic working integration of turnitin

Add a checkbox to allow searching the EranBot/plagiabot database for
Turnitin results, and display them in a similar form to the on-wiki
EranBot reports if they exist.

Add a new module (copyvios/turnitin.py) to handle fetching and parsing
the EranBot results.

Bug: https://phabricator.wikimedia.org/T110144

TODO: tweak display HTML/CSS; refactor/clean up turnitin.py;
      improve dev set-up so it doesn't always default to testwiki
      and can test without hardcoding page title
pull/24/head
Frances Hocutt 9 jaren geleden
bovenliggende
commit
efe2300250
4 gewijzigde bestanden met toevoegingen van 107 en 1 verwijderingen
  1. +2
    -1
      app.py
  2. +8
    -0
      copyvios/checker.py
  3. +68
    -0
      copyvios/turnitin.py
  4. +29
    -0
      templates/index.mako

+ 2
- 1
app.py Bestand weergeven

@@ -103,7 +103,8 @@ def index():
update_sites()
query = do_check()
return render_template(
"index.mako", notice=notice, query=query, result=query.result)
"index.mako", notice=notice, query=query, result=query.result,
turnitin_result=query.turnitin_result)

@app.route("/settings", methods=["GET", "POST"])
@catch_errors


+ 8
- 0
copyvios/checker.py Bestand weergeven

@@ -11,6 +11,7 @@ from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult

from .misc import Query, get_db
from .sites import get_site
from .turnitin import search_turnitin

__all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]

@@ -63,9 +64,16 @@ def _get_results(query, follow=True):
conn = get_db()
use_engine = 0 if query.use_engine in ("0", "false") else 1
use_links = 0 if query.use_links in ("0", "false") else 1
use_turnitin = 0 if query.turnitin in ("0", "false") else 1
if not use_engine and not use_links:
query.error = "no search method"
return

# Handle the turnitin check
if use_turnitin:
query.turnitin_result = search_turnitin(query.title, query.lang)

# Handle the copyvio check
mode = "{0}:{1}:".format(use_engine, use_links)
if not _coerce_bool(query.nocache):
query.result = _get_cached_results(


+ 68
- 0
copyvios/turnitin.py Bestand weergeven

@@ -0,0 +1,68 @@
# -*- coding: utf-8 -*-
"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)"""

from ast import literal_eval
import re

import requests

__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']

TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'

def search_turnitin(page_title, lang):
""" returns a list of tuples, one per report, each containing report id and data from the report"""
turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang))
turnitin_result = TurnitinResult(turnitin_data)
return turnitin_result

def _make_api_request(page_title, lang):
stripped_page_title = page_title.replace(' ', '_')
api_parameters = {'action': 'suspected_diffs',
'page_title': stripped_page_title,
'lang': lang,
'report': 1}

result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
parsed_result = literal_eval(result.text) # should be ok with encoding, content-type utf-8
return parsed_result

def _parse_reports(turnitin_api_result):
reports_data = []
for item in turnitin_api_result:
reports_data.append(_regex_magic(item['report']))
return reports_data

def _regex_magic(report):
# ~magic~
report_id_pattern = re.compile(r'\?rid=(\d*)')
report_id = report_id_pattern.search(report).groups()[0]

extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
results = extract_info_pattern.findall(report)

return (report_id, results)

class TurnitinResult:
def __init__(self, turnitin_data):
self.reports = []
for item in turnitin_data:
report = TurnitinReport(item)
self.reports.append(report)

def __repr__(self):
return str(self.__dict__)

class TurnitinReport:
def __init__(self, data):
self.reportid = data[0]

self.sources = []
for item in data[1]:
source = {'percent': item[0],
'words': item[1],
'url': item[2]}
self.sources.append(source)

def __repr__(self):
return str(self.__dict__)

+ 29
- 0
templates/index.mako Bestand weergeven

@@ -113,6 +113,10 @@
<input class="cv-search" type="hidden" name="use_links" value="0" />
<input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" ${'checked="checked"' if (query.use_links != "0") else ""} />
<label for="cv-cb-links">Use&nbsp;links&nbsp;in&nbsp;page</label>
<input class="cv-search" type="hidden" name="use_links" value="0" />
<br>
<input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1" ${'checked="checked"' if (query.turnitin != "0") else ""}/>
<label for="cv-cb-turnitin">Find&nbsp;reports&nbsp;through&nbsp;Turnitin</label>
</td>
</tr>
<tr>
@@ -160,6 +164,31 @@
% endif
<a href="${request.script_root | h}?lang=${query.lang | h}&amp;project=${query.project | h}&amp;oldid=${query.oldid or query.page.lastrevid | h}&amp;action=${query.action | h}&amp;${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a>
</div>

% if query.turnitin:
<div id="turnitin-result" class="${'red' if query.turnitin_result else 'green'}-box">
<p>Turnitin results (this should be centered like "checked sources")</p>
% if query.turnitin_result:
Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them. (Does this need some sort of p tag or something?)

%for report in turnitin_result.reports:
<ul>
<li><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report</a>
<ul>
% for source in report.sources:
<li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li>
% endfor
</ul></li>
%endfor
</ul>
${turnitin_result}

% else:
Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.
% endif
</div>
% endif

<div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box">
<table id="cv-result-head-table">
<colgroup>


Laden…
Annuleren
Opslaan