diff --git a/app.py b/app.py index 5e80db9..3df644a 100755 --- a/app.py +++ b/app.py @@ -103,7 +103,8 @@ def index(): update_sites() query = do_check() return render_template( - "index.mako", notice=notice, query=query, result=query.result) + "index.mako", notice=notice, query=query, result=query.result, + turnitin_result=query.turnitin_result) @app.route("/settings", methods=["GET", "POST"]) @catch_errors diff --git a/copyvios/checker.py b/copyvios/checker.py index bf0b278..dab8526 100644 --- a/copyvios/checker.py +++ b/copyvios/checker.py @@ -11,6 +11,7 @@ from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult from .misc import Query, get_db from .sites import get_site +from .turnitin import search_turnitin __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"] @@ -63,9 +64,16 @@ def _get_results(query, follow=True): conn = get_db() use_engine = 0 if query.use_engine in ("0", "false") else 1 use_links = 0 if query.use_links in ("0", "false") else 1 + use_turnitin = 0 if query.turnitin in ("0", "false") else 1 if not use_engine and not use_links: query.error = "no search method" return + + # Handle the turnitin check + if use_turnitin: + query.turnitin_result = search_turnitin(query.title, query.lang) + + # Handle the copyvio check mode = "{0}:{1}:".format(use_engine, use_links) if not _coerce_bool(query.nocache): query.result = _get_cached_results( diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py new file mode 100644 index 0000000..0e99d92 --- /dev/null +++ b/copyvios/turnitin.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)""" + +from ast import literal_eval +import re + +import requests + +__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] + +TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' + +def search_turnitin(page_title, lang): + """ returns a list of tuples, one per report, each containing report id and data from the report""" + turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang)) + turnitin_result = TurnitinResult(turnitin_data) + return turnitin_result + +def _make_api_request(page_title, lang): + stripped_page_title = page_title.replace(' ', '_') + api_parameters = {'action': 'suspected_diffs', + 'page_title': stripped_page_title, + 'lang': lang, + 'report': 1} + + result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters) + parsed_result = literal_eval(result.text) # should be ok with encoding, content-type utf-8 + return parsed_result + +def _parse_reports(turnitin_api_result): + reports_data = [] + for item in turnitin_api_result: + reports_data.append(_regex_magic(item['report'])) + return reports_data + +def _regex_magic(report): + # ~magic~ + report_id_pattern = re.compile(r'\?rid=(\d*)') + report_id = report_id_pattern.search(report).groups()[0] + + extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') + results = extract_info_pattern.findall(report) + + return (report_id, results) + +class TurnitinResult: + def __init__(self, turnitin_data): + self.reports = [] + for item in turnitin_data: + report = TurnitinReport(item) + self.reports.append(report) + + def __repr__(self): + return str(self.__dict__) + +class TurnitinReport: + def __init__(self, data): + self.reportid = data[0] + + self.sources = [] + for item in data[1]: + source = {'percent': item[0], + 'words': item[1], + 'url': item[2]} + self.sources.append(source) + + def __repr__(self): + return str(self.__dict__) diff --git a/templates/index.mako b/templates/index.mako index 4709b3c..2795fea 100644 --- a/templates/index.mako +++ b/templates/index.mako @@ -113,6 +113,10 @@ + +
+ + @@ -160,6 +164,31 @@ % endif Permalink. + + % if query.turnitin: +
+

Turnitin results (this should be centered like "checked sources")

+ % if query.turnitin_result: + Turnitin (through EranBot) found revisions that may have been plagiarized. Please review them. (Does this need some sort of p tag or something?) + + %for report in turnitin_result.reports: + + ${turnitin_result} + + % else: + Turnitin (through EranBot) found no matching sources. + % endif +
+ % endif +