diff --git a/app.py b/app.py
index 5e80db9..3df644a 100755
--- a/app.py
+++ b/app.py
@@ -103,7 +103,8 @@ def index():
update_sites()
query = do_check()
return render_template(
- "index.mako", notice=notice, query=query, result=query.result)
+ "index.mako", notice=notice, query=query, result=query.result,
+ turnitin_result=query.turnitin_result)
@app.route("/settings", methods=["GET", "POST"])
@catch_errors
diff --git a/copyvios/checker.py b/copyvios/checker.py
index bf0b278..dab8526 100644
--- a/copyvios/checker.py
+++ b/copyvios/checker.py
@@ -11,6 +11,7 @@ from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult
from .misc import Query, get_db
from .sites import get_site
+from .turnitin import search_turnitin
__all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]
@@ -63,9 +64,16 @@ def _get_results(query, follow=True):
conn = get_db()
use_engine = 0 if query.use_engine in ("0", "false") else 1
use_links = 0 if query.use_links in ("0", "false") else 1
+ use_turnitin = 0 if query.turnitin in ("0", "false") else 1
if not use_engine and not use_links:
query.error = "no search method"
return
+
+ # Handle the turnitin check
+ if use_turnitin:
+ query.turnitin_result = search_turnitin(query.title, query.lang)
+
+ # Handle the copyvio check
mode = "{0}:{1}:".format(use_engine, use_links)
if not _coerce_bool(query.nocache):
query.result = _get_cached_results(
diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py
new file mode 100644
index 0000000..0e99d92
--- /dev/null
+++ b/copyvios/turnitin.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)"""
+
+from ast import literal_eval
+import re
+
+import requests
+
+__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
+
+TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
+
+def search_turnitin(page_title, lang):
+ """ returns a list of tuples, one per report, each containing report id and data from the report"""
+ turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang))
+ turnitin_result = TurnitinResult(turnitin_data)
+ return turnitin_result
+
+def _make_api_request(page_title, lang):
+ stripped_page_title = page_title.replace(' ', '_')
+ api_parameters = {'action': 'suspected_diffs',
+ 'page_title': stripped_page_title,
+ 'lang': lang,
+ 'report': 1}
+
+ result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
+ parsed_result = literal_eval(result.text) # should be ok with encoding, content-type utf-8
+ return parsed_result
+
+def _parse_reports(turnitin_api_result):
+ reports_data = []
+ for item in turnitin_api_result:
+ reports_data.append(_regex_magic(item['report']))
+ return reports_data
+
+def _regex_magic(report):
+ # ~magic~
+ report_id_pattern = re.compile(r'\?rid=(\d*)')
+ report_id = report_id_pattern.search(report).groups()[0]
+
+ extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
+ results = extract_info_pattern.findall(report)
+
+ return (report_id, results)
+
+class TurnitinResult:
+ def __init__(self, turnitin_data):
+ self.reports = []
+ for item in turnitin_data:
+ report = TurnitinReport(item)
+ self.reports.append(report)
+
+ def __repr__(self):
+ return str(self.__dict__)
+
+class TurnitinReport:
+ def __init__(self, data):
+ self.reportid = data[0]
+
+ self.sources = []
+ for item in data[1]:
+ source = {'percent': item[0],
+ 'words': item[1],
+ 'url': item[2]}
+ self.sources.append(source)
+
+ def __repr__(self):
+ return str(self.__dict__)
diff --git a/templates/index.mako b/templates/index.mako
index 4709b3c..2795fea 100644
--- a/templates/index.mako
+++ b/templates/index.mako
@@ -113,6 +113,10 @@
+
+
+
+
Turnitin results (this should be centered like "checked sources")
+ % if query.turnitin_result: + Turnitin (through EranBot) found revisions that may have been plagiarized. Please review them. (Does this need some sort of p tag or something?) + + %for report in turnitin_result.reports: +