From efe23002502a2d4faf521e0a0811e7c7e1565cb4 Mon Sep 17 00:00:00 2001 From: Frances Hocutt Date: Wed, 16 Dec 2015 20:48:36 -0800 Subject: [PATCH 1/8] [WIP] Basic working integration of turnitin Add a checkbox to allow searching the EranBot/plagiabot database for Turnitin results, and display them in a similar form to the on-wiki EranBot reports if they exist. Add a new module (copyvios/turnitin.py) to handle fetching and parsing the EranBot results. Bug: https://phabricator.wikimedia.org/T110144 TODO: tweak display HTML/CSS; refactor/clean up turnitin.py; improve dev set-up so it doesn't always default to testwiki and can test without hardcoding page title --- app.py | 3 ++- copyvios/checker.py | 8 +++++++ copyvios/turnitin.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++ templates/index.mako | 29 ++++++++++++++++++++++ 4 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 copyvios/turnitin.py diff --git a/app.py b/app.py index 5e80db9..3df644a 100755 --- a/app.py +++ b/app.py @@ -103,7 +103,8 @@ def index(): update_sites() query = do_check() return render_template( - "index.mako", notice=notice, query=query, result=query.result) + "index.mako", notice=notice, query=query, result=query.result, + turnitin_result=query.turnitin_result) @app.route("/settings", methods=["GET", "POST"]) @catch_errors diff --git a/copyvios/checker.py b/copyvios/checker.py index bf0b278..dab8526 100644 --- a/copyvios/checker.py +++ b/copyvios/checker.py @@ -11,6 +11,7 @@ from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult from .misc import Query, get_db from .sites import get_site +from .turnitin import search_turnitin __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"] @@ -63,9 +64,16 @@ def _get_results(query, follow=True): conn = get_db() use_engine = 0 if query.use_engine in ("0", "false") else 1 use_links = 0 if query.use_links in ("0", "false") else 1 + use_turnitin = 0 if query.turnitin in ("0", "false") else 1 if not use_engine and not use_links: query.error = "no search method" return + + # Handle the turnitin check + if use_turnitin: + query.turnitin_result = search_turnitin(query.title, query.lang) + + # Handle the copyvio check mode = "{0}:{1}:".format(use_engine, use_links) if not _coerce_bool(query.nocache): query.result = _get_cached_results( diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py new file mode 100644 index 0000000..0e99d92 --- /dev/null +++ b/copyvios/turnitin.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)""" + +from ast import literal_eval +import re + +import requests + +__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] + +TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' + +def search_turnitin(page_title, lang): + """ returns a list of tuples, one per report, each containing report id and data from the report""" + turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang)) + turnitin_result = TurnitinResult(turnitin_data) + return turnitin_result + +def _make_api_request(page_title, lang): + stripped_page_title = page_title.replace(' ', '_') + api_parameters = {'action': 'suspected_diffs', + 'page_title': stripped_page_title, + 'lang': lang, + 'report': 1} + + result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters) + parsed_result = literal_eval(result.text) # should be ok with encoding, content-type utf-8 + return parsed_result + +def _parse_reports(turnitin_api_result): + reports_data = [] + for item in turnitin_api_result: + reports_data.append(_regex_magic(item['report'])) + return reports_data + +def _regex_magic(report): + # ~magic~ + report_id_pattern = re.compile(r'\?rid=(\d*)') + report_id = report_id_pattern.search(report).groups()[0] + + extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') + results = extract_info_pattern.findall(report) + + return (report_id, results) + +class TurnitinResult: + def __init__(self, turnitin_data): + self.reports = [] + for item in turnitin_data: + report = TurnitinReport(item) + self.reports.append(report) + + def __repr__(self): + return str(self.__dict__) + +class TurnitinReport: + def __init__(self, data): + self.reportid = data[0] + + self.sources = [] + for item in data[1]: + source = {'percent': item[0], + 'words': item[1], + 'url': item[2]} + self.sources.append(source) + + def __repr__(self): + return str(self.__dict__) diff --git a/templates/index.mako b/templates/index.mako index 4709b3c..2795fea 100644 --- a/templates/index.mako +++ b/templates/index.mako @@ -113,6 +113,10 @@ + +
+ + @@ -160,6 +164,31 @@ % endif Permalink. + + % if query.turnitin: +
+

Turnitin results (this should be centered like "checked sources")

+ % if query.turnitin_result: + Turnitin (through EranBot) found revisions that may have been plagiarized. Please review them. (Does this need some sort of p tag or something?) + + %for report in turnitin_result.reports: + + ${turnitin_result} + + % else: + Turnitin (through EranBot) found no matching sources. + % endif +
+ % endif +
From f0bbb29621b9f770ef8f0f9cf4c60d351595318e Mon Sep 17 00:00:00 2001 From: Frances Hocutt Date: Thu, 17 Dec 2015 16:18:11 -0800 Subject: [PATCH 2/8] [WIP] Improve style and turnitin report display --- static/style.css | 11 +++++++++++ templates/index.mako | 14 +++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/static/style.css b/static/style.css index ecc7dad..cb0fa11 100644 --- a/static/style.css +++ b/static/style.css @@ -63,6 +63,17 @@ div#info-box { margin: 10px 5px; } +div#turnitin-container { + padding: 5px 10px; + margin: 15px 10px 10px 5px; +} + +div#turnitin-title { + margin-bottom: -5px; + text-align: center; + font-weight: bold; +} + div#cv-result { padding: 5px; margin: 10px 5px; diff --git a/templates/index.mako b/templates/index.mako index 2795fea..dafc474 100644 --- a/templates/index.mako +++ b/templates/index.mako @@ -150,6 +150,7 @@
+ % if result:
Results @@ -166,14 +167,14 @@
% if query.turnitin: -
-

Turnitin results (this should be centered like "checked sources")

- % if query.turnitin_result: - Turnitin (through EranBot) found revisions that may have been plagiarized. Please review them. (Does this need some sort of p tag or something?) +
+
Turnitin Results
+ % if query.turnitin_result.reports: +

Turnitin (through EranBot) found revisions that may have been plagiarized. Please review them.

%for report in turnitin_result.reports: - ${turnitin_result} % else: - Turnitin (through EranBot) found no matching sources. +

Turnitin (through EranBot) found no matching sources.

% endif
% endif From bf0aa22fa9f0a990f27d55798bf8fcac98990b10 Mon Sep 17 00:00:00 2001 From: Frances Hocutt Date: Thu, 17 Dec 2015 19:02:25 -0800 Subject: [PATCH 3/8] [WIP] improve docstrings and naming, mark TODO --- copyvios/turnitin.py | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py index 0e99d92..a5f46de 100644 --- a/copyvios/turnitin.py +++ b/copyvios/turnitin.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)""" +"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures)""" from ast import literal_eval import re @@ -11,12 +11,17 @@ __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' def search_turnitin(page_title, lang): - """ returns a list of tuples, one per report, each containing report id and data from the report""" - turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang)) + """ returns a TurnitinResult, containing a list of TurnitinReport items, each containing report id and a list of dicts with data from the report""" + turnitin_data = _parse_plagiabot_result(_make_api_request( + 'The quick brown fox jumps over the lazy dog', lang)) # FIXME: replace with page_title when the earwigbot dev setup is working properly turnitin_result = TurnitinResult(turnitin_data) return turnitin_result def _make_api_request(page_title, lang): + """ Query the plagiabot API for Turnitin reports for a given page + page_title : string containing title of the page in question + lang : string containing language code for the current project + """ stripped_page_title = page_title.replace(' ', '_') api_parameters = {'action': 'suspected_diffs', 'page_title': stripped_page_title, @@ -24,16 +29,19 @@ def _make_api_request(page_title, lang): 'report': 1} result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters) - parsed_result = literal_eval(result.text) # should be ok with encoding, content-type utf-8 - return parsed_result + # use literal_eval to *safely* parse the resulting dict-containing string + parsed_api_result = literal_eval(result.text) + return parsed_api_result -def _parse_reports(turnitin_api_result): - reports_data = [] +def _parse_plagiabot_result(turnitin_api_result): + result_data = [] for item in turnitin_api_result: - reports_data.append(_regex_magic(item['report'])) - return reports_data + reports_data.append(_parse_report(item['report'])) + return result_data -def _regex_magic(report): +def _parse_report(report): + """ Given the "report" bit from the plagiabot API, extract the report ID and the percent/words/url + """ # ~magic~ report_id_pattern = re.compile(r'\?rid=(\d*)') report_id = report_id_pattern.search(report).groups()[0] @@ -44,6 +52,12 @@ def _regex_magic(report): return (report_id, results) class TurnitinResult: + """ Container class for TurnitinReports. Each page may have zero or + more reports of plagiarism, if plagiarism has been detected for + different revisions. + + TurnitinResult.reports : list containing zero or more TurnitinReports + """ def __init__(self, turnitin_data): self.reports = [] for item in turnitin_data: @@ -54,6 +68,10 @@ class TurnitinResult: return str(self.__dict__) class TurnitinReport: + """ Contains data for each Turnitin report. + TurnitinReport.sources : list of dicts with info from each source + TurnitinReport.reportid : Turnitin report ID, taken from plagiabot + """ def __init__(self, data): self.reportid = data[0] From 1ffa87da0b2a6503f74082bf869ebac7ed8cad56 Mon Sep 17 00:00:00 2001 From: Frances Hocutt Date: Fri, 18 Dec 2015 16:24:31 -0800 Subject: [PATCH 4/8] Improve turnitin.py docstrings, fix bugs --- copyvios/turnitin.py | 58 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py index a5f46de..801b312 100644 --- a/copyvios/turnitin.py +++ b/copyvios/turnitin.py @@ -1,6 +1,4 @@ # -*- coding: utf-8 -*- -"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures)""" - from ast import literal_eval import re @@ -11,16 +9,22 @@ __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' def search_turnitin(page_title, lang): - """ returns a TurnitinResult, containing a list of TurnitinReport items, each containing report id and a list of dicts with data from the report""" + """ Search the Plagiabot database for Turnitin reports for a page. + + Keyword arguments: + page_title -- string containing the page title + lang -- string containing the page's project language code + + Return a TurnitinResult (containing a list of TurnitinReports, with + report ID and source data). + """ turnitin_data = _parse_plagiabot_result(_make_api_request( - 'The quick brown fox jumps over the lazy dog', lang)) # FIXME: replace with page_title when the earwigbot dev setup is working properly + page_title, lang)) turnitin_result = TurnitinResult(turnitin_data) return turnitin_result def _make_api_request(page_title, lang): - """ Query the plagiabot API for Turnitin reports for a given page - page_title : string containing title of the page in question - lang : string containing language code for the current project + """ Query the plagiabot API for Turnitin reports for a given page. """ stripped_page_title = page_title.replace(' ', '_') api_parameters = {'action': 'suspected_diffs', @@ -36,16 +40,15 @@ def _make_api_request(page_title, lang): def _parse_plagiabot_result(turnitin_api_result): result_data = [] for item in turnitin_api_result: - reports_data.append(_parse_report(item['report'])) + result_data.append(_parse_report(item['report'])) return result_data def _parse_report(report): - """ Given the "report" bit from the plagiabot API, extract the report ID and the percent/words/url - """ - # ~magic~ + # extract report ID report_id_pattern = re.compile(r'\?rid=(\d*)') report_id = report_id_pattern.search(report).groups()[0] + # extract percent match, words, and URL for each source in the report extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') results = extract_info_pattern.findall(report) @@ -53,12 +56,18 @@ def _parse_report(report): class TurnitinResult: """ Container class for TurnitinReports. Each page may have zero or - more reports of plagiarism, if plagiarism has been detected for - different revisions. + more reports of plagiarism. The list will have multiple + TurnitinReports if plagiarism has been detected for more than one + revision. - TurnitinResult.reports : list containing zero or more TurnitinReports + TurnitinResult.reports -- list containing >= 0 TurnitinReport items """ def __init__(self, turnitin_data): + """ + Keyword argument: + turnitin_data -- list of tuples with data on each report; see + TurnitinReport.__init__ for the contents. + """ self.reports = [] for item in turnitin_data: report = TurnitinReport(item) @@ -68,11 +77,26 @@ class TurnitinResult: return str(self.__dict__) class TurnitinReport: - """ Contains data for each Turnitin report. - TurnitinReport.sources : list of dicts with info from each source - TurnitinReport.reportid : Turnitin report ID, taken from plagiabot + """ Contains data for each Turnitin report (one on each potentially + plagiarized revision). + + TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot + TurnitinReport.sources -- list of dicts with information on: + percent -- percent of revision found in source as well + words -- number of words found in both source and revision + url -- url for the possibly-plagiarized source """ def __init__(self, data): + """ + Keyword argument: + data -- tuple containing report data. All values are strings. + data[0] -- turnitin report ID + data[1] -- list of tuples with data on each source in the + report + data[][0] -- percent of revision found in source + data[][1] -- number of words matching the source + data[][2] -- url for the matched source + """ self.reportid = data[0] self.sources = [] From 8161bcec548394aff722f3e571a5b779edd82d8b Mon Sep 17 00:00:00 2001 From: Frances Hocutt Date: Fri, 18 Dec 2015 16:30:12 -0800 Subject: [PATCH 5/8] Fix CSS margin to match other boxes --- static/style.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/style.css b/static/style.css index cb0fa11..5ff70b9 100644 --- a/static/style.css +++ b/static/style.css @@ -65,7 +65,7 @@ div#info-box { div#turnitin-container { padding: 5px 10px; - margin: 15px 10px 10px 5px; + margin: 15px 5px 10px 5px; } div#turnitin-title { From 6cafb14991955805245614fcbcb4a50b5f1f96c1 Mon Sep 17 00:00:00 2001 From: Frances Hocutt Date: Fri, 18 Dec 2015 18:38:13 -0800 Subject: [PATCH 6/8] Fix wrapping issue; start reworking report display --- templates/index.mako | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/templates/index.mako b/templates/index.mako index dafc474..1e4340e 100644 --- a/templates/index.mako +++ b/templates/index.mako @@ -114,9 +114,8 @@ -
- - + + @@ -172,16 +171,18 @@ % if query.turnitin_result.reports:

Turnitin (through EranBot) found revisions that may have been plagiarized. Please review them.

+ + ## TODO: make this prettier/tabular %for report in turnitin_result.reports: - %endfor - +
Turnitin report ${report.reportid} for text added in revision ${loop.index} +## TODO: Rework this to something like: [Turnitin report](link) for [revision at timestamp](diff link). Requires API-result-parsing/TurnitinReport changes. Shouldn't be too bad. Reason: needs to make it clear that Turnitin is looking at individual revisions; current report does not.
    % for source in report.sources:
  • ${source['percent']}% of revision text (${source['words']} words) found at ${source['url']}
  • % endfor -
+
% else:

Turnitin (through EranBot) found no matching sources.

From 4e994f13022e7a0a38d93403a246468964f0ce54 Mon Sep 17 00:00:00 2001 From: Frances Hocutt Date: Tue, 22 Dec 2015 15:56:27 -0800 Subject: [PATCH 7/8] Refactor turnitin.py, incorporate diff link/timestamp * Add a wiki timestamp parser to copyvios/misc.py * Refactor copyvios/turnitin.py for more sensible structure * Update templates/index.mako to incorporate diff link/timestamp and make it clearer that Turnitin is revision-based checking --- copyvios/misc.py | 4 +++ copyvios/turnitin.py | 70 ++++++++++++++++++++++++---------------------------- templates/index.mako | 5 +--- 3 files changed, 37 insertions(+), 42 deletions(-) diff --git a/copyvios/misc.py b/copyvios/misc.py index 045386a..9c4d824 100644 --- a/copyvios/misc.py +++ b/copyvios/misc.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import datetime from os.path import expanduser from flask import g, request @@ -64,6 +65,9 @@ def httpsfix(context, url): url = url[len("http:"):] return url +def parse_wiki_timestamp(timestamp): + return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S') + def urlstrip(context, url): if url.startswith("http://"): url = url[7:] diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py index 801b312..07a3b8a 100644 --- a/copyvios/turnitin.py +++ b/copyvios/turnitin.py @@ -4,6 +4,8 @@ import re import requests +from .misc import parse_wiki_timestamp + __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT'] TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py' @@ -15,13 +17,9 @@ def search_turnitin(page_title, lang): page_title -- string containing the page title lang -- string containing the page's project language code - Return a TurnitinResult (containing a list of TurnitinReports, with - report ID and source data). + Return a TurnitinResult (contains a list of TurnitinReports). """ - turnitin_data = _parse_plagiabot_result(_make_api_request( - page_title, lang)) - turnitin_result = TurnitinResult(turnitin_data) - return turnitin_result + return TurnitinResult(_make_api_request(page_title, lang)) def _make_api_request(page_title, lang): """ Query the plagiabot API for Turnitin reports for a given page. @@ -37,23 +35,6 @@ def _make_api_request(page_title, lang): parsed_api_result = literal_eval(result.text) return parsed_api_result -def _parse_plagiabot_result(turnitin_api_result): - result_data = [] - for item in turnitin_api_result: - result_data.append(_parse_report(item['report'])) - return result_data - -def _parse_report(report): - # extract report ID - report_id_pattern = re.compile(r'\?rid=(\d*)') - report_id = report_id_pattern.search(report).groups()[0] - - # extract percent match, words, and URL for each source in the report - extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') - results = extract_info_pattern.findall(report) - - return (report_id, results) - class TurnitinResult: """ Container class for TurnitinReports. Each page may have zero or more reports of plagiarism. The list will have multiple @@ -65,12 +46,12 @@ class TurnitinResult: def __init__(self, turnitin_data): """ Keyword argument: - turnitin_data -- list of tuples with data on each report; see - TurnitinReport.__init__ for the contents. + turnitin_data -- plagiabot API result """ self.reports = [] for item in turnitin_data: - report = TurnitinReport(item) + report = TurnitinReport( + item['diff_timestamp'], item['diff'], item['report']) self.reports.append(report) def __repr__(self): @@ -80,27 +61,28 @@ class TurnitinReport: """ Contains data for each Turnitin report (one on each potentially plagiarized revision). - TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot - TurnitinReport.sources -- list of dicts with information on: + TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot + TurnitinReport.diffid -- diff ID from Wikipedia database + TurnitinReport.time_posted -- datetime of the time the diff posted + TurnitinReport.sources -- list of dicts with information on: percent -- percent of revision found in source as well words -- number of words found in both source and revision url -- url for the possibly-plagiarized source """ - def __init__(self, data): + def __init__(self, timestamp, diffid, report): """ Keyword argument: - data -- tuple containing report data. All values are strings. - data[0] -- turnitin report ID - data[1] -- list of tuples with data on each source in the - report - data[][0] -- percent of revision found in source - data[][1] -- number of words matching the source - data[][2] -- url for the matched source + timestamp -- diff timestamp from Wikipedia database + diffid -- diff ID from Wikipedia database + report -- Turnitin report from the plagiabot database """ - self.reportid = data[0] + self.report_data = self._parse_report(report) + self.reportid = self.report_data[0] + self.diffid = diffid + self.time_posted = parse_wiki_timestamp(timestamp) self.sources = [] - for item in data[1]: + for item in self.report_data[1]: source = {'percent': item[0], 'words': item[1], 'url': item[2]} @@ -108,3 +90,15 @@ class TurnitinReport: def __repr__(self): return str(self.__dict__) + + def _parse_report(self, report_text): + # extract report ID + report_id_pattern = re.compile(r'\?rid=(\d*)') + report_id = report_id_pattern.search(report_text).groups()[0] + + # extract percent match, words, and URL for each source in the report + extract_info_pattern = re.compile( + r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ') + results = extract_info_pattern.findall(report_text) + + return (report_id, results) diff --git a/templates/index.mako b/templates/index.mako index 1e4340e..5522517 100644 --- a/templates/index.mako +++ b/templates/index.mako @@ -172,10 +172,8 @@

Turnitin (through EranBot) found revisions that may have been plagiarized. Please review them.

- ## TODO: make this prettier/tabular %for report in turnitin_result.reports: - %endfor
Turnitin report ${report.reportid} for text added in revision ${loop.index} -## TODO: Rework this to something like: [Turnitin report](link) for [revision at timestamp](diff link). Requires API-result-parsing/TurnitinReport changes. Shouldn't be too bad. Reason: needs to make it clear that Turnitin is looking at individual revisions; current report does not. +
Turnitin report ${report.reportid} for text added at ${report.time_posted}:
    % for source in report.sources:
  • ${source['percent']}% of revision text (${source['words']} words) found at ${source['url']}
  • @@ -183,7 +181,6 @@
- % else:

Turnitin (through EranBot) found no matching sources.

% endif From 9a4dde16138a7ed5ee5a80f44e1351e50f658523 Mon Sep 17 00:00:00 2001 From: Frances Hocutt Date: Wed, 13 Jan 2016 17:09:13 -0800 Subject: [PATCH 8/8] Update Turnitin option label --- templates/index.mako | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/index.mako b/templates/index.mako index 5522517..faf3b90 100644 --- a/templates/index.mako +++ b/templates/index.mako @@ -115,7 +115,7 @@ - +