From efe23002502a2d4faf521e0a0811e7c7e1565cb4 Mon Sep 17 00:00:00 2001
From: Frances Hocutt <fhocutt@wikimedia.org>
Date: Wed, 16 Dec 2015 20:48:36 -0800
Subject: [PATCH 1/8] [WIP] Basic working integration of turnitin

Add a checkbox to allow searching the EranBot/plagiabot database for
Turnitin results, and display them in a similar form to the on-wiki
EranBot reports if they exist.

Add a new module (copyvios/turnitin.py) to handle fetching and parsing
the EranBot results.

Bug: https://phabricator.wikimedia.org/T110144

TODO: tweak display HTML/CSS; refactor/clean up turnitin.py;
      improve dev set-up so it doesn't always default to testwiki
      and can test without hardcoding page title
---
 app.py               |  3 ++-
 copyvios/checker.py  |  8 +++++++
 copyvios/turnitin.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 templates/index.mako | 29 ++++++++++++++++++++++
 4 files changed, 107 insertions(+), 1 deletion(-)
 create mode 100644 copyvios/turnitin.py

diff --git a/app.py b/app.py
index 5e80db9..3df644a 100755
--- a/app.py
+++ b/app.py
@@ -103,7 +103,8 @@ def index():
     update_sites()
     query = do_check()
     return render_template(
-        "index.mako", notice=notice, query=query, result=query.result)
+        "index.mako", notice=notice, query=query, result=query.result,
+        turnitin_result=query.turnitin_result)
 
 @app.route("/settings", methods=["GET", "POST"])
 @catch_errors
diff --git a/copyvios/checker.py b/copyvios/checker.py
index bf0b278..dab8526 100644
--- a/copyvios/checker.py
+++ b/copyvios/checker.py
@@ -11,6 +11,7 @@ from earwigbot.wiki.copyvios.result import CopyvioSource, CopyvioCheckResult
 
 from .misc import Query, get_db
 from .sites import get_site
+from .turnitin import search_turnitin
 
 __all__ = ["do_check", "T_POSSIBLE", "T_SUSPECT"]
 
@@ -63,9 +64,16 @@ def _get_results(query, follow=True):
         conn = get_db()
         use_engine = 0 if query.use_engine in ("0", "false") else 1
         use_links = 0 if query.use_links in ("0", "false") else 1
+        use_turnitin = 0 if query.turnitin in ("0", "false") else 1
         if not use_engine and not use_links:
             query.error = "no search method"
             return
+
+        # Handle the turnitin check
+        if use_turnitin:
+            query.turnitin_result = search_turnitin(query.title, query.lang)
+
+        # Handle the copyvio check
         mode = "{0}:{1}:".format(use_engine, use_links)
         if not _coerce_bool(query.nocache):
             query.result = _get_cached_results(
diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py
new file mode 100644
index 0000000..0e99d92
--- /dev/null
+++ b/copyvios/turnitin.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)"""
+
+from ast import literal_eval
+import re
+
+import requests
+
+__all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
+
+TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
+
+def search_turnitin(page_title, lang):
+    """ returns a list of tuples, one per report, each containing report id and data from the report"""
+    turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang))
+    turnitin_result = TurnitinResult(turnitin_data)
+    return turnitin_result
+
+def _make_api_request(page_title, lang):
+    stripped_page_title = page_title.replace(' ', '_')
+    api_parameters = {'action': 'suspected_diffs',
+                      'page_title': stripped_page_title,
+                      'lang': lang,
+                      'report': 1}
+
+    result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
+    parsed_result = literal_eval(result.text)  # should be ok with encoding, content-type utf-8
+    return parsed_result
+
+def _parse_reports(turnitin_api_result):
+    reports_data = []
+    for item in turnitin_api_result:
+        reports_data.append(_regex_magic(item['report']))
+    return reports_data
+
+def _regex_magic(report):
+    # ~magic~
+    report_id_pattern = re.compile(r'\?rid=(\d*)')
+    report_id = report_id_pattern.search(report).groups()[0]
+
+    extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
+    results = extract_info_pattern.findall(report)
+
+    return (report_id, results)
+
+class TurnitinResult:
+    def __init__(self, turnitin_data):
+        self.reports = []
+        for item in turnitin_data:
+            report = TurnitinReport(item)
+            self.reports.append(report)
+
+    def __repr__(self):
+        return str(self.__dict__)
+
+class TurnitinReport:
+    def __init__(self, data):
+        self.reportid = data[0]
+
+        self.sources = []
+        for item in data[1]:
+            source = {'percent': item[0],
+                      'words': item[1],
+                      'url': item[2]}
+            self.sources.append(source)
+
+    def __repr__(self):
+        return str(self.__dict__)
diff --git a/templates/index.mako b/templates/index.mako
index 4709b3c..2795fea 100644
--- a/templates/index.mako
+++ b/templates/index.mako
@@ -113,6 +113,10 @@
                             <input class="cv-search" type="hidden" name="use_links" value="0" />
                             <input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" ${'checked="checked"' if (query.use_links != "0") else ""} />
                             <label for="cv-cb-links">Use&nbsp;links&nbsp;in&nbsp;page</label>
+                            <input class="cv-search" type="hidden" name="use_links" value="0" />
+                            <br>
+                            <input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1" ${'checked="checked"' if (query.turnitin != "0") else ""}/>
+                            <label for="cv-cb-turnitin">Find&nbsp;reports&nbsp;through&nbsp;Turnitin</label>
                         </td>
                     </tr>
                     <tr>
@@ -160,6 +164,31 @@
         % endif
         <a href="${request.script_root | h}?lang=${query.lang | h}&amp;project=${query.project | h}&amp;oldid=${query.oldid or query.page.lastrevid | h}&amp;action=${query.action | h}&amp;${"use_engine={0}&use_links={1}".format(int(query.use_engine not in ("0", "false")), int(query.use_links not in ("0", "false"))) if query.action == "search" else "" | h}${"url=" if query.action == "compare" else ""}${query.url if query.action == "compare" else "" | u}">Permalink.</a>
     </div>
+
+    % if query.turnitin:
+        <div id="turnitin-result" class="${'red' if query.turnitin_result else 'green'}-box">
+            <p>Turnitin results (this should be centered like "checked sources")</p>
+            % if query.turnitin_result:
+                Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them. (Does this need some sort of p tag or something?)
+
+                %for report in turnitin_result.reports:
+                <ul>
+                    <li><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report</a>
+                    <ul>
+                    % for source in report.sources:
+                          <li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li>
+                    % endfor
+                    </ul></li>
+                %endfor
+                </ul>
+                ${turnitin_result}
+
+            % else:
+                Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.
+            % endif
+        </div>
+    % endif
+
     <div id="cv-result" class="${'red' if result.confidence >= T_SUSPECT else 'yellow' if result.confidence >= T_POSSIBLE else 'green'}-box">
         <table id="cv-result-head-table">
             <colgroup>

From f0bbb29621b9f770ef8f0f9cf4c60d351595318e Mon Sep 17 00:00:00 2001
From: Frances Hocutt <fhocutt@wikimedia.org>
Date: Thu, 17 Dec 2015 16:18:11 -0800
Subject: [PATCH 2/8] [WIP] Improve style and turnitin report display

---
 static/style.css     | 11 +++++++++++
 templates/index.mako | 14 +++++++-------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/static/style.css b/static/style.css
index ecc7dad..cb0fa11 100644
--- a/static/style.css
+++ b/static/style.css
@@ -63,6 +63,17 @@ div#info-box {
     margin: 10px 5px;
 }
 
+div#turnitin-container {
+    padding: 5px 10px;
+    margin: 15px 10px 10px 5px;
+}
+
+div#turnitin-title {
+    margin-bottom: -5px;
+    text-align: center;
+    font-weight: bold;
+}
+
 div#cv-result {
     padding: 5px;
     margin: 10px 5px;
diff --git a/templates/index.mako b/templates/index.mako
index 2795fea..dafc474 100644
--- a/templates/index.mako
+++ b/templates/index.mako
@@ -150,6 +150,7 @@
         </tr>
     </table>
 </form>
+
 % if result:
     <div id="generation-time">
         Results
@@ -166,14 +167,14 @@
     </div>
 
     % if query.turnitin:
-        <div id="turnitin-result" class="${'red' if query.turnitin_result else 'green'}-box">
-            <p>Turnitin results (this should be centered like "checked sources")</p>
-            % if query.turnitin_result:
-                Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them. (Does this need some sort of p tag or something?)
+        <div id="turnitin-container" class="${'red' if query.turnitin_result.reports else 'green'}-box">
+            <div id="turnitin-title">Turnitin Results</div>
+            % if query.turnitin_result.reports:
+                <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p>
 
                 %for report in turnitin_result.reports:
                 <ul>
-                    <li><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report</a>
+                    <li><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid}</a>
                     <ul>
                     % for source in report.sources:
                           <li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li>
@@ -181,10 +182,9 @@
                     </ul></li>
                 %endfor
                 </ul>
-                ${turnitin_result}
 
             % else:
-                Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.
+                <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p>
             % endif
         </div>
     % endif

From bf0aa22fa9f0a990f27d55798bf8fcac98990b10 Mon Sep 17 00:00:00 2001
From: Frances Hocutt <fhocutt@wikimedia.org>
Date: Thu, 17 Dec 2015 19:02:25 -0800
Subject: [PATCH 3/8] [WIP] improve docstrings and naming, mark TODO

---
 copyvios/turnitin.py | 38 ++++++++++++++++++++++++++++----------
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py
index 0e99d92..a5f46de 100644
--- a/copyvios/turnitin.py
+++ b/copyvios/turnitin.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures woe)"""
+"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures)"""
 
 from ast import literal_eval
 import re
@@ -11,12 +11,17 @@ __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
 TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
 
 def search_turnitin(page_title, lang):
-    """ returns a list of tuples, one per report, each containing report id and data from the report"""
-    turnitin_data = _parse_reports(_make_api_request('The quick brown fox jumps over the lazy dog', lang))
+    """ returns a TurnitinResult, containing a list of TurnitinReport items, each containing report id and a list of dicts with data from the report"""
+    turnitin_data = _parse_plagiabot_result(_make_api_request(
+        'The quick brown fox jumps over the lazy dog', lang))  # FIXME: replace with page_title when the earwigbot dev setup is working properly
     turnitin_result = TurnitinResult(turnitin_data)
     return turnitin_result
 
 def _make_api_request(page_title, lang):
+    """ Query the plagiabot API for Turnitin reports for a given page
+    page_title : string containing title of the page in question
+    lang : string containing language code for the current project
+    """
     stripped_page_title = page_title.replace(' ', '_')
     api_parameters = {'action': 'suspected_diffs',
                       'page_title': stripped_page_title,
@@ -24,16 +29,19 @@ def _make_api_request(page_title, lang):
                       'report': 1}
 
     result = requests.get(TURNITIN_API_ENDPOINT, params=api_parameters)
-    parsed_result = literal_eval(result.text)  # should be ok with encoding, content-type utf-8
-    return parsed_result
+    # use literal_eval to *safely* parse the resulting dict-containing string
+    parsed_api_result = literal_eval(result.text)
+    return parsed_api_result
 
-def _parse_reports(turnitin_api_result):
-    reports_data = []
+def _parse_plagiabot_result(turnitin_api_result):
+    result_data = []
     for item in turnitin_api_result:
-        reports_data.append(_regex_magic(item['report']))
-    return reports_data
+        reports_data.append(_parse_report(item['report']))
+    return result_data
 
-def _regex_magic(report):
+def _parse_report(report):
+    """ Given the "report" bit from the plagiabot API, extract the report ID and the percent/words/url
+    """
     # ~magic~
     report_id_pattern = re.compile(r'\?rid=(\d*)')
     report_id = report_id_pattern.search(report).groups()[0]
@@ -44,6 +52,12 @@ def _regex_magic(report):
     return (report_id, results)
 
 class TurnitinResult:
+    """ Container class for TurnitinReports. Each page may have zero or
+    more reports of plagiarism, if plagiarism has been detected for
+    different revisions.
+
+    TurnitinResult.reports : list containing zero or more TurnitinReports
+    """
     def __init__(self, turnitin_data):
         self.reports = []
         for item in turnitin_data:
@@ -54,6 +68,10 @@ class TurnitinResult:
         return str(self.__dict__)
 
 class TurnitinReport:
+    """ Contains data for each Turnitin report.
+    TurnitinReport.sources : list of dicts with info from each source
+    TurnitinReport.reportid : Turnitin report ID, taken from plagiabot
+    """
     def __init__(self, data):
         self.reportid = data[0]
 

From 1ffa87da0b2a6503f74082bf869ebac7ed8cad56 Mon Sep 17 00:00:00 2001
From: Frances Hocutt <fhocutt@wikimedia.org>
Date: Fri, 18 Dec 2015 16:24:31 -0800
Subject: [PATCH 4/8] Improve turnitin.py docstrings, fix bugs

---
 copyvios/turnitin.py | 58 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py
index a5f46de..801b312 100644
--- a/copyvios/turnitin.py
+++ b/copyvios/turnitin.py
@@ -1,6 +1,4 @@
 # -*- coding: utf-8 -*-
-"""TODO: Docstrings, tests?, documentation of input/output formats (esp. nested data structures)"""
-
 from ast import literal_eval
 import re
 
@@ -11,16 +9,22 @@ __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
 TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
 
 def search_turnitin(page_title, lang):
-    """ returns a TurnitinResult, containing a list of TurnitinReport items, each containing report id and a list of dicts with data from the report"""
+    """ Search the Plagiabot database for Turnitin reports for a page.
+
+    Keyword arguments:
+    page_title -- string containing the page title
+    lang       -- string containing the page's project language code
+
+    Return a TurnitinResult (containing a list of TurnitinReports, with
+    report ID and source data).
+    """
     turnitin_data = _parse_plagiabot_result(_make_api_request(
-        'The quick brown fox jumps over the lazy dog', lang))  # FIXME: replace with page_title when the earwigbot dev setup is working properly
+        page_title, lang))
     turnitin_result = TurnitinResult(turnitin_data)
     return turnitin_result
 
 def _make_api_request(page_title, lang):
-    """ Query the plagiabot API for Turnitin reports for a given page
-    page_title : string containing title of the page in question
-    lang : string containing language code for the current project
+    """ Query the plagiabot API for Turnitin reports for a given page.
     """
     stripped_page_title = page_title.replace(' ', '_')
     api_parameters = {'action': 'suspected_diffs',
@@ -36,16 +40,15 @@ def _make_api_request(page_title, lang):
 def _parse_plagiabot_result(turnitin_api_result):
     result_data = []
     for item in turnitin_api_result:
-        reports_data.append(_parse_report(item['report']))
+        result_data.append(_parse_report(item['report']))
     return result_data
 
 def _parse_report(report):
-    """ Given the "report" bit from the plagiabot API, extract the report ID and the percent/words/url
-    """
-    # ~magic~
+    # extract report ID
     report_id_pattern = re.compile(r'\?rid=(\d*)')
     report_id = report_id_pattern.search(report).groups()[0]
 
+    # extract percent match, words, and URL for each source in the report
     extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
     results = extract_info_pattern.findall(report)
 
@@ -53,12 +56,18 @@ def _parse_report(report):
 
 class TurnitinResult:
     """ Container class for TurnitinReports. Each page may have zero or
-    more reports of plagiarism, if plagiarism has been detected for
-    different revisions.
+    more reports of plagiarism. The list will have multiple
+    TurnitinReports if plagiarism has been detected for more than one
+    revision.
 
-    TurnitinResult.reports : list containing zero or more TurnitinReports
+    TurnitinResult.reports -- list containing >= 0 TurnitinReport items
     """
     def __init__(self, turnitin_data):
+        """
+        Keyword argument:
+        turnitin_data -- list of tuples with data on each report; see
+                         TurnitinReport.__init__ for the contents.
+        """
         self.reports = []
         for item in turnitin_data:
             report = TurnitinReport(item)
@@ -68,11 +77,26 @@ class TurnitinResult:
         return str(self.__dict__)
 
 class TurnitinReport:
-    """ Contains data for each Turnitin report.
-    TurnitinReport.sources : list of dicts with info from each source
-    TurnitinReport.reportid : Turnitin report ID, taken from plagiabot
+    """ Contains data for each Turnitin report (one on each potentially
+    plagiarized revision).
+
+    TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
+    TurnitinReport.sources -- list of dicts with information on:
+        percent -- percent of revision found in source as well
+        words   -- number of words found in both source and revision
+        url     -- url for the possibly-plagiarized source
     """
     def __init__(self, data):
+        """
+        Keyword argument:
+        data -- tuple containing report data. All values are strings.
+            data[0] -- turnitin report ID
+            data[1] -- list of tuples with data on each source in the
+                       report
+               data[<index>][0] -- percent of revision found in source
+               data[<index>][1] -- number of words matching the source
+               data[<index>][2] -- url for the matched source
+        """
         self.reportid = data[0]
 
         self.sources = []

From 8161bcec548394aff722f3e571a5b779edd82d8b Mon Sep 17 00:00:00 2001
From: Frances Hocutt <fhocutt@wikimedia.org>
Date: Fri, 18 Dec 2015 16:30:12 -0800
Subject: [PATCH 5/8] Fix CSS margin to match other boxes

---
 static/style.css | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/static/style.css b/static/style.css
index cb0fa11..5ff70b9 100644
--- a/static/style.css
+++ b/static/style.css
@@ -65,7 +65,7 @@ div#info-box {
 
 div#turnitin-container {
     padding: 5px 10px;
-    margin: 15px 10px 10px 5px;
+    margin: 15px 5px 10px 5px;
 }
 
 div#turnitin-title {

From 6cafb14991955805245614fcbcb4a50b5f1f96c1 Mon Sep 17 00:00:00 2001
From: Frances Hocutt <fhocutt@wikimedia.org>
Date: Fri, 18 Dec 2015 18:38:13 -0800
Subject: [PATCH 6/8] Fix wrapping issue; start reworking report display

---
 templates/index.mako | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/templates/index.mako b/templates/index.mako
index dafc474..1e4340e 100644
--- a/templates/index.mako
+++ b/templates/index.mako
@@ -114,9 +114,8 @@
                             <input id="cv-cb-links" class="cv-search" type="checkbox" name="use_links" value="1" ${'checked="checked"' if (query.use_links != "0") else ""} />
                             <label for="cv-cb-links">Use&nbsp;links&nbsp;in&nbsp;page</label>
                             <input class="cv-search" type="hidden" name="use_links" value="0" />
-                            <br>
-                            <input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1" ${'checked="checked"' if (query.turnitin != "0") else ""}/>
-                            <label for="cv-cb-turnitin">Find&nbsp;reports&nbsp;through&nbsp;Turnitin</label>
+                            <span style="white-space:nowrap"><input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1" ${'checked="checked"' if (query.turnitin != "0") else ""}/>
+                            <label for="cv-cb-turnitin">Search&nbsp;Turnitin&nbsp;reports</label></span>
                         </td>
                     </tr>
                     <tr>
@@ -172,16 +171,18 @@
             % if query.turnitin_result.reports:
                 <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p>
 
+                <table id="turnitin-table"><tbody>
+                ## TODO: make this prettier/tabular
                 %for report in turnitin_result.reports:
-                <ul>
-                    <li><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid}</a>
+                    <tr><td><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid} for text added in revision ${loop.index}</a>
+## TODO: Rework this to something like: [Turnitin report](link) for [revision at timestamp](diff link). Requires API-result-parsing/TurnitinReport changes. Shouldn't be too bad. Reason: needs to make it clear that Turnitin is looking at individual revisions; current report does not.
                     <ul>
                     % for source in report.sources:
                           <li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li>
                     % endfor
-                    </ul></li>
+                    </ul></td></tr>
                 %endfor
-                </ul>
+                </tbody></table>
 
             % else:
                 <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p>

From 4e994f13022e7a0a38d93403a246468964f0ce54 Mon Sep 17 00:00:00 2001
From: Frances Hocutt <fhocutt@wikimedia.org>
Date: Tue, 22 Dec 2015 15:56:27 -0800
Subject: [PATCH 7/8] Refactor turnitin.py, incorporate diff link/timestamp

* Add a wiki timestamp parser to copyvios/misc.py
* Refactor copyvios/turnitin.py for more sensible structure
* Update templates/index.mako to incorporate diff link/timestamp and
  make it clearer that Turnitin is revision-based checking
---
 copyvios/misc.py     |  4 +++
 copyvios/turnitin.py | 70 ++++++++++++++++++++++++----------------------------
 templates/index.mako |  5 +---
 3 files changed, 37 insertions(+), 42 deletions(-)

diff --git a/copyvios/misc.py b/copyvios/misc.py
index 045386a..9c4d824 100644
--- a/copyvios/misc.py
+++ b/copyvios/misc.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8  -*-
 
+import datetime
 from os.path import expanduser
 
 from flask import g, request
@@ -64,6 +65,9 @@ def httpsfix(context, url):
         url = url[len("http:"):]
     return url
 
+def parse_wiki_timestamp(timestamp):
+    return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S')
+
 def urlstrip(context, url):
     if url.startswith("http://"):
         url = url[7:]
diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py
index 801b312..07a3b8a 100644
--- a/copyvios/turnitin.py
+++ b/copyvios/turnitin.py
@@ -4,6 +4,8 @@ import re
 
 import requests
 
+from .misc import parse_wiki_timestamp
+
 __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
 
 TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
@@ -15,13 +17,9 @@ def search_turnitin(page_title, lang):
     page_title -- string containing the page title
     lang       -- string containing the page's project language code
 
-    Return a TurnitinResult (containing a list of TurnitinReports, with
-    report ID and source data).
+    Return a TurnitinResult (contains a list of TurnitinReports).
     """
-    turnitin_data = _parse_plagiabot_result(_make_api_request(
-        page_title, lang))
-    turnitin_result = TurnitinResult(turnitin_data)
-    return turnitin_result
+    return TurnitinResult(_make_api_request(page_title, lang))
 
 def _make_api_request(page_title, lang):
     """ Query the plagiabot API for Turnitin reports for a given page.
@@ -37,23 +35,6 @@ def _make_api_request(page_title, lang):
     parsed_api_result = literal_eval(result.text)
     return parsed_api_result
 
-def _parse_plagiabot_result(turnitin_api_result):
-    result_data = []
-    for item in turnitin_api_result:
-        result_data.append(_parse_report(item['report']))
-    return result_data
-
-def _parse_report(report):
-    # extract report ID
-    report_id_pattern = re.compile(r'\?rid=(\d*)')
-    report_id = report_id_pattern.search(report).groups()[0]
-
-    # extract percent match, words, and URL for each source in the report
-    extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
-    results = extract_info_pattern.findall(report)
-
-    return (report_id, results)
-
 class TurnitinResult:
     """ Container class for TurnitinReports. Each page may have zero or
     more reports of plagiarism. The list will have multiple
@@ -65,12 +46,12 @@ class TurnitinResult:
     def __init__(self, turnitin_data):
         """
         Keyword argument:
-        turnitin_data -- list of tuples with data on each report; see
-                         TurnitinReport.__init__ for the contents.
+        turnitin_data -- plagiabot API result
         """
         self.reports = []
         for item in turnitin_data:
-            report = TurnitinReport(item)
+            report = TurnitinReport(
+                item['diff_timestamp'], item['diff'], item['report'])
             self.reports.append(report)
 
     def __repr__(self):
@@ -80,27 +61,28 @@ class TurnitinReport:
     """ Contains data for each Turnitin report (one on each potentially
     plagiarized revision).
 
-    TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
-    TurnitinReport.sources -- list of dicts with information on:
+    TurnitinReport.reportid  -- Turnitin report ID, taken from plagiabot
+    TurnitinReport.diffid    -- diff ID from Wikipedia database
+    TurnitinReport.time_posted -- datetime of the time the diff posted
+    TurnitinReport.sources   -- list of dicts with information on:
         percent -- percent of revision found in source as well
         words   -- number of words found in both source and revision
         url     -- url for the possibly-plagiarized source
     """
-    def __init__(self, data):
+    def __init__(self, timestamp, diffid, report):
         """
         Keyword argument:
-        data -- tuple containing report data. All values are strings.
-            data[0] -- turnitin report ID
-            data[1] -- list of tuples with data on each source in the
-                       report
-               data[<index>][0] -- percent of revision found in source
-               data[<index>][1] -- number of words matching the source
-               data[<index>][2] -- url for the matched source
+        timestamp  -- diff timestamp from Wikipedia database
+        diffid     -- diff ID from Wikipedia database
+        report     -- Turnitin report from the plagiabot database
         """
-        self.reportid = data[0]
+        self.report_data = self._parse_report(report)
+        self.reportid = self.report_data[0]
+        self.diffid = diffid
+        self.time_posted = parse_wiki_timestamp(timestamp)
 
         self.sources = []
-        for item in data[1]:
+        for item in self.report_data[1]:
             source = {'percent': item[0],
                       'words': item[1],
                       'url': item[2]}
@@ -108,3 +90,15 @@ class TurnitinReport:
 
     def __repr__(self):
         return str(self.__dict__)
+
+    def _parse_report(self, report_text):
+        # extract report ID
+        report_id_pattern = re.compile(r'\?rid=(\d*)')
+        report_id = report_id_pattern.search(report_text).groups()[0]
+
+        # extract percent match, words, and URL for each source in the report
+        extract_info_pattern = re.compile(
+            r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
+        results = extract_info_pattern.findall(report_text)
+
+        return (report_id, results)
diff --git a/templates/index.mako b/templates/index.mako
index 1e4340e..5522517 100644
--- a/templates/index.mako
+++ b/templates/index.mako
@@ -172,10 +172,8 @@
                 <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p>
 
                 <table id="turnitin-table"><tbody>
-                ## TODO: make this prettier/tabular
                 %for report in turnitin_result.reports:
-                    <tr><td><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid} for text added in revision ${loop.index}</a>
-## TODO: Rework this to something like: [Turnitin report](link) for [revision at timestamp](diff link). Requires API-result-parsing/TurnitinReport changes. Shouldn't be too bad. Reason: needs to make it clear that Turnitin is looking at individual revisions; current report does not.
+                    <tr><td id="turnitin-table-cell"><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid}</a> for text added <a href="https://${query.lang}.wikipedia.org/w/index.php?title=${query.title}&diff=${report.diffid}"> at ${report.time_posted}</a>:
                     <ul>
                     % for source in report.sources:
                           <li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li>
@@ -183,7 +181,6 @@
                     </ul></td></tr>
                 %endfor
                 </tbody></table>
-
             % else:
                 <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p>
             % endif

From 9a4dde16138a7ed5ee5a80f44e1351e50f658523 Mon Sep 17 00:00:00 2001
From: Frances Hocutt <fhocutt@wikimedia.org>
Date: Wed, 13 Jan 2016 17:09:13 -0800
Subject: [PATCH 8/8] Update Turnitin option label

---
 templates/index.mako | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/templates/index.mako b/templates/index.mako
index 5522517..faf3b90 100644
--- a/templates/index.mako
+++ b/templates/index.mako
@@ -115,7 +115,7 @@
                             <label for="cv-cb-links">Use&nbsp;links&nbsp;in&nbsp;page</label>
                             <input class="cv-search" type="hidden" name="use_links" value="0" />
                             <span style="white-space:nowrap"><input id="cv-cb-turnitin" class="cv-search" type="checkbox" name="turnitin" value="1" ${'checked="checked"' if (query.turnitin != "0") else ""}/>
-                            <label for="cv-cb-turnitin">Search&nbsp;Turnitin&nbsp;reports</label></span>
+                            <label for="cv-cb-turnitin">Use&nbsp;Turnitin&nbsp;database</label></span>
                         </td>
                     </tr>
                     <tr>