From 4e994f13022e7a0a38d93403a246468964f0ce54 Mon Sep 17 00:00:00 2001
From: Frances Hocutt <fhocutt@wikimedia.org>
Date: Tue, 22 Dec 2015 15:56:27 -0800
Subject: [PATCH] Refactor turnitin.py, incorporate diff link/timestamp

* Add a wiki timestamp parser to copyvios/misc.py
* Refactor copyvios/turnitin.py for more sensible structure
* Update templates/index.mako to incorporate diff link/timestamp and
  make it clearer that Turnitin is revision-based checking
---
 copyvios/misc.py     |  4 +++
 copyvios/turnitin.py | 70 ++++++++++++++++++++++++----------------------------
 templates/index.mako |  5 +---
 3 files changed, 37 insertions(+), 42 deletions(-)

diff --git a/copyvios/misc.py b/copyvios/misc.py
index 045386a..9c4d824 100644
--- a/copyvios/misc.py
+++ b/copyvios/misc.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8  -*-
 
+import datetime
 from os.path import expanduser
 
 from flask import g, request
@@ -64,6 +65,9 @@ def httpsfix(context, url):
         url = url[len("http:"):]
     return url
 
+def parse_wiki_timestamp(timestamp):
+    return datetime.datetime.strptime(timestamp, '%Y%m%d%H%M%S')
+
 def urlstrip(context, url):
     if url.startswith("http://"):
         url = url[7:]
diff --git a/copyvios/turnitin.py b/copyvios/turnitin.py
index 801b312..07a3b8a 100644
--- a/copyvios/turnitin.py
+++ b/copyvios/turnitin.py
@@ -4,6 +4,8 @@ import re
 
 import requests
 
+from .misc import parse_wiki_timestamp
+
 __all__ = ['search_turnitin', 'TURNITIN_API_ENDPOINT']
 
 TURNITIN_API_ENDPOINT = 'http://tools.wmflabs.org/eranbot/plagiabot/api.py'
@@ -15,13 +17,9 @@ def search_turnitin(page_title, lang):
     page_title -- string containing the page title
     lang       -- string containing the page's project language code
 
-    Return a TurnitinResult (containing a list of TurnitinReports, with
-    report ID and source data).
+    Return a TurnitinResult (contains a list of TurnitinReports).
     """
-    turnitin_data = _parse_plagiabot_result(_make_api_request(
-        page_title, lang))
-    turnitin_result = TurnitinResult(turnitin_data)
-    return turnitin_result
+    return TurnitinResult(_make_api_request(page_title, lang))
 
 def _make_api_request(page_title, lang):
     """ Query the plagiabot API for Turnitin reports for a given page.
@@ -37,23 +35,6 @@ def _make_api_request(page_title, lang):
     parsed_api_result = literal_eval(result.text)
     return parsed_api_result
 
-def _parse_plagiabot_result(turnitin_api_result):
-    result_data = []
-    for item in turnitin_api_result:
-        result_data.append(_parse_report(item['report']))
-    return result_data
-
-def _parse_report(report):
-    # extract report ID
-    report_id_pattern = re.compile(r'\?rid=(\d*)')
-    report_id = report_id_pattern.search(report).groups()[0]
-
-    # extract percent match, words, and URL for each source in the report
-    extract_info_pattern = re.compile(r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
-    results = extract_info_pattern.findall(report)
-
-    return (report_id, results)
-
 class TurnitinResult:
     """ Container class for TurnitinReports. Each page may have zero or
     more reports of plagiarism. The list will have multiple
@@ -65,12 +46,12 @@ class TurnitinResult:
     def __init__(self, turnitin_data):
         """
         Keyword argument:
-        turnitin_data -- list of tuples with data on each report; see
-                         TurnitinReport.__init__ for the contents.
+        turnitin_data -- plagiabot API result
         """
         self.reports = []
         for item in turnitin_data:
-            report = TurnitinReport(item)
+            report = TurnitinReport(
+                item['diff_timestamp'], item['diff'], item['report'])
             self.reports.append(report)
 
     def __repr__(self):
@@ -80,27 +61,28 @@ class TurnitinReport:
     """ Contains data for each Turnitin report (one on each potentially
     plagiarized revision).
 
-    TurnitinReport.reportid -- Turnitin report ID, taken from plagiabot
-    TurnitinReport.sources -- list of dicts with information on:
+    TurnitinReport.reportid  -- Turnitin report ID, taken from plagiabot
+    TurnitinReport.diffid    -- diff ID from Wikipedia database
+    TurnitinReport.time_posted -- datetime of the time the diff posted
+    TurnitinReport.sources   -- list of dicts with information on:
         percent -- percent of revision found in source as well
         words   -- number of words found in both source and revision
         url     -- url for the possibly-plagiarized source
     """
-    def __init__(self, data):
+    def __init__(self, timestamp, diffid, report):
         """
         Keyword argument:
-        data -- tuple containing report data. All values are strings.
-            data[0] -- turnitin report ID
-            data[1] -- list of tuples with data on each source in the
-                       report
-               data[<index>][0] -- percent of revision found in source
-               data[<index>][1] -- number of words matching the source
-               data[<index>][2] -- url for the matched source
+        timestamp  -- diff timestamp from Wikipedia database
+        diffid     -- diff ID from Wikipedia database
+        report     -- Turnitin report from the plagiabot database
         """
-        self.reportid = data[0]
+        self.report_data = self._parse_report(report)
+        self.reportid = self.report_data[0]
+        self.diffid = diffid
+        self.time_posted = parse_wiki_timestamp(timestamp)
 
         self.sources = []
-        for item in data[1]:
+        for item in self.report_data[1]:
             source = {'percent': item[0],
                       'words': item[1],
                       'url': item[2]}
@@ -108,3 +90,15 @@ class TurnitinReport:
 
     def __repr__(self):
         return str(self.__dict__)
+
+    def _parse_report(self, report_text):
+        # extract report ID
+        report_id_pattern = re.compile(r'\?rid=(\d*)')
+        report_id = report_id_pattern.search(report_text).groups()[0]
+
+        # extract percent match, words, and URL for each source in the report
+        extract_info_pattern = re.compile(
+            r'\n\* \w\s+(\d*)\% (\d*) words at \[(.*?) ')
+        results = extract_info_pattern.findall(report_text)
+
+        return (report_id, results)
diff --git a/templates/index.mako b/templates/index.mako
index 1e4340e..5522517 100644
--- a/templates/index.mako
+++ b/templates/index.mako
@@ -172,10 +172,8 @@
                 <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found revisions that may have been plagiarized. Please review them.</p>
 
                 <table id="turnitin-table"><tbody>
-                ## TODO: make this prettier/tabular
                 %for report in turnitin_result.reports:
-                    <tr><td><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid} for text added in revision ${loop.index}</a>
-## TODO: Rework this to something like: [Turnitin report](link) for [revision at timestamp](diff link). Requires API-result-parsing/TurnitinReport changes. Shouldn't be too bad. Reason: needs to make it clear that Turnitin is looking at individual revisions; current report does not.
+                    <tr><td id="turnitin-table-cell"><a href="https://tools.wmflabs.org/eranbot/ithenticate.py?rid=${report.reportid}">Turnitin report ${report.reportid}</a> for text added <a href="https://${query.lang}.wikipedia.org/w/index.php?title=${query.title}&diff=${report.diffid}"> at ${report.time_posted}</a>:
                     <ul>
                     % for source in report.sources:
                           <li> ${source['percent']}% of revision text (${source['words']} words) found at <a href="${source['url']}">${source['url']}</a></li>
@@ -183,7 +181,6 @@
                     </ul></td></tr>
                 %endfor
                 </tbody></table>
-
             % else:
                 <p>Turnitin (through <a href="https://en.wikipedia.org/wiki/User:EranBot">EranBot</a>) found no matching sources.</p>
             % endif