From 309d4e8fd4e1e5e77d50ea96d11cdb19273f8c70 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 4 Sep 2012 19:05:42 -0400
Subject: [PATCH] Try a new highlighting engine.

---
 toolserver/copyvios/highlighter.py | 73 ++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 39 deletions(-)
diff --git a/toolserver/copyvios/highlighter.py b/toolserver/copyvios/highlighter.py
index 7010c52..79c3ec9 100644
--- a/toolserver/copyvios/highlighter.py
+++ b/toolserver/copyvios/highlighter.py
@@ -5,66 +5,59 @@ from re import sub, UNICODE
 from markupsafe import escape
 
 def highlight_delta(context, chain, delta):
+    degree = chain.degree - 1
+    highlights = [False] * degree
+    block = [chain.START] * degree
+    for word in chain.text.split() + ([chain.END] * degree):
+        word = _strip_word(chain, word)
+        tblock = tuple(block)
+        if tblock in delta.chain and word in delta.chain[tblock]:
+            highlights[-1 * degree:] = [True] * degree
+            highlights.append(True)
+        else:
+            highlights.append(False)
+        block.pop(0)
+        block.append(word)
+
+    i = degree
+    numwords = len(chain.text.split())
     processed = []
-    dchain = delta.chain
-    prev_prev = prev = chain.START
-    i = 0
-    all_words = chain.text.split()
-    paragraphs = chain.text.split("\n")
-    for paragraph in paragraphs:
-        processed_words = []
-        words = paragraph.split(" ")
-        for i, word in enumerate(words, i):
-            try:
-                next = _strip_word(all_words[i + 1])
-                try:
-                    next_next = _strip_word(all_words[i + 2])
-                except IndexError:
-                    next_next = chain.END
-            except IndexError:
-                next = next_next = chain.END
-            sword = _strip_word(word)
-            middle = (prev, sword) in dchain and next in dchain[(prev, sword)]
-            if middle:
-                before = after = True
-            else:
-                b_block = (prev_prev, prev)
-                a_block = (sword, next)
-                before = b_block in dchain and sword in dchain[b_block]
-                after = a_block in dchain and next_next in dchain[a_block]
-            is_first = i == 0
-            is_last = i + 1 == len(all_words)
-            res = _highlight_word(word, before, after, is_first, is_last)
-            processed_words.append(res)
-            prev_prev = prev
-            prev = sword
-        processed.append(u" ".join(processed_words))
+    for paragraph in chain.text.split("\n"):
+        words = []
+        for i, word in enumerate(paragraph.split(), i):
+            before = highlights[i - 1]
+            after = highlights[i + 1]
+            first = i == degree
+            last = i - degree + 1 == numwords
+            words.append(_highlight_word(word, before, after, first, last))
+        processed.append(u" ".join(words))
         i += 1
+
     return u"<p>" + u"</p>\n<p>".join(processed) + u"</p>"
 
-def _highlight_word(word, before, after, is_first, is_last):
+def _highlight_word(word, before, after, first, last):
     if before and after:
         # Word is in the middle of a highlighted block, so don't change
         # anything unless this is the first word (force block to start) or the
         # last word (force block to end):
         res = unicode(escape(word))
-        if is_first:
+        if first:
             res = u'<span class="cv-hl">' + res
-        if is_last:
+        if last:
             res += u'</span>'
     elif before:
         # Word is the last in a highlighted block, so fade it out and then end
         # the block; force open a block before the word if this is the first
         # word:
         res = _fade_word(word, u"out") + u"</span>"
-        if is_first:
+        if first:
             res = u'<span class="cv-hl">' + res
     elif after:
         # Word is the first in a highlighted block, so start the block and then
         # fade it in; force close the block after the word if this is the last
         # word:
         res = u'<span class="cv-hl">' + _fade_word(word, u"in")
-        if is_last:
+        if last:
             res += u"</span>"
     else:
         # Word is completely outside of a highlighted block, so do nothing:
@@ -84,5 +77,7 @@ def _fade_word(word, dir):
         base = u'<span class="cv-hl-in">{0}</span>{1}'
         return base.format(before, after)
 
-def _strip_word(word):
+def _strip_word(chain, word):
+    if word == chain.START or word == chain.END:
+        return word
     return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)