From 912e0dcce3cb67e92cb59df8ca940518df99e0d8 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 4 Sep 2012 14:15:38 -0400 Subject: [PATCH] Fix highligher for trigrams; fix HTML escaping. --- toolserver/copyvios/highlighter.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/toolserver/copyvios/highlighter.py b/toolserver/copyvios/highlighter.py index 67efb7f..ba4df87 100644 --- a/toolserver/copyvios/highlighter.py +++ b/toolserver/copyvios/highlighter.py @@ -6,6 +6,7 @@ from markupsafe import escape def highlight_delta(context, chain, delta): processed = [] + dchain = delta.chain prev_prev = prev = chain.START i = 0 all_words = chain.text.split() @@ -15,14 +16,22 @@ def highlight_delta(context, chain, delta): words = paragraph.split(" ") for i, word in enumerate(words, i): try: - next = _strip_word(all_words[i+1]) + next = _strip_word(all_words[i + 1]) + try: + next_next = _strip_word(all_words[i + 2]) + except IndexError: + next_next = chain.END except IndexError: - next = chain.END + next = next_next = chain.END sword = _strip_word(word) - block = (prev_prev, prev) # Block for before - alock = (prev, sword) # Block for after - before = [block in delta.chain and sword in delta.chain[block]] - after = [alock in delta.chain and next in delta.chain[alock]] + middle = (prev, sword) in dchain and next in dchain[(prev, sword)] + if middle: + before = after = True + else: + b_block = (prev_prev, prev) + a_block = (sword, next) + before = b_block in dchain and sword in dchain[b_block] + after = a_block in dchain and next_next in dchain[a_block] is_first = i == 0 is_last = i + 1 == len(all_words) res = _highlight_word(word, before, after, is_first, is_last) @@ -38,7 +47,7 @@ def _highlight_word(word, before, after, is_first, is_last): # Word is in the middle of a highlighted block, so don't change # anything unless this is the first word (force block to start) or the # last word (force block to end): - res = escape(word) + res = unicode(escape(word)) if is_first: res = u'' + res if is_last: @@ -59,18 +68,21 @@ def _highlight_word(word, before, after, is_first, is_last): res += u"" else: # Word is completely outside of a highlighted block, so do nothing: - res = escape(word) + res = unicode(escape(word)) return res def _fade_word(word, dir): if len(word) <= 4: - return u'{1}'.format(dir, escape(word)) + word = unicode(escape(word)) + return u'{1}'.format(dir, word) if dir == u"out": + before, after = unicode(escape(word[:-4])), unicode(escape(word[-4:])) base = u'{0}{1}' - return base.format(escape(word[:-4]), escape(word[-4:])) + return base.format(before, after) else: + before, after = unicode(escape(word[:4])), unicode(escape(word[4:])) base = u'{0}{1}' - return base.format(escape(word[:4]), escape(word[4:])) + return base.format(before, after) def _strip_word(word): return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)