Fix highligher for trigrams; fix HTML escaping.

12 years ago · 912e0dcce3
--- a/toolserver/copyvios/highlighter.py
+++ b/toolserver/copyvios/highlighter.py
@@ -6,6 +6,7 @@ from markupsafe import escape

 def highlight_delta(context, chain, delta):
    processed = []
    dchain = delta.chain
    prev_prev = prev = chain.START
    i = 0
    all_words = chain.text.split()
@@ -15,14 +16,22 @@ def highlight_delta(context, chain, delta):
        words = paragraph.split(" ")
        for i, word in enumerate(words, i):
            try:
                next = _strip_word(all_words[i+1])
                next = _strip_word(all_words[i + 1])
                try:
                    next_next = _strip_word(all_words[i + 2])
                except IndexError:
                    next_next = chain.END
            except IndexError:
                next = chain.END
                next = next_next = chain.END
            sword = _strip_word(word)
            block = (prev_prev, prev)  # Block for before
            alock = (prev, sword)  # Block for after
            before = [block in delta.chain and sword in delta.chain[block]]
            after = [alock in delta.chain and next in delta.chain[alock]]
            middle = (prev, sword) in dchain and next in dchain[(prev, sword)]
            if middle:
                before = after = True
            else:
                b_block = (prev_prev, prev)
                a_block = (sword, next)
                before = b_block in dchain and sword in dchain[b_block]
                after = a_block in dchain and next_next in dchain[a_block]
            is_first = i == 0
            is_last = i + 1 == len(all_words)
            res = _highlight_word(word, before, after, is_first, is_last)
@@ -38,7 +47,7 @@ def _highlight_word(word, before, after, is_first, is_last):
        # Word is in the middle of a highlighted block, so don't change
        # anything unless this is the first word (force block to start) or the
        # last word (force block to end):
        res = escape(word)
        res = unicode(escape(word))
        if is_first:
            res = u'<span class="cv-hl">' + res
        if is_last:
@@ -59,18 +68,21 @@ def _highlight_word(word, before, after, is_first, is_last):
            res += u"</span>"
    else:
        # Word is completely outside of a highlighted block, so do nothing:
        res = escape(word)
        res = unicode(escape(word))
    return res

 def _fade_word(word, dir):
    if len(word) <= 4:
        return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, escape(word))
        word = unicode(escape(word))
        return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
    if dir == u"out":
        before, after = unicode(escape(word[:-4])), unicode(escape(word[-4:]))
        base = u'{0}<span class="cv-hl-out">{1}</span>'
        return base.format(escape(word[:-4]), escape(word[-4:]))
        return base.format(before, after)
    else:
        before, after = unicode(escape(word[:4])), unicode(escape(word[4:]))
        base = u'<span class="cv-hl-in">{0}</span>{1}'
        return base.format(escape(word[:4]), escape(word[4:]))
        return base.format(before, after)

 def _strip_word(word):
    return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)