A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

94 lines
3.0 KiB

  1. # -*- coding: utf-8 -*-
  2. from re import sub, UNICODE
  3. from markupsafe import escape
  4. def highlight_delta(context, chain, delta):
  5. degree = chain.degree - 1
  6. highlights = [False] * degree
  7. block = [chain.START] * degree
  8. for word in chain.text.split() + ([chain.END] * degree):
  9. word = _strip_word(chain, word)
  10. tblock = tuple(block)
  11. if tblock in delta.chain and word in delta.chain[tblock]:
  12. highlights[-1 * degree:] = [True] * degree
  13. highlights.append(True)
  14. else:
  15. highlights.append(False)
  16. block.pop(0)
  17. block.append(word)
  18. i = degree
  19. numwords = len(chain.text.split())
  20. processed = []
  21. paragraphs = chain.text.split("\n")
  22. while paragraphs:
  23. words = []
  24. for i, word in enumerate(_get_next(paragraphs), i):
  25. if highlights[i]:
  26. before = highlights[i - 1]
  27. after = highlights[i + 1]
  28. first = i == degree
  29. last = i - degree + 1 == numwords
  30. words.append(_highlight_word(word, before, after, first, last))
  31. else:
  32. words.append(unicode(escape(word)))
  33. processed.append(u" ".join(words))
  34. i += 1
  35. return u"<br /><br />".join(processed)
  36. def _get_next(paragraphs):
  37. paragraph = paragraphs.pop(0)
  38. body = paragraph.split()
  39. if len(body) <= 3:
  40. while paragraphs:
  41. next = paragraphs[0].split()
  42. if len(next) <= 3:
  43. body += next
  44. paragraphs.pop(0)
  45. else:
  46. break
  47. return body
  48. def _highlight_word(word, before, after, first, last):
  49. if before and after:
  50. # Word is in the middle of a highlighted block:
  51. res = unicode(escape(word))
  52. if first:
  53. res = u'<span class="cv-hl">' + res
  54. if last:
  55. res += u'</span>'
  56. elif after:
  57. # Word is the first in a highlighted block:
  58. res = u'<span class="cv-hl">' + _fade_word(word, u"in")
  59. if last:
  60. res += u"</span>"
  61. elif before:
  62. # Word is the last in a highlighted block:
  63. res = _fade_word(word, u"out") + u"</span>"
  64. if first:
  65. res = u'<span class="cv-hl">' + res
  66. else:
  67. res = unicode(escape(word))
  68. return res
  69. def _fade_word(word, dir):
  70. if len(word) <= 4:
  71. word = unicode(escape(word))
  72. return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
  73. if dir == u"out":
  74. before, after = unicode(escape(word[:-4])), unicode(escape(word[-4:]))
  75. base = u'{0}<span class="cv-hl-out">{1}</span>'
  76. return base.format(before, after)
  77. else:
  78. before, after = unicode(escape(word[:4])), unicode(escape(word[4:]))
  79. base = u'<span class="cv-hl-in">{0}</span>{1}'
  80. return base.format(before, after)
  81. def _strip_word(chain, word):
  82. if word == chain.START or word == chain.END:
  83. return word
  84. return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)