A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

96 regels
3.0 KiB

  1. # -*- coding: utf-8 -*-
  2. from re import sub, UNICODE
  3. from markupsafe import escape
  4. __all__ = ["highlight_delta"]
  5. def highlight_delta(context, chain, delta):
  6. degree = chain.degree - 1
  7. highlights = [False] * degree
  8. block = [chain.START] * degree
  9. for word in chain.text.split() + ([chain.END] * degree):
  10. word = _strip_word(chain, word)
  11. tblock = tuple(block)
  12. if tblock in delta.chain and word in delta.chain[tblock]:
  13. highlights[-1 * degree:] = [True] * degree
  14. highlights.append(True)
  15. else:
  16. highlights.append(False)
  17. block.pop(0)
  18. block.append(word)
  19. i = degree
  20. numwords = len(chain.text.split())
  21. processed = []
  22. paragraphs = chain.text.split("\n")
  23. while paragraphs:
  24. words = []
  25. for i, word in enumerate(_get_next(paragraphs), i):
  26. if highlights[i]:
  27. before = highlights[i - 1]
  28. after = highlights[i + 1]
  29. first = i == degree
  30. last = i - degree + 1 == numwords
  31. words.append(_highlight_word(word, before, after, first, last))
  32. else:
  33. words.append(unicode(escape(word)))
  34. processed.append(u" ".join(words))
  35. i += 1
  36. return u"<br /><br />".join(processed)
  37. def _get_next(paragraphs):
  38. paragraph = paragraphs.pop(0)
  39. body = paragraph.split()
  40. if len(body) <= 3:
  41. while paragraphs:
  42. next = paragraphs[0].split()
  43. if len(next) <= 3:
  44. body += next
  45. paragraphs.pop(0)
  46. else:
  47. break
  48. return body
  49. def _highlight_word(word, before, after, first, last):
  50. if before and after:
  51. # Word is in the middle of a highlighted block:
  52. res = unicode(escape(word))
  53. if first:
  54. res = u'<span class="cv-hl">' + res
  55. if last:
  56. res += u'</span>'
  57. elif after:
  58. # Word is the first in a highlighted block:
  59. res = u'<span class="cv-hl">' + _fade_word(word, u"in")
  60. if last:
  61. res += u"</span>"
  62. elif before:
  63. # Word is the last in a highlighted block:
  64. res = _fade_word(word, u"out") + u"</span>"
  65. if first:
  66. res = u'<span class="cv-hl">' + res
  67. else:
  68. res = unicode(escape(word))
  69. return res
  70. def _fade_word(word, dir):
  71. if len(word) <= 4:
  72. word = unicode(escape(word))
  73. return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
  74. if dir == u"out":
  75. before, after = unicode(escape(word[:-4])), unicode(escape(word[-4:]))
  76. base = u'{0}<span class="cv-hl-out">{1}</span>'
  77. return base.format(before, after)
  78. else:
  79. before, after = unicode(escape(word[:4])), unicode(escape(word[4:]))
  80. base = u'<span class="cv-hl-in">{0}</span>{1}'
  81. return base.format(before, after)
  82. def _strip_word(chain, word):
  83. if word == chain.START or word == chain.END:
  84. return word
  85. return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)