A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

100 lines
3.1 KiB

  1. # -*- coding: utf-8 -*-
  2. from collections import deque
  3. from re import sub, UNICODE
  4. from earwigbot.wiki.copyvios.markov import EMPTY_INTERSECTION
  5. from markupsafe import escape
  6. __all__ = ["highlight_delta"]
  7. def highlight_delta(context, chain, delta):
  8. degree = chain.degree - 1
  9. highlights = [False] * degree
  10. block = deque([chain.START] * degree)
  11. if not delta:
  12. delta = EMPTY_INTERSECTION
  13. for word in chain.text.split() + ([chain.END] * degree):
  14. word = _strip_word(chain, word)
  15. block.append(word)
  16. if tuple(block) in delta.chain:
  17. highlights[-1 * degree:] = [True] * degree
  18. highlights.append(True)
  19. else:
  20. highlights.append(False)
  21. block.popleft()
  22. i = degree
  23. numwords = len(chain.text.split())
  24. result = []
  25. paragraphs = deque(chain.text.split("\n"))
  26. while paragraphs:
  27. words = []
  28. for i, word in enumerate(_get_next(paragraphs), i):
  29. if highlights[i]:
  30. before = highlights[i - 1]
  31. after = highlights[i + 1]
  32. first = i == degree
  33. last = i - degree + 1 == numwords
  34. words.append(_highlight_word(word, before, after, first, last))
  35. else:
  36. words.append(unicode(escape(word)))
  37. result.append(u" ".join(words))
  38. i += 1
  39. return u"<br /><br />".join(result)
  40. def _get_next(paragraphs):
  41. body = []
  42. while paragraphs and not body:
  43. body = paragraphs.popleft().split()
  44. if body and len(body) <= 3:
  45. while paragraphs:
  46. next = paragraphs[0].split()
  47. if len(next) <= 3:
  48. body += next
  49. paragraphs.popleft()
  50. else:
  51. break
  52. return body
  53. def _highlight_word(word, before, after, first, last):
  54. if before and after:
  55. # Word is in the middle of a highlighted block:
  56. res = unicode(escape(word))
  57. if first:
  58. res = u'<span class="cv-hl">' + res
  59. if last:
  60. res += u'</span>'
  61. elif after:
  62. # Word is the first in a highlighted block:
  63. res = u'<span class="cv-hl">' + _fade_word(word, u"in")
  64. if last:
  65. res += u"</span>"
  66. elif before:
  67. # Word is the last in a highlighted block:
  68. res = _fade_word(word, u"out") + u"</span>"
  69. if first:
  70. res = u'<span class="cv-hl">' + res
  71. else:
  72. res = unicode(escape(word))
  73. return res
  74. def _fade_word(word, dir):
  75. if len(word) <= 4:
  76. word = unicode(escape(word))
  77. return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
  78. if dir == u"out":
  79. before, after = unicode(escape(word[:-4])), unicode(escape(word[-4:]))
  80. base = u'{0}<span class="cv-hl-out">{1}</span>'
  81. return base.format(before, after)
  82. else:
  83. before, after = unicode(escape(word[:4])), unicode(escape(word[4:]))
  84. base = u'<span class="cv-hl-in">{0}</span>{1}'
  85. return base.format(before, after)
  86. def _strip_word(chain, word):
  87. if word == chain.START or word == chain.END:
  88. return word
  89. return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)