A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

101 regels
3.2 KiB

  1. # -*- coding: utf-8 -*-
  2. from collections import deque
  3. from re import sub, UNICODE
  4. from earwigbot.wiki.copyvios.markov import EMPTY_INTERSECTION
  5. from markupsafe import escape
  6. __all__ = ["highlight_delta"]
  7. def highlight_delta(context, chain, delta):
  8. degree = chain.degree - 1
  9. highlights = [False] * degree
  10. block = deque([chain.START] * degree)
  11. if not delta:
  12. delta = EMPTY_INTERSECTION
  13. for word in chain.text.split() + ([chain.END] * degree):
  14. word = _strip_word(chain, word)
  15. tblock = tuple(block)
  16. if tblock in delta.chain and word in delta.chain[tblock]:
  17. highlights[-1 * degree:] = [True] * degree
  18. highlights.append(True)
  19. else:
  20. highlights.append(False)
  21. block.popleft()
  22. block.append(word)
  23. i = degree
  24. numwords = len(chain.text.split())
  25. result = []
  26. paragraphs = deque(chain.text.split("\n"))
  27. while paragraphs:
  28. words = []
  29. for i, word in enumerate(_get_next(paragraphs), i):
  30. if highlights[i]:
  31. before = highlights[i - 1]
  32. after = highlights[i + 1]
  33. first = i == degree
  34. last = i - degree + 1 == numwords
  35. words.append(_highlight_word(word, before, after, first, last))
  36. else:
  37. words.append(unicode(escape(word)))
  38. result.append(u" ".join(words))
  39. i += 1
  40. return u"<br /><br />".join(result)
  41. def _get_next(paragraphs):
  42. body = []
  43. while paragraphs and not body:
  44. body = paragraphs.popleft().split()
  45. if body and len(body) <= 3:
  46. while paragraphs:
  47. next = paragraphs[0].split()
  48. if len(next) <= 3:
  49. body += next
  50. paragraphs.popleft()
  51. else:
  52. break
  53. return body
  54. def _highlight_word(word, before, after, first, last):
  55. if before and after:
  56. # Word is in the middle of a highlighted block:
  57. res = unicode(escape(word))
  58. if first:
  59. res = u'<span class="cv-hl">' + res
  60. if last:
  61. res += u'</span>'
  62. elif after:
  63. # Word is the first in a highlighted block:
  64. res = u'<span class="cv-hl">' + _fade_word(word, u"in")
  65. if last:
  66. res += u"</span>"
  67. elif before:
  68. # Word is the last in a highlighted block:
  69. res = _fade_word(word, u"out") + u"</span>"
  70. if first:
  71. res = u'<span class="cv-hl">' + res
  72. else:
  73. res = unicode(escape(word))
  74. return res
  75. def _fade_word(word, dir):
  76. if len(word) <= 4:
  77. word = unicode(escape(word))
  78. return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
  79. if dir == u"out":
  80. before, after = unicode(escape(word[:-4])), unicode(escape(word[-4:]))
  81. base = u'{0}<span class="cv-hl-out">{1}</span>'
  82. return base.format(before, after)
  83. else:
  84. before, after = unicode(escape(word[:4])), unicode(escape(word[4:]))
  85. base = u'<span class="cv-hl-in">{0}</span>{1}'
  86. return base.format(before, after)
  87. def _strip_word(chain, word):
  88. if word == chain.START or word == chain.END:
  89. return word
  90. return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)