A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

пре 10 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
пре 11 година
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # -*- coding: utf-8 -*-
  2. from collections import deque
  3. from re import sub, UNICODE
  4. from earwigbot.wiki.copyvios.markov import EMPTY_INTERSECTION
  5. from markupsafe import escape
  6. __all__ = ["highlight_delta"]
  7. def highlight_delta(context, chain, delta):
  8. degree = chain.degree - 1
  9. highlights = [False] * degree
  10. block = deque([chain.START] * degree)
  11. if not delta:
  12. delta = EMPTY_INTERSECTION
  13. for word in chain.text.split() + ([chain.END] * degree):
  14. word = _strip_word(chain, word)
  15. block.append(word)
  16. if tuple(block) in delta.chain:
  17. highlights[-1 * degree:] = [True] * degree
  18. highlights.append(True)
  19. else:
  20. highlights.append(False)
  21. block.popleft()
  22. i = degree
  23. numwords = len(chain.text.split())
  24. result = []
  25. paragraphs = deque(chain.text.split("\n"))
  26. while paragraphs:
  27. words = []
  28. for i, word in enumerate(_get_next(paragraphs), i):
  29. if highlights[i]:
  30. before = highlights[i - 1]
  31. after = highlights[i + 1]
  32. first = i == degree
  33. last = i - degree + 1 == numwords
  34. words.append(_highlight_word(word, before, after, first, last))
  35. else:
  36. words.append(unicode(escape(word)))
  37. result.append(u" ".join(words))
  38. i += 1
  39. return u"<br /><br />".join(result)
  40. def _get_next(paragraphs):
  41. body = []
  42. while paragraphs and not body:
  43. body = paragraphs.popleft().split()
  44. if body and len(body) <= 3:
  45. while paragraphs:
  46. next = paragraphs[0].split()
  47. if len(next) <= 3:
  48. body += next
  49. paragraphs.popleft()
  50. else:
  51. break
  52. return body
  53. def _highlight_word(word, before, after, first, last):
  54. if before and after:
  55. # Word is in the middle of a highlighted block:
  56. res = unicode(escape(word))
  57. if first:
  58. res = u'<span class="cv-hl">' + res
  59. if last:
  60. res += u'</span>'
  61. elif after:
  62. # Word is the first in a highlighted block:
  63. res = u'<span class="cv-hl">' + _fade_word(word, u"in")
  64. if last:
  65. res += u"</span>"
  66. elif before:
  67. # Word is the last in a highlighted block:
  68. res = _fade_word(word, u"out") + u"</span>"
  69. if first:
  70. res = u'<span class="cv-hl">' + res
  71. else:
  72. res = unicode(escape(word))
  73. return res
  74. def _fade_word(word, dir):
  75. if len(word) <= 4:
  76. word = unicode(escape(word))
  77. return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
  78. if dir == u"out":
  79. before, after = unicode(escape(word[:-4])), unicode(escape(word[-4:]))
  80. base = u'{0}<span class="cv-hl-out">{1}</span>'
  81. return base.format(before, after)
  82. else:
  83. before, after = unicode(escape(word[:4])), unicode(escape(word[4:]))
  84. base = u'<span class="cv-hl-in">{0}</span>{1}'
  85. return base.format(before, after)
  86. def _strip_word(chain, word):
  87. if word == chain.START or word == chain.END:
  88. return word
  89. return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)