A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 
 

99 行
3.1 KiB

  1. # -*- coding: utf-8 -*-
  2. from re import sub, UNICODE
  3. from earwigbot.wiki.copyvios.markov import EMPTY_INTERSECTION
  4. from markupsafe import escape
  5. __all__ = ["highlight_delta"]
  6. def highlight_delta(context, chain, delta):
  7. degree = chain.degree - 1
  8. highlights = [False] * degree
  9. block = [chain.START] * degree
  10. if not delta:
  11. delta = EMPTY_INTERSECTION
  12. for word in chain.text.split() + ([chain.END] * degree):
  13. word = _strip_word(chain, word)
  14. tblock = tuple(block)
  15. if tblock in delta.chain and word in delta.chain[tblock]:
  16. highlights[-1 * degree:] = [True] * degree
  17. highlights.append(True)
  18. else:
  19. highlights.append(False)
  20. block.pop(0)
  21. block.append(word)
  22. i = degree
  23. numwords = len(chain.text.split())
  24. result = []
  25. paragraphs = chain.text.split("\n")
  26. while paragraphs:
  27. words = []
  28. for i, word in enumerate(_get_next(paragraphs), i):
  29. if highlights[i]:
  30. before = highlights[i - 1]
  31. after = highlights[i + 1]
  32. first = i == degree
  33. last = i - degree + 1 == numwords
  34. words.append(_highlight_word(word, before, after, first, last))
  35. else:
  36. words.append(unicode(escape(word)))
  37. result.append(u" ".join(words))
  38. i += 1
  39. return u"<br /><br />".join(result)
  40. def _get_next(paragraphs):
  41. paragraph = paragraphs.pop(0)
  42. body = paragraph.split()
  43. if len(body) <= 3:
  44. while paragraphs:
  45. next = paragraphs[0].split()
  46. if len(next) <= 3:
  47. body += next
  48. paragraphs.pop(0)
  49. else:
  50. break
  51. return body
  52. def _highlight_word(word, before, after, first, last):
  53. if before and after:
  54. # Word is in the middle of a highlighted block:
  55. res = unicode(escape(word))
  56. if first:
  57. res = u'<span class="cv-hl">' + res
  58. if last:
  59. res += u'</span>'
  60. elif after:
  61. # Word is the first in a highlighted block:
  62. res = u'<span class="cv-hl">' + _fade_word(word, u"in")
  63. if last:
  64. res += u"</span>"
  65. elif before:
  66. # Word is the last in a highlighted block:
  67. res = _fade_word(word, u"out") + u"</span>"
  68. if first:
  69. res = u'<span class="cv-hl">' + res
  70. else:
  71. res = unicode(escape(word))
  72. return res
  73. def _fade_word(word, dir):
  74. if len(word) <= 4:
  75. word = unicode(escape(word))
  76. return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
  77. if dir == u"out":
  78. before, after = unicode(escape(word[:-4])), unicode(escape(word[-4:]))
  79. base = u'{0}<span class="cv-hl-out">{1}</span>'
  80. return base.format(before, after)
  81. else:
  82. before, after = unicode(escape(word[:4])), unicode(escape(word[4:]))
  83. base = u'<span class="cv-hl-in">{0}</span>{1}'
  84. return base.format(before, after)
  85. def _strip_word(chain, word):
  86. if word == chain.START or word == chain.END:
  87. return word
  88. return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)