A copyright violation detector running on Wikimedia Cloud Services https://tools.wmflabs.org/copyvios/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

72 line
2.6 KiB

  1. # -*- coding: utf-8 -*-
  2. from re import sub, UNICODE
  3. def highlight_delta(context, chain, delta):
  4. processed = []
  5. prev_prev = prev = chain.START
  6. i = 0
  7. all_words = chain.text.split()
  8. paragraphs = chain.text.split("\n")
  9. for paragraph in paragraphs:
  10. processed_words = []
  11. words = paragraph.split(" ")
  12. for i, word in enumerate(words, i):
  13. try:
  14. next = _strip_word(all_words[i+1])
  15. except IndexError:
  16. next = chain.END
  17. sword = _strip_word(word)
  18. block = (prev_prev, prev) # Block for before
  19. alock = (prev, sword) # Block for after
  20. before = [block in delta.chain and sword in delta.chain[block]]
  21. after = [alock in delta.chain and next in delta.chain[alock]]
  22. is_first = i == 0
  23. is_last = i + 1 == len(all_words)
  24. res = _highlight_word(word, before, after, is_first, is_last)
  25. processed_words.append(res)
  26. prev_prev = prev
  27. prev = sword
  28. processed.append(u" ".join(processed_words))
  29. i += 1
  30. return u"<br /><br />".join(processed)
  31. def _highlight_word(word, before, after, is_first, is_last):
  32. if before and after:
  33. # Word is in the middle of a highlighted block, so don't change
  34. # anything unless this is the first word (force block to start) or
  35. # the last word (force block to end):
  36. res = word
  37. if is_first:
  38. res = u'<span class="cv-hl">' + res
  39. if is_last:
  40. res += u'</span>'
  41. elif before:
  42. # Word is the last in a highlighted block, so fade it out and then
  43. # end the block; force open a block before the word if this is the
  44. # first word:
  45. res = _fade_word(word, u"out") + u"</span>"
  46. if is_first:
  47. res = u'<span class="cv-hl">' + res
  48. elif after:
  49. # Word is the first in a highlighted block, so start the block and
  50. # then fade it in; force close the block after the word if this is
  51. # the last word:
  52. res = u'<span class="cv-hl">' + _fade_word(word, u"in")
  53. if is_last:
  54. res += u"</span>"
  55. else:
  56. # Word is completely outside of a highlighted block, so do nothing:
  57. res = word
  58. return res
  59. def _fade_word(word, dir):
  60. if len(word) <= 4:
  61. return u'<span class="cv-hl-{0}">{1}</span>'.format(dir, word)
  62. if dir == u"out":
  63. return u'{0}<span class="cv-hl-out">{1}</span>'.format(word[:-4], word[-4:])
  64. return u'<span class="cv-hl-in">{0}</span>{1}'.format(word[:4], word[4:])
  65. def _strip_word(word):
  66. return sub("[^\w\s-]", "", word.lower(), flags=UNICODE)