From 75058997c2a33fec82e2df6ca652b5579a59482a Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Fri, 20 Nov 2015 05:51:44 -0600
Subject: [PATCH] Split copyvio queries a bit differently; maybe better on
 other languages.

---
 earwigbot/wiki/copyvios/parsers.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index d843ad5..546a138 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -61,6 +61,7 @@ class ArticleTextParser(_BaseTextParser):
     """A parser that can strip and chunk wikicode article text."""
     TYPE = "Article"
     TEMPLATE_MERGE_THRESHOLD = 35
+    SPLIT_REGEX = re.compile("[!#$%&*+,./:;<=>?@^`|~{}]")
 
     def _merge_templates(self, code):
         """Merge template contents in to wikicode when the values are long."""
@@ -132,6 +133,12 @@ class ArticleTextParser(_BaseTextParser):
         directory (*nltk_dir*) is required to store nltk's punctuation
         database. This is typically located in the bot's working directory.
         """
+        def cut_string(fragment):
+            words = fragment.split()
+            while len(" ".join(words)) > max_query:
+                words.pop()
+            return " ".join(words)
+
         datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
         try:
             tokenizer = nltk.data.load("file:" + datafile)
@@ -141,16 +148,14 @@ class ArticleTextParser(_BaseTextParser):
 
         sentences = []
         for sentence in tokenizer.tokenize(self.clean):
-            if len(sentence) > max_query:
-                words = sentence.split()
-                while len(" ".join(words)) > max_query:
-                    words.pop()
-                sentence = " ".join(words)
-            if len(sentence) < min_query:
-                continue
-            sentences.append(sentence)
-
-        if max_chunks >= len(sentences):
+            if len(sentence) <= max_query:
+                sentences.append(sentence)
+            else:
+                sentences.extend(cut_string(fragment) for fragment in
+                                 self.SPLIT_REGEX.split(sentence))
+
+        sentences = [sen for sen in sentences if len(sen) >= min_query]
+        if len(sentences) <= max_chunks:
             return sentences
 
         chunks = []