From 75058997c2a33fec82e2df6ca652b5579a59482a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 20 Nov 2015 05:51:44 -0600 Subject: [PATCH] Split copyvio queries a bit differently; maybe better on other languages. --- earwigbot/wiki/copyvios/parsers.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index d843ad5..546a138 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -61,6 +61,7 @@ class ArticleTextParser(_BaseTextParser): """A parser that can strip and chunk wikicode article text.""" TYPE = "Article" TEMPLATE_MERGE_THRESHOLD = 35 + SPLIT_REGEX = re.compile("[!#$%&*+,./:;<=>?@^`|~{}]") def _merge_templates(self, code): """Merge template contents in to wikicode when the values are long.""" @@ -132,6 +133,12 @@ class ArticleTextParser(_BaseTextParser): directory (*nltk_dir*) is required to store nltk's punctuation database. This is typically located in the bot's working directory. """ + def cut_string(fragment): + words = fragment.split() + while len(" ".join(words)) > max_query: + words.pop() + return " ".join(words) + datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") try: tokenizer = nltk.data.load("file:" + datafile) @@ -141,16 +148,14 @@ class ArticleTextParser(_BaseTextParser): sentences = [] for sentence in tokenizer.tokenize(self.clean): - if len(sentence) > max_query: - words = sentence.split() - while len(" ".join(words)) > max_query: - words.pop() - sentence = " ".join(words) - if len(sentence) < min_query: - continue - sentences.append(sentence) - - if max_chunks >= len(sentences): + if len(sentence) <= max_query: + sentences.append(sentence) + else: + sentences.extend(cut_string(fragment) for fragment in + self.SPLIT_REGEX.split(sentence)) + + sentences = [sen for sen in sentences if len(sen) >= min_query] + if len(sentences) <= max_chunks: return sentences chunks = []