Split copyvio queries a bit differently; maybe better on other languages.

9 years ago · 75058997c2
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -61,6 +61,7 @@ class ArticleTextParser(_BaseTextParser):
    """A parser that can strip and chunk wikicode article text."""
    TYPE = "Article"
    TEMPLATE_MERGE_THRESHOLD = 35
    SPLIT_REGEX = re.compile("[!#$%&*+,./:;<=>?@^`|~{}]")

    def _merge_templates(self, code):
        """Merge template contents in to wikicode when the values are long."""
@@ -132,6 +133,12 @@ class ArticleTextParser(_BaseTextParser):
        directory (*nltk_dir*) is required to store nltk's punctuation
        database. This is typically located in the bot's working directory.
        """
        def cut_string(fragment):
            words = fragment.split()
            while len(" ".join(words)) > max_query:
                words.pop()
            return " ".join(words)

        datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
        try:
            tokenizer = nltk.data.load("file:" + datafile)
@@ -141,16 +148,14 @@ class ArticleTextParser(_BaseTextParser):

        sentences = []
        for sentence in tokenizer.tokenize(self.clean):
            if len(sentence) > max_query:
                words = sentence.split()
                while len(" ".join(words)) > max_query:
                    words.pop()
                sentence = " ".join(words)
            if len(sentence) < min_query:
                continue
            sentences.append(sentence)

        if max_chunks >= len(sentences):
            if len(sentence) <= max_query:
                sentences.append(sentence)
            else:
                sentences.extend(cut_string(fragment) for fragment in
                                 self.SPLIT_REGEX.split(sentence))

        sentences = [sen for sen in sentences if len(sen) >= min_query]
        if len(sentences) <= max_chunks:
            return sentences

        chunks = []