Browse Source

Split copyvio queries a bit differently; maybe better on other languages.

tags/v0.3
Ben Kurtovic 8 years ago
parent
commit
75058997c2
1 changed files with 15 additions and 10 deletions
  1. +15
    -10
      earwigbot/wiki/copyvios/parsers.py

+ 15
- 10
earwigbot/wiki/copyvios/parsers.py View File

@@ -61,6 +61,7 @@ class ArticleTextParser(_BaseTextParser):
"""A parser that can strip and chunk wikicode article text."""
TYPE = "Article"
TEMPLATE_MERGE_THRESHOLD = 35
SPLIT_REGEX = re.compile("[!#$%&*+,./:;<=>?@^`|~{}]")

def _merge_templates(self, code):
"""Merge template contents in to wikicode when the values are long."""
@@ -132,6 +133,12 @@ class ArticleTextParser(_BaseTextParser):
directory (*nltk_dir*) is required to store nltk's punctuation
database. This is typically located in the bot's working directory.
"""
def cut_string(fragment):
words = fragment.split()
while len(" ".join(words)) > max_query:
words.pop()
return " ".join(words)

datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
try:
tokenizer = nltk.data.load("file:" + datafile)
@@ -141,16 +148,14 @@ class ArticleTextParser(_BaseTextParser):

sentences = []
for sentence in tokenizer.tokenize(self.clean):
if len(sentence) > max_query:
words = sentence.split()
while len(" ".join(words)) > max_query:
words.pop()
sentence = " ".join(words)
if len(sentence) < min_query:
continue
sentences.append(sentence)

if max_chunks >= len(sentences):
if len(sentence) <= max_query:
sentences.append(sentence)
else:
sentences.extend(cut_string(fragment) for fragment in
self.SPLIT_REGEX.split(sentence))

sentences = [sen for sen in sentences if len(sen) >= min_query]
if len(sentences) <= max_chunks:
return sentences

chunks = []


Loading…
Cancel
Save