Procházet zdrojové kódy

Minor refactor.

tags/v0.3
Ben Kurtovic před 8 roky
rodič
revize
eceb4d139a
1 změnil soubory, kde provedl 30 přidání a 24 odebrání
  1. +30
    -24
      earwigbot/wiki/copyvios/parsers.py

+ 30
- 24
earwigbot/wiki/copyvios/parsers.py Zobrazit soubor

@@ -108,6 +108,35 @@ class ArticleTextParser(_BaseTextParser):
nltk.download("punkt", self._args["nltk_dir"]) nltk.download("punkt", self._args["nltk_dir"])
return nltk.data.load(datafile(lang)) return nltk.data.load(datafile(lang))


def _get_sentences(self, min_query, max_query, split_thresh):
"""Split the article text into sentences of a certain length."""
def cut_sentence(words):
div = len(words)
if div == 0:
return []

length = len(" ".join(words))
while length > max_query:
div -= 1
length -= len(words[div]) + 1

result = []
if length >= split_thresh:
result.append(" ".join(words[:div]))
return result + cut_sentence(words[div + 1:])

tokenizer = self._get_tokenizer()
sentences = []
if not hasattr(self, "clean"):
self.strip()

for sentence in tokenizer.tokenize(self.clean):
if len(sentence) <= max_query:
sentences.append(sentence)
else:
sentences.extend(cut_sentence(sentence.split()))
return [sen for sen in sentences if len(sen) >= min_query]

def strip(self): def strip(self):
"""Clean the page's raw text by removing templates and formatting. """Clean the page's raw text by removing templates and formatting.


@@ -165,30 +194,7 @@ class ArticleTextParser(_BaseTextParser):
database, and should be passed as an argument to the constructor. It is database, and should be passed as an argument to the constructor. It is
typically located in the bot's working directory. typically located in the bot's working directory.
""" """
def cut_sentence(words):
div = len(words)
if div == 0:
return []

length = len(" ".join(words))
while length > max_query:
div -= 1
length -= len(words[div]) + 1

result = []
if length >= split_thresh:
result.append(" ".join(words[:div]))
return result + cut_sentence(words[div + 1:])

tokenizer = self._get_tokenizer()
sentences = []
for sentence in tokenizer.tokenize(self.clean):
if len(sentence) <= max_query:
sentences.append(sentence)
else:
sentences.extend(cut_sentence(sentence.split()))

sentences = [sen for sen in sentences if len(sen) >= min_query]
sentences = self._get_sentences(min_query, max_query, split_thresh)
if len(sentences) <= max_chunks: if len(sentences) <= max_chunks:
return sentences return sentences




Načítá se…
Zrušit
Uložit