From 33aa1d67444ed55414679f8919e7db1588b375de Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 26 Jul 2012 14:16:07 -0400 Subject: [PATCH] Collapse extra newlines to avoid distorting trigrams. --- earwigbot/wiki/copyvios/parsers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index d7906e4..fab867d 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -70,7 +70,8 @@ class ArticleTextParser(BaseTextParser): The actual stripping is handled by :py:mod:`mwparserfromhell`. """ wikicode = mwparserfromhell.parse(self.text) - self.clean = wikicode.strip_code(normalize=True) + clean = wikicode.strip_code(normalize=True, collapse=True) + self.clean = clean.replace("\n\n", "\n") # Collapse extra newlines. return self.clean def chunk(self, nltk_dir, max_chunks, max_query=256):