From 509598d7fcf684cffd5693e8f1a2f1e413ceaf02 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 28 Sep 2015 23:57:31 -0500 Subject: [PATCH] Try merging in templates with parameter values of a certain size (fixes #42) --- earwigbot/wiki/copyvios/parsers.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index a676413..49bc4af 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -58,6 +58,21 @@ class _BaseTextParser(object): class ArticleTextParser(_BaseTextParser): """A parser that can strip and chunk wikicode article text.""" TYPE = "Article" + TEMPLATE_MERGE_THRESHOLD = 35 + + def _merge_templates(self, code): + """Merge template contents in to wikicode when the values are long.""" + for template in code.filter_templates(recursive=code.RECURSE_OTHERS): + chunks = [] + for param in template.params: + if len(param.value) >= self.TEMPLATE_MERGE_THRESHOLD: + self._merge_templates(param.value) + chunks.append(param.value) + if chunks: + subst = u" ".join(map(unicode, chunks)) + code.replace(template, u" " + subst + u" ") + else: + code.remove(template) def strip(self): """Clean the page's raw text by removing templates and formatting. @@ -94,6 +109,9 @@ class ArticleTextParser(_BaseTextParser): for tag in wikicode.filter_tags(matches=lambda tag: tag.tag == "ref"): remove(wikicode, tag) + # Merge in template contents when the values are long: + self._merge_templates(code) + clean = wikicode.strip_code(normalize=True, collapse=True) self.clean = re.sub("\n\n+", "\n", clean).strip() return self.clean