From f2099df5d5efbc6a5ac6fe0621e3129e4d91a4f1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 27 Jul 2016 18:48:38 -0400 Subject: [PATCH] Minor refactor in HTML parser. --- earwigbot/wiki/copyvios/parsers.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 2f88356..21ccfed 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -232,6 +232,20 @@ class _HTMLParser(_BaseTextParser): "script", "style" ] + def _fail_if_mirror(self, soup): + """Look for obvious signs that the given soup is a wiki mirror. + + If so, raise ParserExclusionError, which is caught in the workers and + causes this source to excluded. + """ + if "mirror_hints" not in self._args: + return + + func = lambda attr: attr and any( + hint in attr for hint in self._args["mirror_hints"]) + if soup.find_all(href=func) or soup.find_all(src=func): + raise ParserExclusionError() + def parse(self): """Return the actual text contained within an HTML document. @@ -248,12 +262,7 @@ class _HTMLParser(_BaseTextParser): # no scrapable content (possibly JS or magic): return "" - if "mirror_hints" in self._args: - # Look for obvious signs that this is a mirror: - func = lambda attr: attr and any( - hint in attr for hint in self._args["mirror_hints"]) - if soup.find_all(href=func) or soup.find_all(src=func): - raise ParserExclusionError() + self._fail_if_mirror(soup) soup = soup.body is_comment = lambda text: isinstance(text, bs4.element.Comment)