From 147b46f572bef94547ba8f33954026de69592495 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Sep 2015 04:03:39 -0500 Subject: [PATCH] A couple more fixes and cleanup. --- earwigbot/wiki/copyvios/parsers.py | 10 +++++----- earwigbot/wiki/copyvios/workers.py | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 2f9a4a1..502bd4d 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -28,7 +28,7 @@ import mwparserfromhell from earwigbot import importer from earwigbot.exceptions import ParserExclusionError -from earwigbot.copyvios.exclusions import MIRROR_HINTS +from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS bs4 = importer.new("bs4") nltk = importer.new("nltk") @@ -187,7 +187,7 @@ class _HTMLParser(_BaseTextParser): "script", "style" ] - def parse(self, detect_exclusions=False): + def parse(self, **kwargs): """Return the actual text contained within an HTML document. Implemented using :py:mod:`BeautifulSoup ` @@ -203,7 +203,7 @@ class _HTMLParser(_BaseTextParser): # no scrapable content (possibly JS or magic): return "" - if detect_exclusions: + if kwargs["detect_exclusions"]: # Look for obvious signs that this is a mirror: func = lambda attr: attr and any( hint in attr for hint in MIRROR_HINTS) @@ -229,7 +229,7 @@ class _PDFParser(_BaseTextParser): (u"\u2022", u" "), ] - def parse(self, detect_exclusions=False): + def parse(self, **kwargs): """Return extracted text from the PDF.""" output = StringIO() manager = pdfinterp.PDFResourceManager() @@ -255,7 +255,7 @@ class _PlainTextParser(_BaseTextParser): """A parser that can unicode-ify and strip text from a plain text page.""" TYPE = "Text" - def parse(self, detect_exclusions=False): + def parse(self, **kwargs): """Unicode-ify and strip whitespace from the plain text document.""" converted = bs4.UnicodeDammit(self.text).unicode_markup return converted.strip() if converted else "" diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index f35f484..e03765e 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -156,7 +156,8 @@ class _CopyvioWorker(object): except (IOError, struct_error): return None - return handler(content).parse(source.detect_exclusions) + return handler(content).parse( + detect_exclusions=source.detect_exclusions) def _acquire_new_site(self): """Block for a new unassigned site queue."""