From 91846ce4fbc8241d60ad9aa213ea3dbc166f590f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Sep 2015 10:12:28 -0500 Subject: [PATCH] Refactor out mirror hinting logic in source parsers. --- earwigbot/wiki/copyvios/__init__.py | 2 +- earwigbot/wiki/copyvios/exclusions.py | 4 +--- earwigbot/wiki/copyvios/markov.py | 2 +- earwigbot/wiki/copyvios/parsers.py | 14 +++++++------- earwigbot/wiki/copyvios/result.py | 4 ++-- earwigbot/wiki/copyvios/workers.py | 11 +++++------ 6 files changed, 17 insertions(+), 20 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 74dc0eb..23e1d2a 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -118,7 +118,7 @@ class CopyvioMixIn(object): article = MarkovChain(parser.strip()) workspace = CopyvioWorkspace( article, min_confidence, max_time, self._logger, self._addheaders, - short_circuit=short_circuit, detect_exclusions=True) + short_circuit=short_circuit, parser_args={"mirror_hints": ["wikipedia.org/w/"]}) if self._exclusions_db: self._exclusions_db.sync(self.site.name) exclude = lambda u: self._exclusions_db.check(self.site.name, u) diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index 33bb5f8..3c88011 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -28,7 +28,7 @@ from urlparse import urlparse from earwigbot import exceptions -__all__ = ["ExclusionsDB", "MIRROR_HINTS"] +__all__ = ["ExclusionsDB"] DEFAULT_SOURCES = { "all": [ # Applies to all, but located on enwiki @@ -43,8 +43,6 @@ DEFAULT_SOURCES = { ] } -MIRROR_HINTS = ["wikipedia.org/w/"] - class ExclusionsDB(object): """ **EarwigBot: Wiki Toolset: Exclusions Database Manager** diff --git a/earwigbot/wiki/copyvios/markov.py b/earwigbot/wiki/copyvios/markov.py index 057fcc1..cf26317 100644 --- a/earwigbot/wiki/copyvios/markov.py +++ b/earwigbot/wiki/copyvios/markov.py @@ -35,7 +35,7 @@ class MarkovChain(object): def __init__(self, text): self.text = text self.chain = defaultdict(lambda: defaultdict(lambda: 0)) - words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split() + words = sub(r"[^\w\s-]", "", text.lower(), flags=UNICODE).split() padding = self.degree - 1 words = ([self.START] * padding) + words + ([self.END] * padding) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 502bd4d..d843ad5 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -28,7 +28,6 @@ import mwparserfromhell from earwigbot import importer from earwigbot.exceptions import ParserExclusionError -from earwigbot.wiki.copyvios.exclusions import MIRROR_HINTS bs4 = importer.new("bs4") nltk = importer.new("nltk") @@ -44,8 +43,9 @@ class _BaseTextParser(object): """Base class for a parser that handles text.""" TYPE = None - def __init__(self, text): + def __init__(self, text, args=None): self.text = text + self._args = args or {} def __repr__(self): """Return the canonical string representation of the text parser.""" @@ -187,7 +187,7 @@ class _HTMLParser(_BaseTextParser): "script", "style" ] - def parse(self, **kwargs): + def parse(self): """Return the actual text contained within an HTML document. Implemented using :py:mod:`BeautifulSoup ` @@ -203,10 +203,10 @@ class _HTMLParser(_BaseTextParser): # no scrapable content (possibly JS or magic): return "" - if kwargs["detect_exclusions"]: + if "mirror_hints" in self._args: # Look for obvious signs that this is a mirror: func = lambda attr: attr and any( - hint in attr for hint in MIRROR_HINTS) + hint in attr for hint in self._args["mirror_hints"]) if soup.find_all(href=func) or soup.find_all(src=func): raise ParserExclusionError() @@ -229,7 +229,7 @@ class _PDFParser(_BaseTextParser): (u"\u2022", u" "), ] - def parse(self, **kwargs): + def parse(self): """Return extracted text from the PDF.""" output = StringIO() manager = pdfinterp.PDFResourceManager() @@ -255,7 +255,7 @@ class _PlainTextParser(_BaseTextParser): """A parser that can unicode-ify and strip text from a plain text page.""" TYPE = "Text" - def parse(self, **kwargs): + def parse(self): """Unicode-ify and strip whitespace from the plain text document.""" converted = bs4.UnicodeDammit(self.text).unicode_markup return converted.strip() if converted else "" diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py index 5a221ca..4d9c43a 100644 --- a/earwigbot/wiki/copyvios/result.py +++ b/earwigbot/wiki/copyvios/result.py @@ -44,12 +44,12 @@ class CopyvioSource(object): """ def __init__(self, workspace, url, headers=None, timeout=5, - detect_exclusions=False): + parser_args=None): self.workspace = workspace self.url = url self.headers = headers self.timeout = timeout - self.detect_exclusions = detect_exclusions + self.parser_args = parser_args self.confidence = 0.0 self.chains = (EMPTY, EMPTY_INTERSECTION) diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index e03765e..437a228 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -156,8 +156,7 @@ class _CopyvioWorker(object): except (IOError, struct_error): return None - return handler(content).parse( - detect_exclusions=source.detect_exclusions) + return handler(content, source.parser_args).parse() def _acquire_new_site(self): """Block for a new unassigned site queue.""" @@ -242,7 +241,7 @@ class CopyvioWorkspace(object): def __init__(self, article, min_confidence, max_time, logger, headers, url_timeout=5, num_workers=8, short_circuit=True, - detect_exclusions=False): + parser_args=None): self.sources = [] self.finished = False self.possible_miss = False @@ -255,9 +254,9 @@ class CopyvioWorkspace(object): self._handled_urls = set() self._finish_lock = Lock() self._short_circuit = short_circuit - self._source_args = {"workspace": self, "headers": headers, - "timeout": url_timeout, - "detect_exclusions": detect_exclusions} + self._source_args = { + "workspace": self, "headers": headers, "timeout": url_timeout, + "parser_args": parser_args} if _is_globalized: self._queues = _global_queues