From 03910b6cb5b5711f1dc040c0f24e92f269213bb9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Sep 2015 04:00:25 -0500 Subject: [PATCH] Add mirror detection logic to parsers; fixes. --- earwigbot/wiki/copyvios/__init__.py | 4 ++-- earwigbot/wiki/copyvios/exclusions.py | 4 +++- earwigbot/wiki/copyvios/parsers.py | 22 ++++++++++++++++------ earwigbot/wiki/copyvios/result.py | 5 ++++- earwigbot/wiki/copyvios/workers.py | 8 +++++--- 5 files changed, 30 insertions(+), 13 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 46fbf96..74dc0eb 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -118,7 +118,7 @@ class CopyvioMixIn(object): article = MarkovChain(parser.strip()) workspace = CopyvioWorkspace( article, min_confidence, max_time, self._logger, self._addheaders, - short_circuit=short_circuit) + short_circuit=short_circuit, detect_exclusions=True) if self._exclusions_db: self._exclusions_db.sync(self.site.name) exclude = lambda u: self._exclusions_db.check(self.site.name, u) @@ -176,7 +176,7 @@ class CopyvioMixIn(object): article = MarkovChain(ArticleTextParser(self.get()).strip()) workspace = CopyvioWorkspace( article, min_confidence, max_time, self._logger, self._addheaders, - max_time, 1) + max_time, num_workers=1) workspace.enqueue([url]) workspace.wait() result = workspace.get_result() diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index 3c88011..33bb5f8 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -28,7 +28,7 @@ from urlparse import urlparse from earwigbot import exceptions -__all__ = ["ExclusionsDB"] +__all__ = ["ExclusionsDB", "MIRROR_HINTS"] DEFAULT_SOURCES = { "all": [ # Applies to all, but located on enwiki @@ -43,6 +43,8 @@ DEFAULT_SOURCES = { ] } +MIRROR_HINTS = ["wikipedia.org/w/"] + class ExclusionsDB(object): """ **EarwigBot: Wiki Toolset: Exclusions Database Manager** diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index dbd103e..2f9a4a1 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -28,6 +28,7 @@ import mwparserfromhell from earwigbot import importer from earwigbot.exceptions import ParserExclusionError +from earwigbot.copyvios.exclusions import MIRROR_HINTS bs4 = importer.new("bs4") nltk = importer.new("nltk") @@ -186,21 +187,30 @@ class _HTMLParser(_BaseTextParser): "script", "style" ] - def parse(self): + def parse(self, detect_exclusions=False): """Return the actual text contained within an HTML document. Implemented using :py:mod:`BeautifulSoup ` (http://www.crummy.com/software/BeautifulSoup/). """ try: - soup = bs4.BeautifulSoup(self.text, "lxml").body + soup = bs4.BeautifulSoup(self.text, "lxml") except ValueError: - soup = bs4.BeautifulSoup(self.text).body + soup = bs4.BeautifulSoup(self.text) - if not soup: + if not soup.body: # No tag present in HTML -> # no scrapable content (possibly JS or magic): return "" + + if detect_exclusions: + # Look for obvious signs that this is a mirror: + func = lambda attr: attr and any( + hint in attr for hint in MIRROR_HINTS) + if soup.find_all(href=func) or soup.find_all(src=func): + raise ParserExclusionError() + + soup = soup.body is_comment = lambda text: isinstance(text, bs4.element.Comment) for comment in soup.find_all(text=is_comment): comment.extract() @@ -219,7 +229,7 @@ class _PDFParser(_BaseTextParser): (u"\u2022", u" "), ] - def parse(self): + def parse(self, detect_exclusions=False): """Return extracted text from the PDF.""" output = StringIO() manager = pdfinterp.PDFResourceManager() @@ -245,7 +255,7 @@ class _PlainTextParser(_BaseTextParser): """A parser that can unicode-ify and strip text from a plain text page.""" TYPE = "Text" - def parse(self): + def parse(self, detect_exclusions=False): """Unicode-ify and strip whitespace from the plain text document.""" converted = bs4.UnicodeDammit(self.text).unicode_markup return converted.strip() if converted else "" diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py index f044c03..5a221ca 100644 --- a/earwigbot/wiki/copyvios/result.py +++ b/earwigbot/wiki/copyvios/result.py @@ -43,11 +43,14 @@ class CopyvioSource(object): - :py:attr:`excluded`: whether this URL was in the exclusions list """ - def __init__(self, workspace, url, headers=None, timeout=5): + def __init__(self, workspace, url, headers=None, timeout=5, + detect_exclusions=False): self.workspace = workspace self.url = url self.headers = headers self.timeout = timeout + self.detect_exclusions = detect_exclusions + self.confidence = 0.0 self.chains = (EMPTY, EMPTY_INTERSECTION) self.skipped = False diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 4ba25bf..f35f484 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -156,7 +156,7 @@ class _CopyvioWorker(object): except (IOError, struct_error): return None - return handler(content).parse() + return handler(content).parse(source.detect_exclusions) def _acquire_new_site(self): """Block for a new unassigned site queue.""" @@ -240,7 +240,8 @@ class CopyvioWorkspace(object): """Manages a single copyvio check distributed across threads.""" def __init__(self, article, min_confidence, max_time, logger, headers, - url_timeout=5, num_workers=8, short_circuit=True): + url_timeout=5, num_workers=8, short_circuit=True, + detect_exclusions=False): self.sources = [] self.finished = False self.possible_miss = False @@ -254,7 +255,8 @@ class CopyvioWorkspace(object): self._finish_lock = Lock() self._short_circuit = short_circuit self._source_args = {"workspace": self, "headers": headers, - "timeout": url_timeout} + "timeout": url_timeout, + "detect_exclusions": detect_exclusions} if _is_globalized: self._queues = _global_queues