From 2b5914b6ae2efb3a7b7296b46f0686decfbd06c6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 Oct 2020 11:33:41 -0400 Subject: [PATCH] Support parser-directed URL redirecting (for Wayback Machine PDFs) --- earwigbot/exceptions.py | 11 +++++++++++ earwigbot/wiki/copyvios/__init__.py | 6 +++--- earwigbot/wiki/copyvios/parsers.py | 16 ++++++++++++---- earwigbot/wiki/copyvios/workers.py | 33 +++++++++++++++++++-------------- 4 files changed, 45 insertions(+), 21 deletions(-) diff --git a/earwigbot/exceptions.py b/earwigbot/exceptions.py index ad34ae0..d452a76 100644 --- a/earwigbot/exceptions.py +++ b/earwigbot/exceptions.py @@ -259,3 +259,14 @@ class ParserExclusionError(CopyvioCheckError): `; should not be exposed in client code. """ + +class ParserRedirectError(CopyvioCheckError): + """A content parser detected that a redirect should be followed. + + Raised internally by :py:meth:`Page.copyvio_check + `; should not be + exposed in client code. + """ + def __init__(self, url): + super(ParserRedirectError, self).__init__() + self.url = url diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 3d625fe..a93cc6e 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -134,7 +134,7 @@ class CopyvioMixIn(object): workspace = CopyvioWorkspace( article, min_confidence, max_time, self._logger, self._addheaders, - short_circuit=short_circuit, parser_args=parser_args) + short_circuit=short_circuit, parser_args=parser_args, exclude_check=exclude) if article.size < 20: # Auto-fail very small articles result = workspace.get_result() @@ -142,7 +142,7 @@ class CopyvioMixIn(object): return result if not no_links: - workspace.enqueue(parser.get_links(), exclude) + workspace.enqueue(parser.get_links()) num_queries = 0 if not no_searches: chunks = parser.chunk(max_queries) @@ -152,7 +152,7 @@ class CopyvioMixIn(object): break log = u"[[{0}]] -> querying {1} for {2!r}" self._logger.debug(log.format(self.title, searcher.name, chunk)) - workspace.enqueue(searcher.search(chunk), exclude) + workspace.enqueue(searcher.search(chunk)) num_queries += 1 sleep(1) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 6cf03ef..ed94882 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -23,11 +23,12 @@ from os import path import re from StringIO import StringIO +import urlparse import mwparserfromhell from earwigbot import importer -from earwigbot.exceptions import ParserExclusionError +from earwigbot.exceptions import ParserExclusionError, ParserRedirectError bs4 = importer.new("bs4") nltk = importer.new("nltk") @@ -41,7 +42,8 @@ class _BaseTextParser(object): """Base class for a parser that handles text.""" TYPE = None - def __init__(self, text, args=None): + def __init__(self, url, text, args=None): + self.url = url self.text = text self._args = args or {} @@ -257,12 +259,18 @@ class _HTMLParser(_BaseTextParser): if not soup.body: # No tag present in HTML -> - # no scrapable content (possibly JS or magic): + # no scrapable content (possibly JS or