Browse Source

Support parser-directed URL redirecting (for Wayback Machine PDFs)

tags/v0.4
Ben Kurtovic 4 years ago
parent
commit
2b5914b6ae
4 changed files with 45 additions and 21 deletions
  1. +11
    -0
      earwigbot/exceptions.py
  2. +3
    -3
      earwigbot/wiki/copyvios/__init__.py
  3. +12
    -4
      earwigbot/wiki/copyvios/parsers.py
  4. +19
    -14
      earwigbot/wiki/copyvios/workers.py

+ 11
- 0
earwigbot/exceptions.py View File

@@ -259,3 +259,14 @@ class ParserExclusionError(CopyvioCheckError):
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`; should not be <earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`; should not be
exposed in client code. exposed in client code.
""" """

class ParserRedirectError(CopyvioCheckError):
"""A content parser detected that a redirect should be followed.

Raised internally by :py:meth:`Page.copyvio_check
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`; should not be
exposed in client code.
"""
def __init__(self, url):
super(ParserRedirectError, self).__init__()
self.url = url

+ 3
- 3
earwigbot/wiki/copyvios/__init__.py View File

@@ -134,7 +134,7 @@ class CopyvioMixIn(object):


workspace = CopyvioWorkspace( workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders, article, min_confidence, max_time, self._logger, self._addheaders,
short_circuit=short_circuit, parser_args=parser_args)
short_circuit=short_circuit, parser_args=parser_args, exclude_check=exclude)


if article.size < 20: # Auto-fail very small articles if article.size < 20: # Auto-fail very small articles
result = workspace.get_result() result = workspace.get_result()
@@ -142,7 +142,7 @@ class CopyvioMixIn(object):
return result return result


if not no_links: if not no_links:
workspace.enqueue(parser.get_links(), exclude)
workspace.enqueue(parser.get_links())
num_queries = 0 num_queries = 0
if not no_searches: if not no_searches:
chunks = parser.chunk(max_queries) chunks = parser.chunk(max_queries)
@@ -152,7 +152,7 @@ class CopyvioMixIn(object):
break break
log = u"[[{0}]] -> querying {1} for {2!r}" log = u"[[{0}]] -> querying {1} for {2!r}"
self._logger.debug(log.format(self.title, searcher.name, chunk)) self._logger.debug(log.format(self.title, searcher.name, chunk))
workspace.enqueue(searcher.search(chunk), exclude)
workspace.enqueue(searcher.search(chunk))
num_queries += 1 num_queries += 1
sleep(1) sleep(1)




+ 12
- 4
earwigbot/wiki/copyvios/parsers.py View File

@@ -23,11 +23,12 @@
from os import path from os import path
import re import re
from StringIO import StringIO from StringIO import StringIO
import urlparse


import mwparserfromhell import mwparserfromhell


from earwigbot import importer from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError


bs4 = importer.new("bs4") bs4 = importer.new("bs4")
nltk = importer.new("nltk") nltk = importer.new("nltk")
@@ -41,7 +42,8 @@ class _BaseTextParser(object):
"""Base class for a parser that handles text.""" """Base class for a parser that handles text."""
TYPE = None TYPE = None


def __init__(self, text, args=None):
def __init__(self, url, text, args=None):
self.url = url
self.text = text self.text = text
self._args = args or {} self._args = args or {}


@@ -257,12 +259,18 @@ class _HTMLParser(_BaseTextParser):


if not soup.body: if not soup.body:
# No <body> tag present in HTML -> # No <body> tag present in HTML ->
# no scrapable content (possibly JS or <frame> magic):
# no scrapable content (possibly JS or <iframe> magic):
return "" return ""


self._fail_if_mirror(soup) self._fail_if_mirror(soup)

soup = soup.body soup = soup.body

url = urlparse.urlparse(self.url)
if url.netloc == "web.archive.org" and url.path.endswith(".pdf"):
playback = soup.find(id="playback")
if playback and "src" in playback.attrs:
raise ParserRedirectError(playback.attrs["src"])

is_comment = lambda text: isinstance(text, bs4.element.Comment) is_comment = lambda text: isinstance(text, bs4.element.Comment)
for comment in soup.find_all(text=is_comment): for comment in soup.find_all(text=is_comment):
comment.extract() comment.extract()


+ 19
- 14
earwigbot/wiki/copyvios/workers.py View File

@@ -34,7 +34,7 @@ from time import time
from urllib2 import build_opener, URLError from urllib2 import build_opener, URLError


from earwigbot import importer from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import get_parser from earwigbot.wiki.copyvios.parsers import get_parser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource
@@ -43,6 +43,8 @@ tldextract = importer.new("tldextract")


__all__ = ["globalize", "localize", "CopyvioWorkspace"] __all__ = ["globalize", "localize", "CopyvioWorkspace"]


_MAX_REDIRECTS = 3

_is_globalized = False _is_globalized = False
_global_queues = None _global_queues = None
_global_workers = [] _global_workers = []
@@ -111,7 +113,7 @@ class _CopyvioWorker(object):
self._opener = build_opener() self._opener = build_opener()
self._logger = getLogger("earwigbot.wiki.cvworker." + name) self._logger = getLogger("earwigbot.wiki.cvworker." + name)


def _open_url(self, source):
def _open_url(self, source, redirects=0):
"""Open a URL and return its parsed content, or None. """Open a URL and return its parsed content, or None.


First, we will decompress the content if the headers contain "gzip" as First, we will decompress the content if the headers contain "gzip" as
@@ -137,10 +139,10 @@ class _CopyvioWorker(object):
return None return None


content_type = response.headers.get("Content-Type", "text/plain") content_type = response.headers.get("Content-Type", "text/plain")
handler = get_parser(content_type)
if not handler:
parser_class = get_parser(content_type)
if not parser_class:
return None return None
if size > (15 if handler.TYPE == "PDF" else 2) * 1024 ** 2:
if size > (15 if parser_class.TYPE == "PDF" else 2) * 1024 ** 2:
return None return None


try: try:
@@ -156,7 +158,13 @@ class _CopyvioWorker(object):
except (IOError, struct_error): except (IOError, struct_error):
return None return None


return handler(content, source.parser_args).parse()
parser = parser_class(content, source.parser_args)
try:
return parser.parse()
except ParserRedirectError as exc:
if redirects >= _MAX_REDIRECTS:
return None
return self._open_url(exc.url, redirects=redirects + 1)


def _acquire_new_site(self): def _acquire_new_site(self):
"""Block for a new unassigned site queue.""" """Block for a new unassigned site queue."""
@@ -248,7 +256,7 @@ class CopyvioWorkspace(object):


def __init__(self, article, min_confidence, max_time, logger, headers, def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8, short_circuit=True, url_timeout=5, num_workers=8, short_circuit=True,
parser_args=None):
parser_args=None, exclude_check=None):
self.sources = [] self.sources = []
self.finished = False self.finished = False
self.possible_miss = False self.possible_miss = False
@@ -264,6 +272,7 @@ class CopyvioWorkspace(object):
self._source_args = { self._source_args = {
"workspace": self, "headers": headers, "timeout": url_timeout, "workspace": self, "headers": headers, "timeout": url_timeout,
"parser_args": parser_args} "parser_args": parser_args}
self._exclude_check = exclude_check


if _is_globalized: if _is_globalized:
self._queues = _global_queues self._queues = _global_queues
@@ -316,12 +325,8 @@ class CopyvioWorkspace(object):
source.skip() source.skip()
self.finished = True self.finished = True


def enqueue(self, urls, exclude_check=None):
"""Put a list of URLs into the various worker queues.

*exclude_check* is an optional exclusion function that takes a URL and
returns ``True`` if we should skip it and ``False`` otherwise.
"""
def enqueue(self, urls):
"""Put a list of URLs into the various worker queues."""
for url in urls: for url in urls:
with self._queues.lock: with self._queues.lock:
if url in self._handled_urls: if url in self._handled_urls:
@@ -331,7 +336,7 @@ class CopyvioWorkspace(object):
source = CopyvioSource(url=url, **self._source_args) source = CopyvioSource(url=url, **self._source_args)
self.sources.append(source) self.sources.append(source)


if exclude_check and exclude_check(url):
if self._exclude_check and self._exclude_check(url):
self._logger.debug(u"enqueue(): exclude {0}".format(url)) self._logger.debug(u"enqueue(): exclude {0}".format(url))
source.excluded = True source.excluded = True
source.skip() source.skip()


Loading…
Cancel
Save