瀏覽代碼

Support parser-directed URL redirecting (for Wayback Machine PDFs)

tags/v0.4
Ben Kurtovic 4 年之前
父節點
當前提交
2b5914b6ae
共有 4 個檔案被更改,包括 45 行新增21 行删除
  1. +11
    -0
      earwigbot/exceptions.py
  2. +3
    -3
      earwigbot/wiki/copyvios/__init__.py
  3. +12
    -4
      earwigbot/wiki/copyvios/parsers.py
  4. +19
    -14
      earwigbot/wiki/copyvios/workers.py

+ 11
- 0
earwigbot/exceptions.py 查看文件

@@ -259,3 +259,14 @@ class ParserExclusionError(CopyvioCheckError):
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`; should not be
exposed in client code.
"""

class ParserRedirectError(CopyvioCheckError):
"""A content parser detected that a redirect should be followed.

Raised internally by :py:meth:`Page.copyvio_check
<earwigbot.wiki.copyvios.CopyvioMixIn.copyvio_check>`; should not be
exposed in client code.
"""
def __init__(self, url):
super(ParserRedirectError, self).__init__()
self.url = url

+ 3
- 3
earwigbot/wiki/copyvios/__init__.py 查看文件

@@ -134,7 +134,7 @@ class CopyvioMixIn(object):

workspace = CopyvioWorkspace(
article, min_confidence, max_time, self._logger, self._addheaders,
short_circuit=short_circuit, parser_args=parser_args)
short_circuit=short_circuit, parser_args=parser_args, exclude_check=exclude)

if article.size < 20: # Auto-fail very small articles
result = workspace.get_result()
@@ -142,7 +142,7 @@ class CopyvioMixIn(object):
return result

if not no_links:
workspace.enqueue(parser.get_links(), exclude)
workspace.enqueue(parser.get_links())
num_queries = 0
if not no_searches:
chunks = parser.chunk(max_queries)
@@ -152,7 +152,7 @@ class CopyvioMixIn(object):
break
log = u"[[{0}]] -> querying {1} for {2!r}"
self._logger.debug(log.format(self.title, searcher.name, chunk))
workspace.enqueue(searcher.search(chunk), exclude)
workspace.enqueue(searcher.search(chunk))
num_queries += 1
sleep(1)



+ 12
- 4
earwigbot/wiki/copyvios/parsers.py 查看文件

@@ -23,11 +23,12 @@
from os import path
import re
from StringIO import StringIO
import urlparse

import mwparserfromhell

from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError

bs4 = importer.new("bs4")
nltk = importer.new("nltk")
@@ -41,7 +42,8 @@ class _BaseTextParser(object):
"""Base class for a parser that handles text."""
TYPE = None

def __init__(self, text, args=None):
def __init__(self, url, text, args=None):
self.url = url
self.text = text
self._args = args or {}

@@ -257,12 +259,18 @@ class _HTMLParser(_BaseTextParser):

if not soup.body:
# No <body> tag present in HTML ->
# no scrapable content (possibly JS or <frame> magic):
# no scrapable content (possibly JS or <iframe> magic):
return ""

self._fail_if_mirror(soup)

soup = soup.body

url = urlparse.urlparse(self.url)
if url.netloc == "web.archive.org" and url.path.endswith(".pdf"):
playback = soup.find(id="playback")
if playback and "src" in playback.attrs:
raise ParserRedirectError(playback.attrs["src"])

is_comment = lambda text: isinstance(text, bs4.element.Comment)
for comment in soup.find_all(text=is_comment):
comment.extract()


+ 19
- 14
earwigbot/wiki/copyvios/workers.py 查看文件

@@ -34,7 +34,7 @@ from time import time
from urllib2 import build_opener, URLError

from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import get_parser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource
@@ -43,6 +43,8 @@ tldextract = importer.new("tldextract")

__all__ = ["globalize", "localize", "CopyvioWorkspace"]

_MAX_REDIRECTS = 3

_is_globalized = False
_global_queues = None
_global_workers = []
@@ -111,7 +113,7 @@ class _CopyvioWorker(object):
self._opener = build_opener()
self._logger = getLogger("earwigbot.wiki.cvworker." + name)

def _open_url(self, source):
def _open_url(self, source, redirects=0):
"""Open a URL and return its parsed content, or None.

First, we will decompress the content if the headers contain "gzip" as
@@ -137,10 +139,10 @@ class _CopyvioWorker(object):
return None

content_type = response.headers.get("Content-Type", "text/plain")
handler = get_parser(content_type)
if not handler:
parser_class = get_parser(content_type)
if not parser_class:
return None
if size > (15 if handler.TYPE == "PDF" else 2) * 1024 ** 2:
if size > (15 if parser_class.TYPE == "PDF" else 2) * 1024 ** 2:
return None

try:
@@ -156,7 +158,13 @@ class _CopyvioWorker(object):
except (IOError, struct_error):
return None

return handler(content, source.parser_args).parse()
parser = parser_class(content, source.parser_args)
try:
return parser.parse()
except ParserRedirectError as exc:
if redirects >= _MAX_REDIRECTS:
return None
return self._open_url(exc.url, redirects=redirects + 1)

def _acquire_new_site(self):
"""Block for a new unassigned site queue."""
@@ -248,7 +256,7 @@ class CopyvioWorkspace(object):

def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8, short_circuit=True,
parser_args=None):
parser_args=None, exclude_check=None):
self.sources = []
self.finished = False
self.possible_miss = False
@@ -264,6 +272,7 @@ class CopyvioWorkspace(object):
self._source_args = {
"workspace": self, "headers": headers, "timeout": url_timeout,
"parser_args": parser_args}
self._exclude_check = exclude_check

if _is_globalized:
self._queues = _global_queues
@@ -316,12 +325,8 @@ class CopyvioWorkspace(object):
source.skip()
self.finished = True

def enqueue(self, urls, exclude_check=None):
"""Put a list of URLs into the various worker queues.

*exclude_check* is an optional exclusion function that takes a URL and
returns ``True`` if we should skip it and ``False`` otherwise.
"""
def enqueue(self, urls):
"""Put a list of URLs into the various worker queues."""
for url in urls:
with self._queues.lock:
if url in self._handled_urls:
@@ -331,7 +336,7 @@ class CopyvioWorkspace(object):
source = CopyvioSource(url=url, **self._source_args)
self.sources.append(source)

if exclude_check and exclude_check(url):
if self._exclude_check and self._exclude_check(url):
self._logger.debug(u"enqueue(): exclude {0}".format(url))
source.excluded = True
source.skip()


Loading…
取消
儲存