From 9ffc3f1bf51000da4a58f8db042a6b865aa7d38f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 12 Dec 2014 17:04:29 -0600 Subject: [PATCH] Raise file crawl size limit for PDFs. --- earwigbot/wiki/copyvios/parsers.py | 5 +++++ earwigbot/wiki/copyvios/workers.py | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 0cb0a68..8eaf3e1 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -40,6 +40,7 @@ __all__ = ["ArticleTextParser", "get_parser"] class _BaseTextParser(object): """Base class for a parser that handles text.""" + TYPE = None def __init__(self, text): self.text = text @@ -56,6 +57,7 @@ class _BaseTextParser(object): class ArticleTextParser(_BaseTextParser): """A parser that can strip and chunk wikicode article text.""" + TYPE = "Article" def strip(self): """Clean the page's raw text by removing templates and formatting. @@ -160,6 +162,7 @@ class ArticleTextParser(_BaseTextParser): class _HTMLParser(_BaseTextParser): """A parser that can extract the text from an HTML document.""" + TYPE = "HTML" hidden_tags = [ "script", "style" ] @@ -191,6 +194,7 @@ class _HTMLParser(_BaseTextParser): class _PDFParser(_BaseTextParser): """A parser that can extract text from a PDF file.""" + TYPE = "PDF" substitutions = [ (u"\x0c", u"\n"), (u"\u2022", u" "), @@ -220,6 +224,7 @@ class _PDFParser(_BaseTextParser): class _PlainTextParser(_BaseTextParser): """A parser that can unicode-ify and strip text from a plain text page.""" + TYPE = "Text" def parse(self): """Unicode-ify and strip whitespace from the plain text document.""" diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 61f1593..aef04a9 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -134,13 +134,13 @@ class _CopyvioWorker(object): size = int(response.headers.get("Content-Length", 0)) except ValueError: return None - if size > 1024 ** 2: # Ignore URLs larger than a megabyte - return None content_type = response.headers.get("Content-Type", "text/plain") handler = get_parser(content_type) if not handler: return None + if size > (15 if handler.TYPE == "PDF" else 2) * 1024 ** 2: + return None try: content = response.read() @@ -151,7 +151,7 @@ class _CopyvioWorker(object): stream = StringIO(content) gzipper = GzipFile(fileobj=stream) try: - content = gzipper.read(2 * 1024 ** 2) + content = gzipper.read() except (IOError, struct_error): return None