From 9ffc3f1bf51000da4a58f8db042a6b865aa7d38f Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Fri, 12 Dec 2014 17:04:29 -0600
Subject: [PATCH] Raise file crawl size limit for PDFs.

---
 earwigbot/wiki/copyvios/parsers.py | 5 +++++
 earwigbot/wiki/copyvios/workers.py | 6 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 0cb0a68..8eaf3e1 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -40,6 +40,7 @@ __all__ = ["ArticleTextParser", "get_parser"]
 
 class _BaseTextParser(object):
     """Base class for a parser that handles text."""
+    TYPE = None
 
     def __init__(self, text):
         self.text = text
@@ -56,6 +57,7 @@ class _BaseTextParser(object):
 
 class ArticleTextParser(_BaseTextParser):
     """A parser that can strip and chunk wikicode article text."""
+    TYPE = "Article"
 
     def strip(self):
         """Clean the page's raw text by removing templates and formatting.
@@ -160,6 +162,7 @@ class ArticleTextParser(_BaseTextParser):
 
 class _HTMLParser(_BaseTextParser):
     """A parser that can extract the text from an HTML document."""
+    TYPE = "HTML"
     hidden_tags = [
         "script", "style"
     ]
@@ -191,6 +194,7 @@ class _HTMLParser(_BaseTextParser):
 
 class _PDFParser(_BaseTextParser):
     """A parser that can extract text from a PDF file."""
+    TYPE = "PDF"
     substitutions = [
         (u"\x0c", u"\n"),
         (u"\u2022", u" "),
@@ -220,6 +224,7 @@ class _PDFParser(_BaseTextParser):
 
 class _PlainTextParser(_BaseTextParser):
     """A parser that can unicode-ify and strip text from a plain text page."""
+    TYPE = "Text"
 
     def parse(self):
         """Unicode-ify and strip whitespace from the plain text document."""
diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py
index 61f1593..aef04a9 100644
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -134,13 +134,13 @@ class _CopyvioWorker(object):
             size = int(response.headers.get("Content-Length", 0))
         except ValueError:
             return None
-        if size > 1024 ** 2:  # Ignore URLs larger than a megabyte
-            return None
 
         content_type = response.headers.get("Content-Type", "text/plain")
         handler = get_parser(content_type)
         if not handler:
             return None
+        if size > (15 if handler.TYPE == "PDF" else 2) * 1024 ** 2:
+            return None
 
         try:
             content = response.read()
@@ -151,7 +151,7 @@ class _CopyvioWorker(object):
             stream = StringIO(content)
             gzipper = GzipFile(fileobj=stream)
             try:
-                content = gzipper.read(2 * 1024 ** 2)
+                content = gzipper.read()
             except (IOError, struct_error):
                 return None