Raise file crawl size limit for PDFs.

10 years ago · 9ffc3f1bf5
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -40,6 +40,7 @@ __all__ = ["ArticleTextParser", "get_parser"]

 class _BaseTextParser(object):
    """Base class for a parser that handles text."""
    TYPE = None

    def __init__(self, text):
        self.text = text
@@ -56,6 +57,7 @@ class _BaseTextParser(object):

 class ArticleTextParser(_BaseTextParser):
    """A parser that can strip and chunk wikicode article text."""
    TYPE = "Article"

    def strip(self):
        """Clean the page's raw text by removing templates and formatting.
@@ -160,6 +162,7 @@ class ArticleTextParser(_BaseTextParser):

 class _HTMLParser(_BaseTextParser):
    """A parser that can extract the text from an HTML document."""
    TYPE = "HTML"
    hidden_tags = [
        "script", "style"
    ]
@@ -191,6 +194,7 @@ class _HTMLParser(_BaseTextParser):

 class _PDFParser(_BaseTextParser):
    """A parser that can extract text from a PDF file."""
    TYPE = "PDF"
    substitutions = [
        (u"\x0c", u"\n"),
        (u"\u2022", u" "),
@@ -220,6 +224,7 @@ class _PDFParser(_BaseTextParser):

 class _PlainTextParser(_BaseTextParser):
    """A parser that can unicode-ify and strip text from a plain text page."""
    TYPE = "Text"

    def parse(self):
        """Unicode-ify and strip whitespace from the plain text document."""
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -134,13 +134,13 @@ class _CopyvioWorker(object):
            size = int(response.headers.get("Content-Length", 0))
        except ValueError:
            return None
        if size > 1024 ** 2:  # Ignore URLs larger than a megabyte
            return None

        content_type = response.headers.get("Content-Type", "text/plain")
        handler = get_parser(content_type)
        if not handler:
            return None
        if size > (15 if handler.TYPE == "PDF" else 2) * 1024 ** 2:
            return None

        try:
            content = response.read()
@@ -151,7 +151,7 @@ class _CopyvioWorker(object):
            stream = StringIO(content)
            gzipper = GzipFile(fileobj=stream)
            try:
                content = gzipper.read(2 * 1024 ** 2)
                content = gzipper.read()
            except (IOError, struct_error):
                return None