Browse Source

Raise file crawl size limit for PDFs.

tags/v0.2
Ben Kurtovic 10 years ago
parent
commit
9ffc3f1bf5
2 changed files with 8 additions and 3 deletions
  1. +5
    -0
      earwigbot/wiki/copyvios/parsers.py
  2. +3
    -3
      earwigbot/wiki/copyvios/workers.py

+ 5
- 0
earwigbot/wiki/copyvios/parsers.py View File

@@ -40,6 +40,7 @@ __all__ = ["ArticleTextParser", "get_parser"]


class _BaseTextParser(object): class _BaseTextParser(object):
"""Base class for a parser that handles text.""" """Base class for a parser that handles text."""
TYPE = None


def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
@@ -56,6 +57,7 @@ class _BaseTextParser(object):


class ArticleTextParser(_BaseTextParser): class ArticleTextParser(_BaseTextParser):
"""A parser that can strip and chunk wikicode article text.""" """A parser that can strip and chunk wikicode article text."""
TYPE = "Article"


def strip(self): def strip(self):
"""Clean the page's raw text by removing templates and formatting. """Clean the page's raw text by removing templates and formatting.
@@ -160,6 +162,7 @@ class ArticleTextParser(_BaseTextParser):


class _HTMLParser(_BaseTextParser): class _HTMLParser(_BaseTextParser):
"""A parser that can extract the text from an HTML document.""" """A parser that can extract the text from an HTML document."""
TYPE = "HTML"
hidden_tags = [ hidden_tags = [
"script", "style" "script", "style"
] ]
@@ -191,6 +194,7 @@ class _HTMLParser(_BaseTextParser):


class _PDFParser(_BaseTextParser): class _PDFParser(_BaseTextParser):
"""A parser that can extract text from a PDF file.""" """A parser that can extract text from a PDF file."""
TYPE = "PDF"
substitutions = [ substitutions = [
(u"\x0c", u"\n"), (u"\x0c", u"\n"),
(u"\u2022", u" "), (u"\u2022", u" "),
@@ -220,6 +224,7 @@ class _PDFParser(_BaseTextParser):


class _PlainTextParser(_BaseTextParser): class _PlainTextParser(_BaseTextParser):
"""A parser that can unicode-ify and strip text from a plain text page.""" """A parser that can unicode-ify and strip text from a plain text page."""
TYPE = "Text"


def parse(self): def parse(self):
"""Unicode-ify and strip whitespace from the plain text document.""" """Unicode-ify and strip whitespace from the plain text document."""


+ 3
- 3
earwigbot/wiki/copyvios/workers.py View File

@@ -134,13 +134,13 @@ class _CopyvioWorker(object):
size = int(response.headers.get("Content-Length", 0)) size = int(response.headers.get("Content-Length", 0))
except ValueError: except ValueError:
return None return None
if size > 1024 ** 2: # Ignore URLs larger than a megabyte
return None


content_type = response.headers.get("Content-Type", "text/plain") content_type = response.headers.get("Content-Type", "text/plain")
handler = get_parser(content_type) handler = get_parser(content_type)
if not handler: if not handler:
return None return None
if size > (15 if handler.TYPE == "PDF" else 2) * 1024 ** 2:
return None


try: try:
content = response.read() content = response.read()
@@ -151,7 +151,7 @@ class _CopyvioWorker(object):
stream = StringIO(content) stream = StringIO(content)
gzipper = GzipFile(fileobj=stream) gzipper = GzipFile(fileobj=stream)
try: try:
content = gzipper.read(2 * 1024 ** 2)
content = gzipper.read()
except (IOError, struct_error): except (IOError, struct_error):
return None return None




Loading…
Cancel
Save