瀏覽代碼

Raise file crawl size limit for PDFs.

tags/v0.2
Ben Kurtovic 10 年之前
父節點
當前提交
9ffc3f1bf5
共有 2 個檔案被更改,包括 8 行新增3 行删除
  1. +5
    -0
      earwigbot/wiki/copyvios/parsers.py
  2. +3
    -3
      earwigbot/wiki/copyvios/workers.py

+ 5
- 0
earwigbot/wiki/copyvios/parsers.py 查看文件

@@ -40,6 +40,7 @@ __all__ = ["ArticleTextParser", "get_parser"]

class _BaseTextParser(object):
"""Base class for a parser that handles text."""
TYPE = None

def __init__(self, text):
self.text = text
@@ -56,6 +57,7 @@ class _BaseTextParser(object):

class ArticleTextParser(_BaseTextParser):
"""A parser that can strip and chunk wikicode article text."""
TYPE = "Article"

def strip(self):
"""Clean the page's raw text by removing templates and formatting.
@@ -160,6 +162,7 @@ class ArticleTextParser(_BaseTextParser):

class _HTMLParser(_BaseTextParser):
"""A parser that can extract the text from an HTML document."""
TYPE = "HTML"
hidden_tags = [
"script", "style"
]
@@ -191,6 +194,7 @@ class _HTMLParser(_BaseTextParser):

class _PDFParser(_BaseTextParser):
"""A parser that can extract text from a PDF file."""
TYPE = "PDF"
substitutions = [
(u"\x0c", u"\n"),
(u"\u2022", u" "),
@@ -220,6 +224,7 @@ class _PDFParser(_BaseTextParser):

class _PlainTextParser(_BaseTextParser):
"""A parser that can unicode-ify and strip text from a plain text page."""
TYPE = "Text"

def parse(self):
"""Unicode-ify and strip whitespace from the plain text document."""


+ 3
- 3
earwigbot/wiki/copyvios/workers.py 查看文件

@@ -134,13 +134,13 @@ class _CopyvioWorker(object):
size = int(response.headers.get("Content-Length", 0))
except ValueError:
return None
if size > 1024 ** 2: # Ignore URLs larger than a megabyte
return None

content_type = response.headers.get("Content-Type", "text/plain")
handler = get_parser(content_type)
if not handler:
return None
if size > (15 if handler.TYPE == "PDF" else 2) * 1024 ** 2:
return None

try:
content = response.read()
@@ -151,7 +151,7 @@ class _CopyvioWorker(object):
stream = StringIO(content)
gzipper = GzipFile(fileobj=stream)
try:
content = gzipper.read(2 * 1024 ** 2)
content = gzipper.read()
except (IOError, struct_error):
return None



Loading…
取消
儲存