Browse Source

Reorder some URL opening code; zip protection.

tags/v0.2
Ben Kurtovic 10 years ago
parent
commit
05010933c7
1 changed files with 21 additions and 10 deletions
  1. +21
    -10
      earwigbot/wiki/copyvios/__init__.py

+ 21
- 10
earwigbot/wiki/copyvios/__init__.py View File

@@ -157,27 +157,38 @@ class _CopyvioWorker(object):
with self._workspace.request_semaphore: with self._workspace.request_semaphore:
try: try:
response = self._opener.open(url, timeout=self._url_timeout) response = self._opener.open(url, timeout=self._url_timeout)
result = response.read()
except (URLError, timeout): except (URLError, timeout):
return None return None


if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(result)
gzipper = GzipFile(fileobj=stream)
try:
result = gzipper.read()
except IOError:
return None
try:
size = int(response.headers.get("Content-Length", 0))
except ValueError:
return None
if size > 1024 ** 2: # Ignore URLs larger than a megabyte
return None


ctype_full = response.headers.get("Content-Type", "text/plain") ctype_full = response.headers.get("Content-Type", "text/plain")
ctype = ctype_full.split(";", 1)[0] ctype = ctype_full.split(";", 1)[0]
if ctype in ["text/html", "application/xhtml+xml"]: if ctype in ["text/html", "application/xhtml+xml"]:
return HTMLTextParser(result).strip()
handler = lambda res: HTMLTextParser(res).strip()
elif ctype == "text/plain": elif ctype == "text/plain":
return result.strip()
handler = lambda res: res.strip()
else: else:
return None return None


with self._workspace.request_semaphore:
content = response.read()

if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(content)
gzipper = GzipFile(fileobj=stream)
try:
content = gzipper.read(2 * 1024 ** 2)
except IOError:
return None

return handler(content)

def _run(self): def _run(self):
"""Main entry point for the worker thread. """Main entry point for the worker thread.




Loading…
Cancel
Save