Browse Source

copyvios: Support on-error condition for URL proxying

tags/v0.4
Ben Kurtovic 3 years ago
parent
commit
01dcbd4394
1 changed files with 36 additions and 18 deletions
  1. +36
    -18
      earwigbot/wiki/copyvios/workers.py

+ 36
- 18
earwigbot/wiki/copyvios/workers.py View File

@@ -121,36 +121,54 @@ class _CopyvioWorker(object):
self._opener = build_opener() self._opener = build_opener()
self._logger = getLogger("earwigbot.wiki.cvworker." + name) self._logger = getLogger("earwigbot.wiki.cvworker." + name)


def _try_map_proxy_url(self, url, parsed, extra_headers, is_error=False):
if not self._search_config or "proxies" not in self._search_config:
return url, False
for proxy_info in self._search_config["proxies"]:
if parsed.netloc != proxy_info["netloc"]:
continue
if "onerr" in proxy_info:
if proxy_info["onerr"] and not is_error:
continue
if not proxy_info["onerr"] and is_error:
continue
path = parsed.path
if "path" in proxy_info:
if not parsed.path.startswith(proxy_info["path"]):
continue
path = path[len(proxy_info["path"]):]
url = proxy_info["target"] + path
if "auth" in proxy_info:
extra_headers["Authorization"] = "Basic %s" % (
base64.b64encode(proxy_info["auth"]))
return url, True
return url, False

def _open_url_raw(self, url, timeout=5, allow_content_types=None): def _open_url_raw(self, url, timeout=5, allow_content_types=None):
"""Open a URL, without parsing it. """Open a URL, without parsing it.


None will be returned for URLs that cannot be read for whatever reason. None will be returned for URLs that cannot be read for whatever reason.
""" """
parsed = urlparse.urlparse(url) parsed = urlparse.urlparse(url)
extra_headers = {}

if self._search_config and self._search_config.get("proxies"):
for proxy_info in self._search_config["proxies"]:
if parsed.netloc != proxy_info["netloc"]:
continue
path = parsed.path
if "path" in proxy_info:
if not parsed.path.startswith(proxy_info["path"]):
continue
path = path[len(proxy_info["path"]):]
url = proxy_info["target"] + path
if "auth" in proxy_info:
extra_headers["Authorization"] = "Basic %s" % (
base64.b64encode(proxy_info["auth"]))

if not isinstance(url, unicode): if not isinstance(url, unicode):
url = url.encode("utf8") url = url.encode("utf8")
extra_headers = {}
url, _ = self._try_map_proxy_url(url, parsed, extra_headers)
request = Request(url, headers=extra_headers) request = Request(url, headers=extra_headers)
try: try:
response = self._opener.open(request, timeout=timeout) response = self._opener.open(request, timeout=timeout)
except (URLError, HTTPException, socket_error, ValueError): except (URLError, HTTPException, socket_error, ValueError):
self._logger.exception("Failed to fetch URL: %s", url)
return None
url, remapped = self._try_map_proxy_url(url, parsed, extra_headers, is_error=True)
if not remapped:
self._logger.exception("Failed to fetch URL: %s", url)
return None
self._logger.info("Failed to fetch URL, trying proxy remap: %s", url)
request = Request(url, headers=extra_headers)
try:
response = self._opener.open(request, timeout=timeout)
except (URLError, HTTPException, socket_error, ValueError):
self._logger.exception("Failed to fetch URL after proxy remap: %s", url)
return None


try: try:
size = int(response.headers.get("Content-Length", 0)) size = int(response.headers.get("Content-Length", 0))


Loading…
Cancel
Save