소스 검색

copyvios: Config-directed URL proxying

tags/v0.4
Ben Kurtovic 3 년 전
부모
커밋
9d66ebc6b2
3개의 변경된 파일31개의 추가작업 그리고 6개의 파일을 삭제
  1. +2
    -1
      earwigbot/wiki/copyvios/result.py
  2. +3
    -1
      earwigbot/wiki/copyvios/search.py
  3. +26
    -4
      earwigbot/wiki/copyvios/workers.py

+ 2
- 1
earwigbot/wiki/copyvios/result.py 파일 보기

@@ -44,12 +44,13 @@ class CopyvioSource(object):
"""

def __init__(self, workspace, url, headers=None, timeout=5,
parser_args=None):
parser_args=None, search_config=None):
self.workspace = workspace
self.url = url
self.headers = headers
self.timeout = timeout
self.parser_args = parser_args
self.search_config = search_config

self.confidence = 0.0
self.chains = (EMPTY, EMPTY_INTERSECTION)


+ 3
- 1
earwigbot/wiki/copyvios/search.py 파일 보기

@@ -61,7 +61,9 @@ class _BaseSearchEngine(object):
response = self.opener.open(*args)
result = response.read()
except (URLError, error) as exc:
raise SearchQueryError("{0} Error: {1}".format(self.name, exc))
err = SearchQueryError("{0} Error: {1}".format(self.name, exc))
err.cause = exc
raise err

if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(result)


+ 26
- 4
earwigbot/wiki/copyvios/workers.py 파일 보기

@@ -20,6 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import base64
import collections
from collections import deque
import functools
@@ -33,7 +34,8 @@ from StringIO import StringIO
from struct import error as struct_error
from threading import Lock, Thread
import time
from urllib2 import build_opener, URLError
from urllib2 import build_opener, Request, URLError
import urlparse

from earwigbot import importer
from earwigbot.exceptions import ParserExclusionError, ParserRedirectError
@@ -115,6 +117,7 @@ class _CopyvioWorker(object):

self._site = None
self._queue = None
self._search_config = None
self._opener = build_opener()
self._logger = getLogger("earwigbot.wiki.cvworker." + name)

@@ -123,10 +126,28 @@ class _CopyvioWorker(object):

None will be returned for URLs that cannot be read for whatever reason.
"""
parsed = urlparse.urlparse(url)
extra_headers = {}

if self._search_config and self._search_config.get("proxies"):
for proxy_info in self._search_config["proxies"]:
if parsed.netloc != proxy_info["netloc"]:
continue
path = parsed.path
if "path" in proxy_info:
if not parsed.path.startswith(proxy_info["path"]):
continue
path = path[len(proxy_info["path"]):]
url = proxy_info["target"] + path
if "auth" in proxy_info:
extra_headers["Authorization"] = "Basic %s" % (
base64.b64encode(proxy_info["auth"]))

if not isinstance(url, unicode):
url = url.encode("utf8")
request = Request(url, headers=extra_headers)
try:
response = self._opener.open(url, timeout=timeout)
response = self._opener.open(request, timeout=timeout)
except (URLError, HTTPException, socket_error, ValueError):
self._logger.exception("Failed to fetch URL: %s", url)
return None
@@ -180,6 +201,7 @@ class _CopyvioWorker(object):
If a URLError was raised while opening the URL or an IOError was raised
while decompressing, None will be returned.
"""
self._search_config = source.search_config
if source.headers:
self._opener.addheaders = source.headers

@@ -296,7 +318,7 @@ class CopyvioWorkspace(object):

def __init__(self, article, min_confidence, max_time, logger, headers,
url_timeout=5, num_workers=8, short_circuit=True,
parser_args=None, exclude_check=None):
parser_args=None, exclude_check=None, config=None):
self.sources = []
self.finished = False
self.possible_miss = False
@@ -311,7 +333,7 @@ class CopyvioWorkspace(object):
self._short_circuit = short_circuit
self._source_args = {
"workspace": self, "headers": headers, "timeout": url_timeout,
"parser_args": parser_args}
"parser_args": parser_args, "search_config": config}
self._exclude_check = exclude_check

if _is_globalized:


불러오는 중...
취소
저장