소스 검색

Fix issues in previous commit

legacy-python2
Ben Kurtovic 3 년 전
부모
커밋
fe2e7879e4
2개의 변경된 파일18개의 추가작업 그리고 8개의 파일을 삭제
  1. +7
    -6
      earwigbot/wiki/copyvios/parsers.py
  2. +11
    -2
      earwigbot/wiki/copyvios/workers.py

+ 7
- 6
earwigbot/wiki/copyvios/parsers.py 파일 보기

@@ -267,12 +267,12 @@ class _HTMLParser(_BaseTextParser):

return "\n".join(soup.stripped_strings)

def _open(self, url):
def _open(self, url, **kwargs):
"""Try to read a URL. Return None if it couldn't be read."""
opener = self._args.get("open_url")
if not opener:
return None
result = opener(url)
result = opener(url, **kwargs)
return result.content if result else None

def _load_from_blogspot(self, url):
@@ -280,15 +280,16 @@ class _HTMLParser(_BaseTextParser):
match = re.search(r"'postId': '(\d+)'", self.text)
if not match:
return ""
post_id = match.groups(1)
url = "https://%s/feeds/posts/default/%s" % (url.netloc, post_id)
post_id = match.group(1)
url = "https://%s/feeds/posts/default/%s?" % (url.netloc, post_id)
params = {
"alt": "json",
"v": "2",
"dynamicviews": "1",
"rewriteforssl": "true",
}
raw = self._open(url + urllib.urlencode(params))
raw = self._open(url + urllib.urlencode(params),
allow_content_types=["application/json"])
if raw is None:
return ""
try:
@@ -381,4 +382,4 @@ _CONTENT_TYPES = {

def get_parser(content_type):
"""Return the parser most able to handle a given content type, or None."""
return _CONTENT_TYPES.get(content_type.split(";", 1)[0])
return _CONTENT_TYPES.get(content_type)

+ 11
- 2
earwigbot/wiki/copyvios/workers.py 파일 보기

@@ -128,6 +128,7 @@ class _CopyvioWorker(object):
try:
response = self._opener.open(url, timeout=timeout)
except (URLError, HTTPException, socket_error, ValueError):
self._logger.exception("Failed to fetch URL: %s", url)
return None

try:
@@ -136,9 +137,13 @@ class _CopyvioWorker(object):
return None

content_type = response.headers.get("Content-Type", "text/plain")
content_type = content_type.split(";", 1)[0]
parser_class = get_parser(content_type)
if not parser_class:
if not parser_class and (
not allow_content_types or content_type not in allow_content_types):
return None
if not parser_class:
parser_class = get_parser("text/plain")
if size > (15 if parser_class.TYPE == "PDF" else 2) * 1024 ** 2:
return None

@@ -182,7 +187,7 @@ class _CopyvioWorker(object):
if result is None:
return None

args = source.parser_args.copy()
args = source.parser_args.copy() if source.parser_args else {}
args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout)
parser = result.parser_class(result.content, url=source.url, args=args)
try:
@@ -255,6 +260,10 @@ class _CopyvioWorker(object):
self._logger.debug("Source excluded by content parser")
source.skipped = source.excluded = True
source.finish_work()
except Exception:
self._logger.exception("Uncaught exception in worker")
source.skip()
source.finish_work()
else:
chain = MarkovChain(text) if text else None
source.workspace.compare(source, chain)


불러오는 중...
취소
저장