Browse Source

Fix issues in previous commit

tags/v0.4
Ben Kurtovic 3 years ago
parent
commit
fe2e7879e4
2 changed files with 18 additions and 8 deletions
  1. +7
    -6
      earwigbot/wiki/copyvios/parsers.py
  2. +11
    -2
      earwigbot/wiki/copyvios/workers.py

+ 7
- 6
earwigbot/wiki/copyvios/parsers.py View File

@@ -267,12 +267,12 @@ class _HTMLParser(_BaseTextParser):


return "\n".join(soup.stripped_strings) return "\n".join(soup.stripped_strings)


def _open(self, url):
def _open(self, url, **kwargs):
"""Try to read a URL. Return None if it couldn't be read.""" """Try to read a URL. Return None if it couldn't be read."""
opener = self._args.get("open_url") opener = self._args.get("open_url")
if not opener: if not opener:
return None return None
result = opener(url)
result = opener(url, **kwargs)
return result.content if result else None return result.content if result else None


def _load_from_blogspot(self, url): def _load_from_blogspot(self, url):
@@ -280,15 +280,16 @@ class _HTMLParser(_BaseTextParser):
match = re.search(r"'postId': '(\d+)'", self.text) match = re.search(r"'postId': '(\d+)'", self.text)
if not match: if not match:
return "" return ""
post_id = match.groups(1)
url = "https://%s/feeds/posts/default/%s" % (url.netloc, post_id)
post_id = match.group(1)
url = "https://%s/feeds/posts/default/%s?" % (url.netloc, post_id)
params = { params = {
"alt": "json", "alt": "json",
"v": "2", "v": "2",
"dynamicviews": "1", "dynamicviews": "1",
"rewriteforssl": "true", "rewriteforssl": "true",
} }
raw = self._open(url + urllib.urlencode(params))
raw = self._open(url + urllib.urlencode(params),
allow_content_types=["application/json"])
if raw is None: if raw is None:
return "" return ""
try: try:
@@ -381,4 +382,4 @@ _CONTENT_TYPES = {


def get_parser(content_type): def get_parser(content_type):
"""Return the parser most able to handle a given content type, or None.""" """Return the parser most able to handle a given content type, or None."""
return _CONTENT_TYPES.get(content_type.split(";", 1)[0])
return _CONTENT_TYPES.get(content_type)

+ 11
- 2
earwigbot/wiki/copyvios/workers.py View File

@@ -128,6 +128,7 @@ class _CopyvioWorker(object):
try: try:
response = self._opener.open(url, timeout=timeout) response = self._opener.open(url, timeout=timeout)
except (URLError, HTTPException, socket_error, ValueError): except (URLError, HTTPException, socket_error, ValueError):
self._logger.exception("Failed to fetch URL: %s", url)
return None return None


try: try:
@@ -136,9 +137,13 @@ class _CopyvioWorker(object):
return None return None


content_type = response.headers.get("Content-Type", "text/plain") content_type = response.headers.get("Content-Type", "text/plain")
content_type = content_type.split(";", 1)[0]
parser_class = get_parser(content_type) parser_class = get_parser(content_type)
if not parser_class:
if not parser_class and (
not allow_content_types or content_type not in allow_content_types):
return None return None
if not parser_class:
parser_class = get_parser("text/plain")
if size > (15 if parser_class.TYPE == "PDF" else 2) * 1024 ** 2: if size > (15 if parser_class.TYPE == "PDF" else 2) * 1024 ** 2:
return None return None


@@ -182,7 +187,7 @@ class _CopyvioWorker(object):
if result is None: if result is None:
return None return None


args = source.parser_args.copy()
args = source.parser_args.copy() if source.parser_args else {}
args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout) args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout)
parser = result.parser_class(result.content, url=source.url, args=args) parser = result.parser_class(result.content, url=source.url, args=args)
try: try:
@@ -255,6 +260,10 @@ class _CopyvioWorker(object):
self._logger.debug("Source excluded by content parser") self._logger.debug("Source excluded by content parser")
source.skipped = source.excluded = True source.skipped = source.excluded = True
source.finish_work() source.finish_work()
except Exception:
self._logger.exception("Uncaught exception in worker")
source.skip()
source.finish_work()
else: else:
chain = MarkovChain(text) if text else None chain = MarkovChain(text) if text else None
source.workspace.compare(source, chain) source.workspace.compare(source, chain)


Loading…
Cancel
Save