diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 2b76e09..5d694aa 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -267,12 +267,12 @@ class _HTMLParser(_BaseTextParser): return "\n".join(soup.stripped_strings) - def _open(self, url): + def _open(self, url, **kwargs): """Try to read a URL. Return None if it couldn't be read.""" opener = self._args.get("open_url") if not opener: return None - result = opener(url) + result = opener(url, **kwargs) return result.content if result else None def _load_from_blogspot(self, url): @@ -280,15 +280,16 @@ class _HTMLParser(_BaseTextParser): match = re.search(r"'postId': '(\d+)'", self.text) if not match: return "" - post_id = match.groups(1) - url = "https://%s/feeds/posts/default/%s" % (url.netloc, post_id) + post_id = match.group(1) + url = "https://%s/feeds/posts/default/%s?" % (url.netloc, post_id) params = { "alt": "json", "v": "2", "dynamicviews": "1", "rewriteforssl": "true", } - raw = self._open(url + urllib.urlencode(params)) + raw = self._open(url + urllib.urlencode(params), + allow_content_types=["application/json"]) if raw is None: return "" try: @@ -381,4 +382,4 @@ _CONTENT_TYPES = { def get_parser(content_type): """Return the parser most able to handle a given content type, or None.""" - return _CONTENT_TYPES.get(content_type.split(";", 1)[0]) + return _CONTENT_TYPES.get(content_type) diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 365abdc..fb75895 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -128,6 +128,7 @@ class _CopyvioWorker(object): try: response = self._opener.open(url, timeout=timeout) except (URLError, HTTPException, socket_error, ValueError): + self._logger.exception("Failed to fetch URL: %s", url) return None try: @@ -136,9 +137,13 @@ class _CopyvioWorker(object): return None content_type = response.headers.get("Content-Type", "text/plain") + content_type = content_type.split(";", 1)[0] parser_class = get_parser(content_type) - if not parser_class: + if not parser_class and ( + not allow_content_types or content_type not in allow_content_types): return None + if not parser_class: + parser_class = get_parser("text/plain") if size > (15 if parser_class.TYPE == "PDF" else 2) * 1024 ** 2: return None @@ -182,7 +187,7 @@ class _CopyvioWorker(object): if result is None: return None - args = source.parser_args.copy() + args = source.parser_args.copy() if source.parser_args else {} args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout) parser = result.parser_class(result.content, url=source.url, args=args) try: @@ -255,6 +260,10 @@ class _CopyvioWorker(object): self._logger.debug("Source excluded by content parser") source.skipped = source.excluded = True source.finish_work() + except Exception: + self._logger.exception("Uncaught exception in worker") + source.skip() + source.finish_work() else: chain = MarkovChain(text) if text else None source.workspace.compare(source, chain)