Fix issues in previous commit

3 years ago · fe2e7879e4
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -267,12 +267,12 @@ class _HTMLParser(_BaseTextParser):

        return "\n".join(soup.stripped_strings)

    def _open(self, url):
    def _open(self, url, **kwargs):
        """Try to read a URL. Return None if it couldn't be read."""
        opener = self._args.get("open_url")
        if not opener:
            return None
        result = opener(url)
        result = opener(url, **kwargs)
        return result.content if result else None

    def _load_from_blogspot(self, url):
@@ -280,15 +280,16 @@ class _HTMLParser(_BaseTextParser):
        match = re.search(r"'postId': '(\d+)'", self.text)
        if not match:
            return ""
        post_id = match.groups(1)
        url = "https://%s/feeds/posts/default/%s" % (url.netloc, post_id)
        post_id = match.group(1)
        url = "https://%s/feeds/posts/default/%s?" % (url.netloc, post_id)
        params = {
            "alt": "json",
            "v": "2",
            "dynamicviews": "1",
            "rewriteforssl": "true",
        }
        raw = self._open(url + urllib.urlencode(params))
        raw = self._open(url + urllib.urlencode(params),
                         allow_content_types=["application/json"])
        if raw is None:
            return ""
        try:
@@ -381,4 +382,4 @@ _CONTENT_TYPES = {

 def get_parser(content_type):
    """Return the parser most able to handle a given content type, or None."""
    return _CONTENT_TYPES.get(content_type.split(";", 1)[0])
    return _CONTENT_TYPES.get(content_type)
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -128,6 +128,7 @@ class _CopyvioWorker(object):
        try:
            response = self._opener.open(url, timeout=timeout)
        except (URLError, HTTPException, socket_error, ValueError):
            self._logger.exception("Failed to fetch URL: %s", url)
            return None

        try:
@@ -136,9 +137,13 @@ class _CopyvioWorker(object):
            return None

        content_type = response.headers.get("Content-Type", "text/plain")
        content_type = content_type.split(";", 1)[0]
        parser_class = get_parser(content_type)
        if not parser_class:
        if not parser_class and (
                not allow_content_types or content_type not in allow_content_types):
            return None
        if not parser_class:
            parser_class = get_parser("text/plain")
        if size > (15 if parser_class.TYPE == "PDF" else 2) * 1024 ** 2:
            return None

@@ -182,7 +187,7 @@ class _CopyvioWorker(object):
        if result is None:
            return None

        args = source.parser_args.copy()
        args = source.parser_args.copy() if source.parser_args else {}
        args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout)
        parser = result.parser_class(result.content, url=source.url, args=args)
        try:
@@ -255,6 +260,10 @@ class _CopyvioWorker(object):
            self._logger.debug("Source excluded by content parser")
            source.skipped = source.excluded = True
            source.finish_work()
        except Exception:
            self._logger.exception("Uncaught exception in worker")
            source.skip()
            source.finish_work()
        else:
            chain = MarkovChain(text) if text else None
            source.workspace.compare(source, chain)