From 2324a73624c18b142a6e447ddfbaca829487e98c Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Sat, 20 Mar 2021 20:21:30 -0400
Subject: [PATCH] copyvios: Refactor some parsing logic and add dynamic Blogger
 support

---
 earwigbot/wiki/copyvios/parsers.py | 87 +++++++++++++++++++++++++++++---------
 earwigbot/wiki/copyvios/workers.py | 66 ++++++++++++++++++++---------
 2 files changed, 114 insertions(+), 39 deletions(-)
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 1b2dfae..2b76e09 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,9 +20,11 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import json
 from os import path
 import re
 from StringIO import StringIO
+import urllib
 import urlparse
 
 import mwparserfromhell
@@ -246,40 +248,87 @@ class _HTMLParser(_BaseTextParser):
         if soup.find_all(href=func) or soup.find_all(src=func):
             raise ParserExclusionError()
 
+    @staticmethod
+    def _get_soup(text):
+        """Parse some text using BeautifulSoup."""
+        try:
+            return bs4.BeautifulSoup(text, "lxml")
+        except ValueError:
+            return bs4.BeautifulSoup(text)
+
+    def _clean_soup(self, soup):
+        """Clean a BeautifulSoup tree of invisible tags."""
+        is_comment = lambda text: isinstance(text, bs4.element.Comment)
+        for comment in soup.find_all(text=is_comment):
+            comment.extract()
+        for tag in self.hidden_tags:
+            for element in soup.find_all(tag):
+                element.extract()
+
+        return "\n".join(soup.stripped_strings)
+
+    def _open(self, url):
+        """Try to read a URL. Return None if it couldn't be read."""
+        opener = self._args.get("open_url")
+        if not opener:
+            return None
+        result = opener(url)
+        return result.content if result else None
+
+    def _load_from_blogspot(self, url):
+        """Load dynamic content from Blogger Dynamic Views."""
+        match = re.search(r"'postId': '(\d+)'", self.text)
+        if not match:
+            return ""
+        post_id = match.groups(1)
+        url = "https://%s/feeds/posts/default/%s" % (url.netloc, post_id)
+        params = {
+            "alt": "json",
+            "v": "2",
+            "dynamicviews": "1",
+            "rewriteforssl": "true",
+        }
+        raw = self._open(url + urllib.urlencode(params))
+        if raw is None:
+            return ""
+        try:
+            parsed = json.loads(raw)
+        except ValueError:
+            return ""
+        try:
+            text = parsed["entry"]["content"]["$t"]
+        except KeyError:
+            return ""
+        soup = self._get_soup(text)
+        return self._clean_soup(soup.body)
+
     def parse(self):
         """Return the actual text contained within an HTML document.
 
         Implemented using :py:mod:`BeautifulSoup <bs4>`
         (http://www.crummy.com/software/BeautifulSoup/).
         """
-        try:
-            soup = bs4.BeautifulSoup(self.text, "lxml")
-        except ValueError:
-            soup = bs4.BeautifulSoup(self.text)
-
+        url = urlparse.urlparse(self.url) if self.url else None
+        soup = self._get_soup(self.text)
         if not soup.body:
             # No <body> tag present in HTML ->
             # no scrapable content (possibly JS or <iframe> magic):
             return ""
 
         self._fail_if_mirror(soup)
-        soup = soup.body
+        body = soup.body
 
-        if self.url:
-            url = urlparse.urlparse(self.url)
-            if url.netloc == "web.archive.org" and url.path.endswith(".pdf"):
-                playback = soup.find(id="playback")
-                if playback and "src" in playback.attrs:
-                    raise ParserRedirectError(playback.attrs["src"])
+        if url and url.netloc == "web.archive.org" and url.path.endswith(".pdf"):
+            playback = body.find(id="playback")
+            if playback and "src" in playback.attrs:
+                raise ParserRedirectError(playback.attrs["src"])
 
-        is_comment = lambda text: isinstance(text, bs4.element.Comment)
-        for comment in soup.find_all(text=is_comment):
-            comment.extract()
-        for tag in self.hidden_tags:
-            for element in soup.find_all(tag):
-                element.extract()
+        content = self._clean_soup(body)
 
-        return "\n".join(soup.stripped_strings)
+        if url and url.netloc.endswith(".blogspot.com") and not content:
+            content = self._load_from_blogspot(url)
+
+        return content
 
 
 class _PDFParser(_BaseTextParser):
diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py
index c48085d..365abdc 100644
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -20,7 +20,9 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import collections
 from collections import deque
+import functools
 from gzip import GzipFile
 from httplib import HTTPException
 from logging import getLogger
@@ -30,7 +32,7 @@ from socket import error as socket_error
 from StringIO import StringIO
 from struct import error as struct_error
 from threading import Lock, Thread
-from time import time
+import time
 from urllib2 import build_opener, URLError
 
 from earwigbot import importer
@@ -44,11 +46,14 @@ tldextract = importer.new("tldextract")
 __all__ = ["globalize", "localize", "CopyvioWorkspace"]
 
 _MAX_REDIRECTS = 3
+_MAX_RAW_SIZE = 20 * 1024 ** 2
 
 _is_globalized = False
 _global_queues = None
 _global_workers = []
 
+_OpenedURL = collections.namedtuple('_OpenedURL', ['content', 'parser_class'])
+
 def globalize(num_workers=8):
     """Cause all copyvio checks to be done by one global set of workers.
 
@@ -113,23 +118,15 @@ class _CopyvioWorker(object):
         self._opener = build_opener()
         self._logger = getLogger("earwigbot.wiki.cvworker." + name)
 
-    def _open_url(self, source, redirects=0):
-        """Open a URL and return its parsed content, or None.
-
-        First, we will decompress the content if the headers contain "gzip" as
-        its content encoding. Then, we will return the content stripped using
-        an HTML parser if the headers indicate it is HTML, or return the
-        content directly if it is plain text. If we don't understand the
-        content type, we'll return None.
+    def _open_url_raw(self, url, timeout=5, allow_content_types=None):
+        """Open a URL, without parsing it.
 
-        If a URLError was raised while opening the URL or an IOError was raised
-        while decompressing, None will be returned.
+        None will be returned for URLs that cannot be read for whatever reason.
         """
-        if source.headers:
-            self._opener.addheaders = source.headers
-        url = source.url.encode("utf8")
+        if not isinstance(url, unicode):
+            url = url.encode("utf8")
         try:
-            response = self._opener.open(url, timeout=source.timeout)
+            response = self._opener.open(url, timeout=timeout)
         except (URLError, HTTPException, socket_error, ValueError):
             return None
 
@@ -146,9 +143,13 @@ class _CopyvioWorker(object):
             return None
 
         try:
-            content = response.read()
+            # Additional safety check for pages using Transfer-Encoding: chunked
+            # where we can't read the Content-Length
+            content = response.read(_MAX_RAW_SIZE + 1)
         except (URLError, socket_error):
             return None
+        if len(content) > _MAX_RAW_SIZE:
+            return None
 
         if response.headers.get("Content-Encoding") == "gzip":
             stream = StringIO(content)
@@ -158,7 +159,32 @@ class _CopyvioWorker(object):
             except (IOError, struct_error):
                 return None
 
-        parser = parser_class(content, url=url, args=source.parser_args)
+        if len(content) > _MAX_RAW_SIZE:
+            return None
+        return _OpenedURL(content, parser_class)
+
+    def _open_url(self, source, redirects=0):
+        """Open a URL and return its parsed content, or None.
+
+        First, we will decompress the content if the headers contain "gzip" as
+        its content encoding. Then, we will return the content stripped using
+        an HTML parser if the headers indicate it is HTML, or return the
+        content directly if it is plain text. If we don't understand the
+        content type, we'll return None.
+
+        If a URLError was raised while opening the URL or an IOError was raised
+        while decompressing, None will be returned.
+        """
+        if source.headers:
+            self._opener.addheaders = source.headers
+
+        result = self._open_url_raw(source.url, timeout=source.timeout)
+        if result is None:
+            return None
+
+        args = source.parser_args.copy()
+        args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout)
+        parser = result.parser_class(result.content, url=source.url, args=args)
         try:
             return parser.parse()
         except ParserRedirectError as exc:
@@ -170,7 +196,7 @@ class _CopyvioWorker(object):
     def _acquire_new_site(self):
         """Block for a new unassigned site queue."""
         if self._until:
-            timeout = self._until - time()
+            timeout = self._until - time.time()
             if timeout <= 0:
                 raise Empty
         else:
@@ -269,7 +295,7 @@ class CopyvioWorkspace(object):
         self._article = article
         self._logger = logger.getChild("copyvios")
         self._min_confidence = min_confidence
-        self._start_time = time()
+        self._start_time = time.time()
         self._until = (self._start_time + max_time) if max_time > 0 else None
         self._handled_urls = set()
         self._finish_lock = Lock()
@@ -407,5 +433,5 @@ class CopyvioWorkspace(object):
 
         self.sources.sort(cmpfunc)
         return CopyvioCheckResult(self.finished, self.sources, num_queries,
-                                  time() - self._start_time, self._article,
+                                  time.time() - self._start_time, self._article,
                                   self.possible_miss)