copyvios: Refactor some parsing logic and add dynamic Blogger support

3 years ago · 2324a73624
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,9 +20,11 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 import json
 from os import path
 import re
 from StringIO import StringIO
 import urllib
 import urlparse

 import mwparserfromhell
@@ -246,40 +248,87 @@ class _HTMLParser(_BaseTextParser):
        if soup.find_all(href=func) or soup.find_all(src=func):
            raise ParserExclusionError()

    @staticmethod
    def _get_soup(text):
        """Parse some text using BeautifulSoup."""
        try:
            return bs4.BeautifulSoup(text, "lxml")
        except ValueError:
            return bs4.BeautifulSoup(text)

    def _clean_soup(self, soup):
        """Clean a BeautifulSoup tree of invisible tags."""
        is_comment = lambda text: isinstance(text, bs4.element.Comment)
        for comment in soup.find_all(text=is_comment):
            comment.extract()
        for tag in self.hidden_tags:
            for element in soup.find_all(tag):
                element.extract()

        return "\n".join(soup.stripped_strings)

    def _open(self, url):
        """Try to read a URL. Return None if it couldn't be read."""
        opener = self._args.get("open_url")
        if not opener:
            return None
        result = opener(url)
        return result.content if result else None

    def _load_from_blogspot(self, url):
        """Load dynamic content from Blogger Dynamic Views."""
        match = re.search(r"'postId': '(\d+)'", self.text)
        if not match:
            return ""
        post_id = match.groups(1)
        url = "https://%s/feeds/posts/default/%s" % (url.netloc, post_id)
        params = {
            "alt": "json",
            "v": "2",
            "dynamicviews": "1",
            "rewriteforssl": "true",
        }
        raw = self._open(url + urllib.urlencode(params))
        if raw is None:
            return ""
        try:
            parsed = json.loads(raw)
        except ValueError:
            return ""
        try:
            text = parsed["entry"]["content"]["$t"]
        except KeyError:
            return ""
        soup = self._get_soup(text)
        return self._clean_soup(soup.body)

    def parse(self):
        """Return the actual text contained within an HTML document.

        Implemented using :py:mod:`BeautifulSoup <bs4>`
        (http://www.crummy.com/software/BeautifulSoup/).
        """
        try:
            soup = bs4.BeautifulSoup(self.text, "lxml")
        except ValueError:
            soup = bs4.BeautifulSoup(self.text)

        url = urlparse.urlparse(self.url) if self.url else None
        soup = self._get_soup(self.text)
        if not soup.body:
            # No <body> tag present in HTML ->
            # no scrapable content (possibly JS or <iframe> magic):
            return ""

        self._fail_if_mirror(soup)
        soup = soup.body
        body = soup.body

        if self.url:
            url = urlparse.urlparse(self.url)
            if url.netloc == "web.archive.org" and url.path.endswith(".pdf"):
                playback = soup.find(id="playback")
                if playback and "src" in playback.attrs:
                    raise ParserRedirectError(playback.attrs["src"])
        if url and url.netloc == "web.archive.org" and url.path.endswith(".pdf"):
            playback = body.find(id="playback")
            if playback and "src" in playback.attrs:
                raise ParserRedirectError(playback.attrs["src"])

        is_comment = lambda text: isinstance(text, bs4.element.Comment)
        for comment in soup.find_all(text=is_comment):
            comment.extract()
        for tag in self.hidden_tags:
            for element in soup.find_all(tag):
                element.extract()
        content = self._clean_soup(body)

        return "\n".join(soup.stripped_strings)
        if url and url.netloc.endswith(".blogspot.com") and not content:
            content = self._load_from_blogspot(url)

        return content


 class _PDFParser(_BaseTextParser):
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -20,7 +20,9 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 import collections
 from collections import deque
 import functools
 from gzip import GzipFile
 from httplib import HTTPException
 from logging import getLogger
@@ -30,7 +32,7 @@ from socket import error as socket_error
 from StringIO import StringIO
 from struct import error as struct_error
 from threading import Lock, Thread
 from time import time
 import time
 from urllib2 import build_opener, URLError

 from earwigbot import importer
@@ -44,11 +46,14 @@ tldextract = importer.new("tldextract")
 __all__ = ["globalize", "localize", "CopyvioWorkspace"]

 _MAX_REDIRECTS = 3
 _MAX_RAW_SIZE = 20 * 1024 ** 2

 _is_globalized = False
 _global_queues = None
 _global_workers = []

 _OpenedURL = collections.namedtuple('_OpenedURL', ['content', 'parser_class'])

 def globalize(num_workers=8):
    """Cause all copyvio checks to be done by one global set of workers.

@@ -113,23 +118,15 @@ class _CopyvioWorker(object):
        self._opener = build_opener()
        self._logger = getLogger("earwigbot.wiki.cvworker." + name)

    def _open_url(self, source, redirects=0):
        """Open a URL and return its parsed content, or None.

        First, we will decompress the content if the headers contain "gzip" as
        its content encoding. Then, we will return the content stripped using
        an HTML parser if the headers indicate it is HTML, or return the
        content directly if it is plain text. If we don't understand the
        content type, we'll return None.
    def _open_url_raw(self, url, timeout=5, allow_content_types=None):
        """Open a URL, without parsing it.

        If a URLError was raised while opening the URL or an IOError was raised
        while decompressing, None will be returned.
        None will be returned for URLs that cannot be read for whatever reason.
        """
        if source.headers:
            self._opener.addheaders = source.headers
        url = source.url.encode("utf8")
        if not isinstance(url, unicode):
            url = url.encode("utf8")
        try:
            response = self._opener.open(url, timeout=source.timeout)
            response = self._opener.open(url, timeout=timeout)
        except (URLError, HTTPException, socket_error, ValueError):
            return None

@@ -146,9 +143,13 @@ class _CopyvioWorker(object):
            return None

        try:
            content = response.read()
            # Additional safety check for pages using Transfer-Encoding: chunked
            # where we can't read the Content-Length
            content = response.read(_MAX_RAW_SIZE + 1)
        except (URLError, socket_error):
            return None
        if len(content) > _MAX_RAW_SIZE:
            return None

        if response.headers.get("Content-Encoding") == "gzip":
            stream = StringIO(content)
@@ -158,7 +159,32 @@ class _CopyvioWorker(object):
            except (IOError, struct_error):
                return None

        parser = parser_class(content, url=url, args=source.parser_args)
        if len(content) > _MAX_RAW_SIZE:
            return None
        return _OpenedURL(content, parser_class)

    def _open_url(self, source, redirects=0):
        """Open a URL and return its parsed content, or None.

        First, we will decompress the content if the headers contain "gzip" as
        its content encoding. Then, we will return the content stripped using
        an HTML parser if the headers indicate it is HTML, or return the
        content directly if it is plain text. If we don't understand the
        content type, we'll return None.

        If a URLError was raised while opening the URL or an IOError was raised
        while decompressing, None will be returned.
        """
        if source.headers:
            self._opener.addheaders = source.headers

        result = self._open_url_raw(source.url, timeout=source.timeout)
        if result is None:
            return None

        args = source.parser_args.copy()
        args["open_url"] = functools.partial(self._open_url_raw, timeout=source.timeout)
        parser = result.parser_class(result.content, url=source.url, args=args)
        try:
            return parser.parse()
        except ParserRedirectError as exc:
@@ -170,7 +196,7 @@ class _CopyvioWorker(object):
    def _acquire_new_site(self):
        """Block for a new unassigned site queue."""
        if self._until:
            timeout = self._until - time()
            timeout = self._until - time.time()
            if timeout <= 0:
                raise Empty
        else:
@@ -269,7 +295,7 @@ class CopyvioWorkspace(object):
        self._article = article
        self._logger = logger.getChild("copyvios")
        self._min_confidence = min_confidence
        self._start_time = time()
        self._start_time = time.time()
        self._until = (self._start_time + max_time) if max_time > 0 else None
        self._handled_urls = set()
        self._finish_lock = Lock()
@@ -407,5 +433,5 @@ class CopyvioWorkspace(object):

        self.sources.sort(cmpfunc)
        return CopyvioCheckResult(self.finished, self.sources, num_queries,
                                  time() - self._start_time, self._article,
                                  time.time() - self._start_time, self._article,
                                  self.possible_miss)