diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 1b2dfae..2b76e09 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -20,9 +20,11 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import json from os import path import re from StringIO import StringIO +import urllib import urlparse import mwparserfromhell @@ -246,40 +248,87 @@ class _HTMLParser(_BaseTextParser): if soup.find_all(href=func) or soup.find_all(src=func): raise ParserExclusionError() + @staticmethod + def _get_soup(text): + """Parse some text using BeautifulSoup.""" + try: + return bs4.BeautifulSoup(text, "lxml") + except ValueError: + return bs4.BeautifulSoup(text) + + def _clean_soup(self, soup): + """Clean a BeautifulSoup tree of invisible tags.""" + is_comment = lambda text: isinstance(text, bs4.element.Comment) + for comment in soup.find_all(text=is_comment): + comment.extract() + for tag in self.hidden_tags: + for element in soup.find_all(tag): + element.extract() + + return "\n".join(soup.stripped_strings) + + def _open(self, url): + """Try to read a URL. Return None if it couldn't be read.""" + opener = self._args.get("open_url") + if not opener: + return None + result = opener(url) + return result.content if result else None + + def _load_from_blogspot(self, url): + """Load dynamic content from Blogger Dynamic Views.""" + match = re.search(r"'postId': '(\d+)'", self.text) + if not match: + return "" + post_id = match.groups(1) + url = "https://%s/feeds/posts/default/%s" % (url.netloc, post_id) + params = { + "alt": "json", + "v": "2", + "dynamicviews": "1", + "rewriteforssl": "true", + } + raw = self._open(url + urllib.urlencode(params)) + if raw is None: + return "" + try: + parsed = json.loads(raw) + except ValueError: + return "" + try: + text = parsed["entry"]["content"]["$t"] + except KeyError: + return "" + soup = self._get_soup(text) + return self._clean_soup(soup.body) + def parse(self): """Return the actual text contained within an HTML document. Implemented using :py:mod:`BeautifulSoup ` (http://www.crummy.com/software/BeautifulSoup/). """ - try: - soup = bs4.BeautifulSoup(self.text, "lxml") - except ValueError: - soup = bs4.BeautifulSoup(self.text) - + url = urlparse.urlparse(self.url) if self.url else None + soup = self._get_soup(self.text) if not soup.body: # No tag present in HTML -> # no scrapable content (possibly JS or