From cb870041079843c521a65e74784d41e224ffadd9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 7 Jul 2012 03:37:15 -0400 Subject: [PATCH] Primitive screen scraper for HTML using BeautifulSoup and LXML. Obviously this can and should be improved significantly later, but it seems good enough for now. --- earwigbot/wiki/copyvios/parsers.py | 27 ++++++++++++++++++++++++++- setup.py | 2 ++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 565acff..8b9655b 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -20,6 +20,13 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import htmlentitydefs + +try: + from bs4 import BeautifulSoup +except ImportError: + BeautifulSoup = None + try: import mwparserfromhell except ImportError: @@ -76,6 +83,24 @@ class ArticleTextParser(BaseTextParser): class HTMLTextParser(BaseTextParser): """A parser that can extract the text from an HTML document.""" + hidden_tags = [ + "script", "style" + ] def strip(self): - return self.text # TODO: NotImplemented + """Return the actual text contained within an HTML document. + + Implemented using :py:mod:`BeautifulSoup ` + (http://www.crummy.com/software/BeautifulSoup/). + """ + try: + soup = BeautifulSoup(self.text, "lxml").body + except ValueError: + soup = BeautifulSoup(self.text).body + + is_comment = lambda text: isinstance(text, bs4.element.Comment) + [comment.extract() for comment in soup.find_all(text=is_comment)] + for tag in self.hidden_tags: + [element.extract() for element in soup.find_all(tag)] + + return "\n".join(soup.stripped_strings) diff --git a/setup.py b/setup.py index 9db6676..3c3c7cd 100644 --- a/setup.py +++ b/setup.py @@ -34,6 +34,8 @@ setup( entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]}, install_requires = ["GitPython >= 0.3.2.RC1", # Interfacing with git "PyYAML >= 3.10", # Config parsing + "beautifulsoup4 >= 4.1.1", # HTML parsing/scraping + "lxml >= 2.3.4", # Faster parser for BeautifulSoup "mwparserfromhell >= 0.1", # Wikicode parsing "oursql >= 0.9.3", # Talking with MediaWiki databases "oauth2 >= 1.5.211", # Talking with Yahoo BOSS Search