From cb870041079843c521a65e74784d41e224ffadd9 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 7 Jul 2012 03:37:15 -0400
Subject: [PATCH] Primitive screen scraper for HTML using BeautifulSoup and
 LXML.

Obviously this can and should be improved significantly later, but it seems
good enough for now.
---
 earwigbot/wiki/copyvios/parsers.py | 27 ++++++++++++++++++++++++++-
 setup.py                           |  2 ++
 2 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 565acff..8b9655b 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,6 +20,13 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import htmlentitydefs
+
+try:
+    from bs4 import BeautifulSoup
+except ImportError:
+    BeautifulSoup = None
+
 try:
     import mwparserfromhell
 except ImportError:
@@ -76,6 +83,24 @@ class ArticleTextParser(BaseTextParser):
 
 class HTMLTextParser(BaseTextParser):
     """A parser that can extract the text from an HTML document."""
+    hidden_tags = [
+        "script", "style"
+    ]
 
     def strip(self):
-        return self.text                                                                            # TODO: NotImplemented
+        """Return the actual text contained within an HTML document.
+
+        Implemented using :py:mod:`BeautifulSoup <bs4>`
+        (http://www.crummy.com/software/BeautifulSoup/).
+        """
+        try:
+            soup = BeautifulSoup(self.text, "lxml").body
+        except ValueError:
+            soup = BeautifulSoup(self.text).body
+
+        is_comment = lambda text: isinstance(text, bs4.element.Comment)
+        [comment.extract() for comment in soup.find_all(text=is_comment)]
+        for tag in self.hidden_tags:
+            [element.extract() for element in soup.find_all(tag)]
+
+        return "\n".join(soup.stripped_strings)
diff --git a/setup.py b/setup.py
index 9db6676..3c3c7cd 100644
--- a/setup.py
+++ b/setup.py
@@ -34,6 +34,8 @@ setup(
     entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]},
     install_requires = ["GitPython >= 0.3.2.RC1",  # Interfacing with git
                         "PyYAML >= 3.10",  # Config parsing
+                        "beautifulsoup4 >= 4.1.1",  # HTML parsing/scraping
+                        "lxml >= 2.3.4",  # Faster parser for BeautifulSoup
                         "mwparserfromhell >= 0.1",  # Wikicode parsing
                         "oursql >= 0.9.3",  # Talking with MediaWiki databases
                         "oauth2 >= 1.5.211",  # Talking with Yahoo BOSS Search