Primitive screen scraper for HTML using BeautifulSoup and LXML.

Obviously this can and should be improved significantly later, but it seems good enough for now.
12 years ago · cb87004107
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,6 +20,13 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 import htmlentitydefs

 try:
    from bs4 import BeautifulSoup
 except ImportError:
    BeautifulSoup = None

 try:
    import mwparserfromhell
 except ImportError:
@@ -76,6 +83,24 @@ class ArticleTextParser(BaseTextParser):

 class HTMLTextParser(BaseTextParser):
    """A parser that can extract the text from an HTML document."""
    hidden_tags = [
        "script", "style"
    ]

    def strip(self):
        return self.text                                                                            # TODO: NotImplemented
        """Return the actual text contained within an HTML document.

        Implemented using :py:mod:`BeautifulSoup <bs4>`
        (http://www.crummy.com/software/BeautifulSoup/).
        """
        try:
            soup = BeautifulSoup(self.text, "lxml").body
        except ValueError:
            soup = BeautifulSoup(self.text).body

        is_comment = lambda text: isinstance(text, bs4.element.Comment)
        [comment.extract() for comment in soup.find_all(text=is_comment)]
        for tag in self.hidden_tags:
            [element.extract() for element in soup.find_all(tag)]

        return "\n".join(soup.stripped_strings)
--- a/setup.py
+++ b/setup.py
@@ -34,6 +34,8 @@ setup(
    entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]},
    install_requires = ["GitPython >= 0.3.2.RC1",  # Interfacing with git
                        "PyYAML >= 3.10",  # Config parsing
                        "beautifulsoup4 >= 4.1.1",  # HTML parsing/scraping
                        "lxml >= 2.3.4",  # Faster parser for BeautifulSoup
                        "mwparserfromhell >= 0.1",  # Wikicode parsing
                        "oursql >= 0.9.3",  # Talking with MediaWiki databases
                        "oauth2 >= 1.5.211",  # Talking with Yahoo BOSS Search