Fix parsing of plain text documents (earwig/copyvios#3)

10 years ago · 5349179088
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -29,7 +29,8 @@ from earwigbot import importer
 bs4 = importer.new("bs4")
 nltk = importer.new("nltk")

 __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]
 __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser",
           "PlainTextParser"]

 class BaseTextParser(object):
    """Base class for a parser that handles text."""
@@ -157,7 +158,7 @@ class HTMLTextParser(BaseTextParser):
        "script", "style"
    ]

    def strip(self):
    def parse(self):
        """Return the actual text contained within an HTML document.

        Implemented using :py:mod:`BeautifulSoup <bs4>`
@@ -180,3 +181,11 @@ class HTMLTextParser(BaseTextParser):
                element.extract()

        return "\n".join(soup.stripped_strings)


 class PlainTextParser(BaseTextParser):
    """A parser that can unicode-ify and strip text from a plain text page."""

    def parse(self):
        """Unicode-ify and strip whitespace from the plain text document."""
        return bs4.UnicodeDammit(self.text).unicode_markup.strip()
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError

 from earwigbot import importer
 from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
 from earwigbot.wiki.copyvios.parsers import HTMLTextParser
 from earwigbot.wiki.copyvios.parsers import HTMLTextParser, PlainTextParser
 from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource

 tldextract = importer.new("tldextract")
@@ -139,9 +139,9 @@ class _CopyvioWorker(object):
        ctype_full = response.headers.get("Content-Type", "text/plain")
        ctype = ctype_full.split(";", 1)[0]
        if ctype in ["text/html", "application/xhtml+xml"]:
            handler = lambda res: HTMLTextParser(res).strip()
            handler = HTMLTextParser
        elif ctype == "text/plain":
            handler = lambda res: res.strip()
            handler = PlainTextParser
        else:
            return None

@@ -158,7 +158,7 @@ class _CopyvioWorker(object):
            except (IOError, struct_error):
                return None

        return handler(content)
        return handler(content).parse()

    def _acquire_new_site(self):
        """Block for a new unassigned site queue."""
--- a/setup.py
+++ b/setup.py
@@ -40,6 +40,7 @@ extra_deps = {
    ],
    "copyvios": [
        "beautifulsoup4 >= 4.1.1",  # Parsing/scraping HTML
        "cchardet >= 0.3.5",  # Encoding detection for BeautifulSoup
        "lxml >= 2.3.5",  # Faster parser for BeautifulSoup
        "nltk >= 2.0.2",  # Parsing sentences to split article content
        "oauth2 >= 1.5.211",  # Interfacing with Yahoo! BOSS Search