diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 750e917..59b5958 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -29,7 +29,8 @@ from earwigbot import importer bs4 = importer.new("bs4") nltk = importer.new("nltk") -__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] +__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser", + "PlainTextParser"] class BaseTextParser(object): """Base class for a parser that handles text.""" @@ -157,7 +158,7 @@ class HTMLTextParser(BaseTextParser): "script", "style" ] - def strip(self): + def parse(self): """Return the actual text contained within an HTML document. Implemented using :py:mod:`BeautifulSoup ` @@ -180,3 +181,11 @@ class HTMLTextParser(BaseTextParser): element.extract() return "\n".join(soup.stripped_strings) + + +class PlainTextParser(BaseTextParser): + """A parser that can unicode-ify and strip text from a plain text page.""" + + def parse(self): + """Unicode-ify and strip whitespace from the plain text document.""" + return bs4.UnicodeDammit(self.text).unicode_markup.strip() diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index 4faed75..ffd5510 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError from earwigbot import importer from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection -from earwigbot.wiki.copyvios.parsers import HTMLTextParser +from earwigbot.wiki.copyvios.parsers import HTMLTextParser, PlainTextParser from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource tldextract = importer.new("tldextract") @@ -139,9 +139,9 @@ class _CopyvioWorker(object): ctype_full = response.headers.get("Content-Type", "text/plain") ctype = ctype_full.split(";", 1)[0] if ctype in ["text/html", "application/xhtml+xml"]: - handler = lambda res: HTMLTextParser(res).strip() + handler = HTMLTextParser elif ctype == "text/plain": - handler = lambda res: res.strip() + handler = PlainTextParser else: return None @@ -158,7 +158,7 @@ class _CopyvioWorker(object): except (IOError, struct_error): return None - return handler(content) + return handler(content).parse() def _acquire_new_site(self): """Block for a new unassigned site queue.""" diff --git a/setup.py b/setup.py index 5efa620..23be139 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ extra_deps = { ], "copyvios": [ "beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML + "cchardet >= 0.3.5", # Encoding detection for BeautifulSoup "lxml >= 2.3.5", # Faster parser for BeautifulSoup "nltk >= 2.0.2", # Parsing sentences to split article content "oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search