@@ -29,7 +29,8 @@ from earwigbot import importer | |||||
bs4 = importer.new("bs4") | bs4 = importer.new("bs4") | ||||
nltk = importer.new("nltk") | nltk = importer.new("nltk") | ||||
__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"] | |||||
__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser", | |||||
"PlainTextParser"] | |||||
class BaseTextParser(object): | class BaseTextParser(object): | ||||
"""Base class for a parser that handles text.""" | """Base class for a parser that handles text.""" | ||||
@@ -157,7 +158,7 @@ class HTMLTextParser(BaseTextParser): | |||||
"script", "style" | "script", "style" | ||||
] | ] | ||||
def strip(self): | |||||
def parse(self): | |||||
"""Return the actual text contained within an HTML document. | """Return the actual text contained within an HTML document. | ||||
Implemented using :py:mod:`BeautifulSoup <bs4>` | Implemented using :py:mod:`BeautifulSoup <bs4>` | ||||
@@ -180,3 +181,11 @@ class HTMLTextParser(BaseTextParser): | |||||
element.extract() | element.extract() | ||||
return "\n".join(soup.stripped_strings) | return "\n".join(soup.stripped_strings) | ||||
class PlainTextParser(BaseTextParser): | |||||
"""A parser that can unicode-ify and strip text from a plain text page.""" | |||||
def parse(self): | |||||
"""Unicode-ify and strip whitespace from the plain text document.""" | |||||
return bs4.UnicodeDammit(self.text).unicode_markup.strip() |
@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError | |||||
from earwigbot import importer | from earwigbot import importer | ||||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | ||||
from earwigbot.wiki.copyvios.parsers import HTMLTextParser | |||||
from earwigbot.wiki.copyvios.parsers import HTMLTextParser, PlainTextParser | |||||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | ||||
tldextract = importer.new("tldextract") | tldextract = importer.new("tldextract") | ||||
@@ -139,9 +139,9 @@ class _CopyvioWorker(object): | |||||
ctype_full = response.headers.get("Content-Type", "text/plain") | ctype_full = response.headers.get("Content-Type", "text/plain") | ||||
ctype = ctype_full.split(";", 1)[0] | ctype = ctype_full.split(";", 1)[0] | ||||
if ctype in ["text/html", "application/xhtml+xml"]: | if ctype in ["text/html", "application/xhtml+xml"]: | ||||
handler = lambda res: HTMLTextParser(res).strip() | |||||
handler = HTMLTextParser | |||||
elif ctype == "text/plain": | elif ctype == "text/plain": | ||||
handler = lambda res: res.strip() | |||||
handler = PlainTextParser | |||||
else: | else: | ||||
return None | return None | ||||
@@ -158,7 +158,7 @@ class _CopyvioWorker(object): | |||||
except (IOError, struct_error): | except (IOError, struct_error): | ||||
return None | return None | ||||
return handler(content) | |||||
return handler(content).parse() | |||||
def _acquire_new_site(self): | def _acquire_new_site(self): | ||||
"""Block for a new unassigned site queue.""" | """Block for a new unassigned site queue.""" | ||||
@@ -40,6 +40,7 @@ extra_deps = { | |||||
], | ], | ||||
"copyvios": [ | "copyvios": [ | ||||
"beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML | "beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML | ||||
"cchardet >= 0.3.5", # Encoding detection for BeautifulSoup | |||||
"lxml >= 2.3.5", # Faster parser for BeautifulSoup | "lxml >= 2.3.5", # Faster parser for BeautifulSoup | ||||
"nltk >= 2.0.2", # Parsing sentences to split article content | "nltk >= 2.0.2", # Parsing sentences to split article content | ||||
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search | "oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search | ||||