@@ -21,6 +21,7 @@ | |||
# SOFTWARE. | |||
from os import path | |||
from StringIO import StringIO | |||
import mwparserfromhell | |||
@@ -28,11 +29,11 @@ from earwigbot import importer | |||
bs4 = importer.new("bs4") | |||
nltk = importer.new("nltk") | |||
PyPDF2 = importer.new("PyPDF2") | |||
__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser", | |||
"PlainTextParser"] | |||
__all__ = ["ArticleTextParser", "get_parser"] | |||
class BaseTextParser(object): | |||
class _BaseTextParser(object): | |||
"""Base class for a parser that handles text.""" | |||
def __init__(self, text): | |||
@@ -48,7 +49,7 @@ class BaseTextParser(object): | |||
return "<{0} of text with size {1}>".format(name, len(self.text)) | |||
class ArticleTextParser(BaseTextParser): | |||
class ArticleTextParser(_BaseTextParser): | |||
"""A parser that can strip and chunk wikicode article text.""" | |||
def strip(self): | |||
@@ -152,7 +153,7 @@ class ArticleTextParser(BaseTextParser): | |||
if link.url.startswith(schemes)] | |||
class HTMLTextParser(BaseTextParser): | |||
class _HTMLParser(_BaseTextParser): | |||
"""A parser that can extract the text from an HTML document.""" | |||
hidden_tags = [ | |||
"script", "style" | |||
@@ -183,9 +184,30 @@ class HTMLTextParser(BaseTextParser): | |||
return "\n".join(soup.stripped_strings) | |||
class PlainTextParser(BaseTextParser): | |||
class _PDFParser(_BaseTextParser): | |||
"""A parser that can extract text from a PDF file.""" | |||
def parse(self): | |||
"""Return extracted text from the PDF.""" | |||
raise NotImplementedError() | |||
class _PlainTextParser(_BaseTextParser): | |||
"""A parser that can unicode-ify and strip text from a plain text page.""" | |||
def parse(self): | |||
"""Unicode-ify and strip whitespace from the plain text document.""" | |||
return bs4.UnicodeDammit(self.text).unicode_markup.strip() | |||
_CONTENT_TYPES = { | |||
"text/html": _HTMLParser, | |||
"application/xhtml+xml": _HTMLParser, | |||
"application/pdf": _PDFParser, | |||
"application/x-pdf": _PDFParser, | |||
"text/plain": _PlainTextParser | |||
} | |||
def get_parser(content_type): | |||
"""Return the parser most able to handle a given content type, or None.""" | |||
return _CONTENT_TYPES.get(content_type.split(";", 1)[0]) |
@@ -72,10 +72,13 @@ class CopyvioSource(object): | |||
self._event2.clear() | |||
self._event1.set() | |||
def finish_work(self, confidence, source_chain, delta_chain): | |||
"""Complete the confidence information inside this source.""" | |||
def update(self, confidence, source_chain, delta_chain): | |||
"""Fill out the confidence and chain information inside this source.""" | |||
self.confidence = confidence | |||
self.chains = (source_chain, delta_chain) | |||
def finish_work(self): | |||
"""Mark this source as finished.""" | |||
self._event2.set() | |||
def skip(self): | |||
@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError | |||
from earwigbot import importer | |||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||
from earwigbot.wiki.copyvios.parsers import HTMLTextParser, PlainTextParser | |||
from earwigbot.wiki.copyvios.parsers import get_parser | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | |||
tldextract = importer.new("tldextract") | |||
@@ -136,13 +136,9 @@ class _CopyvioWorker(object): | |||
if size > 1024 ** 2: # Ignore URLs larger than a megabyte | |||
return None | |||
ctype_full = response.headers.get("Content-Type", "text/plain") | |||
ctype = ctype_full.split(";", 1)[0] | |||
if ctype in ["text/html", "application/xhtml+xml"]: | |||
handler = HTMLTextParser | |||
elif ctype == "text/plain": | |||
handler = PlainTextParser | |||
else: | |||
content_type = response.headers.get("Content-Type", "text/plain") | |||
handler = get_parser(content_type) | |||
if not handler: | |||
return None | |||
try: | |||
@@ -222,7 +218,8 @@ class _CopyvioWorker(object): | |||
self._logger.debug("Exiting: got stop signal") | |||
return | |||
text = self._open_url(source) | |||
source.workspace.compare(source, MarkovChain(text or "")) | |||
chain = MarkovChain(text) if text else None | |||
source.workspace.compare(source, chain) | |||
def start(self): | |||
"""Start the copyvio worker in a new thread.""" | |||
@@ -339,11 +336,16 @@ class CopyvioWorkspace(object): | |||
def compare(self, source, source_chain): | |||
"""Compare a source to the article; call _finish_early if necessary.""" | |||
delta = MarkovChainIntersection(self._article, source_chain) | |||
conf = self._calculate_confidence(delta) | |||
if source_chain: | |||
delta = MarkovChainIntersection(self._article, source_chain) | |||
conf = self._calculate_confidence(delta) | |||
else: | |||
conf = 0.0 | |||
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf)) | |||
with self._finish_lock: | |||
source.finish_work(conf, source_chain, delta) | |||
if source_chain: | |||
source.update(conf, source_chain, delta) | |||
source.finish_work() | |||
if not self.finished and conf >= self._min_confidence: | |||
if self._short_circuit: | |||
self._finish_early() | |||
@@ -44,6 +44,7 @@ extra_deps = { | |||
"lxml >= 2.3.5", # Faster parser for BeautifulSoup | |||
"nltk >= 2.0.2", # Parsing sentences to split article content | |||
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search | |||
"PyPDF2 >= 1.23", # Extracting text from PDF files | |||
"tldextract >= 1.4", # Getting domains for the multithreaded workers | |||
], | |||
"time": [ | |||