@@ -21,6 +21,7 @@ | |||||
# SOFTWARE. | # SOFTWARE. | ||||
from os import path | from os import path | ||||
from StringIO import StringIO | |||||
import mwparserfromhell | import mwparserfromhell | ||||
@@ -28,11 +29,11 @@ from earwigbot import importer | |||||
bs4 = importer.new("bs4") | bs4 = importer.new("bs4") | ||||
nltk = importer.new("nltk") | nltk = importer.new("nltk") | ||||
PyPDF2 = importer.new("PyPDF2") | |||||
__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser", | |||||
"PlainTextParser"] | |||||
__all__ = ["ArticleTextParser", "get_parser"] | |||||
class BaseTextParser(object): | |||||
class _BaseTextParser(object): | |||||
"""Base class for a parser that handles text.""" | """Base class for a parser that handles text.""" | ||||
def __init__(self, text): | def __init__(self, text): | ||||
@@ -48,7 +49,7 @@ class BaseTextParser(object): | |||||
return "<{0} of text with size {1}>".format(name, len(self.text)) | return "<{0} of text with size {1}>".format(name, len(self.text)) | ||||
class ArticleTextParser(BaseTextParser): | |||||
class ArticleTextParser(_BaseTextParser): | |||||
"""A parser that can strip and chunk wikicode article text.""" | """A parser that can strip and chunk wikicode article text.""" | ||||
def strip(self): | def strip(self): | ||||
@@ -152,7 +153,7 @@ class ArticleTextParser(BaseTextParser): | |||||
if link.url.startswith(schemes)] | if link.url.startswith(schemes)] | ||||
class HTMLTextParser(BaseTextParser): | |||||
class _HTMLParser(_BaseTextParser): | |||||
"""A parser that can extract the text from an HTML document.""" | """A parser that can extract the text from an HTML document.""" | ||||
hidden_tags = [ | hidden_tags = [ | ||||
"script", "style" | "script", "style" | ||||
@@ -183,9 +184,30 @@ class HTMLTextParser(BaseTextParser): | |||||
return "\n".join(soup.stripped_strings) | return "\n".join(soup.stripped_strings) | ||||
class PlainTextParser(BaseTextParser): | |||||
class _PDFParser(_BaseTextParser): | |||||
"""A parser that can extract text from a PDF file.""" | |||||
def parse(self): | |||||
"""Return extracted text from the PDF.""" | |||||
raise NotImplementedError() | |||||
class _PlainTextParser(_BaseTextParser): | |||||
"""A parser that can unicode-ify and strip text from a plain text page.""" | """A parser that can unicode-ify and strip text from a plain text page.""" | ||||
def parse(self): | def parse(self): | ||||
"""Unicode-ify and strip whitespace from the plain text document.""" | """Unicode-ify and strip whitespace from the plain text document.""" | ||||
return bs4.UnicodeDammit(self.text).unicode_markup.strip() | return bs4.UnicodeDammit(self.text).unicode_markup.strip() | ||||
_CONTENT_TYPES = { | |||||
"text/html": _HTMLParser, | |||||
"application/xhtml+xml": _HTMLParser, | |||||
"application/pdf": _PDFParser, | |||||
"application/x-pdf": _PDFParser, | |||||
"text/plain": _PlainTextParser | |||||
} | |||||
def get_parser(content_type): | |||||
"""Return the parser most able to handle a given content type, or None.""" | |||||
return _CONTENT_TYPES.get(content_type.split(";", 1)[0]) |
@@ -72,10 +72,13 @@ class CopyvioSource(object): | |||||
self._event2.clear() | self._event2.clear() | ||||
self._event1.set() | self._event1.set() | ||||
def finish_work(self, confidence, source_chain, delta_chain): | |||||
"""Complete the confidence information inside this source.""" | |||||
def update(self, confidence, source_chain, delta_chain): | |||||
"""Fill out the confidence and chain information inside this source.""" | |||||
self.confidence = confidence | self.confidence = confidence | ||||
self.chains = (source_chain, delta_chain) | self.chains = (source_chain, delta_chain) | ||||
def finish_work(self): | |||||
"""Mark this source as finished.""" | |||||
self._event2.set() | self._event2.set() | ||||
def skip(self): | def skip(self): | ||||
@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError | |||||
from earwigbot import importer | from earwigbot import importer | ||||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | ||||
from earwigbot.wiki.copyvios.parsers import HTMLTextParser, PlainTextParser | |||||
from earwigbot.wiki.copyvios.parsers import get_parser | |||||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource | ||||
tldextract = importer.new("tldextract") | tldextract = importer.new("tldextract") | ||||
@@ -136,13 +136,9 @@ class _CopyvioWorker(object): | |||||
if size > 1024 ** 2: # Ignore URLs larger than a megabyte | if size > 1024 ** 2: # Ignore URLs larger than a megabyte | ||||
return None | return None | ||||
ctype_full = response.headers.get("Content-Type", "text/plain") | |||||
ctype = ctype_full.split(";", 1)[0] | |||||
if ctype in ["text/html", "application/xhtml+xml"]: | |||||
handler = HTMLTextParser | |||||
elif ctype == "text/plain": | |||||
handler = PlainTextParser | |||||
else: | |||||
content_type = response.headers.get("Content-Type", "text/plain") | |||||
handler = get_parser(content_type) | |||||
if not handler: | |||||
return None | return None | ||||
try: | try: | ||||
@@ -222,7 +218,8 @@ class _CopyvioWorker(object): | |||||
self._logger.debug("Exiting: got stop signal") | self._logger.debug("Exiting: got stop signal") | ||||
return | return | ||||
text = self._open_url(source) | text = self._open_url(source) | ||||
source.workspace.compare(source, MarkovChain(text or "")) | |||||
chain = MarkovChain(text) if text else None | |||||
source.workspace.compare(source, chain) | |||||
def start(self): | def start(self): | ||||
"""Start the copyvio worker in a new thread.""" | """Start the copyvio worker in a new thread.""" | ||||
@@ -339,11 +336,16 @@ class CopyvioWorkspace(object): | |||||
def compare(self, source, source_chain): | def compare(self, source, source_chain): | ||||
"""Compare a source to the article; call _finish_early if necessary.""" | """Compare a source to the article; call _finish_early if necessary.""" | ||||
delta = MarkovChainIntersection(self._article, source_chain) | |||||
conf = self._calculate_confidence(delta) | |||||
if source_chain: | |||||
delta = MarkovChainIntersection(self._article, source_chain) | |||||
conf = self._calculate_confidence(delta) | |||||
else: | |||||
conf = 0.0 | |||||
self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf)) | self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf)) | ||||
with self._finish_lock: | with self._finish_lock: | ||||
source.finish_work(conf, source_chain, delta) | |||||
if source_chain: | |||||
source.update(conf, source_chain, delta) | |||||
source.finish_work() | |||||
if not self.finished and conf >= self._min_confidence: | if not self.finished and conf >= self._min_confidence: | ||||
if self._short_circuit: | if self._short_circuit: | ||||
self._finish_early() | self._finish_early() | ||||
@@ -44,6 +44,7 @@ extra_deps = { | |||||
"lxml >= 2.3.5", # Faster parser for BeautifulSoup | "lxml >= 2.3.5", # Faster parser for BeautifulSoup | ||||
"nltk >= 2.0.2", # Parsing sentences to split article content | "nltk >= 2.0.2", # Parsing sentences to split article content | ||||
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search | "oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search | ||||
"PyPDF2 >= 1.23", # Extracting text from PDF files | |||||
"tldextract >= 1.4", # Getting domains for the multithreaded workers | "tldextract >= 1.4", # Getting domains for the multithreaded workers | ||||
], | ], | ||||
"time": [ | "time": [ | ||||