From 30f72df470f4a834179eecf87a03b70a8c00ab55 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 19 Sep 2014 21:20:57 -0500 Subject: [PATCH] Refactor parsers; fix empty document behavior. --- earwigbot/wiki/copyvios/parsers.py | 34 ++++++++++++++++++++++++++++------ earwigbot/wiki/copyvios/result.py | 7 +++++-- earwigbot/wiki/copyvios/workers.py | 26 ++++++++++++++------------ setup.py | 1 + 4 files changed, 48 insertions(+), 20 deletions(-) diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 59b5958..594caeb 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -21,6 +21,7 @@ # SOFTWARE. from os import path +from StringIO import StringIO import mwparserfromhell @@ -28,11 +29,11 @@ from earwigbot import importer bs4 = importer.new("bs4") nltk = importer.new("nltk") +PyPDF2 = importer.new("PyPDF2") -__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser", - "PlainTextParser"] +__all__ = ["ArticleTextParser", "get_parser"] -class BaseTextParser(object): +class _BaseTextParser(object): """Base class for a parser that handles text.""" def __init__(self, text): @@ -48,7 +49,7 @@ class BaseTextParser(object): return "<{0} of text with size {1}>".format(name, len(self.text)) -class ArticleTextParser(BaseTextParser): +class ArticleTextParser(_BaseTextParser): """A parser that can strip and chunk wikicode article text.""" def strip(self): @@ -152,7 +153,7 @@ class ArticleTextParser(BaseTextParser): if link.url.startswith(schemes)] -class HTMLTextParser(BaseTextParser): +class _HTMLParser(_BaseTextParser): """A parser that can extract the text from an HTML document.""" hidden_tags = [ "script", "style" @@ -183,9 +184,30 @@ class HTMLTextParser(BaseTextParser): return "\n".join(soup.stripped_strings) -class PlainTextParser(BaseTextParser): +class _PDFParser(_BaseTextParser): + """A parser that can extract text from a PDF file.""" + + def parse(self): + """Return extracted text from the PDF.""" + raise NotImplementedError() + + +class _PlainTextParser(_BaseTextParser): """A parser that can unicode-ify and strip text from a plain text page.""" def parse(self): """Unicode-ify and strip whitespace from the plain text document.""" return bs4.UnicodeDammit(self.text).unicode_markup.strip() + + +_CONTENT_TYPES = { + "text/html": _HTMLParser, + "application/xhtml+xml": _HTMLParser, + "application/pdf": _PDFParser, + "application/x-pdf": _PDFParser, + "text/plain": _PlainTextParser +} + +def get_parser(content_type): + """Return the parser most able to handle a given content type, or None.""" + return _CONTENT_TYPES.get(content_type.split(";", 1)[0]) diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py index d664965..bbfc566 100644 --- a/earwigbot/wiki/copyvios/result.py +++ b/earwigbot/wiki/copyvios/result.py @@ -72,10 +72,13 @@ class CopyvioSource(object): self._event2.clear() self._event1.set() - def finish_work(self, confidence, source_chain, delta_chain): - """Complete the confidence information inside this source.""" + def update(self, confidence, source_chain, delta_chain): + """Fill out the confidence and chain information inside this source.""" self.confidence = confidence self.chains = (source_chain, delta_chain) + + def finish_work(self): + """Mark this source as finished.""" self._event2.set() def skip(self): diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py index ffd5510..e4ea165 100644 --- a/earwigbot/wiki/copyvios/workers.py +++ b/earwigbot/wiki/copyvios/workers.py @@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError from earwigbot import importer from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection -from earwigbot.wiki.copyvios.parsers import HTMLTextParser, PlainTextParser +from earwigbot.wiki.copyvios.parsers import get_parser from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource tldextract = importer.new("tldextract") @@ -136,13 +136,9 @@ class _CopyvioWorker(object): if size > 1024 ** 2: # Ignore URLs larger than a megabyte return None - ctype_full = response.headers.get("Content-Type", "text/plain") - ctype = ctype_full.split(";", 1)[0] - if ctype in ["text/html", "application/xhtml+xml"]: - handler = HTMLTextParser - elif ctype == "text/plain": - handler = PlainTextParser - else: + content_type = response.headers.get("Content-Type", "text/plain") + handler = get_parser(content_type) + if not handler: return None try: @@ -222,7 +218,8 @@ class _CopyvioWorker(object): self._logger.debug("Exiting: got stop signal") return text = self._open_url(source) - source.workspace.compare(source, MarkovChain(text or "")) + chain = MarkovChain(text) if text else None + source.workspace.compare(source, chain) def start(self): """Start the copyvio worker in a new thread.""" @@ -339,11 +336,16 @@ class CopyvioWorkspace(object): def compare(self, source, source_chain): """Compare a source to the article; call _finish_early if necessary.""" - delta = MarkovChainIntersection(self._article, source_chain) - conf = self._calculate_confidence(delta) + if source_chain: + delta = MarkovChainIntersection(self._article, source_chain) + conf = self._calculate_confidence(delta) + else: + conf = 0.0 self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf)) with self._finish_lock: - source.finish_work(conf, source_chain, delta) + if source_chain: + source.update(conf, source_chain, delta) + source.finish_work() if not self.finished and conf >= self._min_confidence: if self._short_circuit: self._finish_early() diff --git a/setup.py b/setup.py index 23be139..e881651 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,7 @@ extra_deps = { "lxml >= 2.3.5", # Faster parser for BeautifulSoup "nltk >= 2.0.2", # Parsing sentences to split article content "oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search + "PyPDF2 >= 1.23", # Extracting text from PDF files "tldextract >= 1.4", # Getting domains for the multithreaded workers ], "time": [