Refactor parsers; fix empty document behavior.

10 years ago · 30f72df470
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 from os import path
 from StringIO import StringIO
 import mwparserfromhell
@@ -28,11 +29,11 @@ from earwigbot import importer
 bs4 = importer.new("bs4")
 nltk = importer.new("nltk")
 PyPDF2 = importer.new("PyPDF2")
 __all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser",
           "PlainTextParser"]
 __all__ = ["ArticleTextParser", "get_parser"]
 class BaseTextParser(object):
 class _BaseTextParser(object):
    """Base class for a parser that handles text."""
    def __init__(self, text):
@@ -48,7 +49,7 @@ class BaseTextParser(object):
        return "<{0} of text with size {1}>".format(name, len(self.text))
 class ArticleTextParser(BaseTextParser):
 class ArticleTextParser(_BaseTextParser):
    """A parser that can strip and chunk wikicode article text."""
    def strip(self):
@@ -152,7 +153,7 @@ class ArticleTextParser(BaseTextParser):
                if link.url.startswith(schemes)]
 class HTMLTextParser(BaseTextParser):
 class _HTMLParser(_BaseTextParser):
    """A parser that can extract the text from an HTML document."""
    hidden_tags = [
        "script", "style"
@@ -183,9 +184,30 @@ class HTMLTextParser(BaseTextParser):
        return "\n".join(soup.stripped_strings)
 class PlainTextParser(BaseTextParser):
 class _PDFParser(_BaseTextParser):
    """A parser that can extract text from a PDF file."""
    def parse(self):
        """Return extracted text from the PDF."""
        raise NotImplementedError()
 class _PlainTextParser(_BaseTextParser):
    """A parser that can unicode-ify and strip text from a plain text page."""
    def parse(self):
        """Unicode-ify and strip whitespace from the plain text document."""
        return bs4.UnicodeDammit(self.text).unicode_markup.strip()
 _CONTENT_TYPES = {
    "text/html": _HTMLParser,
    "application/xhtml+xml": _HTMLParser,
    "application/pdf": _PDFParser,
    "application/x-pdf": _PDFParser,
    "text/plain": _PlainTextParser
 }
 def get_parser(content_type):
    """Return the parser most able to handle a given content type, or None."""
    return _CONTENT_TYPES.get(content_type.split(";", 1)[0])
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -72,10 +72,13 @@ class CopyvioSource(object):
        self._event2.clear()
        self._event1.set()
    def finish_work(self, confidence, source_chain, delta_chain):
        """Complete the confidence information inside this source."""
    def update(self, confidence, source_chain, delta_chain):
        """Fill out the confidence and chain information inside this source."""
        self.confidence = confidence
        self.chains = (source_chain, delta_chain)
    def finish_work(self):
        """Mark this source as finished."""
        self._event2.set()
    def skip(self):
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError
 from earwigbot import importer
 from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
 from earwigbot.wiki.copyvios.parsers import HTMLTextParser, PlainTextParser
 from earwigbot.wiki.copyvios.parsers import get_parser
 from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource
 tldextract = importer.new("tldextract")
@@ -136,13 +136,9 @@ class _CopyvioWorker(object):
        if size > 1024 ** 2:  # Ignore URLs larger than a megabyte
            return None
        ctype_full = response.headers.get("Content-Type", "text/plain")
        ctype = ctype_full.split(";", 1)[0]
        if ctype in ["text/html", "application/xhtml+xml"]:
            handler = HTMLTextParser
        elif ctype == "text/plain":
            handler = PlainTextParser
        else:
        content_type = response.headers.get("Content-Type", "text/plain")
        handler = get_parser(content_type)
        if not handler:
            return None
        try:
@@ -222,7 +218,8 @@ class _CopyvioWorker(object):
                self._logger.debug("Exiting: got stop signal")
                return
            text = self._open_url(source)
            source.workspace.compare(source, MarkovChain(text or ""))
            chain = MarkovChain(text) if text else None
            source.workspace.compare(source, chain)
    def start(self):
        """Start the copyvio worker in a new thread."""
@@ -339,11 +336,16 @@ class CopyvioWorkspace(object):
    def compare(self, source, source_chain):
        """Compare a source to the article; call _finish_early if necessary."""
        delta = MarkovChainIntersection(self._article, source_chain)
        conf = self._calculate_confidence(delta)
        if source_chain:
            delta = MarkovChainIntersection(self._article, source_chain)
            conf = self._calculate_confidence(delta)
        else:
            conf = 0.0
        self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf))
        with self._finish_lock:
            source.finish_work(conf, source_chain, delta)
            if source_chain:
                source.update(conf, source_chain, delta)
            source.finish_work()
            if not self.finished and conf >= self._min_confidence:
                if self._short_circuit:
                    self._finish_early()
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,7 @@ extra_deps = {
        "lxml >= 2.3.5",  # Faster parser for BeautifulSoup
        "nltk >= 2.0.2",  # Parsing sentences to split article content
        "oauth2 >= 1.5.211",  # Interfacing with Yahoo! BOSS Search
        "PyPDF2 >= 1.23",  # Extracting text from PDF files
        "tldextract >= 1.4",  # Getting domains for the multithreaded workers
    ],
    "time": [