From 30f72df470f4a834179eecf87a03b70a8c00ab55 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Fri, 19 Sep 2014 21:20:57 -0500
Subject: [PATCH] Refactor parsers; fix empty document behavior.

---
 earwigbot/wiki/copyvios/parsers.py | 34 ++++++++++++++++++++++++++++------
 earwigbot/wiki/copyvios/result.py  |  7 +++++--
 earwigbot/wiki/copyvios/workers.py | 26 ++++++++++++++------------
 setup.py                           |  1 +
 4 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 59b5958..594caeb 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -21,6 +21,7 @@
 # SOFTWARE.
 
 from os import path
+from StringIO import StringIO
 
 import mwparserfromhell
 
@@ -28,11 +29,11 @@ from earwigbot import importer
 
 bs4 = importer.new("bs4")
 nltk = importer.new("nltk")
+PyPDF2 = importer.new("PyPDF2")
 
-__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser",
-           "PlainTextParser"]
+__all__ = ["ArticleTextParser", "get_parser"]
 
-class BaseTextParser(object):
+class _BaseTextParser(object):
     """Base class for a parser that handles text."""
 
     def __init__(self, text):
@@ -48,7 +49,7 @@ class BaseTextParser(object):
         return "<{0} of text with size {1}>".format(name, len(self.text))
 
 
-class ArticleTextParser(BaseTextParser):
+class ArticleTextParser(_BaseTextParser):
     """A parser that can strip and chunk wikicode article text."""
 
     def strip(self):
@@ -152,7 +153,7 @@ class ArticleTextParser(BaseTextParser):
                 if link.url.startswith(schemes)]
 
 
-class HTMLTextParser(BaseTextParser):
+class _HTMLParser(_BaseTextParser):
     """A parser that can extract the text from an HTML document."""
     hidden_tags = [
         "script", "style"
@@ -183,9 +184,30 @@ class HTMLTextParser(BaseTextParser):
         return "\n".join(soup.stripped_strings)
 
 
-class PlainTextParser(BaseTextParser):
+class _PDFParser(_BaseTextParser):
+    """A parser that can extract text from a PDF file."""
+
+    def parse(self):
+        """Return extracted text from the PDF."""
+        raise NotImplementedError()
+
+
+class _PlainTextParser(_BaseTextParser):
     """A parser that can unicode-ify and strip text from a plain text page."""
 
     def parse(self):
         """Unicode-ify and strip whitespace from the plain text document."""
         return bs4.UnicodeDammit(self.text).unicode_markup.strip()
+
+
+_CONTENT_TYPES = {
+    "text/html": _HTMLParser,
+    "application/xhtml+xml": _HTMLParser,
+    "application/pdf": _PDFParser,
+    "application/x-pdf": _PDFParser,
+    "text/plain": _PlainTextParser
+}
+
+def get_parser(content_type):
+    """Return the parser most able to handle a given content type, or None."""
+    return _CONTENT_TYPES.get(content_type.split(";", 1)[0])
diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py
index d664965..bbfc566 100644
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -72,10 +72,13 @@ class CopyvioSource(object):
         self._event2.clear()
         self._event1.set()
 
-    def finish_work(self, confidence, source_chain, delta_chain):
-        """Complete the confidence information inside this source."""
+    def update(self, confidence, source_chain, delta_chain):
+        """Fill out the confidence and chain information inside this source."""
         self.confidence = confidence
         self.chains = (source_chain, delta_chain)
+
+    def finish_work(self):
+        """Mark this source as finished."""
         self._event2.set()
 
     def skip(self):
diff --git a/earwigbot/wiki/copyvios/workers.py b/earwigbot/wiki/copyvios/workers.py
index ffd5510..e4ea165 100644
--- a/earwigbot/wiki/copyvios/workers.py
+++ b/earwigbot/wiki/copyvios/workers.py
@@ -34,7 +34,7 @@ from urllib2 import build_opener, URLError
 
 from earwigbot import importer
 from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
-from earwigbot.wiki.copyvios.parsers import HTMLTextParser, PlainTextParser
+from earwigbot.wiki.copyvios.parsers import get_parser
 from earwigbot.wiki.copyvios.result import CopyvioCheckResult, CopyvioSource
 
 tldextract = importer.new("tldextract")
@@ -136,13 +136,9 @@ class _CopyvioWorker(object):
         if size > 1024 ** 2:  # Ignore URLs larger than a megabyte
             return None
 
-        ctype_full = response.headers.get("Content-Type", "text/plain")
-        ctype = ctype_full.split(";", 1)[0]
-        if ctype in ["text/html", "application/xhtml+xml"]:
-            handler = HTMLTextParser
-        elif ctype == "text/plain":
-            handler = PlainTextParser
-        else:
+        content_type = response.headers.get("Content-Type", "text/plain")
+        handler = get_parser(content_type)
+        if not handler:
             return None
 
         try:
@@ -222,7 +218,8 @@ class _CopyvioWorker(object):
                 self._logger.debug("Exiting: got stop signal")
                 return
             text = self._open_url(source)
-            source.workspace.compare(source, MarkovChain(text or ""))
+            chain = MarkovChain(text) if text else None
+            source.workspace.compare(source, chain)
 
     def start(self):
         """Start the copyvio worker in a new thread."""
@@ -339,11 +336,16 @@ class CopyvioWorkspace(object):
 
     def compare(self, source, source_chain):
         """Compare a source to the article; call _finish_early if necessary."""
-        delta = MarkovChainIntersection(self._article, source_chain)
-        conf = self._calculate_confidence(delta)
+        if source_chain:
+            delta = MarkovChainIntersection(self._article, source_chain)
+            conf = self._calculate_confidence(delta)
+        else:
+            conf = 0.0
         self._logger.debug(u"compare(): {0} -> {1}".format(source.url, conf))
         with self._finish_lock:
-            source.finish_work(conf, source_chain, delta)
+            if source_chain:
+                source.update(conf, source_chain, delta)
+            source.finish_work()
             if not self.finished and conf >= self._min_confidence:
                 if self._short_circuit:
                     self._finish_early()
diff --git a/setup.py b/setup.py
index 23be139..e881651 100644
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,7 @@ extra_deps = {
         "lxml >= 2.3.5",  # Faster parser for BeautifulSoup
         "nltk >= 2.0.2",  # Parsing sentences to split article content
         "oauth2 >= 1.5.211",  # Interfacing with Yahoo! BOSS Search
+        "PyPDF2 >= 1.23",  # Extracting text from PDF files
         "tldextract >= 1.4",  # Getting domains for the multithreaded workers
     ],
     "time": [