From 0eadf65a091cfe6f6aca84dc51541dbe28fedca5 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Mon, 14 Jul 2014 01:02:38 -0400
Subject: [PATCH] Only accept HTML and plain text for copyvio checks.

---
 earwigbot/wiki/copyvios/__init__.py | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 5e5dc2f..a4396aa 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -54,12 +54,16 @@ class CopyvioMixIn(object):
         self._opener.addheaders = site._opener.addheaders
 
     def _open_url_ignoring_errors(self, url):
-        """Open a URL using self._opener and return its content, or None.
+        """Open a URL and return its parsed content, or None.
 
-        Will decompress the content if the headers contain "gzip" as its
-        content encoding, and will return None if URLError is raised while
-        opening the URL. IOErrors while gunzipping a compressed response are
-        ignored, and the original content is returned.
+        First, we will decompress the content if the headers contain "gzip" as
+        its content encoding. Then, we will return the content stripped using
+        an HTML parser if the headers indicate it is HTML, or return the
+        content directly if it is plain text. If we don't understand the
+        content type, we'll return None.
+
+        If a URLError was raised while opening the URL or an IOError was raised
+        while decompressing, None will be returned.
         """
         try:
             response = self._opener.open(url.encode("utf8"), timeout=5)
@@ -73,9 +77,16 @@ class CopyvioMixIn(object):
             try:
                 result = gzipper.read()
             except IOError:
-                pass
-
-        return result
+                return None
+
+        ctype_full = response.headers.get("Content-Type", "text/plain")
+        ctype = ctype_full.split(" ", 1)[0]
+        if ctype in ["text/html", "application/xhtml+xml"]:
+            return HTMLTextParser(result).strip()
+        elif ctype == "text/plain":
+            return result.strip()
+        else:
+            return None
 
     def _select_search_engine(self):
         """Return a function that can be called to do web searches.
@@ -108,12 +119,12 @@ class CopyvioMixIn(object):
         The *article* is a Markov chain, whereas the *url* is just a string
         that we'll try to open and read ourselves.
         """
-        html = self._open_url_ignoring_errors(url)
-        if not html:
+        text = self._open_url_ignoring_errors(url)
+        if not text:
             empty = MarkovChain("")
             return 0, (empty, MarkovChainIntersection(empty, empty))
 
-        source = MarkovChain(HTMLTextParser(html).strip())
+        source = MarkovChain(text)
         delta = MarkovChainIntersection(article, source)
         return float(delta.size()) / article.size(), (source, delta)