From 0eadf65a091cfe6f6aca84dc51541dbe28fedca5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 14 Jul 2014 01:02:38 -0400 Subject: [PATCH] Only accept HTML and plain text for copyvio checks. --- earwigbot/wiki/copyvios/__init__.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 5e5dc2f..a4396aa 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -54,12 +54,16 @@ class CopyvioMixIn(object): self._opener.addheaders = site._opener.addheaders def _open_url_ignoring_errors(self, url): - """Open a URL using self._opener and return its content, or None. + """Open a URL and return its parsed content, or None. - Will decompress the content if the headers contain "gzip" as its - content encoding, and will return None if URLError is raised while - opening the URL. IOErrors while gunzipping a compressed response are - ignored, and the original content is returned. + First, we will decompress the content if the headers contain "gzip" as + its content encoding. Then, we will return the content stripped using + an HTML parser if the headers indicate it is HTML, or return the + content directly if it is plain text. If we don't understand the + content type, we'll return None. + + If a URLError was raised while opening the URL or an IOError was raised + while decompressing, None will be returned. """ try: response = self._opener.open(url.encode("utf8"), timeout=5) @@ -73,9 +77,16 @@ class CopyvioMixIn(object): try: result = gzipper.read() except IOError: - pass - - return result + return None + + ctype_full = response.headers.get("Content-Type", "text/plain") + ctype = ctype_full.split(" ", 1)[0] + if ctype in ["text/html", "application/xhtml+xml"]: + return HTMLTextParser(result).strip() + elif ctype == "text/plain": + return result.strip() + else: + return None def _select_search_engine(self): """Return a function that can be called to do web searches. @@ -108,12 +119,12 @@ class CopyvioMixIn(object): The *article* is a Markov chain, whereas the *url* is just a string that we'll try to open and read ourselves. """ - html = self._open_url_ignoring_errors(url) - if not html: + text = self._open_url_ignoring_errors(url) + if not text: empty = MarkovChain("") return 0, (empty, MarkovChainIntersection(empty, empty)) - source = MarkovChain(HTMLTextParser(html).strip()) + source = MarkovChain(text) delta = MarkovChainIntersection(article, source) return float(delta.size()) / article.size(), (source, delta)