Browse Source

Only accept HTML and plain text for copyvio checks.

tags/v0.2
Ben Kurtovic 10 years ago
parent
commit
0eadf65a09
1 changed files with 22 additions and 11 deletions
  1. +22
    -11
      earwigbot/wiki/copyvios/__init__.py

+ 22
- 11
earwigbot/wiki/copyvios/__init__.py View File

@@ -54,12 +54,16 @@ class CopyvioMixIn(object):
self._opener.addheaders = site._opener.addheaders self._opener.addheaders = site._opener.addheaders


def _open_url_ignoring_errors(self, url): def _open_url_ignoring_errors(self, url):
"""Open a URL using self._opener and return its content, or None.
"""Open a URL and return its parsed content, or None.


Will decompress the content if the headers contain "gzip" as its
content encoding, and will return None if URLError is raised while
opening the URL. IOErrors while gunzipping a compressed response are
ignored, and the original content is returned.
First, we will decompress the content if the headers contain "gzip" as
its content encoding. Then, we will return the content stripped using
an HTML parser if the headers indicate it is HTML, or return the
content directly if it is plain text. If we don't understand the
content type, we'll return None.

If a URLError was raised while opening the URL or an IOError was raised
while decompressing, None will be returned.
""" """
try: try:
response = self._opener.open(url.encode("utf8"), timeout=5) response = self._opener.open(url.encode("utf8"), timeout=5)
@@ -73,9 +77,16 @@ class CopyvioMixIn(object):
try: try:
result = gzipper.read() result = gzipper.read()
except IOError: except IOError:
pass

return result
return None

ctype_full = response.headers.get("Content-Type", "text/plain")
ctype = ctype_full.split(" ", 1)[0]
if ctype in ["text/html", "application/xhtml+xml"]:
return HTMLTextParser(result).strip()
elif ctype == "text/plain":
return result.strip()
else:
return None


def _select_search_engine(self): def _select_search_engine(self):
"""Return a function that can be called to do web searches. """Return a function that can be called to do web searches.
@@ -108,12 +119,12 @@ class CopyvioMixIn(object):
The *article* is a Markov chain, whereas the *url* is just a string The *article* is a Markov chain, whereas the *url* is just a string
that we'll try to open and read ourselves. that we'll try to open and read ourselves.
""" """
html = self._open_url_ignoring_errors(url)
if not html:
text = self._open_url_ignoring_errors(url)
if not text:
empty = MarkovChain("") empty = MarkovChain("")
return 0, (empty, MarkovChainIntersection(empty, empty)) return 0, (empty, MarkovChainIntersection(empty, empty))


source = MarkovChain(HTMLTextParser(html).strip())
source = MarkovChain(text)
delta = MarkovChainIntersection(article, source) delta = MarkovChainIntersection(article, source)
return float(delta.size()) / article.size(), (source, delta) return float(delta.size()) / article.size(), (source, delta)




Loading…
Cancel
Save