From a4dda89a615071ef68e64ea8454367508bae9eec Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 4 Sep 2012 01:05:50 -0400 Subject: [PATCH] Various fixes for copyvios. - Fix a bug in ExclusionsDB; improve URL regexes. - NLTK's LookupError is actually an IOError. - Fix bug in __repr__ for CopyvioCheckResult. - Rewrite YahooBOSSSearchEngine to actually work with oauth2. - Search engines now take a URL opener in addition to credentials. --- earwigbot/wiki/copyvios/__init__.py | 2 +- earwigbot/wiki/copyvios/exclusions.py | 7 ++--- earwigbot/wiki/copyvios/parsers.py | 5 +++- earwigbot/wiki/copyvios/result.py | 2 +- earwigbot/wiki/copyvios/search.py | 50 +++++++++++++++++++++++------------ 5 files changed, 43 insertions(+), 23 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 59e0dcb..295685c 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -98,7 +98,7 @@ class CopyvioMixIn(object): except ImportError: e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2" raise exceptions.UnsupportedSearchEngineError(e) - return YahooBOSSSearchEngine(credentials) + return YahooBOSSSearchEngine(credentials, self._opener) raise exceptions.UnknownSearchEngineError(engine) diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py index 3600f97..517a12d 100644 --- a/earwigbot/wiki/copyvios/exclusions.py +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -88,11 +88,12 @@ class ExclusionsDB(object): return urls regexes = [ - "url\s*=\s*(?:https?:)?(?://)?(.*)", - "\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?" + r"url\s*=\s*(?:https?:)?(?://)?(.*)", + r"\*\s*Site:\s*(?:\[|\)?(?:https?:)?(?://)?(.*)(?:\]|\)?" ] for regex in regexes: - [urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)] + find = re.findall(regex, data, re.I) + [urls.add(url.lower().strip()) for url in find if url.strip()] return urls def _update(self, sitename): diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py index 4f8e981..c5e4325 100644 --- a/earwigbot/wiki/copyvios/parsers.py +++ b/earwigbot/wiki/copyvios/parsers.py @@ -20,6 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import errno from os import path import mwparserfromhell @@ -83,7 +84,9 @@ class ArticleTextParser(BaseTextParser): datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle") try: tokenizer = nltk.data.load("file:" + datafile) - except LookupError: + except IOError as exc: + if exc.errno != errno.ENOENT: + raise nltk.download("punkt", nltk_dir) tokenizer = nltk.data.load("file:" + datafile) diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py index 0c3e98f..7594a41 100644 --- a/earwigbot/wiki/copyvios/result.py +++ b/earwigbot/wiki/copyvios/result.py @@ -50,7 +50,7 @@ class CopyvioCheckResult(object): def __repr__(self): """Return the canonical string representation of the result.""" - res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" + res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3!r})" return res.format(self.violation, self.confidence, self.url, self.queries) diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index a9afcfb..91db646 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -20,8 +20,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +from gzip import GzipFile from json import loads -from urllib import quote_plus, urlencode +from StringIO import StringIO +from urllib import quote_plus from earwigbot import importer from earwigbot.exceptions import SearchQueryError @@ -34,9 +36,10 @@ class BaseSearchEngine(object): """Base class for a simple search engine interface.""" name = "Base" - def __init__(self, cred): - """Store credentials *cred* for searching later on.""" + def __init__(self, cred, opener): + """Store credentials (*cred*) and *opener* for searching later on.""" self.cred = cred + self.opener = opener def __repr__(self): """Return the canonical string representation of the search engine.""" @@ -65,22 +68,35 @@ class YahooBOSSSearchEngine(BaseSearchEngine): determined by Yahoo). Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. """ - base_url = "http://yboss.yahooapis.com/ysearch/web" - query = quote_plus(query.join('"', '"')) - params = {"q": query, "type": "html,text", "format": "json"} - url = "{0}?{1}".format(base_url, urlencode(params)) - - consumer = oauth.Consumer(key=self.cred["key"], - secret=self.cred["secret"]) - client = oauth.Client(consumer) - headers, body = client.request(url, "GET") - - if headers["status"] != "200": + key, secret = self.cred["key"], self.cred["secret"] + consumer = oauth.Consumer(key=key, secret=secret) + + url = "http://yboss.yahooapis.com/ysearch/web" + params = { + "oauth_version": oauth.OAUTH_VERSION, + "oauth_nonce": oauth.generate_nonce(), + "oauth_timestamp": oauth.Request.make_timestamp(), + "oauth_consumer_key": consumer.key, + "q": quote_plus('"' + query.encode("utf8") + '"'), + "type": "html,text", + "format": "json", + } + + req = oauth.Request(method="GET", url=url, parameters=params) + req.sign_request(oauth.SignatureMethod_HMAC_SHA1(), consumer, None) + response = self.opener.open(req.to_url()) + result = response.read() + + if response.headers.get("Content-Encoding") == "gzip": + stream = StringIO(result) + gzipper = GzipFile(fileobj=stream) + result = gzipper.read() + + if response.getcode() != 200: e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" - raise SearchQueryError(e.format(headers["status"], body)) - + raise SearchQueryError(e.format(response.getcode(), result)) try: - res = loads(body) + res = loads(result) except ValueError: e = "Yahoo! BOSS Error: JSON could not be decoded" raise SearchQueryError(e)