From a4dda89a615071ef68e64ea8454367508bae9eec Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Tue, 4 Sep 2012 01:05:50 -0400
Subject: [PATCH] Various fixes for copyvios.

- Fix a bug in ExclusionsDB; improve URL regexes.
- NLTK's LookupError is actually an IOError.
- Fix bug in __repr__ for CopyvioCheckResult.
- Rewrite YahooBOSSSearchEngine to actually work with oauth2.
- Search engines now take a URL opener in addition to credentials.
---
 earwigbot/wiki/copyvios/__init__.py   |  2 +-
 earwigbot/wiki/copyvios/exclusions.py |  7 ++---
 earwigbot/wiki/copyvios/parsers.py    |  5 +++-
 earwigbot/wiki/copyvios/result.py     |  2 +-
 earwigbot/wiki/copyvios/search.py     | 50 +++++++++++++++++++++++------------
 5 files changed, 43 insertions(+), 23 deletions(-)
diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 59e0dcb..295685c 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -98,7 +98,7 @@ class CopyvioMixIn(object):
             except ImportError:
                 e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2"
                 raise exceptions.UnsupportedSearchEngineError(e)
-            return YahooBOSSSearchEngine(credentials)
+            return YahooBOSSSearchEngine(credentials, self._opener)
 
         raise exceptions.UnknownSearchEngineError(engine)
 
diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py
index 3600f97..517a12d 100644
--- a/earwigbot/wiki/copyvios/exclusions.py
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -88,11 +88,12 @@ class ExclusionsDB(object):
             return urls
 
         regexes = [
-            "url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
-            "\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?"
+            r"url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
+            r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*)(?:\]|\</nowiki\>)?"
         ]
         for regex in regexes:
-            [urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)]
+            find = re.findall(regex, data, re.I)
+            [urls.add(url.lower().strip()) for url in find if url.strip()]
         return urls
 
     def _update(self, sitename):
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 4f8e981..c5e4325 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import errno
 from os import path
 
 import mwparserfromhell
@@ -83,7 +84,9 @@ class ArticleTextParser(BaseTextParser):
         datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
         try:
             tokenizer = nltk.data.load("file:" + datafile)
-        except LookupError:
+        except IOError as exc:
+            if exc.errno != errno.ENOENT:
+                raise
             nltk.download("punkt", nltk_dir)
             tokenizer = nltk.data.load("file:" + datafile)
 
diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py
index 0c3e98f..7594a41 100644
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -50,7 +50,7 @@ class CopyvioCheckResult(object):
 
     def __repr__(self):
         """Return the canonical string representation of the result."""
-        res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
+        res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3!r})"
         return res.format(self.violation, self.confidence, self.url,
                           self.queries)
 
diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py
index a9afcfb..91db646 100644
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -20,8 +20,10 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+from gzip import GzipFile
 from json import loads
-from urllib import quote_plus, urlencode
+from StringIO import StringIO
+from urllib import quote_plus
 
 from earwigbot import importer
 from earwigbot.exceptions import SearchQueryError
@@ -34,9 +36,10 @@ class BaseSearchEngine(object):
     """Base class for a simple search engine interface."""
     name = "Base"
 
-    def __init__(self, cred):
-        """Store credentials *cred* for searching later on."""
+    def __init__(self, cred, opener):
+        """Store credentials (*cred*) and *opener* for searching later on."""
         self.cred = cred
+        self.opener = opener
 
     def __repr__(self):
         """Return the canonical string representation of the search engine."""
@@ -65,22 +68,35 @@ class YahooBOSSSearchEngine(BaseSearchEngine):
         determined by Yahoo). Raises
         :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
         """
-        base_url = "http://yboss.yahooapis.com/ysearch/web"
-        query = quote_plus(query.join('"', '"'))
-        params = {"q": query, "type": "html,text", "format": "json"}
-        url = "{0}?{1}".format(base_url, urlencode(params))
-
-        consumer = oauth.Consumer(key=self.cred["key"],
-                                  secret=self.cred["secret"])
-        client = oauth.Client(consumer)
-        headers, body = client.request(url, "GET")
-
-        if headers["status"] != "200":
+        key, secret = self.cred["key"], self.cred["secret"]
+        consumer = oauth.Consumer(key=key, secret=secret)
+
+        url = "http://yboss.yahooapis.com/ysearch/web"
+        params = {
+            "oauth_version": oauth.OAUTH_VERSION,
+            "oauth_nonce": oauth.generate_nonce(),
+            "oauth_timestamp": oauth.Request.make_timestamp(),
+            "oauth_consumer_key": consumer.key,
+            "q": quote_plus('"' + query.encode("utf8") + '"'),
+            "type": "html,text",
+            "format": "json",
+        }
+
+        req = oauth.Request(method="GET", url=url, parameters=params)
+        req.sign_request(oauth.SignatureMethod_HMAC_SHA1(), consumer, None)
+        response = self.opener.open(req.to_url())
+        result = response.read()
+
+        if response.headers.get("Content-Encoding") == "gzip":
+            stream = StringIO(result)
+            gzipper = GzipFile(fileobj=stream)
+            result = gzipper.read()
+
+        if response.getcode() != 200:
             e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
-            raise SearchQueryError(e.format(headers["status"], body))
-
+            raise SearchQueryError(e.format(response.getcode(), result))
         try:
-            res = loads(body)
+            res = loads(result)
         except ValueError:
             e = "Yahoo! BOSS Error: JSON could not be decoded"
             raise SearchQueryError(e)