diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py
index 59e0dcb..295685c 100644
--- a/earwigbot/wiki/copyvios/__init__.py
+++ b/earwigbot/wiki/copyvios/__init__.py
@@ -98,7 +98,7 @@ class CopyvioMixIn(object):
except ImportError:
e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2"
raise exceptions.UnsupportedSearchEngineError(e)
- return YahooBOSSSearchEngine(credentials)
+ return YahooBOSSSearchEngine(credentials, self._opener)
raise exceptions.UnknownSearchEngineError(engine)
diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py
index 3600f97..517a12d 100644
--- a/earwigbot/wiki/copyvios/exclusions.py
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -88,11 +88,12 @@ class ExclusionsDB(object):
return urls
regexes = [
- "url\s*=\s*(?:https?:)?(?://)?(.*)",
- "\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?"
+ r"url\s*=\s*(?:https?:)?(?://)?(.*)",
+ r"\*\s*Site:\s*(?:\[|\)?(?:https?:)?(?://)?(.*)(?:\]|\)?"
]
for regex in regexes:
- [urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)]
+ find = re.findall(regex, data, re.I)
+ [urls.add(url.lower().strip()) for url in find if url.strip()]
return urls
def _update(self, sitename):
diff --git a/earwigbot/wiki/copyvios/parsers.py b/earwigbot/wiki/copyvios/parsers.py
index 4f8e981..c5e4325 100644
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,6 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
+import errno
from os import path
import mwparserfromhell
@@ -83,7 +84,9 @@ class ArticleTextParser(BaseTextParser):
datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
try:
tokenizer = nltk.data.load("file:" + datafile)
- except LookupError:
+ except IOError as exc:
+ if exc.errno != errno.ENOENT:
+ raise
nltk.download("punkt", nltk_dir)
tokenizer = nltk.data.load("file:" + datafile)
diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py
index 0c3e98f..7594a41 100644
--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -50,7 +50,7 @@ class CopyvioCheckResult(object):
def __repr__(self):
"""Return the canonical string representation of the result."""
- res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
+ res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3!r})"
return res.format(self.violation, self.confidence, self.url,
self.queries)
diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py
index a9afcfb..91db646 100644
--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -20,8 +20,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
+from gzip import GzipFile
from json import loads
-from urllib import quote_plus, urlencode
+from StringIO import StringIO
+from urllib import quote_plus
from earwigbot import importer
from earwigbot.exceptions import SearchQueryError
@@ -34,9 +36,10 @@ class BaseSearchEngine(object):
"""Base class for a simple search engine interface."""
name = "Base"
- def __init__(self, cred):
- """Store credentials *cred* for searching later on."""
+ def __init__(self, cred, opener):
+ """Store credentials (*cred*) and *opener* for searching later on."""
self.cred = cred
+ self.opener = opener
def __repr__(self):
"""Return the canonical string representation of the search engine."""
@@ -65,22 +68,35 @@ class YahooBOSSSearchEngine(BaseSearchEngine):
determined by Yahoo). Raises
:py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
"""
- base_url = "http://yboss.yahooapis.com/ysearch/web"
- query = quote_plus(query.join('"', '"'))
- params = {"q": query, "type": "html,text", "format": "json"}
- url = "{0}?{1}".format(base_url, urlencode(params))
-
- consumer = oauth.Consumer(key=self.cred["key"],
- secret=self.cred["secret"])
- client = oauth.Client(consumer)
- headers, body = client.request(url, "GET")
-
- if headers["status"] != "200":
+ key, secret = self.cred["key"], self.cred["secret"]
+ consumer = oauth.Consumer(key=key, secret=secret)
+
+ url = "http://yboss.yahooapis.com/ysearch/web"
+ params = {
+ "oauth_version": oauth.OAUTH_VERSION,
+ "oauth_nonce": oauth.generate_nonce(),
+ "oauth_timestamp": oauth.Request.make_timestamp(),
+ "oauth_consumer_key": consumer.key,
+ "q": quote_plus('"' + query.encode("utf8") + '"'),
+ "type": "html,text",
+ "format": "json",
+ }
+
+ req = oauth.Request(method="GET", url=url, parameters=params)
+ req.sign_request(oauth.SignatureMethod_HMAC_SHA1(), consumer, None)
+ response = self.opener.open(req.to_url())
+ result = response.read()
+
+ if response.headers.get("Content-Encoding") == "gzip":
+ stream = StringIO(result)
+ gzipper = GzipFile(fileobj=stream)
+ result = gzipper.read()
+
+ if response.getcode() != 200:
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
- raise SearchQueryError(e.format(headers["status"], body))
-
+ raise SearchQueryError(e.format(response.getcode(), result))
try:
- res = loads(body)
+ res = loads(result)
except ValueError:
e = "Yahoo! BOSS Error: JSON could not be decoded"
raise SearchQueryError(e)