diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 68b4134..0aaa9b5 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -20,12 +20,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from functools import partial from gzip import GzipFile -from json import loads from StringIO import StringIO from time import sleep, time -from urllib import quote_plus, urlencode from urllib2 import build_opener, URLError try: @@ -35,6 +32,7 @@ except ImportError: from earwigbot.wiki.exceptions import * from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection +from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine class CopyvioCheckResult(object): def __init__(self, violation, confidence, url, queries, article, chains): @@ -107,42 +105,9 @@ class CopyvioMixin(object): if not oauth: e = "The package 'oauth2' could not be imported" raise UnsupportedSearchEngineError(e) - searcher = self._yahoo_boss_query - else: - raise UnknownSearchEngineError(engine) - - return partial(searcher, credentials) - - def _yahoo_boss_query(self, cred, query): - """Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials. - - Returns a list of URLs, no more than fifty, ranked by relevance (as - determined by Yahoo). Raises SearchQueryError() on errors. - """ - base_url = "http://yboss.yahooapis.com/ysearch/web" - query = quote_plus(query.join('"', '"')) - params = {"q": query, "style": "raw", "format": "json"} - url = "{0}?{1}".format(base_url, urlencode(params)) + return YahooBOSSSearchEngine(credentials) - consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"]) - client = oauth.Client(consumer) - headers, body = client.request(url, "GET") - - if headers["status"] != "200": - e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" - raise SearchQueryError(e.format(headers["status"], body)) - - try: - res = loads(body) - except ValueError: - e = "Yahoo! BOSS Error: JSON could not be decoded" - raise SearchQueryError(e) - - try: - results = res["bossresponse"]["web"]["results"] - except KeyError: - return [] - return [result["url"] for result in results] + raise UnknownSearchEngineError(engine) def _copyvio_strip_html(self, html): """ @@ -209,7 +174,7 @@ class CopyvioMixin(object): Raises CopyvioCheckError or subclasses (UnknownSearchEngineError, SearchQueryError, ...) on errors. """ - search = self._select_search_engine() + searcher = self._select_search_engine() handled_urls = [] best_confidence = 0 best_match = None @@ -228,7 +193,7 @@ class CopyvioMixin(object): while (chunks and best_confidence < min_confidence and (max_queries < 0 or num_queries < max_queries)): - urls = search(chunks.pop(0)) + urls = searcher.search(chunks.pop(0)) urls = [url for url in urls if url not in handled_urls] for url in urls: handled_urls.append(url) diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py new file mode 100644 index 0000000..59287cc --- /dev/null +++ b/earwigbot/wiki/copyvios/search.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2009-2012 by Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from json import loads +from urllib import quote_plus, urlencode + +try: + import oauth2 as oauth +except ImportError: + oauth = None + +from earwigbot.wiki.exceptions import SearchQueryError + +class BaseSearchEngine(object): + def __init__(self, cred): + """Store credentials 'cred' for searching later on.""" + self.cred = cred + + def search(self, query): + """Use this engine to search for 'query'. + + Not implemented in this base class; overridden in subclasses.""" + raise NotImplementedError() + + +class YahooBOSSSearchEngine(BaseSearchEngine): + def search(self, query): + """Do a Yahoo! BOSS web search for 'query'. + + Returns a list of URLs, no more than fifty, ranked by relevance (as + determined by Yahoo). Raises SearchQueryError() on errors. + """ + base_url = "http://yboss.yahooapis.com/ysearch/web" + query = quote_plus(query.join('"', '"')) + params = {"q": query, "style": "raw", "format": "json"} + url = "{0}?{1}".format(base_url, urlencode(params)) + + consumer = oauth.Consumer(key=self.cred["key"], secret=self.cred["secret"]) + client = oauth.Client(consumer) + headers, body = client.request(url, "GET") + + if headers["status"] != "200": + e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" + raise SearchQueryError(e.format(headers["status"], body)) + + try: + res = loads(body) + except ValueError: + e = "Yahoo! BOSS Error: JSON could not be decoded" + raise SearchQueryError(e) + + try: + results = res["bossresponse"]["web"]["results"] + except KeyError: + return [] + return [result["url"] for result in results]