From 977b587e5ec46374993b1090f36eed94d36cf6ef Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 2 Apr 2016 18:29:55 -0500 Subject: [PATCH] Add support for Bing Search --- earwigbot/wiki/copyvios/__init__.py | 30 ++++++++------ earwigbot/wiki/copyvios/search.py | 79 ++++++++++++++++++++++++++++++++++--- 2 files changed, 92 insertions(+), 17 deletions(-) diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index 1d960a7..b129941 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -23,15 +23,14 @@ from time import sleep, time from urllib2 import build_opener -from earwigbot import exceptions, importer +from earwigbot import exceptions from earwigbot.wiki.copyvios.markov import MarkovChain from earwigbot.wiki.copyvios.parsers import ArticleTextParser -from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine +from earwigbot.wiki.copyvios.search import ( + BingSearchEngine, YahooBOSSSearchEngine) from earwigbot.wiki.copyvios.workers import ( globalize, localize, CopyvioWorkspace) -oauth = importer.new("oauth2") - __all__ = ["CopyvioMixIn", "globalize", "localize"] class CopyvioMixIn(object): @@ -62,20 +61,29 @@ class CopyvioMixIn(object): unknown to us, and UnsupportedSearchEngineError if we are missing a required package or module, like oauth2 for "Yahoo! BOSS". """ + engines = { + "Bing": BingSearchEngine, + "Yahoo! BOSS": YahooBOSSSearchEngine + } + engine = self._search_config["engine"] + if engine not in engines: + raise exceptions.UnknownSearchEngineError(engine) + + klass = engines[engine] credentials = self._search_config["credentials"] + opener = build_opener() + opener.addheaders = self._addheaders - if engine == "Yahoo! BOSS": + for dep in klass.requirements(): try: - oauth.__version__ # Force-load the lazy module + __import__(dep).__package__ except ImportError: - e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2" + e = "Missing a required dependency ({}) for the {} engine" + e = e.format(dep, engine) raise exceptions.UnsupportedSearchEngineError(e) - opener = build_opener() - opener.addheaders = self._addheaders - return YahooBOSSSearchEngine(credentials, opener) - raise exceptions.UnknownSearchEngineError(engine) + return klass(credentials, opener) def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1, no_searches=False, no_links=False, short_circuit=True): diff --git a/earwigbot/wiki/copyvios/search.py b/earwigbot/wiki/copyvios/search.py index a049837..9df20f7 100644 --- a/earwigbot/wiki/copyvios/search.py +++ b/earwigbot/wiki/copyvios/search.py @@ -24,7 +24,7 @@ from gzip import GzipFile from json import loads from socket import error from StringIO import StringIO -from urllib import quote +from urllib import quote, urlencode from urllib2 import URLError from earwigbot import importer @@ -32,7 +32,7 @@ from earwigbot.exceptions import SearchQueryError oauth = importer.new("oauth2") -__all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"] +__all__ = ["BaseSearchEngine", "BingSearchEngine", "YahooBOSSSearchEngine"] class BaseSearchEngine(object): """Base class for a simple search engine interface.""" @@ -51,6 +51,11 @@ class BaseSearchEngine(object): """Return a nice string representation of the search engine.""" return "<{0}>".format(self.__class__.__name__) + @staticmethod + def requirements(): + """Return a list of packages required by this search engine.""" + return [] + def search(self, query): """Use this engine to search for *query*. @@ -59,6 +64,64 @@ class BaseSearchEngine(object): raise NotImplementedError() +class BingSearchEngine(BaseSearchEngine): + """A search engine interface with Bing Search (via Azure Marketplace).""" + name = "Bing" + + def __init__(self, cred, opener): + super(BingSearchEngine, self).__init__(cred, opener) + + key = self.cred["key"] + auth = (key + ":" + key).encode("base64").replace("\n", "") + self.opener.addheaders.append(("Authorization", "Basic " + auth)) + + def search(self, query): + """Do a Bing web search for *query*. + + Returns a list of URLs, no more than five, ranked by relevance + (as determined by Bing). + Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors. + """ + service = "SearchWeb" if self.cred["type"] == "searchweb" else "Search" + url = "https://api.datamarket.azure.com/Bing/{0}/Web?".format(service) + params = { + "$format": "json", + "$top": "5", + "Query": "'\"" + query.replace('"', "").encode("utf8") + "\"'", + "Market": "'en-US'", + "Adult": "'Off'", + "Options": "'DisableLocationDetection'", + "WebFileType": "'HTM+HTML+PDF+TEXT+TXT'", + "WebSearchOptions": "'DisableHostCollapsing+DisableQueryAlterations'" + } + + try: + response = self.opener.open(url + urlencode(params)) + result = response.read() + except (URLError, error) as exc: + raise SearchQueryError("Bing Error: " + str(exc)) + + if response.headers.get("Content-Encoding") == "gzip": + stream = StringIO(result) + gzipper = GzipFile(fileobj=stream) + result = gzipper.read() + + if response.getcode() != 200: + err = "Bing Error: got response code '{0}':\n{1}'" + raise SearchQueryError(err.format(response.getcode(), result)) + try: + res = loads(result) + except ValueError: + err = "Bing Error: JSON could not be decoded" + raise SearchQueryError(err) + + try: + results = res["d"]["results"] + except KeyError: + return [] + return [result["Url"] for result in results] + + class YahooBOSSSearchEngine(BaseSearchEngine): """A search engine interface with Yahoo! BOSS.""" name = "Yahoo! BOSS" @@ -70,6 +133,10 @@ class YahooBOSSSearchEngine(BaseSearchEngine): args = ["=".join((enc(k), enc(v))) for k, v in params.iteritems()] return base + "?" + "&".join(args) + @staticmethod + def requirements(): + return ["oauth2"] + def search(self, query): """Do a Yahoo! BOSS web search for *query*. @@ -104,13 +171,13 @@ class YahooBOSSSearchEngine(BaseSearchEngine): result = gzipper.read() if response.getcode() != 200: - e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" - raise SearchQueryError(e.format(response.getcode(), result)) + err = "Yahoo! BOSS Error: got response code '{0}':\n{1}'" + raise SearchQueryError(err.format(response.getcode(), result)) try: res = loads(result) except ValueError: - e = "Yahoo! BOSS Error: JSON could not be decoded" - raise SearchQueryError(e) + err = "Yahoo! BOSS Error: JSON could not be decoded" + raise SearchQueryError(err) try: results = res["bossresponse"]["web"]["results"]