Browse Source

Add support for Bing Search

tags/v0.3
Ben Kurtovic 8 years ago
parent
commit
977b587e5e
2 changed files with 92 additions and 17 deletions
  1. +19
    -11
      earwigbot/wiki/copyvios/__init__.py
  2. +73
    -6
      earwigbot/wiki/copyvios/search.py

+ 19
- 11
earwigbot/wiki/copyvios/__init__.py View File

@@ -23,15 +23,14 @@
from time import sleep, time from time import sleep, time
from urllib2 import build_opener from urllib2 import build_opener


from earwigbot import exceptions, importer
from earwigbot import exceptions
from earwigbot.wiki.copyvios.markov import MarkovChain from earwigbot.wiki.copyvios.markov import MarkovChain
from earwigbot.wiki.copyvios.parsers import ArticleTextParser from earwigbot.wiki.copyvios.parsers import ArticleTextParser
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine
from earwigbot.wiki.copyvios.search import (
BingSearchEngine, YahooBOSSSearchEngine)
from earwigbot.wiki.copyvios.workers import ( from earwigbot.wiki.copyvios.workers import (
globalize, localize, CopyvioWorkspace) globalize, localize, CopyvioWorkspace)


oauth = importer.new("oauth2")

__all__ = ["CopyvioMixIn", "globalize", "localize"] __all__ = ["CopyvioMixIn", "globalize", "localize"]


class CopyvioMixIn(object): class CopyvioMixIn(object):
@@ -62,20 +61,29 @@ class CopyvioMixIn(object):
unknown to us, and UnsupportedSearchEngineError if we are missing a unknown to us, and UnsupportedSearchEngineError if we are missing a
required package or module, like oauth2 for "Yahoo! BOSS". required package or module, like oauth2 for "Yahoo! BOSS".
""" """
engines = {
"Bing": BingSearchEngine,
"Yahoo! BOSS": YahooBOSSSearchEngine
}

engine = self._search_config["engine"] engine = self._search_config["engine"]
if engine not in engines:
raise exceptions.UnknownSearchEngineError(engine)

klass = engines[engine]
credentials = self._search_config["credentials"] credentials = self._search_config["credentials"]
opener = build_opener()
opener.addheaders = self._addheaders


if engine == "Yahoo! BOSS":
for dep in klass.requirements():
try: try:
oauth.__version__ # Force-load the lazy module
__import__(dep).__package__
except ImportError: except ImportError:
e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2"
e = "Missing a required dependency ({}) for the {} engine"
e = e.format(dep, engine)
raise exceptions.UnsupportedSearchEngineError(e) raise exceptions.UnsupportedSearchEngineError(e)
opener = build_opener()
opener.addheaders = self._addheaders
return YahooBOSSSearchEngine(credentials, opener)


raise exceptions.UnknownSearchEngineError(engine)
return klass(credentials, opener)


def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1, def copyvio_check(self, min_confidence=0.75, max_queries=15, max_time=-1,
no_searches=False, no_links=False, short_circuit=True): no_searches=False, no_links=False, short_circuit=True):


+ 73
- 6
earwigbot/wiki/copyvios/search.py View File

@@ -24,7 +24,7 @@ from gzip import GzipFile
from json import loads from json import loads
from socket import error from socket import error
from StringIO import StringIO from StringIO import StringIO
from urllib import quote
from urllib import quote, urlencode
from urllib2 import URLError from urllib2 import URLError


from earwigbot import importer from earwigbot import importer
@@ -32,7 +32,7 @@ from earwigbot.exceptions import SearchQueryError


oauth = importer.new("oauth2") oauth = importer.new("oauth2")


__all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"]
__all__ = ["BaseSearchEngine", "BingSearchEngine", "YahooBOSSSearchEngine"]


class BaseSearchEngine(object): class BaseSearchEngine(object):
"""Base class for a simple search engine interface.""" """Base class for a simple search engine interface."""
@@ -51,6 +51,11 @@ class BaseSearchEngine(object):
"""Return a nice string representation of the search engine.""" """Return a nice string representation of the search engine."""
return "<{0}>".format(self.__class__.__name__) return "<{0}>".format(self.__class__.__name__)


@staticmethod
def requirements():
"""Return a list of packages required by this search engine."""
return []

def search(self, query): def search(self, query):
"""Use this engine to search for *query*. """Use this engine to search for *query*.


@@ -59,6 +64,64 @@ class BaseSearchEngine(object):
raise NotImplementedError() raise NotImplementedError()




class BingSearchEngine(BaseSearchEngine):
"""A search engine interface with Bing Search (via Azure Marketplace)."""
name = "Bing"

def __init__(self, cred, opener):
super(BingSearchEngine, self).__init__(cred, opener)

key = self.cred["key"]
auth = (key + ":" + key).encode("base64").replace("\n", "")
self.opener.addheaders.append(("Authorization", "Basic " + auth))

def search(self, query):
"""Do a Bing web search for *query*.

Returns a list of URLs, no more than five, ranked by relevance
(as determined by Bing).
Raises :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
"""
service = "SearchWeb" if self.cred["type"] == "searchweb" else "Search"
url = "https://api.datamarket.azure.com/Bing/{0}/Web?".format(service)
params = {
"$format": "json",
"$top": "5",
"Query": "'\"" + query.replace('"', "").encode("utf8") + "\"'",
"Market": "'en-US'",
"Adult": "'Off'",
"Options": "'DisableLocationDetection'",
"WebFileType": "'HTM+HTML+PDF+TEXT+TXT'",
"WebSearchOptions": "'DisableHostCollapsing+DisableQueryAlterations'"
}

try:
response = self.opener.open(url + urlencode(params))
result = response.read()
except (URLError, error) as exc:
raise SearchQueryError("Bing Error: " + str(exc))

if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(result)
gzipper = GzipFile(fileobj=stream)
result = gzipper.read()

if response.getcode() != 200:
err = "Bing Error: got response code '{0}':\n{1}'"
raise SearchQueryError(err.format(response.getcode(), result))
try:
res = loads(result)
except ValueError:
err = "Bing Error: JSON could not be decoded"
raise SearchQueryError(err)

try:
results = res["d"]["results"]
except KeyError:
return []
return [result["Url"] for result in results]


class YahooBOSSSearchEngine(BaseSearchEngine): class YahooBOSSSearchEngine(BaseSearchEngine):
"""A search engine interface with Yahoo! BOSS.""" """A search engine interface with Yahoo! BOSS."""
name = "Yahoo! BOSS" name = "Yahoo! BOSS"
@@ -70,6 +133,10 @@ class YahooBOSSSearchEngine(BaseSearchEngine):
args = ["=".join((enc(k), enc(v))) for k, v in params.iteritems()] args = ["=".join((enc(k), enc(v))) for k, v in params.iteritems()]
return base + "?" + "&".join(args) return base + "?" + "&".join(args)


@staticmethod
def requirements():
return ["oauth2"]

def search(self, query): def search(self, query):
"""Do a Yahoo! BOSS web search for *query*. """Do a Yahoo! BOSS web search for *query*.


@@ -104,13 +171,13 @@ class YahooBOSSSearchEngine(BaseSearchEngine):
result = gzipper.read() result = gzipper.read()


if response.getcode() != 200: if response.getcode() != 200:
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
raise SearchQueryError(e.format(response.getcode(), result))
err = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
raise SearchQueryError(err.format(response.getcode(), result))
try: try:
res = loads(result) res = loads(result)
except ValueError: except ValueError:
e = "Yahoo! BOSS Error: JSON could not be decoded"
raise SearchQueryError(e)
err = "Yahoo! BOSS Error: JSON could not be decoded"
raise SearchQueryError(err)


try: try:
results = res["bossresponse"]["web"]["results"] results = res["bossresponse"]["web"]["results"]


Loading…
Cancel
Save