Browse Source

earwigbot.wiki.copyvios.search module split

tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
d4e947b98b
2 changed files with 80 additions and 40 deletions
  1. +5
    -40
      earwigbot/wiki/copyvios/__init__.py
  2. +75
    -0
      earwigbot/wiki/copyvios/search.py

+ 5
- 40
earwigbot/wiki/copyvios/__init__.py View File

@@ -20,12 +20,9 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from functools import partial
from gzip import GzipFile from gzip import GzipFile
from json import loads
from StringIO import StringIO from StringIO import StringIO
from time import sleep, time from time import sleep, time
from urllib import quote_plus, urlencode
from urllib2 import build_opener, URLError from urllib2 import build_opener, URLError


try: try:
@@ -35,6 +32,7 @@ except ImportError:


from earwigbot.wiki.exceptions import * from earwigbot.wiki.exceptions import *
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine


class CopyvioCheckResult(object): class CopyvioCheckResult(object):
def __init__(self, violation, confidence, url, queries, article, chains): def __init__(self, violation, confidence, url, queries, article, chains):
@@ -107,42 +105,9 @@ class CopyvioMixin(object):
if not oauth: if not oauth:
e = "The package 'oauth2' could not be imported" e = "The package 'oauth2' could not be imported"
raise UnsupportedSearchEngineError(e) raise UnsupportedSearchEngineError(e)
searcher = self._yahoo_boss_query
else:
raise UnknownSearchEngineError(engine)

return partial(searcher, credentials)

def _yahoo_boss_query(self, cred, query):
"""Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials.

Returns a list of URLs, no more than fifty, ranked by relevance (as
determined by Yahoo). Raises SearchQueryError() on errors.
"""
base_url = "http://yboss.yahooapis.com/ysearch/web"
query = quote_plus(query.join('"', '"'))
params = {"q": query, "style": "raw", "format": "json"}
url = "{0}?{1}".format(base_url, urlencode(params))
return YahooBOSSSearchEngine(credentials)


consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"])
client = oauth.Client(consumer)
headers, body = client.request(url, "GET")

if headers["status"] != "200":
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
raise SearchQueryError(e.format(headers["status"], body))

try:
res = loads(body)
except ValueError:
e = "Yahoo! BOSS Error: JSON could not be decoded"
raise SearchQueryError(e)

try:
results = res["bossresponse"]["web"]["results"]
except KeyError:
return []
return [result["url"] for result in results]
raise UnknownSearchEngineError(engine)


def _copyvio_strip_html(self, html): def _copyvio_strip_html(self, html):
""" """
@@ -209,7 +174,7 @@ class CopyvioMixin(object):
Raises CopyvioCheckError or subclasses (UnknownSearchEngineError, Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
SearchQueryError, ...) on errors. SearchQueryError, ...) on errors.
""" """
search = self._select_search_engine()
searcher = self._select_search_engine()
handled_urls = [] handled_urls = []
best_confidence = 0 best_confidence = 0
best_match = None best_match = None
@@ -228,7 +193,7 @@ class CopyvioMixin(object):


while (chunks and best_confidence < min_confidence and while (chunks and best_confidence < min_confidence and
(max_queries < 0 or num_queries < max_queries)): (max_queries < 0 or num_queries < max_queries)):
urls = search(chunks.pop(0))
urls = searcher.search(chunks.pop(0))
urls = [url for url in urls if url not in handled_urls] urls = [url for url in urls if url not in handled_urls]
for url in urls: for url in urls:
handled_urls.append(url) handled_urls.append(url)


+ 75
- 0
earwigbot/wiki/copyvios/search.py View File

@@ -0,0 +1,75 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from json import loads
from urllib import quote_plus, urlencode

try:
import oauth2 as oauth
except ImportError:
oauth = None

from earwigbot.wiki.exceptions import SearchQueryError

class BaseSearchEngine(object):
def __init__(self, cred):
"""Store credentials 'cred' for searching later on."""
self.cred = cred

def search(self, query):
"""Use this engine to search for 'query'.

Not implemented in this base class; overridden in subclasses."""
raise NotImplementedError()


class YahooBOSSSearchEngine(BaseSearchEngine):
def search(self, query):
"""Do a Yahoo! BOSS web search for 'query'.

Returns a list of URLs, no more than fifty, ranked by relevance (as
determined by Yahoo). Raises SearchQueryError() on errors.
"""
base_url = "http://yboss.yahooapis.com/ysearch/web"
query = quote_plus(query.join('"', '"'))
params = {"q": query, "style": "raw", "format": "json"}
url = "{0}?{1}".format(base_url, urlencode(params))

consumer = oauth.Consumer(key=self.cred["key"], secret=self.cred["secret"])
client = oauth.Client(consumer)
headers, body = client.request(url, "GET")

if headers["status"] != "200":
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
raise SearchQueryError(e.format(headers["status"], body))

try:
res = loads(body)
except ValueError:
e = "Yahoo! BOSS Error: JSON could not be decoded"
raise SearchQueryError(e)

try:
results = res["bossresponse"]["web"]["results"]
except KeyError:
return []
return [result["url"] for result in results]

Loading…
Cancel
Save