Parcourir la source

Various fixes for copyvios.

- Fix a bug in ExclusionsDB; improve URL regexes.
- NLTK's LookupError is actually an IOError.
- Fix bug in __repr__ for CopyvioCheckResult.
- Rewrite YahooBOSSSearchEngine to actually work with oauth2.
- Search engines now take a URL opener in addition to credentials.
tags/v0.2
Ben Kurtovic il y a 12 ans
Parent
révision
a4dda89a61
5 fichiers modifiés avec 43 ajouts et 23 suppressions
  1. +1
    -1
      earwigbot/wiki/copyvios/__init__.py
  2. +4
    -3
      earwigbot/wiki/copyvios/exclusions.py
  3. +4
    -1
      earwigbot/wiki/copyvios/parsers.py
  4. +1
    -1
      earwigbot/wiki/copyvios/result.py
  5. +33
    -17
      earwigbot/wiki/copyvios/search.py

+ 1
- 1
earwigbot/wiki/copyvios/__init__.py Voir le fichier

@@ -98,7 +98,7 @@ class CopyvioMixIn(object):
except ImportError:
e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2"
raise exceptions.UnsupportedSearchEngineError(e)
return YahooBOSSSearchEngine(credentials)
return YahooBOSSSearchEngine(credentials, self._opener)

raise exceptions.UnknownSearchEngineError(engine)



+ 4
- 3
earwigbot/wiki/copyvios/exclusions.py Voir le fichier

@@ -88,11 +88,12 @@ class ExclusionsDB(object):
return urls

regexes = [
"url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
"\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?"
r"url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*)(?:\]|\</nowiki\>)?"
]
for regex in regexes:
[urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)]
find = re.findall(regex, data, re.I)
[urls.add(url.lower().strip()) for url in find if url.strip()]
return urls

def _update(self, sitename):


+ 4
- 1
earwigbot/wiki/copyvios/parsers.py Voir le fichier

@@ -20,6 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import errno
from os import path

import mwparserfromhell
@@ -83,7 +84,9 @@ class ArticleTextParser(BaseTextParser):
datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
try:
tokenizer = nltk.data.load("file:" + datafile)
except LookupError:
except IOError as exc:
if exc.errno != errno.ENOENT:
raise
nltk.download("punkt", nltk_dir)
tokenizer = nltk.data.load("file:" + datafile)



+ 1
- 1
earwigbot/wiki/copyvios/result.py Voir le fichier

@@ -50,7 +50,7 @@ class CopyvioCheckResult(object):

def __repr__(self):
"""Return the canonical string representation of the result."""
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3!r})"
return res.format(self.violation, self.confidence, self.url,
self.queries)



+ 33
- 17
earwigbot/wiki/copyvios/search.py Voir le fichier

@@ -20,8 +20,10 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from gzip import GzipFile
from json import loads
from urllib import quote_plus, urlencode
from StringIO import StringIO
from urllib import quote_plus

from earwigbot import importer
from earwigbot.exceptions import SearchQueryError
@@ -34,9 +36,10 @@ class BaseSearchEngine(object):
"""Base class for a simple search engine interface."""
name = "Base"

def __init__(self, cred):
"""Store credentials *cred* for searching later on."""
def __init__(self, cred, opener):
"""Store credentials (*cred*) and *opener* for searching later on."""
self.cred = cred
self.opener = opener

def __repr__(self):
"""Return the canonical string representation of the search engine."""
@@ -65,22 +68,35 @@ class YahooBOSSSearchEngine(BaseSearchEngine):
determined by Yahoo). Raises
:py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
"""
base_url = "http://yboss.yahooapis.com/ysearch/web"
query = quote_plus(query.join('"', '"'))
params = {"q": query, "type": "html,text", "format": "json"}
url = "{0}?{1}".format(base_url, urlencode(params))

consumer = oauth.Consumer(key=self.cred["key"],
secret=self.cred["secret"])
client = oauth.Client(consumer)
headers, body = client.request(url, "GET")

if headers["status"] != "200":
key, secret = self.cred["key"], self.cred["secret"]
consumer = oauth.Consumer(key=key, secret=secret)

url = "http://yboss.yahooapis.com/ysearch/web"
params = {
"oauth_version": oauth.OAUTH_VERSION,
"oauth_nonce": oauth.generate_nonce(),
"oauth_timestamp": oauth.Request.make_timestamp(),
"oauth_consumer_key": consumer.key,
"q": quote_plus('"' + query.encode("utf8") + '"'),
"type": "html,text",
"format": "json",
}

req = oauth.Request(method="GET", url=url, parameters=params)
req.sign_request(oauth.SignatureMethod_HMAC_SHA1(), consumer, None)
response = self.opener.open(req.to_url())
result = response.read()

if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(result)
gzipper = GzipFile(fileobj=stream)
result = gzipper.read()

if response.getcode() != 200:
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
raise SearchQueryError(e.format(headers["status"], body))

raise SearchQueryError(e.format(response.getcode(), result))
try:
res = loads(body)
res = loads(result)
except ValueError:
e = "Yahoo! BOSS Error: JSON could not be decoded"
raise SearchQueryError(e)


Chargement…
Annuler
Enregistrer