Various fixes for copyvios.

- Fix a bug in ExclusionsDB; improve URL regexes. - NLTK's LookupError is actually an IOError. - Fix bug in __repr__ for CopyvioCheckResult. - Rewrite YahooBOSSSearchEngine to actually work with oauth2. - Search engines now take a URL opener in addition to credentials.
12 years ago · a4dda89a61
--- a/earwigbot/wiki/copyvios/init.py
+++ b/earwigbot/wiki/copyvios/init.py
@@ -98,7 +98,7 @@ class CopyvioMixIn(object):
            except ImportError:
                e = "Yahoo! BOSS requires the 'oauth2' package: https://github.com/simplegeo/python-oauth2"
                raise exceptions.UnsupportedSearchEngineError(e)
            return YahooBOSSSearchEngine(credentials)
            return YahooBOSSSearchEngine(credentials, self._opener)

        raise exceptions.UnknownSearchEngineError(engine)

--- a/earwigbot/wiki/copyvios/exclusions.py
+++ b/earwigbot/wiki/copyvios/exclusions.py
@@ -88,11 +88,12 @@ class ExclusionsDB(object):
            return urls

        regexes = [
            "url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
            "\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?"
            r"url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
            r"\*\s*Site:\s*(?:\[|\<nowiki\>)?(?:https?:)?(?://)?(.*)(?:\]|\</nowiki\>)?"
        ]
        for regex in regexes:
            [urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)]
            find = re.findall(regex, data, re.I)
            [urls.add(url.lower().strip()) for url in find if url.strip()]
        return urls

    def _update(self, sitename):
--- a/earwigbot/wiki/copyvios/parsers.py
+++ b/earwigbot/wiki/copyvios/parsers.py
@@ -20,6 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 import errno
 from os import path

 import mwparserfromhell
@@ -83,7 +84,9 @@ class ArticleTextParser(BaseTextParser):
        datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
        try:
            tokenizer = nltk.data.load("file:" + datafile)
        except LookupError:
        except IOError as exc:
            if exc.errno != errno.ENOENT:
                raise
            nltk.download("punkt", nltk_dir)
            tokenizer = nltk.data.load("file:" + datafile)

--- a/earwigbot/wiki/copyvios/result.py
+++ b/earwigbot/wiki/copyvios/result.py
@@ -50,7 +50,7 @@ class CopyvioCheckResult(object):

    def __repr__(self):
        """Return the canonical string representation of the result."""
        res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
        res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3!r})"
        return res.format(self.violation, self.confidence, self.url,
                          self.queries)

--- a/earwigbot/wiki/copyvios/search.py
+++ b/earwigbot/wiki/copyvios/search.py
@@ -20,8 +20,10 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 from gzip import GzipFile
 from json import loads
 from urllib import quote_plus, urlencode
 from StringIO import StringIO
 from urllib import quote_plus

 from earwigbot import importer
 from earwigbot.exceptions import SearchQueryError
@@ -34,9 +36,10 @@ class BaseSearchEngine(object):
    """Base class for a simple search engine interface."""
    name = "Base"

    def __init__(self, cred):
        """Store credentials *cred* for searching later on."""
    def __init__(self, cred, opener):
        """Store credentials (*cred*) and *opener* for searching later on."""
        self.cred = cred
        self.opener = opener

    def __repr__(self):
        """Return the canonical string representation of the search engine."""
@@ -65,22 +68,35 @@ class YahooBOSSSearchEngine(BaseSearchEngine):
        determined by Yahoo). Raises
        :py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
        """
        base_url = "http://yboss.yahooapis.com/ysearch/web"
        query = quote_plus(query.join('"', '"'))
        params = {"q": query, "type": "html,text", "format": "json"}
        url = "{0}?{1}".format(base_url, urlencode(params))

        consumer = oauth.Consumer(key=self.cred["key"],
                                  secret=self.cred["secret"])
        client = oauth.Client(consumer)
        headers, body = client.request(url, "GET")

        if headers["status"] != "200":
        key, secret = self.cred["key"], self.cred["secret"]
        consumer = oauth.Consumer(key=key, secret=secret)

        url = "http://yboss.yahooapis.com/ysearch/web"
        params = {
            "oauth_version": oauth.OAUTH_VERSION,
            "oauth_nonce": oauth.generate_nonce(),
            "oauth_timestamp": oauth.Request.make_timestamp(),
            "oauth_consumer_key": consumer.key,
            "q": quote_plus('"' + query.encode("utf8") + '"'),
            "type": "html,text",
            "format": "json",
        }

        req = oauth.Request(method="GET", url=url, parameters=params)
        req.sign_request(oauth.SignatureMethod_HMAC_SHA1(), consumer, None)
        response = self.opener.open(req.to_url())
        result = response.read()

        if response.headers.get("Content-Encoding") == "gzip":
            stream = StringIO(result)
            gzipper = GzipFile(fileobj=stream)
            result = gzipper.read()

        if response.getcode() != 200:
            e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
            raise SearchQueryError(e.format(headers["status"], body))

            raise SearchQueryError(e.format(response.getcode(), result))
        try:
            res = loads(body)
            res = loads(result)
        except ValueError:
            e = "Yahoo! BOSS Error: JSON could not be decoded"
            raise SearchQueryError(e)