Moved search engine/credential info into config proper.

- In config.json, search config relocated from tasks.afc_copyvios to wiki. - Site.__init__() takes a `search_config' argument, which is auto-supplied from its value in config.json by get_site(). - Page.copyvio_check() doesn't ask for search config anymore, meaning doing checks from the command line is less painful. - Added a Page.copyvio_compare() function, which works just like copyvio_check() but on a specified URL; this is for cache retrieval on the web front-end.
13 years ago · 9434a416a1
--- a/earwigbot/runner.py
+++ b/earwigbot/runner.py
@@ -50,6 +50,8 @@ def run():
    if is_encrypted:
        config._decryption_key = raw_input()
        config.decrypt(config.wiki, "password")
        config.decrypt(config.wiki, "search", "credentials", "key")
        config.decrypt(config.wiki, "search", "credentials", "secret")
        config.decrypt(config.irc, "frontend", "nickservPassword")
        config.decrypt(config.irc, "watcher", "nickservPassword")

--- a/earwigbot/tasks/afc_copyvios.py
+++ b/earwigbot/tasks/afc_copyvios.py
@@ -37,9 +37,6 @@ class Task(BaseTask):
    number = 1

    def __init__(self):
        config.decrypt(config.tasks, self.name, "search", "credentials", "key")
        config.decrypt(config.tasks, self.name, "search", "credentials", "secret")

        cfg = config.tasks.get(self.name, {})
        self.template = cfg.get("template", "AfC suspected copyvio")
        self.ignore_list = cfg.get("ignoreList", [])
@@ -49,11 +46,6 @@ class Task(BaseTask):
        default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}"
        self.summary = self.make_summary(cfg.get("summary", default_summary))

        # Search API data:
        search = cfg.get("search", {})
        self.engine = search.get("engine")
        self.credentials = search.get("credentials", {})

        # Connection data for our SQL database:
        kwargs = cfg.get("sql", {})
        kwargs["read_default_file"] = expanduser("~/.my.cnf")
@@ -91,8 +83,7 @@ class Task(BaseTask):
            return

        self.logger.info("Checking [[{0}]]".format(title))
        result = page.copyvio_check(self.engine, self.credentials,
                                    self.min_confidence, self.max_queries)
        result = page.copyvio_check(self.min_confidence, self.max_queries)
        url = result.url
        confidence = "{0}%".format(round(result.confidence * 100, 2))

--- a/earwigbot/wiki/copyright.py
+++ b/earwigbot/wiki/copyright.py
@@ -48,12 +48,8 @@ class _CopyvioCheckResult(object):
        self.delta_chain = chains[1]

    def __repr__(self):
        r = ", ".join(("_CopyvioCheckResult(violation={0!r}",
                       "confidence={1!r}", "url={2!r}", "queries={3|r}",
                       "article={4|r}", "chains={5!r})"))
        return r.format(self.violation, self.confidence, self.url,
                        self.queries, self.article_chain,
                        (self.source_chain, self.delta_chain))
        r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
        return r.format(self.violation, self.confidence, self.url, self.queries)


 class _MarkovChain(object):
@@ -100,9 +96,11 @@ class CopyrightMixin(object):
    """
    EarwigBot's Wiki Toolset: Copyright Violation Mixin

    This is a mixin that provides one public method, copyvio_check(), which
    checks the page for copyright violations using a search engine API. The
    API keys must be provided to the method as arguments.
    This is a mixin that provides two public methods, copyvio_check() and
    copyvio_compare(). The former checks the page for copyright violations
    using a search engine API, and the latter compares the page against a
    specified URL. Credentials for the search engine API are stored in the
    site's config.
    """
    def __init__(self, site):
        self._opener = build_opener()
@@ -132,7 +130,7 @@ class CopyrightMixin(object):

        return result

    def _select_search_engine(self, engine, credentials):
    def _select_search_engine(self):
        """Return a function that can be called to do web searches.

        The "function" is a functools.partial object that takes one argument, a
@@ -140,10 +138,12 @@ class CopyrightMixin(object):
        logic depends on the 'engine' argument; for example, if 'engine' is
        "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.

        Raises UnknownSearchEngineError if 'engine' is not known to us, and
        UnsupportedSearchEngineError if we are missing a required package or
        module, like oauth2 for "Yahoo! BOSS".
        Raises UnknownSearchEngineError if the 'engine' listed in our config is
        unknown to us, and UnsupportedSearchEngineError if we are missing a
        required package or module, like oauth2 for "Yahoo! BOSS".
        """
        engine, credentials = self._site._search_config

        if engine == "Yahoo! BOSS":
            if not oauth:
                e = "The package 'oauth2' could not be imported"
@@ -224,8 +224,8 @@ class CopyrightMixin(object):
        delta = _MarkovChainIntersection(article, source)
        return float(delta.size()) / article.size(), (source, delta)

    def copyvio_check(self, engine, credentials, min_confidence=0.5,
                      max_queries=-1, interquery_sleep=1, force=False):
    def copyvio_check(self, min_confidence=0.5, max_queries=-1,
                      interquery_sleep=1, force=False):
        """Check the page for copyright violations.

        Returns a _CopyvioCheckResult object with four useful attributes:
@@ -250,7 +250,7 @@ class CopyrightMixin(object):
        Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
        SearchQueryError, ...) on errors.
        """
        search = self._select_search_engine(engine, credentials)
        search = self._select_search_engine()
        handled_urls = []
        best_confidence = 0
        best_match = None
@@ -290,3 +290,35 @@ class CopyrightMixin(object):
            v = False
        return _CopyvioCheckResult(v, best_confidence, best_match, num_queries,
                                   article_chain, best_chains)

    def copyvio_compare(self, url, min_confidence=0.5, force=False):
        """Check the page like copyvio_check(), but against a specific URL.

        This is essentially a reduced version of the above - a copyivo
        comparison is made using Markov chains and the result is returned in a
        _CopyvioCheckResult object - without using a search engine, as the
        suspected "violated" URL is supplied from the start.
        
        Its primary use is to generate a result when the URL is retrieved from
        a cache, like the one used in EarwigBot's Toolserver site. After a
        search is done, the resulting URL is stored in a cache for 24 hours so
        future checks against that page will not require another set of
        time-and-money-consuming search engine queries. However, the comparison
        itself (which includes the article's and the source's content) cannot
        be stored for data retention reasons, so a fresh comparison is made
        using this function.

        Since no searching is done, neither UnknownSearchEngineError nor
        SearchQueryError will be raised.
        """
        content = self.get(force)
        clean = self._copyvio_strip_article(content)
        article_chain = _MarkovChain(clean)
        confidence, chains = self._copyvio_compare_content(article_chain, url)

        if confidence >= min_confidence:
            is_violation = True
        else:
            is_violation = False
        return _CopyvioCheckResult(is_violation, confidence, url, 0,
                                   article_chain, chains)
--- a/earwigbot/wiki/functions.py
+++ b/earwigbot/wiki/functions.py
@@ -110,6 +110,7 @@ def _get_site_object_from_dict(name, d):
    user_agent = config.wiki.get("userAgent")
    assert_edit = config.wiki.get("assert")
    maxlag = config.wiki.get("maxlag")
    search_config = config.wiki.get("search")

    if user_agent:
        user_agent = user_agent.replace("$1", earwigbot.__version__)
@@ -126,7 +127,8 @@ def _get_site_object_from_dict(name, d):
    return Site(name=name, project=project, lang=lang, base_url=base_url,
                article_path=article_path, script_path=script_path, sql=sql,
                namespaces=namespaces, login=login, cookiejar=cookiejar,
                user_agent=user_agent, assert_edit=assert_edit, maxlag=maxlag)
                user_agent=user_agent, assert_edit=assert_edit, maxlag=maxlag,
                search_config=search_config)

 def get_site(name=None, project=None, lang=None):
    """Returns a Site instance based on information from our config file.
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -71,7 +71,8 @@ class Site(object):
    def __init__(self, name=None, project=None, lang=None, base_url=None,
                 article_path=None, script_path=None, sql=None,
                 namespaces=None, login=(None, None), cookiejar=None,
                 user_agent=None, assert_edit=None, maxlag=None):
                 user_agent=None, assert_edit=None, maxlag=None,
                 search_config=(None, None)):
        """Constructor for new Site instances.

        This probably isn't necessary to call yourself unless you're building a
@@ -107,6 +108,9 @@ class Site(object):
        self._sql_data = sql
        self._sql_conn = None

        # Attribute used in copyright violation checks (see CopyrightMixin):
        self._search_config = search_config

        # Set up cookiejar and URL opener for making API queries:
        if cookiejar is not None:
            self._cookiejar = cookiejar