From 9434a416a1003459da033d8bc4db5521858a0197 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Fri, 24 Feb 2012 04:24:17 -0500
Subject: [PATCH] Moved search engine/credential info into config proper.

- In config.json, search config relocated from
  tasks.afc_copyvios to wiki.
- Site.__init__() takes a `search_config' argument, which is
  auto-supplied from its value in config.json by get_site().
- Page.copyvio_check() doesn't ask for search config
  anymore, meaning doing checks from the command line
  is less painful.
- Added a Page.copyvio_compare() function, which works
  just like copyvio_check() but on a specified URL; this is
  for cache retrieval on the web front-end.
---
 earwigbot/runner.py             |  2 ++
 earwigbot/tasks/afc_copyvios.py | 11 +------
 earwigbot/wiki/copyright.py     | 64 ++++++++++++++++++++++++++++++-----------
 earwigbot/wiki/functions.py     |  4 ++-
 earwigbot/wiki/site.py          |  6 +++-
 5 files changed, 59 insertions(+), 28 deletions(-)

diff --git a/earwigbot/runner.py b/earwigbot/runner.py
index 30c4dff..882b35f 100644
--- a/earwigbot/runner.py
+++ b/earwigbot/runner.py
@@ -50,6 +50,8 @@ def run():
     if is_encrypted:
         config._decryption_key = raw_input()
         config.decrypt(config.wiki, "password")
+        config.decrypt(config.wiki, "search", "credentials", "key")
+        config.decrypt(config.wiki, "search", "credentials", "secret")
         config.decrypt(config.irc, "frontend", "nickservPassword")
         config.decrypt(config.irc, "watcher", "nickservPassword")
 
diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py
index 26058ec..7c7fd31 100644
--- a/earwigbot/tasks/afc_copyvios.py
+++ b/earwigbot/tasks/afc_copyvios.py
@@ -37,9 +37,6 @@ class Task(BaseTask):
     number = 1
 
     def __init__(self):
-        config.decrypt(config.tasks, self.name, "search", "credentials", "key")
-        config.decrypt(config.tasks, self.name, "search", "credentials", "secret")
-
         cfg = config.tasks.get(self.name, {})
         self.template = cfg.get("template", "AfC suspected copyvio")
         self.ignore_list = cfg.get("ignoreList", [])
@@ -49,11 +46,6 @@ class Task(BaseTask):
         default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}"
         self.summary = self.make_summary(cfg.get("summary", default_summary))
 
-        # Search API data:
-        search = cfg.get("search", {})
-        self.engine = search.get("engine")
-        self.credentials = search.get("credentials", {})
-
         # Connection data for our SQL database:
         kwargs = cfg.get("sql", {})
         kwargs["read_default_file"] = expanduser("~/.my.cnf")
@@ -91,8 +83,7 @@ class Task(BaseTask):
             return
 
         self.logger.info("Checking [[{0}]]".format(title))
-        result = page.copyvio_check(self.engine, self.credentials,
-                                    self.min_confidence, self.max_queries)
+        result = page.copyvio_check(self.min_confidence, self.max_queries)
         url = result.url
         confidence = "{0}%".format(round(result.confidence * 100, 2))
 
diff --git a/earwigbot/wiki/copyright.py b/earwigbot/wiki/copyright.py
index 7d3940e..c003ebb 100644
--- a/earwigbot/wiki/copyright.py
+++ b/earwigbot/wiki/copyright.py
@@ -48,12 +48,8 @@ class _CopyvioCheckResult(object):
         self.delta_chain = chains[1]
 
     def __repr__(self):
-        r = ", ".join(("_CopyvioCheckResult(violation={0!r}",
-                       "confidence={1!r}", "url={2!r}", "queries={3|r}",
-                       "article={4|r}", "chains={5!r})"))
-        return r.format(self.violation, self.confidence, self.url,
-                        self.queries, self.article_chain,
-                        (self.source_chain, self.delta_chain))
+        r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
+        return r.format(self.violation, self.confidence, self.url, self.queries)
 
 
 class _MarkovChain(object):
@@ -100,9 +96,11 @@ class CopyrightMixin(object):
     """
     EarwigBot's Wiki Toolset: Copyright Violation Mixin
 
-    This is a mixin that provides one public method, copyvio_check(), which
-    checks the page for copyright violations using a search engine API. The
-    API keys must be provided to the method as arguments.
+    This is a mixin that provides two public methods, copyvio_check() and
+    copyvio_compare(). The former checks the page for copyright violations
+    using a search engine API, and the latter compares the page against a
+    specified URL. Credentials for the search engine API are stored in the
+    site's config.
     """
     def __init__(self, site):
         self._opener = build_opener()
@@ -132,7 +130,7 @@ class CopyrightMixin(object):
 
         return result
 
-    def _select_search_engine(self, engine, credentials):
+    def _select_search_engine(self):
         """Return a function that can be called to do web searches.
 
         The "function" is a functools.partial object that takes one argument, a
@@ -140,10 +138,12 @@ class CopyrightMixin(object):
         logic depends on the 'engine' argument; for example, if 'engine' is
         "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.
 
-        Raises UnknownSearchEngineError if 'engine' is not known to us, and
-        UnsupportedSearchEngineError if we are missing a required package or
-        module, like oauth2 for "Yahoo! BOSS".
+        Raises UnknownSearchEngineError if the 'engine' listed in our config is
+        unknown to us, and UnsupportedSearchEngineError if we are missing a
+        required package or module, like oauth2 for "Yahoo! BOSS".
         """
+        engine, credentials = self._site._search_config
+
         if engine == "Yahoo! BOSS":
             if not oauth:
                 e = "The package 'oauth2' could not be imported"
@@ -224,8 +224,8 @@ class CopyrightMixin(object):
         delta = _MarkovChainIntersection(article, source)
         return float(delta.size()) / article.size(), (source, delta)
 
-    def copyvio_check(self, engine, credentials, min_confidence=0.5,
-                      max_queries=-1, interquery_sleep=1, force=False):
+    def copyvio_check(self, min_confidence=0.5, max_queries=-1,
+                      interquery_sleep=1, force=False):
         """Check the page for copyright violations.
 
         Returns a _CopyvioCheckResult object with four useful attributes:
@@ -250,7 +250,7 @@ class CopyrightMixin(object):
         Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
         SearchQueryError, ...) on errors.
         """
-        search = self._select_search_engine(engine, credentials)
+        search = self._select_search_engine()
         handled_urls = []
         best_confidence = 0
         best_match = None
@@ -290,3 +290,35 @@ class CopyrightMixin(object):
             v = False
         return _CopyvioCheckResult(v, best_confidence, best_match, num_queries,
                                    article_chain, best_chains)
+
+    def copyvio_compare(self, url, min_confidence=0.5, force=False):
+        """Check the page like copyvio_check(), but against a specific URL.
+
+        This is essentially a reduced version of the above - a copyivo
+        comparison is made using Markov chains and the result is returned in a
+        _CopyvioCheckResult object - without using a search engine, as the
+        suspected "violated" URL is supplied from the start.
+        
+        Its primary use is to generate a result when the URL is retrieved from
+        a cache, like the one used in EarwigBot's Toolserver site. After a
+        search is done, the resulting URL is stored in a cache for 24 hours so
+        future checks against that page will not require another set of
+        time-and-money-consuming search engine queries. However, the comparison
+        itself (which includes the article's and the source's content) cannot
+        be stored for data retention reasons, so a fresh comparison is made
+        using this function.
+
+        Since no searching is done, neither UnknownSearchEngineError nor
+        SearchQueryError will be raised.
+        """
+        content = self.get(force)
+        clean = self._copyvio_strip_article(content)
+        article_chain = _MarkovChain(clean)
+        confidence, chains = self._copyvio_compare_content(article_chain, url)
+
+        if confidence >= min_confidence:
+            is_violation = True
+        else:
+            is_violation = False
+        return _CopyvioCheckResult(is_violation, confidence, url, 0,
+                                   article_chain, chains)
diff --git a/earwigbot/wiki/functions.py b/earwigbot/wiki/functions.py
index 6fc0837..a870ee4 100644
--- a/earwigbot/wiki/functions.py
+++ b/earwigbot/wiki/functions.py
@@ -110,6 +110,7 @@ def _get_site_object_from_dict(name, d):
     user_agent = config.wiki.get("userAgent")
     assert_edit = config.wiki.get("assert")
     maxlag = config.wiki.get("maxlag")
+    search_config = config.wiki.get("search")
 
     if user_agent:
         user_agent = user_agent.replace("$1", earwigbot.__version__)
@@ -126,7 +127,8 @@ def _get_site_object_from_dict(name, d):
     return Site(name=name, project=project, lang=lang, base_url=base_url,
                 article_path=article_path, script_path=script_path, sql=sql,
                 namespaces=namespaces, login=login, cookiejar=cookiejar,
-                user_agent=user_agent, assert_edit=assert_edit, maxlag=maxlag)
+                user_agent=user_agent, assert_edit=assert_edit, maxlag=maxlag,
+                search_config=search_config)
 
 def get_site(name=None, project=None, lang=None):
     """Returns a Site instance based on information from our config file.
diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py
index 6b6c10b..8719036 100644
--- a/earwigbot/wiki/site.py
+++ b/earwigbot/wiki/site.py
@@ -71,7 +71,8 @@ class Site(object):
     def __init__(self, name=None, project=None, lang=None, base_url=None,
                  article_path=None, script_path=None, sql=None,
                  namespaces=None, login=(None, None), cookiejar=None,
-                 user_agent=None, assert_edit=None, maxlag=None):
+                 user_agent=None, assert_edit=None, maxlag=None,
+                 search_config=(None, None)):
         """Constructor for new Site instances.
 
         This probably isn't necessary to call yourself unless you're building a
@@ -107,6 +108,9 @@ class Site(object):
         self._sql_data = sql
         self._sql_conn = None
 
+        # Attribute used in copyright violation checks (see CopyrightMixin):
+        self._search_config = search_config
+
         # Set up cookiejar and URL opener for making API queries:
         if cookiejar is not None:
             self._cookiejar = cookiejar