From 9434a416a1003459da033d8bc4db5521858a0197 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 24 Feb 2012 04:24:17 -0500 Subject: [PATCH] Moved search engine/credential info into config proper. - In config.json, search config relocated from tasks.afc_copyvios to wiki. - Site.__init__() takes a `search_config' argument, which is auto-supplied from its value in config.json by get_site(). - Page.copyvio_check() doesn't ask for search config anymore, meaning doing checks from the command line is less painful. - Added a Page.copyvio_compare() function, which works just like copyvio_check() but on a specified URL; this is for cache retrieval on the web front-end. --- earwigbot/runner.py | 2 ++ earwigbot/tasks/afc_copyvios.py | 11 +------ earwigbot/wiki/copyright.py | 64 ++++++++++++++++++++++++++++++----------- earwigbot/wiki/functions.py | 4 ++- earwigbot/wiki/site.py | 6 +++- 5 files changed, 59 insertions(+), 28 deletions(-) diff --git a/earwigbot/runner.py b/earwigbot/runner.py index 30c4dff..882b35f 100644 --- a/earwigbot/runner.py +++ b/earwigbot/runner.py @@ -50,6 +50,8 @@ def run(): if is_encrypted: config._decryption_key = raw_input() config.decrypt(config.wiki, "password") + config.decrypt(config.wiki, "search", "credentials", "key") + config.decrypt(config.wiki, "search", "credentials", "secret") config.decrypt(config.irc, "frontend", "nickservPassword") config.decrypt(config.irc, "watcher", "nickservPassword") diff --git a/earwigbot/tasks/afc_copyvios.py b/earwigbot/tasks/afc_copyvios.py index 26058ec..7c7fd31 100644 --- a/earwigbot/tasks/afc_copyvios.py +++ b/earwigbot/tasks/afc_copyvios.py @@ -37,9 +37,6 @@ class Task(BaseTask): number = 1 def __init__(self): - config.decrypt(config.tasks, self.name, "search", "credentials", "key") - config.decrypt(config.tasks, self.name, "search", "credentials", "secret") - cfg = config.tasks.get(self.name, {}) self.template = cfg.get("template", "AfC suspected copyvio") self.ignore_list = cfg.get("ignoreList", []) @@ -49,11 +46,6 @@ class Task(BaseTask): default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}" self.summary = self.make_summary(cfg.get("summary", default_summary)) - # Search API data: - search = cfg.get("search", {}) - self.engine = search.get("engine") - self.credentials = search.get("credentials", {}) - # Connection data for our SQL database: kwargs = cfg.get("sql", {}) kwargs["read_default_file"] = expanduser("~/.my.cnf") @@ -91,8 +83,7 @@ class Task(BaseTask): return self.logger.info("Checking [[{0}]]".format(title)) - result = page.copyvio_check(self.engine, self.credentials, - self.min_confidence, self.max_queries) + result = page.copyvio_check(self.min_confidence, self.max_queries) url = result.url confidence = "{0}%".format(round(result.confidence * 100, 2)) diff --git a/earwigbot/wiki/copyright.py b/earwigbot/wiki/copyright.py index 7d3940e..c003ebb 100644 --- a/earwigbot/wiki/copyright.py +++ b/earwigbot/wiki/copyright.py @@ -48,12 +48,8 @@ class _CopyvioCheckResult(object): self.delta_chain = chains[1] def __repr__(self): - r = ", ".join(("_CopyvioCheckResult(violation={0!r}", - "confidence={1!r}", "url={2!r}", "queries={3|r}", - "article={4|r}", "chains={5!r})")) - return r.format(self.violation, self.confidence, self.url, - self.queries, self.article_chain, - (self.source_chain, self.delta_chain)) + r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" + return r.format(self.violation, self.confidence, self.url, self.queries) class _MarkovChain(object): @@ -100,9 +96,11 @@ class CopyrightMixin(object): """ EarwigBot's Wiki Toolset: Copyright Violation Mixin - This is a mixin that provides one public method, copyvio_check(), which - checks the page for copyright violations using a search engine API. The - API keys must be provided to the method as arguments. + This is a mixin that provides two public methods, copyvio_check() and + copyvio_compare(). The former checks the page for copyright violations + using a search engine API, and the latter compares the page against a + specified URL. Credentials for the search engine API are stored in the + site's config. """ def __init__(self, site): self._opener = build_opener() @@ -132,7 +130,7 @@ class CopyrightMixin(object): return result - def _select_search_engine(self, engine, credentials): + def _select_search_engine(self): """Return a function that can be called to do web searches. The "function" is a functools.partial object that takes one argument, a @@ -140,10 +138,12 @@ class CopyrightMixin(object): logic depends on the 'engine' argument; for example, if 'engine' is "Yahoo! BOSS", we'll use self._yahoo_boss_query for querying. - Raises UnknownSearchEngineError if 'engine' is not known to us, and - UnsupportedSearchEngineError if we are missing a required package or - module, like oauth2 for "Yahoo! BOSS". + Raises UnknownSearchEngineError if the 'engine' listed in our config is + unknown to us, and UnsupportedSearchEngineError if we are missing a + required package or module, like oauth2 for "Yahoo! BOSS". """ + engine, credentials = self._site._search_config + if engine == "Yahoo! BOSS": if not oauth: e = "The package 'oauth2' could not be imported" @@ -224,8 +224,8 @@ class CopyrightMixin(object): delta = _MarkovChainIntersection(article, source) return float(delta.size()) / article.size(), (source, delta) - def copyvio_check(self, engine, credentials, min_confidence=0.5, - max_queries=-1, interquery_sleep=1, force=False): + def copyvio_check(self, min_confidence=0.5, max_queries=-1, + interquery_sleep=1, force=False): """Check the page for copyright violations. Returns a _CopyvioCheckResult object with four useful attributes: @@ -250,7 +250,7 @@ class CopyrightMixin(object): Raises CopyvioCheckError or subclasses (UnknownSearchEngineError, SearchQueryError, ...) on errors. """ - search = self._select_search_engine(engine, credentials) + search = self._select_search_engine() handled_urls = [] best_confidence = 0 best_match = None @@ -290,3 +290,35 @@ class CopyrightMixin(object): v = False return _CopyvioCheckResult(v, best_confidence, best_match, num_queries, article_chain, best_chains) + + def copyvio_compare(self, url, min_confidence=0.5, force=False): + """Check the page like copyvio_check(), but against a specific URL. + + This is essentially a reduced version of the above - a copyivo + comparison is made using Markov chains and the result is returned in a + _CopyvioCheckResult object - without using a search engine, as the + suspected "violated" URL is supplied from the start. + + Its primary use is to generate a result when the URL is retrieved from + a cache, like the one used in EarwigBot's Toolserver site. After a + search is done, the resulting URL is stored in a cache for 24 hours so + future checks against that page will not require another set of + time-and-money-consuming search engine queries. However, the comparison + itself (which includes the article's and the source's content) cannot + be stored for data retention reasons, so a fresh comparison is made + using this function. + + Since no searching is done, neither UnknownSearchEngineError nor + SearchQueryError will be raised. + """ + content = self.get(force) + clean = self._copyvio_strip_article(content) + article_chain = _MarkovChain(clean) + confidence, chains = self._copyvio_compare_content(article_chain, url) + + if confidence >= min_confidence: + is_violation = True + else: + is_violation = False + return _CopyvioCheckResult(is_violation, confidence, url, 0, + article_chain, chains) diff --git a/earwigbot/wiki/functions.py b/earwigbot/wiki/functions.py index 6fc0837..a870ee4 100644 --- a/earwigbot/wiki/functions.py +++ b/earwigbot/wiki/functions.py @@ -110,6 +110,7 @@ def _get_site_object_from_dict(name, d): user_agent = config.wiki.get("userAgent") assert_edit = config.wiki.get("assert") maxlag = config.wiki.get("maxlag") + search_config = config.wiki.get("search") if user_agent: user_agent = user_agent.replace("$1", earwigbot.__version__) @@ -126,7 +127,8 @@ def _get_site_object_from_dict(name, d): return Site(name=name, project=project, lang=lang, base_url=base_url, article_path=article_path, script_path=script_path, sql=sql, namespaces=namespaces, login=login, cookiejar=cookiejar, - user_agent=user_agent, assert_edit=assert_edit, maxlag=maxlag) + user_agent=user_agent, assert_edit=assert_edit, maxlag=maxlag, + search_config=search_config) def get_site(name=None, project=None, lang=None): """Returns a Site instance based on information from our config file. diff --git a/earwigbot/wiki/site.py b/earwigbot/wiki/site.py index 6b6c10b..8719036 100644 --- a/earwigbot/wiki/site.py +++ b/earwigbot/wiki/site.py @@ -71,7 +71,8 @@ class Site(object): def __init__(self, name=None, project=None, lang=None, base_url=None, article_path=None, script_path=None, sql=None, namespaces=None, login=(None, None), cookiejar=None, - user_agent=None, assert_edit=None, maxlag=None): + user_agent=None, assert_edit=None, maxlag=None, + search_config=(None, None)): """Constructor for new Site instances. This probably isn't necessary to call yourself unless you're building a @@ -107,6 +108,9 @@ class Site(object): self._sql_data = sql self._sql_conn = None + # Attribute used in copyright violation checks (see CopyrightMixin): + self._search_config = search_config + # Set up cookiejar and URL opener for making API queries: if cookiejar is not None: self._cookiejar = cookiejar