Browse Source

Moved search engine/credential info into config proper.

- In config.json, search config relocated from
  tasks.afc_copyvios to wiki.
- Site.__init__() takes a `search_config' argument, which is
  auto-supplied from its value in config.json by get_site().
- Page.copyvio_check() doesn't ask for search config
  anymore, meaning doing checks from the command line
  is less painful.
- Added a Page.copyvio_compare() function, which works
  just like copyvio_check() but on a specified URL; this is
  for cache retrieval on the web front-end.
tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
9434a416a1
5 changed files with 59 additions and 28 deletions
  1. +2
    -0
      earwigbot/runner.py
  2. +1
    -10
      earwigbot/tasks/afc_copyvios.py
  3. +48
    -16
      earwigbot/wiki/copyright.py
  4. +3
    -1
      earwigbot/wiki/functions.py
  5. +5
    -1
      earwigbot/wiki/site.py

+ 2
- 0
earwigbot/runner.py View File

@@ -50,6 +50,8 @@ def run():
if is_encrypted:
config._decryption_key = raw_input()
config.decrypt(config.wiki, "password")
config.decrypt(config.wiki, "search", "credentials", "key")
config.decrypt(config.wiki, "search", "credentials", "secret")
config.decrypt(config.irc, "frontend", "nickservPassword")
config.decrypt(config.irc, "watcher", "nickservPassword")



+ 1
- 10
earwigbot/tasks/afc_copyvios.py View File

@@ -37,9 +37,6 @@ class Task(BaseTask):
number = 1

def __init__(self):
config.decrypt(config.tasks, self.name, "search", "credentials", "key")
config.decrypt(config.tasks, self.name, "search", "credentials", "secret")

cfg = config.tasks.get(self.name, {})
self.template = cfg.get("template", "AfC suspected copyvio")
self.ignore_list = cfg.get("ignoreList", [])
@@ -49,11 +46,6 @@ class Task(BaseTask):
default_summary = "Tagging suspected [[WP:COPYVIO|copyright violation]] of {url}"
self.summary = self.make_summary(cfg.get("summary", default_summary))

# Search API data:
search = cfg.get("search", {})
self.engine = search.get("engine")
self.credentials = search.get("credentials", {})

# Connection data for our SQL database:
kwargs = cfg.get("sql", {})
kwargs["read_default_file"] = expanduser("~/.my.cnf")
@@ -91,8 +83,7 @@ class Task(BaseTask):
return

self.logger.info("Checking [[{0}]]".format(title))
result = page.copyvio_check(self.engine, self.credentials,
self.min_confidence, self.max_queries)
result = page.copyvio_check(self.min_confidence, self.max_queries)
url = result.url
confidence = "{0}%".format(round(result.confidence * 100, 2))



+ 48
- 16
earwigbot/wiki/copyright.py View File

@@ -48,12 +48,8 @@ class _CopyvioCheckResult(object):
self.delta_chain = chains[1]

def __repr__(self):
r = ", ".join(("_CopyvioCheckResult(violation={0!r}",
"confidence={1!r}", "url={2!r}", "queries={3|r}",
"article={4|r}", "chains={5!r})"))
return r.format(self.violation, self.confidence, self.url,
self.queries, self.article_chain,
(self.source_chain, self.delta_chain))
r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
return r.format(self.violation, self.confidence, self.url, self.queries)


class _MarkovChain(object):
@@ -100,9 +96,11 @@ class CopyrightMixin(object):
"""
EarwigBot's Wiki Toolset: Copyright Violation Mixin

This is a mixin that provides one public method, copyvio_check(), which
checks the page for copyright violations using a search engine API. The
API keys must be provided to the method as arguments.
This is a mixin that provides two public methods, copyvio_check() and
copyvio_compare(). The former checks the page for copyright violations
using a search engine API, and the latter compares the page against a
specified URL. Credentials for the search engine API are stored in the
site's config.
"""
def __init__(self, site):
self._opener = build_opener()
@@ -132,7 +130,7 @@ class CopyrightMixin(object):

return result

def _select_search_engine(self, engine, credentials):
def _select_search_engine(self):
"""Return a function that can be called to do web searches.

The "function" is a functools.partial object that takes one argument, a
@@ -140,10 +138,12 @@ class CopyrightMixin(object):
logic depends on the 'engine' argument; for example, if 'engine' is
"Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.

Raises UnknownSearchEngineError if 'engine' is not known to us, and
UnsupportedSearchEngineError if we are missing a required package or
module, like oauth2 for "Yahoo! BOSS".
Raises UnknownSearchEngineError if the 'engine' listed in our config is
unknown to us, and UnsupportedSearchEngineError if we are missing a
required package or module, like oauth2 for "Yahoo! BOSS".
"""
engine, credentials = self._site._search_config

if engine == "Yahoo! BOSS":
if not oauth:
e = "The package 'oauth2' could not be imported"
@@ -224,8 +224,8 @@ class CopyrightMixin(object):
delta = _MarkovChainIntersection(article, source)
return float(delta.size()) / article.size(), (source, delta)

def copyvio_check(self, engine, credentials, min_confidence=0.5,
max_queries=-1, interquery_sleep=1, force=False):
def copyvio_check(self, min_confidence=0.5, max_queries=-1,
interquery_sleep=1, force=False):
"""Check the page for copyright violations.

Returns a _CopyvioCheckResult object with four useful attributes:
@@ -250,7 +250,7 @@ class CopyrightMixin(object):
Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
SearchQueryError, ...) on errors.
"""
search = self._select_search_engine(engine, credentials)
search = self._select_search_engine()
handled_urls = []
best_confidence = 0
best_match = None
@@ -290,3 +290,35 @@ class CopyrightMixin(object):
v = False
return _CopyvioCheckResult(v, best_confidence, best_match, num_queries,
article_chain, best_chains)

def copyvio_compare(self, url, min_confidence=0.5, force=False):
"""Check the page like copyvio_check(), but against a specific URL.

This is essentially a reduced version of the above - a copyivo
comparison is made using Markov chains and the result is returned in a
_CopyvioCheckResult object - without using a search engine, as the
suspected "violated" URL is supplied from the start.
Its primary use is to generate a result when the URL is retrieved from
a cache, like the one used in EarwigBot's Toolserver site. After a
search is done, the resulting URL is stored in a cache for 24 hours so
future checks against that page will not require another set of
time-and-money-consuming search engine queries. However, the comparison
itself (which includes the article's and the source's content) cannot
be stored for data retention reasons, so a fresh comparison is made
using this function.

Since no searching is done, neither UnknownSearchEngineError nor
SearchQueryError will be raised.
"""
content = self.get(force)
clean = self._copyvio_strip_article(content)
article_chain = _MarkovChain(clean)
confidence, chains = self._copyvio_compare_content(article_chain, url)

if confidence >= min_confidence:
is_violation = True
else:
is_violation = False
return _CopyvioCheckResult(is_violation, confidence, url, 0,
article_chain, chains)

+ 3
- 1
earwigbot/wiki/functions.py View File

@@ -110,6 +110,7 @@ def _get_site_object_from_dict(name, d):
user_agent = config.wiki.get("userAgent")
assert_edit = config.wiki.get("assert")
maxlag = config.wiki.get("maxlag")
search_config = config.wiki.get("search")

if user_agent:
user_agent = user_agent.replace("$1", earwigbot.__version__)
@@ -126,7 +127,8 @@ def _get_site_object_from_dict(name, d):
return Site(name=name, project=project, lang=lang, base_url=base_url,
article_path=article_path, script_path=script_path, sql=sql,
namespaces=namespaces, login=login, cookiejar=cookiejar,
user_agent=user_agent, assert_edit=assert_edit, maxlag=maxlag)
user_agent=user_agent, assert_edit=assert_edit, maxlag=maxlag,
search_config=search_config)

def get_site(name=None, project=None, lang=None):
"""Returns a Site instance based on information from our config file.


+ 5
- 1
earwigbot/wiki/site.py View File

@@ -71,7 +71,8 @@ class Site(object):
def __init__(self, name=None, project=None, lang=None, base_url=None,
article_path=None, script_path=None, sql=None,
namespaces=None, login=(None, None), cookiejar=None,
user_agent=None, assert_edit=None, maxlag=None):
user_agent=None, assert_edit=None, maxlag=None,
search_config=(None, None)):
"""Constructor for new Site instances.

This probably isn't necessary to call yourself unless you're building a
@@ -107,6 +108,9 @@ class Site(object):
self._sql_data = sql
self._sql_conn = None

# Attribute used in copyright violation checks (see CopyrightMixin):
self._search_config = search_config

# Set up cookiejar and URL opener for making API queries:
if cookiejar is not None:
self._cookiejar = cookiejar


Loading…
Cancel
Save