Browse Source

Merge branch 'feature/copyvios' into develop

OH MY GOD I'M FINALLY DONE.
tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
4944c120bd
18 changed files with 952 additions and 387 deletions
  1. +47
    -0
      docs/api/earwigbot.wiki.copyvios.rst
  2. +7
    -6
      docs/api/earwigbot.wiki.rst
  3. +1
    -1
      docs/api/modules.rst
  4. +4
    -3
      docs/toolset.rst
  5. +5
    -9
      earwigbot/commands/link.py
  6. +14
    -14
      earwigbot/tasks/afc_copyvios.py
  7. +0
    -324
      earwigbot/wiki/copyright.py
  8. +229
    -0
      earwigbot/wiki/copyvios/__init__.py
  9. +164
    -0
      earwigbot/wiki/copyvios/exclusions.py
  10. +87
    -0
      earwigbot/wiki/copyvios/markov.py
  11. +148
    -0
      earwigbot/wiki/copyvios/parsers.py
  12. +60
    -0
      earwigbot/wiki/copyvios/result.py
  13. +94
    -0
      earwigbot/wiki/copyvios/search.py
  14. +13
    -3
      earwigbot/wiki/page.py
  15. +9
    -8
      earwigbot/wiki/site.py
  16. +41
    -9
      earwigbot/wiki/sitesdb.py
  17. +9
    -1
      earwigbot/wiki/user.py
  18. +20
    -9
      setup.py

+ 47
- 0
docs/api/earwigbot.wiki.copyvios.rst View File

@@ -0,0 +1,47 @@
copyvios Package
================

:mod:`copyvios` Package
-----------------------

.. automodule:: earwigbot.wiki.copyvios
:members:
:undoc-members:

:mod:`exclusions` Module
------------------------

.. automodule:: earwigbot.wiki.copyvios.exclusions
:members:
:undoc-members:

:mod:`markov` Module
--------------------

.. automodule:: earwigbot.wiki.copyvios.markov
:members:
:undoc-members:
:show-inheritance:

:mod:`parsers` Module
---------------------

.. automodule:: earwigbot.wiki.copyvios.parsers
:members:
:undoc-members:
:show-inheritance:

:mod:`result` Module
--------------------

.. automodule:: earwigbot.wiki.copyvios.result
:members:
:undoc-members:

:mod:`search` Module
--------------------

.. automodule:: earwigbot.wiki.copyvios.search
:members:
:undoc-members:
:show-inheritance:

+ 7
- 6
docs/api/earwigbot.wiki.rst View File

@@ -22,13 +22,6 @@ wiki Package
:members:
:undoc-members:

:mod:`copyright` Module

.. automodule:: earwigbot.wiki.copyright
:members:
:undoc-members:

:mod:`page` Module
------------------

@@ -57,3 +50,10 @@ wiki Package
.. automodule:: earwigbot.wiki.user
:members:
:undoc-members:

Subpackages
-----------

.. toctree::

earwigbot.wiki.copyvios

+ 1
- 1
docs/api/modules.rst View File

@@ -2,6 +2,6 @@ earwigbot
=========

.. toctree::
:maxdepth: 4
:maxdepth: 6

earwigbot

+ 4
- 3
docs/toolset.rst View File

@@ -47,9 +47,10 @@ wikis, you can usually use code like this::
site = bot.wiki.add_site(project=project, lang=lang)

This works because EarwigBot assumes that the URL for the site is
``"//{lang}.{project}.org"`` and the API is at ``/w/api.php``; this might
change if you're dealing with non-WMF wikis, where the code might look
something more like::
``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL
connection info (if any) is stored as ``config.wiki["sql"]``. This might change
if you're dealing with non-WMF wikis, where the code might look something more
like::

project, lang = "mywiki", "it"
try:


+ 5
- 9
earwigbot/commands/link.py View File

@@ -30,6 +30,7 @@ class Link(Command):
name = "link"

def process(self, data):
self.site = self.bot.wiki.get_site()
msg = data.msg

if re.search("(\[\[(.*?)\]\])|(\{\{(.*?)\}\})", msg):
@@ -41,8 +42,8 @@ class Link(Command):
if not data.args:
self.reply(data, "what do you want me to link to?")
return
pagename = ' '.join(data.args)
link = self.parse_link(pagename)
pagename = " ".join(data.args)
link = self.site.get_page(pagename).url
self.reply(data, link)

def parse_line(self, line):
@@ -56,8 +57,7 @@ class Link(Command):
if links:
# re.findall() returns a list of tuples, but we only want the 2nd
# item in each tuple:
links = [i[1] for i in links]
results = map(self.parse_link, links)
results = [self.site.get_page(name[1]).url for name in links]

# Find all {{templates}}
templates = re.findall("(\{\{(.*?)(\||\}\}))", line)
@@ -67,10 +67,6 @@ class Link(Command):

return results

def parse_link(self, pagename):
link = quote(pagename.replace(" ", "_"), safe="/:")
return "".join(("http://enwp.org/", link))

def parse_template(self, pagename):
pagename = "".join(("Template:", pagename))
return self.parse_link(pagename)
return self.site.get_page(pagename).url

+ 14
- 14
earwigbot/tasks/afc_copyvios.py View File

@@ -23,6 +23,7 @@
from hashlib import sha256
from os.path import expanduser
from threading import Lock
from urllib import quote

import oursql

@@ -70,35 +71,36 @@ class AFCCopyvios(Task):
"""Detect copyvios in 'page' and add a note if any are found."""
title = page.title
if title in self.ignore_list:
msg = "Skipping page in ignore list: [[{0}]]"
msg = u"Skipping page in ignore list: [[{0}]]"
self.logger.info(msg.format(title))
return

pageid = page.pageid
if self.has_been_processed(pageid):
msg = "Skipping check on already processed page [[{0}]]"
msg = u"Skipping check on already processed page [[{0}]]"
self.logger.info(msg.format(title))
return

self.logger.info("Checking [[{0}]]".format(title))
self.logger.info(u"Checking [[{0}]]".format(title))
result = page.copyvio_check(self.min_confidence, self.max_queries)
url = result.url
confidence = "{0}%".format(round(result.confidence * 100, 2))

if result.violation:
safeurl = quote(url.encode("utf8"), safe="/:").decode("utf8")
content = page.get()
template = "\{\{{0}|url={1}|confidence={2}\}\}\n"
template = template.format(self.template, url, confidence)
template = u"\{\{{0}|url={1}|confidence={2}\}\}\n"
template = template.format(self.template, safeurl, confidence)
newtext = template + content
if "{url}" in self.summary:
page.edit(newtext, self.summary.format(url=url))
else:
page.edit(newtext, self.summary)
msg = "Found violation: [[{0}]] -> {1} ({2} confidence)"
self.logger.warn(msg.format(title, url, confidence))
msg = u"Found violation: [[{0}]] -> {1} ({2} confidence)"
self.logger.info(msg.format(title, url, confidence))
else:
msg = "No violations detected (best: {1} at {2} confidence)"
self.logger.debug(msg.format(url, confidence))
msg = u"No violations detected in [[{0}]] (best: {1} at {2} confidence)"
self.logger.info(msg.format(title, url, confidence))

self.log_processed(pageid)
if self.cache_results:
@@ -110,9 +112,7 @@ class AFCCopyvios(Task):
with self.conn.cursor() as cursor:
cursor.execute(query, (pageid,))
results = cursor.fetchall()
if results:
return True
return False
return True if results else False

def log_processed(self, pageid):
"""Adds pageid to our database of processed pages.
@@ -138,8 +138,8 @@ class AFCCopyvios(Task):
be) retained for one day; this task does not remove old entries (that
is handled by the Toolserver component).

This will only be called if "cache_results" == True in the task's
config, which is False by default.
This will only be called if ``cache_results == True`` in the task's
config, which is ``False`` by default.
"""
pageid = page.pageid
hash = sha256(page.get()).hexdigest()


+ 0
- 324
earwigbot/wiki/copyright.py View File

@@ -1,324 +0,0 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 by Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from collections import defaultdict
from functools import partial
from gzip import GzipFile
from json import loads
from re import sub, UNICODE
from StringIO import StringIO
from time import sleep, time
from urllib import quote_plus, urlencode
from urllib2 import build_opener, URLError

try:
import oauth2 as oauth
except ImportError:
oauth = None

from earwigbot.exceptions import *

class _CopyvioCheckResult(object):
def __init__(self, violation, confidence, url, queries, article, chains):
self.violation = violation
self.confidence = confidence
self.url = url
self.queries = queries
self.article_chain = article
self.source_chain = chains[0]
self.delta_chain = chains[1]

def __repr__(self):
r = "_CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
return r.format(self.violation, self.confidence, self.url, self.queries)


class _MarkovChain(object):
START = -1
END = -2

def __init__(self, text):
self.text = text
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()
prev = self.START
for word in words:
self.chain[prev][word] += 1
prev = word
try: # This won't work if the source text is completely blank
self.chain[word][self.END] += 1
except KeyError:
pass

def size(self):
count = 0
for node in self.chain.itervalues():
for hits in node.itervalues():
count += hits
return count


class _MarkovChainIntersection(_MarkovChain):
def __init__(self, mc1, mc2):
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
c1 = mc1.chain
c2 = mc2.chain

for word, nodes1 in c1.iteritems():
if word in c2:
nodes2 = c2[word]
for node, count1 in nodes1.iteritems():
if node in nodes2:
count2 = nodes2[node]
self.chain[word][node] = min(count1, count2)


class CopyrightMixIn(object):
"""
EarwigBot's Wiki Toolset: Copyright Violation Mixin

This is a mixin that provides two public methods, copyvio_check() and
copyvio_compare(). The former checks the page for copyright violations
using a search engine API, and the latter compares the page against a
specified URL. Credentials for the search engine API are stored in the
site's config.
"""
def __init__(self, site):
self._opener = build_opener()
self._opener.addheaders = site._opener.addheaders

def _open_url_ignoring_errors(self, url):
"""Open a URL using self._opener and return its content, or None.

Will decompress the content if the headers contain "gzip" as its
content encoding, and will return None if URLError is raised while
opening the URL. IOErrors while gunzipping a compressed response are
ignored, and the original content is returned.
"""
try:
response = self._opener.open(url)
except URLError:
return None
result = response.read()

if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(result)
gzipper = GzipFile(fileobj=stream)
try:
result = gzipper.read()
except IOError:
pass

return result

def _select_search_engine(self):
"""Return a function that can be called to do web searches.

The "function" is a functools.partial object that takes one argument, a
query, and returns a list of URLs, ranked by importance. The underlying
logic depends on the 'engine' argument; for example, if 'engine' is
"Yahoo! BOSS", we'll use self._yahoo_boss_query for querying.

Raises UnknownSearchEngineError if the 'engine' listed in our config is
unknown to us, and UnsupportedSearchEngineError if we are missing a
required package or module, like oauth2 for "Yahoo! BOSS".
"""
engine, credentials = self._site._search_config

if engine == "Yahoo! BOSS":
if not oauth:
e = "The package 'oauth2' could not be imported"
raise UnsupportedSearchEngineError(e)
searcher = self._yahoo_boss_query
else:
raise UnknownSearchEngineError(engine)

return partial(searcher, credentials)

def _yahoo_boss_query(self, cred, query):
"""Do a Yahoo! BOSS web search for 'query' using 'cred' as credentials.

Returns a list of URLs, no more than fifty, ranked by relevance (as
determined by Yahoo). Raises SearchQueryError() on errors.
"""
base_url = "http://yboss.yahooapis.com/ysearch/web"
query = quote_plus(query.join('"', '"'))
params = {"q": query, "style": "raw", "format": "json"}
url = "{0}?{1}".format(base_url, urlencode(params))

consumer = oauth.Consumer(key=cred["key"], secret=cred["secret"])
client = oauth.Client(consumer)
headers, body = client.request(url, "GET")

if headers["status"] != "200":
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
raise SearchQueryError(e.format(headers["status"], body))

try:
res = loads(body)
except ValueError:
e = "Yahoo! BOSS Error: JSON could not be decoded"
raise SearchQueryError(e)

try:
results = res["bossresponse"]["web"]["results"]
except KeyError:
return []
return [result["url"] for result in results]

def _copyvio_strip_html(self, html):
"""
STUB
"""
return html

def _copyvio_strip_article(self, content):
"""Clean the page's raw text by removing templates and formatting.

Returns the page's text with all HTML and wikicode formatting removed,
including templates, tables, references, and the Bibliography/
References/Sources/See also section(s). It retains punctuation
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
quotes) and original capitalization, but not brackets (square and
angular), abnormal spacing, nor anything else. HTML entities are
replaced by their unicode equivalents.

STUB
"""
return content

def _copyvio_chunk_article(self, content, max_chunks):
"""
STUB
"""
return [content]

def _copyvio_compare_content(self, article, url):
"""
DOCSTRING NEEDED
"""
html = self._open_url_ignoring_errors(url)
if not html:
return 0

source = _MarkovChain(self._copyvio_strip_html(html))
delta = _MarkovChainIntersection(article, source)
return float(delta.size()) / article.size(), (source, delta)

def copyvio_check(self, min_confidence=0.5, max_queries=-1,
interquery_sleep=1, force=False):
"""Check the page for copyright violations.

Returns a _CopyvioCheckResult object with four useful attributes:
"violation", "confidence", "url", and "queries". "confidence" is a
number between 0 and 1; if it is less than "min_confidence", we could
not find any indication of a violation (so "violation" will be False
and "url" may or may not be None), otherwise it indicates the relative
faith in our results, "violation" will be True, and "url" will be the
place the article is suspected of being copied from. "queries" is the
number of queries used to determine the results.

"max_queries" is self-explanatory; we will never make more than this
number of queries in a given check. If it's less than 0, we will not
limit our number of queries.

"interquery_sleep" is the minimum amount of time we will sleep between
search engine queries, in seconds.

"force" is simply passed to page.get() - it has the same behavior there
as it does here.

Raises CopyvioCheckError or subclasses (UnknownSearchEngineError,
SearchQueryError, ...) on errors.
"""
search = self._select_search_engine()
handled_urls = []
best_confidence = 0
best_match = None
num_queries = 0
empty = _MarkovChain("")
best_chains = (empty, _MarkovChainIntersection(empty, empty))
content = self.get(force)
clean = self._copyvio_strip_article(content)
chunks = self._copyvio_chunk_article(clean, max_queries)
article_chain = _MarkovChain(clean)
last_query = time()

if article_chain.size() < 20: # Auto-fail very small articles
return _CopyvioCheckResult(False, best_confidence, best_match,
num_queries, article_chain, best_chains)

while (chunks and best_confidence < min_confidence and
(max_queries < 0 or num_queries < max_queries)):
urls = search(chunks.pop(0))
urls = [url for url in urls if url not in handled_urls]
for url in urls:
handled_urls.append(url)
conf, chains = self._copyvio_compare_content(article_chain, url)
if conf > best_confidence:
best_confidence = conf
best_match = url
best_chains = chains
num_queries += 1
diff = time() - last_query
if diff < interquery_sleep:
sleep(interquery_sleep - diff)
last_query = time()

if best_confidence >= min_confidence: # violation?
v = True
else:
v = False
return _CopyvioCheckResult(v, best_confidence, best_match, num_queries,
article_chain, best_chains)

def copyvio_compare(self, url, min_confidence=0.5, force=False):
"""Check the page like copyvio_check(), but against a specific URL.

This is essentially a reduced version of the above - a copyivo
comparison is made using Markov chains and the result is returned in a
_CopyvioCheckResult object - without using a search engine, as the
suspected "violated" URL is supplied from the start.

Its primary use is to generate a result when the URL is retrieved from
a cache, like the one used in EarwigBot's Toolserver site. After a
search is done, the resulting URL is stored in a cache for 24 hours so
future checks against that page will not require another set of
time-and-money-consuming search engine queries. However, the comparison
itself (which includes the article's and the source's content) cannot
be stored for data retention reasons, so a fresh comparison is made
using this function.

Since no searching is done, neither UnknownSearchEngineError nor
SearchQueryError will be raised.
"""
content = self.get(force)
clean = self._copyvio_strip_article(content)
article_chain = _MarkovChain(clean)
confidence, chains = self._copyvio_compare_content(article_chain, url)

if confidence >= min_confidence:
is_violation = True
else:
is_violation = False
return _CopyvioCheckResult(is_violation, confidence, url, 0,
article_chain, chains)

+ 229
- 0
earwigbot/wiki/copyvios/__init__.py View File

@@ -0,0 +1,229 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from gzip import GzipFile
from StringIO import StringIO
from time import sleep, time
from urllib2 import build_opener, URLError

try:
import oauth2 as oauth
except ImportError:
oauth = None

from earwigbot import exceptions
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine

__all__ = ["CopyvioMixIn"]

class CopyvioMixIn(object):
"""
**EarwigBot: Wiki Toolset: Copyright Violation MixIn**

This is a mixin that provides two public methods, :py:meth:`copyvio_check`
and :py:meth:`copyvio_compare`. The former checks the page for copyright
violations using a search engine API, and the latter compares the page
against a given URL. Credentials for the search engine API are stored in
the :py:class:`~earwigbot.wiki.site.Site`'s config.
"""

def __init__(self, site):
self._search_config = site._search_config
self._exclusions_db = self._search_config["exclusions_db"]
self._opener = build_opener()
self._opener.addheaders = site._opener.addheaders

def _open_url_ignoring_errors(self, url):
"""Open a URL using self._opener and return its content, or None.

Will decompress the content if the headers contain "gzip" as its
content encoding, and will return None if URLError is raised while
opening the URL. IOErrors while gunzipping a compressed response are
ignored, and the original content is returned.
"""
try:
response = self._opener.open(url)
except URLError:
return None
result = response.read()

if response.headers.get("Content-Encoding") == "gzip":
stream = StringIO(result)
gzipper = GzipFile(fileobj=stream)
try:
result = gzipper.read()
except IOError:
pass

return result

def _select_search_engine(self):
"""Return a function that can be called to do web searches.

The function takes one argument, a search query, and returns a list of
URLs, ranked by importance. The underlying logic depends on the
*engine* argument within our config; for example, if *engine* is
"Yahoo! BOSS", we'll use YahooBOSSSearchEngine for querying.

Raises UnknownSearchEngineError if the 'engine' listed in our config is
unknown to us, and UnsupportedSearchEngineError if we are missing a
required package or module, like oauth2 for "Yahoo! BOSS".
"""
engine = self._search_config["engine"]
credentials = self._search_config["credentials"]

if engine == "Yahoo! BOSS":
if not oauth:
e = "The package 'oauth2' could not be imported"
raise exceptions.UnsupportedSearchEngineError(e)
return YahooBOSSSearchEngine(credentials)

raise exceptions.UnknownSearchEngineError(engine)

def _copyvio_compare_content(self, article, url):
"""Return a number comparing an article and a URL.

The *article* is a Markov chain, whereas the *url* is just a string
that we'll try to open and read ourselves.
"""
html = self._open_url_ignoring_errors(url)
if not html:
return 0

source = MarkovChain(HTMLTextParser(html).strip())
delta = MarkovChainIntersection(article, source)
return float(delta.size()) / article.size(), (source, delta)

def copyvio_check(self, min_confidence=0.5, max_queries=-1,
interquery_sleep=1):
"""Check the page for copyright violations.

Returns a
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object
with information on the results of the check.

*max_queries* is self-explanatory; we will never make more than this
number of queries in a given check. If it's lower than 0, we will not
limit the number of queries.

*interquery_sleep* is the minimum amount of time we will sleep between
search engine queries, in seconds.

Raises :py:exc:`~earwigbot.exceptions.CopyvioCheckError` or subclasses
(:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError`,
:py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors.
"""
searcher = self._select_search_engine()
self._exclusions_db.sync(self.site.name)
handled_urls = []
best_confidence = 0
best_match = None
num_queries = 0
empty = MarkovChain("")
best_chains = (empty, MarkovChainIntersection(empty, empty))
parser = ArticleTextParser(self.get())
clean = parser.strip()
chunks = parser.chunk(self._search_config["nltk_dir"], max_queries)
article_chain = MarkovChain(clean)
last_query = time()

if article_chain.size() < 20: # Auto-fail very small articles
return CopyvioCheckResult(False, best_confidence, best_match,
num_queries, article_chain, best_chains)

while (chunks and best_confidence < min_confidence and
(max_queries < 0 or num_queries < max_queries)):
chunk = chunks.pop(0)
log = u"[[{0}]] -> querying {1} for {2!r}"
self._logger.debug(log.format(self.title, searcher.name, chunk))
urls = searcher.search(chunk)
urls = [url for url in urls if url not in handled_urls]
for url in urls:
handled_urls.append(url)
if self._exclusions_db.check(self.site.name, url):
continue
conf, chains = self._copyvio_compare_content(article_chain, url)
if conf > best_confidence:
best_confidence = conf
best_match = url
best_chains = chains
num_queries += 1
diff = time() - last_query
if diff < interquery_sleep:
sleep(interquery_sleep - diff)
last_query = time()

if best_confidence >= min_confidence:
is_violation = True
log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2}; using {3} queries)"
self._logger.debug(log.format(self.title, best_confidence,
best_match, num_queries))
else:
is_violation = False
log = u"No violation for [[{0}]] (confidence: {1}; using {2} queries)"
self._logger.debug(log.format(self.title, best_confidence,
num_queries))

return CopyvioCheckResult(is_violation, best_confidence, best_match,
num_queries, article_chain, best_chains)

def copyvio_compare(self, url, min_confidence=0.5):
"""Check the page like :py:meth:`copyvio_check` against a specific URL.

This is essentially a reduced version of the above - a copyivo
comparison is made using Markov chains and the result is returned in a
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object -
but without using a search engine, since the suspected "violated" URL
is supplied from the start.

Its primary use is to generate a result when the URL is retrieved from
a cache, like the one used in EarwigBot's Toolserver site. After a
search is done, the resulting URL is stored in a cache for 24 hours so
future checks against that page will not require another set of
time-and-money-consuming search engine queries. However, the comparison
itself (which includes the article's and the source's content) cannot
be stored for data retention reasons, so a fresh comparison is made
using this function.

Since no searching is done, neither
:py:exc:`~earwigbot.exceptions.UnknownSearchEngineError` nor
:py:exc:`~earwigbot.exceptions.SearchQueryError` will be raised.
"""
content = self.get()
clean = ArticleTextParser(content).strip()
article_chain = MarkovChain(clean)
confidence, chains = self._copyvio_compare_content(article_chain, url)

if confidence >= min_confidence:
is_violation = True
log = u"Violation detected for [[{0}]] (confidence: {1}; URL: {2})"
self._logger.debug(log.format(self.title, confidence, url))
else:
is_violation = False
log = u"No violation for [[{0}]] (confidence: {1}; URL: {2})"
self._logger.debug(log.format(self.title, confidence, url))

return CopyvioCheckResult(is_violation, confidence, url, 0,
article_chain, chains)

+ 164
- 0
earwigbot/wiki/copyvios/exclusions.py View File

@@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import re
import sqlite3 as sqlite
from threading import Lock
from time import time

from earwigbot import exceptions

__all__ = ["ExclusionsDB"]

default_sources = {
"enwiki": [
"Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def",
"Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl",
"Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr",
"Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz"
]
}

class ExclusionsDB(object):
"""
**EarwigBot: Wiki Toolset: Exclusions Database Manager**

Controls the :file:`.exclusions.db` file, which stores URLs excluded from
copyright violation checks on account of being known mirrors, for example.
"""

def __init__(self, sitesdb, dbfile, logger):
self._sitesdb = sitesdb
self._dbfile = dbfile
self._logger = logger
self._db_access_lock = Lock()

def __repr__(self):
"""Return the canonical string representation of the ExclusionsDB."""
res = "ExclusionsDB(sitesdb={0!r}, dbfile={1!r}, logger={2!r})"
return res.format(self._sitesdb, self._dbfile, self._logger)

def __str__(self):
"""Return a nice string representation of the ExclusionsDB."""
return "<ExclusionsDB at {0}>".format(self._dbfile)

def _create(self):
"""Initialize the exclusions database with its necessary tables."""
script = """
CREATE TABLE sources (source_sitename, source_page);
CREATE TABLE updates (update_sitename, update_time);
CREATE TABLE exclusions (exclusion_sitename, exclusion_url);
"""
query = "INSERT INTO sources VALUES (?, ?);"
sources = []
for sitename, pages in default_sources.iteritems():
[sources.append((sitename, page)) for page in pages]

with sqlite.connect(self._dbfile) as conn:
conn.executescript(script)
conn.executemany(query, sources)

def _load_source(self, site, source):
"""Load from a specific source and return a set of URLs."""
urls = set()
try:
data = site.get_page(source).get()
except exceptions.PageNotFoundError:
return urls

regexes = [
"url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
"\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?"
]
for regex in regexes:
[urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)]
return urls

def _update(self, sitename):
"""Update the database from listed sources in the index."""
query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;"
query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?"
query4 = "INSERT INTO exclusions VALUES (?, ?);"
query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;"
query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;"
query7 = "INSERT INTO updates VALUES (?, ?);"

site = self._sitesdb.get_site(sitename)
with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
urls = set()
for (source,) in conn.execute(query1, (sitename,)):
urls |= self._load_source(site, source)
for (url,) in conn.execute(query2, (sitename,)):
if url in urls:
urls.remove(url)
else:
conn.execute(query3, (sitename, url))
conn.executemany(query4, [(sitename, url) for url in urls])
if conn.execute(query5, (name,)).fetchone():
conn.execute(query6, (time(), sitename))
else:
conn.execute(query7, (sitename, time()))

def _get_last_update(self, sitename):
"""Return the UNIX timestamp of the last time the db was updated."""
query = "SELECT update_time FROM updates WHERE update_sitename = ?;"
with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
try:
result = conn.execute(query, (sitename,)).fetchone()
except sqlite.OperationalError:
self._create()
return 0
return result[0] if result else 0

def sync(self, sitename):
"""Update the database if it hasn't been updated in the past month.

This only updates the exclusions database for the *sitename* site.
"""
max_staleness = 60 * 60 * 24 * 30
time_since_update = int(time() - self._get_last_update())
if time_since_update > max_staleness:
log = u"Updating stale database: {0} (last updated {1} seconds ago)"
self._logger.info(log.format(sitename, time_since_update))
self._update(sitename)
else:
log = u"Database for {0} is still fresh (last updated {1} seconds ago)"
self._logger.debug(log.format(sitename, time_since_update))

def check(self, sitename, url):
"""Check whether a given URL is in the exclusions database.

Return ``True`` if the URL is in the database, or ``False`` otherwise.
"""
normalized = re.sub("https?://", "", url.lower())
query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
for row in conn.execute(query, (sitename,)):
if normalized.startswith(row[0]):
log = u"Exclusion detected in {0} for {1}"
self._logger.debug(log.format(sitename, url))
return True

log = u"No exclusions in {0} for {1}".format(sitename, url)
self._logger.debug(log)
return False

+ 87
- 0
earwigbot/wiki/copyvios/markov.py View File

@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from collections import defaultdict
from re import sub, UNICODE

__all__ = ["MarkovChain", "MarkovChainIntersection"]

class MarkovChain(object):
"""Implements a basic ngram Markov chain of words."""
START = -1
END = -2
degree = 3 # 2 for bigrams, 3 for trigrams, etc.

def __init__(self, text):
self.text = text
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
words = sub("[^\w\s-]", "", text.lower(), flags=UNICODE).split()

padding = self.degree - 1
words = ([self.START] * padding) + words + ([self.END] * padding)
for i in range(len(words) - self.degree + 1):
last = i + self.degree - 1
self.chain[words[i:last]][words[last]] += 1

def __repr__(self):
"""Return the canonical string representation of the MarkovChain."""
return "MarkovChain(text={0!r})".format(self.text)

def __str__(self):
"""Return a nice string representation of the MarkovChain."""
return "<MarkovChain of size {0}>".format(self.size())

def size(self):
"""Return the size of the Markov chain: the total number of nodes."""
count = 0
for node in self.chain.itervalues():
for hits in node.itervalues():
count += hits
return count


class MarkovChainIntersection(MarkovChain):
"""Implements the intersection of two chains (i.e., their shared nodes)."""

def __init__(self, mc1, mc2):
self.chain = defaultdict(lambda: defaultdict(lambda: 0))
self.mc1, self.mc2 = mc1, mc2
c1 = mc1.chain
c2 = mc2.chain

for word, nodes1 in c1.iteritems():
if word in c2:
nodes2 = c2[word]
for node, count1 in nodes1.iteritems():
if node in nodes2:
count2 = nodes2[node]
self.chain[word][node] = min(count1, count2)

def __repr__(self):
"""Return the canonical string representation of the intersection."""
res = "MarkovChainIntersection(mc1={0!r}, mc2={1!r})"
return res.format(self.mc1, self.mc2)

def __str__(self):
"""Return a nice string representation of the intersection."""
res = "<MarkovChainIntersection of size {0} ({1} ^ {2})>"
return res.format(self.size(), self.mc1, self.mc2)

+ 148
- 0
earwigbot/wiki/copyvios/parsers.py View File

@@ -0,0 +1,148 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from os import path

try:
from bs4 import BeautifulSoup
except ImportError:
BeautifulSoup = None

try:
import mwparserfromhell
except ImportError:
mwparserfromhell = None

try:
import nltk
except ImportError:
nltk = None

__all__ = ["BaseTextParser", "ArticleTextParser", "HTMLTextParser"]

class BaseTextParser(object):
"""Base class for a parser that handles text."""

def __init__(self, text):
self.text = text

def __repr__(self):
"""Return the canonical string representation of the text parser."""
return "{0}(text={1!r})".format(self.__class__.__name__, self.text)

def __str__(self):
"""Return a nice string representation of the text parser."""
name = self.__class__.__name__
return "<{0} of text with size {1}>".format(name, len(text))


class ArticleTextParser(BaseTextParser):
"""A parser that can strip and chunk wikicode article text."""

def strip(self):
"""Clean the page's raw text by removing templates and formatting.

Return the page's text with all HTML and wikicode formatting removed,
including templates, tables, and references. It retains punctuation
(spacing, paragraphs, periods, commas, (semi)-colons, parentheses,
quotes), original capitalization, and so forth. HTML entities are
replaced by their unicode equivalents.

The actual stripping is handled by :py:mod:`mwparserfromhell`.
"""
wikicode = mwparserfromhell.parse(self.text)
self.clean = wikicode.strip_code(normalize=True)
return self.clean

def chunk(self, nltk_dir, max_chunks, max_query=256):
"""Convert the clean article text into a list of web-searchable chunks.

No greater than *max_chunks* will be returned. Each chunk will only be
a sentence or two long at most (no more than *max_query*). The idea is
to return a sample of the article text rather than the whole, so we'll
pick and choose from parts of it, especially if the article is large
and *max_chunks* is low, so we don't end up just searching for just the
first paragraph.

This is implemented using :py:mod:`nltk` (http://nltk.org/). A base
directory (*nltk_dir*) is required to store nltk's punctuation
database. This is typically located in the bot's working directory.
"""
datafile = path.join(nltk_dir, "tokenizers", "punkt", "english.pickle")
try:
tokenizer = nltk.data.load("file:" + datafile)
except LookupError:
nltk.download("punkt", nltk_dir)
tokenizer = nltk.data.load("file:" + datafile)

sentences = []
for sentence in tokenizer.tokenize(self.clean):
if len(sentence) > max_query:
words = sentence.split()
while len(" ".join(words)) > max_query:
words.pop()
sentence = " ".join(words)
sentences.append(sentence)

if max_chunks >= len(sentences):
return sentences

chunks = []
while len(chunks) < max_chunks:
if len(chunks) % 5 == 0:
chunk = sentences.pop(0) # Pop from beginning
elif len(chunks) % 5 == 1:
chunk = sentences.pop() # Pop from end
elif len(chunks) % 5 == 2:
chunk = sentences.pop(len(sentences) / 2) # Pop from Q2
elif len(chunks) % 5 == 3:
chunk = sentences.pop(len(sentences) / 4) # Pop from Q1
else:
chunk = sentences.pop(3 * len(sentences) / 4) # Pop from Q3
chunks.append(chunk)

return chunks


class HTMLTextParser(BaseTextParser):
"""A parser that can extract the text from an HTML document."""
hidden_tags = [
"script", "style"
]

def strip(self):
"""Return the actual text contained within an HTML document.

Implemented using :py:mod:`BeautifulSoup <bs4>`
(http://www.crummy.com/software/BeautifulSoup/).
"""
try:
soup = BeautifulSoup(self.text, "lxml").body
except ValueError:
soup = BeautifulSoup(self.text).body

is_comment = lambda text: isinstance(text, bs4.element.Comment)
[comment.extract() for comment in soup.find_all(text=is_comment)]
for tag in self.hidden_tags:
[element.extract() for element in soup.find_all(tag)]

return "\n".join(soup.stripped_strings)

+ 60
- 0
earwigbot/wiki/copyvios/result.py View File

@@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

__all__ = ["CopyvioCheckResult"]

class CopyvioCheckResult(object):
"""
**EarwigBot: Wiki Toolset: Copyvio Check Result**

A class holding information about the results of a copyvio check.

*Attributes:*

- :py:attr:`violation`: ``True`` if this is a violation, else ``False``
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy
- :py:attr:`url`: the URL of the violated page
- :py:attr:`queries`: the number of queries used to reach a result
- :py:attr:`article_chain`: the MarkovChain of the article text
- :py:attr:`source_chain`: the MarkovChain of the violated page text
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two
"""

def __init__(self, violation, confidence, url, queries, article, chains):
self.violation = violation
self.confidence = confidence
self.url = url
self.queries = queries
self.article_chain = article
self.source_chain = chains[0]
self.delta_chain = chains[1]

def __repr__(self):
"""Return the canonical string representation of the result."""
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
return res.format(self.violation, self.confidence, self.url,
self.queries)

def __str__(self):
"""Return a nice string representation of the result."""
res = "<CopyvioCheckResult ({0} with {1} conf)>"
return res.format(self.violation, self.confidence)

+ 94
- 0
earwigbot/wiki/copyvios/search.py View File

@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from json import loads
from urllib import quote_plus, urlencode

try:
import oauth2 as oauth
except ImportError:
oauth = None

from earwigbot.exceptions import SearchQueryError

__all__ = ["BaseSearchEngine", "YahooBOSSSearchEngine"]

class BaseSearchEngine(object):
"""Base class for a simple search engine interface."""
name = "Base"

def __init__(self, cred):
"""Store credentials *cred* for searching later on."""
self.cred = cred

def __repr__(self):
"""Return the canonical string representation of the search engine."""
return "{0}()".format(self.__class__.__name__)

def __str__(self):
"""Return a nice string representation of the search engine."""
return "<{0}>".format(self.__class__.__name__)

def search(self, query):
"""Use this engine to search for *query*.

Not implemented in this base class; overridden in subclasses.
"""
raise NotImplementedError()


class YahooBOSSSearchEngine(BaseSearchEngine):
"""A search engine interface with Yahoo! BOSS."""
name = "Yahoo! BOSS"

def search(self, query):
"""Do a Yahoo! BOSS web search for *query*.

Returns a list of URLs, no more than fifty, ranked by relevance (as
determined by Yahoo). Raises
:py:exc:`~earwigbot.exceptions.SearchQueryError` on errors.
"""
base_url = "http://yboss.yahooapis.com/ysearch/web"
query = quote_plus(query.join('"', '"'))
params = {"q": query, "type": "html,text", "format": "json"}
url = "{0}?{1}".format(base_url, urlencode(params))

consumer = oauth.Consumer(key=self.cred["key"],
secret=self.cred["secret"])
client = oauth.Client(consumer)
headers, body = client.request(url, "GET")

if headers["status"] != "200":
e = "Yahoo! BOSS Error: got response code '{0}':\n{1}'"
raise SearchQueryError(e.format(headers["status"], body))

try:
res = loads(body)
except ValueError:
e = "Yahoo! BOSS Error: JSON could not be decoded"
raise SearchQueryError(e)

try:
results = res["bossresponse"]["web"]["results"]
except KeyError:
return []
return [result["url"] for result in results]

+ 13
- 3
earwigbot/wiki/page.py View File

@@ -21,6 +21,7 @@
# SOFTWARE.

from hashlib import md5
from logging import getLogger, NullHandler
import re
from time import gmtime, strftime
from urllib import quote
@@ -31,11 +32,11 @@ except ImportError:
mwparserfromhell = None

from earwigbot import exceptions
from earwigbot.wiki.copyright import CopyrightMixIn
from earwigbot.wiki.copyvios import CopyvioMixIn

__all__ = ["Page"]

class Page(CopyrightMixIn):
class Page(CopyvioMixIn):
"""
**EarwigBot: Wiki Toolset: Page**

@@ -81,7 +82,8 @@ class Page(CopyrightMixIn):
PAGE_MISSING = 2
PAGE_EXISTS = 3

def __init__(self, site, title, follow_redirects=False, pageid=None):
def __init__(self, site, title, follow_redirects=False, pageid=None,
logger=None):
"""Constructor for new Page instances.

Takes four arguments: a Site object, the Page's title (or pagename),
@@ -100,6 +102,14 @@ class Page(CopyrightMixIn):
self._follow_redirects = self._keep_following = follow_redirects
self._pageid = pageid

# Set up our internal logger:
if logger:
self._logger = logger
else: # Just set up a null logger to eat up our messages:
self._logger = getLogger("earwigbot.wiki")
self._logger.addHandler(NullHandler())

# Attributes to be loaded through the API:
self._exists = self.PAGE_UNKNOWN
self._is_redirect = None
self._lastrevid = None


+ 9
- 8
earwigbot/wiki/site.py View File

@@ -92,7 +92,7 @@ class Site(object):
namespaces=None, login=(None, None), cookiejar=None,
user_agent=None, use_https=False, assert_edit=None,
maxlag=None, wait_between_queries=3, logger=None,
search_config=(None, None)):
search_config=None):
"""Constructor for new Site instances.

This probably isn't necessary to call yourself unless you're building a
@@ -560,10 +560,10 @@ class Site(object):
return [self.SERVICE_API]
sqllag = self._sql_info_cache["replag"]

if sqllag > 180:
if sqllag > 300:
if not self._maxlag:
return [self.SERVICE_API, self.SERVICE_SQL]
if now - self._api_info_cache["lastcheck"] > 120:
if now - self._api_info_cache["lastcheck"] > 300:
self._api_info_cache["lastcheck"] = now
try:
self._api_info_cache["maxlag"] = apilag = self.get_maxlag()
@@ -571,7 +571,7 @@ class Site(object):
self._api_info_cache["maxlag"] = apilag = 0
else:
apilag = self._api_info_cache["maxlag"]
if sqllag / (180.0 / self._maxlag) < apilag:
if apilag > self._maxlag:
return [self.SERVICE_SQL, self.SERVICE_API]
return [self.SERVICE_API, self.SERVICE_SQL]

@@ -789,8 +789,9 @@ class Site(object):
prefix = title.split(":", 1)[0]
if prefix != title: # Avoid a page that is simply "Category"
if prefix in prefixes:
return Category(self, title, follow_redirects, pageid)
return Page(self, title, follow_redirects, pageid)
return Category(self, title, follow_redirects, pageid,
self._logger)
return Page(self, title, follow_redirects, pageid, self._logger)

def get_category(self, catname, follow_redirects=False, pageid=None):
"""Return a :py:class:`Category` object for the given category name.
@@ -802,7 +803,7 @@ class Site(object):
catname = self._unicodeify(catname)
prefix = self.namespace_id_to_name(constants.NS_CATEGORY)
pagename = u':'.join((prefix, catname))
return Category(self, pagename, follow_redirects, pageid)
return Category(self, pagename, follow_redirects, pageid, self._logger)

def get_user(self, username=None):
"""Return a :py:class:`User` object for the given username.
@@ -815,7 +816,7 @@ class Site(object):
username = self._unicodeify(username)
else:
username = self._get_username()
return User(self, username)
return User(self, username, self._logger)

def delegate(self, services, args=None, kwargs=None):
"""Delegate a task to either the API or SQL depending on conditions.


+ 41
- 9
earwigbot/wiki/sitesdb.py View File

@@ -29,6 +29,7 @@ import sqlite3 as sqlite

from earwigbot import __version__
from earwigbot.exceptions import SiteNotFoundError
from earwigbot.wiki.copyvios.exclusions import ExclusionsDB
from earwigbot.wiki.site import Site

__all__ = ["SitesDB"]
@@ -58,11 +59,16 @@ class SitesDB(object):
"""Set up the manager with an attribute for the base Bot object."""
self.config = bot.config
self._logger = bot.logger.getChild("wiki")

self._sites = {} # Internal site cache
self._sitesdb = path.join(bot.config.root_dir, "sites.db")
self._cookie_file = path.join(bot.config.root_dir, ".cookies")
self._cookiejar = None

excl_db = path.join(bot.config.root_dir, "exclusions.db")
excl_logger = self._logger.getChild("exclusionsdb")
self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger)

def __repr__(self):
"""Return the canonical string representation of the SitesDB."""
res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})"
@@ -192,6 +198,17 @@ class SitesDB(object):
user_agent = user_agent.replace("$1", __version__)
user_agent = user_agent.replace("$2", python_version())

if search_config:
nltk_dir = path.join(self.config.root_dir, ".nltk")
search_config["nltk_dir"] = nltk_dir
search_config["exclusions_db"] = self._exclusions_db

if not sql:
sql = config.wiki.get("sql", {})
for key, value in sql.iteritems():
if "$1" in value:
sql[key] = value.replace("$1", name)

return Site(name=name, project=project, lang=lang, base_url=base_url,
article_path=article_path, script_path=script_path,
sql=sql, namespaces=namespaces, login=login,
@@ -332,13 +349,12 @@ class SitesDB(object):
the script path (meaning the API is located at
``"{base_url}{script_path}/api.php"`` ->
``"//{lang}.{project}.org/w/api.php"``), so this is the default. If
your wiki is different, provide the script_path as an argument. The
only other argument to :py:class:`~earwigbot.wiki.site.Site` that we
can't get from config files or by querying the wiki itself is SQL
connection info, so provide a dict of kwargs as *sql* and Site will
pass it to :py:func:`oursql.connect(**sql) <oursql.connect>`, allowing
you to make queries with :py:meth:`site.sql_query
<earwigbot.wiki.site.Site.sql_query>`.
your wiki is different, provide the script_path as an argument. SQL
connection settings are guessed automatically using config's template
value. If this is wrong or not specified, provide a dict of kwargs as
*sql* and Site will pass it to :py:func:`oursql.connect(**sql)
<oursql.connect>`, allowing you to make queries with
:py:meth:`site.sql_query <earwigbot.wiki.site.Site.sql_query>`.

Returns ``True`` if the site was added successfully or ``False`` if the
site is already in our sitesdb (this can be done purposefully to update
@@ -359,15 +375,31 @@ class SitesDB(object):
use_https = config.wiki.get("useHTTPS", False)
assert_edit = config.wiki.get("assert")
maxlag = config.wiki.get("maxlag")
wait_between_queries = config.wiki.get("waitTime", 5)
wait_between_queries = config.wiki.get("waitTime", 3)
logger = self._logger.getChild(name)
search_config = config.wiki.get("search")

if user_agent:
user_agent = user_agent.replace("$1", __version__)
user_agent = user_agent.replace("$2", python_version())

if search_config:
nltk_dir = path.join(self.config.root_dir, ".nltk")
search_config["nltk_dir"] = nltk_dir
search_config["exclusions_db"] = self._exclusions_db

if not sql:
sql = config.wiki.get("sql", {})
for key, value in sql.iteritems():
if "$1" in value:
sql[key] = value.replace("$1", name)

# Create a Site object to log in and load the other attributes:
site = Site(base_url=base_url, script_path=script_path, sql=sql,
login=login, cookiejar=cookiejar, user_agent=user_agent,
use_https=use_https, assert_edit=assert_edit,
maxlag=maxlag, wait_between_queries=wait_between_queries,
search_config=search_config)
logger=logger, search_config=search_config)

self._add_site_to_sitesdb(site)
self._sites[site.name] = site


+ 9
- 1
earwigbot/wiki/user.py View File

@@ -20,6 +20,7 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from logging import getLogger, NullHandler
from time import gmtime, strptime

from earwigbot.exceptions import UserNotFoundError
@@ -60,7 +61,7 @@ class User(object):
talkpage
"""

def __init__(self, site, name):
def __init__(self, site, name, logger=None):
"""Constructor for new User instances.

Takes two arguments, a Site object (necessary for doing API queries),
@@ -76,6 +77,13 @@ class User(object):
self._site = site
self._name = name

# Set up our internal logger:
if logger:
self._logger = logger
else: # Just set up a null logger to eat up our messages:
self._logger = getLogger("earwigbot.wiki")
self._logger.addHandler(NullHandler())

def __repr__(self):
"""Return the canonical string representation of the User."""
return "User(name={0!r}, site={1!r})".format(self._name, self._site)


+ 20
- 9
setup.py View File

@@ -25,6 +25,25 @@ from setuptools import setup, find_packages

from earwigbot import __version__

# Not all of these dependencies are required, particularly the copyvio-specific
# ones (bs4, lxml, nltk, and oauth2) or the command-specific ones (GitPython,
# pytz). The bot should run fine without them, but will raise an exception if
# you try to detect copyvios or run a command that requries one.

dependencies = [
"GitPython >= 0.3.2.RC1", # Interfacing with git for !git and __version__
"PyYAML >= 3.10", # Parsing config files
"beautifulsoup4 >= 4.1.1", # Parsing/scraping HTML for copyvios
"lxml >= 2.3.4", # Faster parser for BeautifulSoup
"mwparserfromhell >= 0.1", # Parsing wikicode for manipulation
"nltk >= 2.0.2", # Parsing sentences to split article content for copyvios
"oursql >= 0.9.3", # Interfacing with MediaWiki databases
"oauth2 >= 1.5.211", # Interfacing with Yahoo! BOSS Search for copyvios
"py-bcrypt >= 0.2", # Hashing the bot key in the config file
"pycrypto >= 2.5", # Storing bot passwords and keys in the config file
"pytz >= 2012c", # Handling timezones for the !time IRC command
]

with open("README.rst") as fp:
long_docs = fp.read()

@@ -32,15 +51,7 @@ setup(
name = "earwigbot",
packages = find_packages(exclude=("tests",)),
entry_points = {"console_scripts": ["earwigbot = earwigbot.util:main"]},
install_requires = ["GitPython >= 0.3.2.RC1", # Interfacing with git
"PyYAML >= 3.10", # Config parsing
"mwparserfromhell >= 0.1", # Wikicode parsing
"oursql >= 0.9.3", # Talking with MediaWiki databases
"oauth2 >= 1.5.211", # Talking with Yahoo BOSS Search
"py-bcrypt >= 0.2", # Password hashing in config
"pycrypto >= 2.5", # Storing bot passwords and keys
"pytz >= 2012c", # Timezone handling
],
install_requires = dependencies,
test_suite = "tests",
version = __version__,
author = "Ben Kurtovic",


Loading…
Cancel
Save