Browse Source

More work on copyvios, including an exclusions database (#5)

* Added exclusions module with a fully implemented ExclusionsDB that can pull
  from multiple sources for different sites.
* Moved CopyvioCheckResult to its own module, to be imported by __init__.
* Some other related changes.
tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
a074da853b
6 changed files with 252 additions and 47 deletions
  1. +14
    -0
      docs/api/earwigbot.wiki.copyvios.rst
  2. +3
    -3
      docs/toolset.rst
  3. +12
    -44
      earwigbot/wiki/copyvios/__init__.py
  4. +155
    -0
      earwigbot/wiki/copyvios/exclusions.py
  5. +60
    -0
      earwigbot/wiki/copyvios/result.py
  6. +8
    -0
      earwigbot/wiki/sitesdb.py

+ 14
- 0
docs/api/earwigbot.wiki.copyvios.rst View File

@@ -8,6 +8,13 @@ copyvios Package
:members:
:undoc-members:

:mod:`exclusions` Module
------------------------

.. automodule:: earwigbot.wiki.copyvios.exclusions
:members:
:undoc-members:

:mod:`markov` Module
--------------------

@@ -24,6 +31,13 @@ copyvios Package
:undoc-members:
:show-inheritance:

:mod:`result` Module
--------------------

.. automodule:: earwigbot.wiki.copyvios.result
:members:
:undoc-members:

:mod:`search` Module
--------------------



+ 3
- 3
docs/toolset.rst View File

@@ -48,9 +48,9 @@ wikis, you can usually use code like this::

This works because EarwigBot assumes that the URL for the site is
``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL
connection info (if any) are stored as ``config.wiki["sql"]``. This might
change if you're dealing with non-WMF wikis, where the code might look
something more like::
connection info (if any) is stored as ``config.wiki["sql"]``. This might change
if you're dealing with non-WMF wikis, where the code might look something more
like::

project, lang = "mywiki", "it"
try:


+ 12
- 44
earwigbot/wiki/copyvios/__init__.py View File

@@ -33,47 +33,10 @@ except ImportError:
from earwigbot import exceptions
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine

__all__ = ["CopyvioCheckResult", "CopyvioMixIn"]

class CopyvioCheckResult(object):
"""
**EarwigBot: Wiki Toolset: Copyvio Check Result**

A class holding information about the results of a copyvio check.

*Attributes:*

- :py:attr:`violation`: ``True`` if this is a violation, else ``False``
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy
- :py:attr:`url`: the URL of the violated page
- :py:attr:`queries`: the number of queries used to reach a result
- :py:attr:`article_chain`: the MarkovChain of the article text
- :py:attr:`source_chain`: the MarkovChain of the violated page text
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two
"""

def __init__(self, violation, confidence, url, queries, article, chains):
self.violation = violation
self.confidence = confidence
self.url = url
self.queries = queries
self.article_chain = article
self.source_chain = chains[0]
self.delta_chain = chains[1]

def __repr__(self):
"""Return the canonical string representation of the result."""
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
return res.format(self.violation, self.confidence, self.url,
self.queries)

def __str__(self):
"""Return a nice string representation of the result."""
res = "<CopyvioCheckResult ({0} with {1} conf)>"
return res.format(self.violation, self.confidence)

__all__ = ["CopyvioMixIn"]

class CopyvioMixIn(object):
"""
@@ -88,6 +51,7 @@ class CopyvioMixIn(object):

def __init__(self, site):
self._search_config = site._search_config
self._exclusions_db = self._search_config["exclusions_db"]
self._opener = build_opener()
self._opener.addheaders = site._opener.addheaders

@@ -156,8 +120,9 @@ class CopyvioMixIn(object):
interquery_sleep=1):
"""Check the page for copyright violations.

Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult`
object with information on the results of the check.
Returns a
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object
with information on the results of the check.

*max_queries* is self-explanatory; we will never make more than this
number of queries in a given check. If it's lower than 0, we will not
@@ -171,6 +136,7 @@ class CopyvioMixIn(object):
:py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors.
"""
searcher = self._select_search_engine()
self._exclusions_db.sync(self.site.name)
handled_urls = []
best_confidence = 0
best_match = None
@@ -193,6 +159,8 @@ class CopyvioMixIn(object):
urls = [url for url in urls if url not in handled_urls]
for url in urls:
handled_urls.append(url)
if self._exclusions_db.check(self.site.name, url):
continue
conf, chains = self._copyvio_compare_content(article_chain, url)
if conf > best_confidence:
best_confidence = conf
@@ -216,9 +184,9 @@ class CopyvioMixIn(object):

This is essentially a reduced version of the above - a copyivo
comparison is made using Markov chains and the result is returned in a
:py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but
without using a search engine, since the suspected "violated" URL is
supplied from the start.
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object -
but without using a search engine, since the suspected "violated" URL
is supplied from the start.

Its primary use is to generate a result when the URL is retrieved from
a cache, like the one used in EarwigBot's Toolserver site. After a


+ 155
- 0
earwigbot/wiki/copyvios/exclusions.py View File

@@ -0,0 +1,155 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import re
import sqlite3 as sqlite
from threading import Lock
from time import time

from earwigbot import exceptions

__all__ = ["ExclusionsDB"]

default_sources = {
"enwiki": [
"Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def",
"Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl",
"Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr",
"Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz"
]
}

class ExclusionsDB(object):
"""
**EarwigBot: Wiki Toolset: Exclusions Database Manager**

Controls the :file:`.exclusions.db` file, which stores URLs excluded from
copyright violation checks on account of being known mirrors, for example.
"""

def __init__(self, sitesdb, dbfile, logger):
self._sitesdb = sitesdb
self._dbfile = dbfile
self._logger = logger
self._db_access_lock = Lock()

def _create(self):
"""Initialize the exclusions database with its necessary tables."""
script = """
CREATE TABLE sources (source_sitename, source_page);
CREATE TABLE updates (update_sitename, update_time);
CREATE TABLE exclusions (exclusion_sitename, exclusion_url);
"""
query = "INSERT INTO sources VALUES (?, ?);"
sources = []
for sitename, pages in default_sources.iteritems():
[sources.append((sitename, page)) for page in pages]

with sqlite.connect(self._dbfile) as conn:
conn.executescript(script)
conn.executemany(query, sources)

def _load_source(self, site, source):
"""Load from a specific source and return a set of URLs."""
urls = set()
try:
data = site.get_page(source).get()
except exceptions.PageNotFoundError:
return urls

regexes = [
"url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
"\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?"
]
for regex in regexes:
[urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)]
return urls

def _update(self, sitename):
"""Update the database from listed sources in the index."""
query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;"
query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?"
query4 = "INSERT INTO exclusions VALUES (?, ?);"
query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;"
query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;"
query7 = "INSERT INTO updates VALUES (?, ?);"

site = self._sitesdb.get_site(sitename)
with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
urls = set()
for (source,) in conn.execute(query1, (sitename,)):
urls |= self._load_source(site, source)
for (url,) in conn.execute(query2, (sitename,)):
if url in urls:
urls.remove(url)
else:
conn.execute(query3, (sitename, url))
conn.executemany(query4, [(sitename, url) for url in urls])
if conn.execute(query5, (name,)).fetchone():
conn.execute(query6, (time(), sitename))
else:
conn.execute(query7, (sitename, time()))

def _get_last_update(self, sitename):
"""Return the UNIX timestamp of the last time the db was updated."""
query = "SELECT update_time FROM updates WHERE update_sitename = ?;"
with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
try:
result = conn.execute(query, (sitename,)).fetchone()
except sqlite.OperationalError:
self._create()
return 0
return result[0] if result else 0

def sync(self, sitename):
"""Update the database if it hasn't been updated in the past month.

This only updates the exclusions database for the *sitename* site.
"""
max_staleness = 60 * 60 * 24 * 30
time_since_update = int(time() - self._get_last_update())
if time_since_update > max_staleness:
log = "Updating stale database: {0} (last updated {1} seconds ago)"
self._logger.info(log.format(sitename, time_since_update))
self._update(sitename)
else:
log = "Database for {0} is still fresh (last updated {1} seconds ago)"
self._logger.debug(log.format(sitename, time_since_update))

def check(self, sitename, url):
"""Check whether a given URL is in the exclusions database.

Return ``True`` if the URL is in the database, or ``False`` otherwise.
"""
normalized = re.sub("https?://", "", url.lower())
query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
for row in conn.execute(query, (sitename,)):
if normalized.startswith(row[0]):
log = "Exclusion detected in {0} for {1}"
self._logger.debug(log.format(sitename, url))
return True

log = "No exclusions in {0} for {1}".format(sitename, url)
self._logger.debug(log)
return False

+ 60
- 0
earwigbot/wiki/copyvios/result.py View File

@@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

__all__ = ["CopyvioCheckResult"]

class CopyvioCheckResult(object):
"""
**EarwigBot: Wiki Toolset: Copyvio Check Result**

A class holding information about the results of a copyvio check.

*Attributes:*

- :py:attr:`violation`: ``True`` if this is a violation, else ``False``
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy
- :py:attr:`url`: the URL of the violated page
- :py:attr:`queries`: the number of queries used to reach a result
- :py:attr:`article_chain`: the MarkovChain of the article text
- :py:attr:`source_chain`: the MarkovChain of the violated page text
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two
"""

def __init__(self, violation, confidence, url, queries, article, chains):
self.violation = violation
self.confidence = confidence
self.url = url
self.queries = queries
self.article_chain = article
self.source_chain = chains[0]
self.delta_chain = chains[1]

def __repr__(self):
"""Return the canonical string representation of the result."""
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
return res.format(self.violation, self.confidence, self.url,
self.queries)

def __str__(self):
"""Return a nice string representation of the result."""
res = "<CopyvioCheckResult ({0} with {1} conf)>"
return res.format(self.violation, self.confidence)

+ 8
- 0
earwigbot/wiki/sitesdb.py View File

@@ -29,6 +29,7 @@ import sqlite3 as sqlite

from earwigbot import __version__
from earwigbot.exceptions import SiteNotFoundError
from earwigbot.wiki.copyvios.exclusions import ExclusionsDB
from earwigbot.wiki.site import Site

__all__ = ["SitesDB"]
@@ -58,11 +59,16 @@ class SitesDB(object):
"""Set up the manager with an attribute for the base Bot object."""
self.config = bot.config
self._logger = bot.logger.getChild("wiki")

self._sites = {} # Internal site cache
self._sitesdb = path.join(bot.config.root_dir, "sites.db")
self._cookie_file = path.join(bot.config.root_dir, ".cookies")
self._cookiejar = None

excl_db = path.join(bot.config.root_dir, "exclusions.db")
excl_logger = self._logger.getChild("exclusionsdb")
self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger)

def __repr__(self):
"""Return the canonical string representation of the SitesDB."""
res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})"
@@ -195,6 +201,7 @@ class SitesDB(object):
if search_config:
nltk_dir = path.join(self.config.root_dir, ".nltk")
search_config["nltk_dir"] = nltk_dir
search_config["exclusions_db"] = self._exclusions_db

if not sql:
sql = config.wiki.get("sql", {})
@@ -379,6 +386,7 @@ class SitesDB(object):
if search_config:
nltk_dir = path.join(self.config.root_dir, ".nltk")
search_config["nltk_dir"] = nltk_dir
search_config["exclusions_db"] = self._exclusions_db

if not sql:
sql = config.wiki.get("sql", {})


Loading…
Cancel
Save