Browse Source

More work on copyvios, including an exclusions database (#5)

* Added exclusions module with a fully implemented ExclusionsDB that can pull
  from multiple sources for different sites.
* Moved CopyvioCheckResult to its own module, to be imported by __init__.
* Some other related changes.
tags/v0.1^2
Ben Kurtovic 12 years ago
parent
commit
a074da853b
6 changed files with 252 additions and 47 deletions
  1. +14
    -0
      docs/api/earwigbot.wiki.copyvios.rst
  2. +3
    -3
      docs/toolset.rst
  3. +12
    -44
      earwigbot/wiki/copyvios/__init__.py
  4. +155
    -0
      earwigbot/wiki/copyvios/exclusions.py
  5. +60
    -0
      earwigbot/wiki/copyvios/result.py
  6. +8
    -0
      earwigbot/wiki/sitesdb.py

+ 14
- 0
docs/api/earwigbot.wiki.copyvios.rst View File

@@ -8,6 +8,13 @@ copyvios Package
:members: :members:
:undoc-members: :undoc-members:


:mod:`exclusions` Module
------------------------

.. automodule:: earwigbot.wiki.copyvios.exclusions
:members:
:undoc-members:

:mod:`markov` Module :mod:`markov` Module
-------------------- --------------------


@@ -24,6 +31,13 @@ copyvios Package
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


:mod:`result` Module
--------------------

.. automodule:: earwigbot.wiki.copyvios.result
:members:
:undoc-members:

:mod:`search` Module :mod:`search` Module
-------------------- --------------------




+ 3
- 3
docs/toolset.rst View File

@@ -48,9 +48,9 @@ wikis, you can usually use code like this::


This works because EarwigBot assumes that the URL for the site is This works because EarwigBot assumes that the URL for the site is
``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL ``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL
connection info (if any) are stored as ``config.wiki["sql"]``. This might
change if you're dealing with non-WMF wikis, where the code might look
something more like::
connection info (if any) is stored as ``config.wiki["sql"]``. This might change
if you're dealing with non-WMF wikis, where the code might look something more
like::


project, lang = "mywiki", "it" project, lang = "mywiki", "it"
try: try:


+ 12
- 44
earwigbot/wiki/copyvios/__init__.py View File

@@ -33,47 +33,10 @@ except ImportError:
from earwigbot import exceptions from earwigbot import exceptions
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection
from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser
from earwigbot.wiki.copyvios.result import CopyvioCheckResult
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine


__all__ = ["CopyvioCheckResult", "CopyvioMixIn"]

class CopyvioCheckResult(object):
"""
**EarwigBot: Wiki Toolset: Copyvio Check Result**

A class holding information about the results of a copyvio check.

*Attributes:*

- :py:attr:`violation`: ``True`` if this is a violation, else ``False``
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy
- :py:attr:`url`: the URL of the violated page
- :py:attr:`queries`: the number of queries used to reach a result
- :py:attr:`article_chain`: the MarkovChain of the article text
- :py:attr:`source_chain`: the MarkovChain of the violated page text
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two
"""

def __init__(self, violation, confidence, url, queries, article, chains):
self.violation = violation
self.confidence = confidence
self.url = url
self.queries = queries
self.article_chain = article
self.source_chain = chains[0]
self.delta_chain = chains[1]

def __repr__(self):
"""Return the canonical string representation of the result."""
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
return res.format(self.violation, self.confidence, self.url,
self.queries)

def __str__(self):
"""Return a nice string representation of the result."""
res = "<CopyvioCheckResult ({0} with {1} conf)>"
return res.format(self.violation, self.confidence)

__all__ = ["CopyvioMixIn"]


class CopyvioMixIn(object): class CopyvioMixIn(object):
""" """
@@ -88,6 +51,7 @@ class CopyvioMixIn(object):


def __init__(self, site): def __init__(self, site):
self._search_config = site._search_config self._search_config = site._search_config
self._exclusions_db = self._search_config["exclusions_db"]
self._opener = build_opener() self._opener = build_opener()
self._opener.addheaders = site._opener.addheaders self._opener.addheaders = site._opener.addheaders


@@ -156,8 +120,9 @@ class CopyvioMixIn(object):
interquery_sleep=1): interquery_sleep=1):
"""Check the page for copyright violations. """Check the page for copyright violations.


Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult`
object with information on the results of the check.
Returns a
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object
with information on the results of the check.


*max_queries* is self-explanatory; we will never make more than this *max_queries* is self-explanatory; we will never make more than this
number of queries in a given check. If it's lower than 0, we will not number of queries in a given check. If it's lower than 0, we will not
@@ -171,6 +136,7 @@ class CopyvioMixIn(object):
:py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors. :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors.
""" """
searcher = self._select_search_engine() searcher = self._select_search_engine()
self._exclusions_db.sync(self.site.name)
handled_urls = [] handled_urls = []
best_confidence = 0 best_confidence = 0
best_match = None best_match = None
@@ -193,6 +159,8 @@ class CopyvioMixIn(object):
urls = [url for url in urls if url not in handled_urls] urls = [url for url in urls if url not in handled_urls]
for url in urls: for url in urls:
handled_urls.append(url) handled_urls.append(url)
if self._exclusions_db.check(self.site.name, url):
continue
conf, chains = self._copyvio_compare_content(article_chain, url) conf, chains = self._copyvio_compare_content(article_chain, url)
if conf > best_confidence: if conf > best_confidence:
best_confidence = conf best_confidence = conf
@@ -216,9 +184,9 @@ class CopyvioMixIn(object):


This is essentially a reduced version of the above - a copyivo This is essentially a reduced version of the above - a copyivo
comparison is made using Markov chains and the result is returned in a comparison is made using Markov chains and the result is returned in a
:py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but
without using a search engine, since the suspected "violated" URL is
supplied from the start.
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object -
but without using a search engine, since the suspected "violated" URL
is supplied from the start.


Its primary use is to generate a result when the URL is retrieved from Its primary use is to generate a result when the URL is retrieved from
a cache, like the one used in EarwigBot's Toolserver site. After a a cache, like the one used in EarwigBot's Toolserver site. After a


+ 155
- 0
earwigbot/wiki/copyvios/exclusions.py View File

@@ -0,0 +1,155 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import re
import sqlite3 as sqlite
from threading import Lock
from time import time

from earwigbot import exceptions

__all__ = ["ExclusionsDB"]

default_sources = {
"enwiki": [
"Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def",
"Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl",
"Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr",
"Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz"
]
}

class ExclusionsDB(object):
"""
**EarwigBot: Wiki Toolset: Exclusions Database Manager**

Controls the :file:`.exclusions.db` file, which stores URLs excluded from
copyright violation checks on account of being known mirrors, for example.
"""

def __init__(self, sitesdb, dbfile, logger):
self._sitesdb = sitesdb
self._dbfile = dbfile
self._logger = logger
self._db_access_lock = Lock()

def _create(self):
"""Initialize the exclusions database with its necessary tables."""
script = """
CREATE TABLE sources (source_sitename, source_page);
CREATE TABLE updates (update_sitename, update_time);
CREATE TABLE exclusions (exclusion_sitename, exclusion_url);
"""
query = "INSERT INTO sources VALUES (?, ?);"
sources = []
for sitename, pages in default_sources.iteritems():
[sources.append((sitename, page)) for page in pages]

with sqlite.connect(self._dbfile) as conn:
conn.executescript(script)
conn.executemany(query, sources)

def _load_source(self, site, source):
"""Load from a specific source and return a set of URLs."""
urls = set()
try:
data = site.get_page(source).get()
except exceptions.PageNotFoundError:
return urls

regexes = [
"url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>",
"\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?"
]
for regex in regexes:
[urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)]
return urls

def _update(self, sitename):
"""Update the database from listed sources in the index."""
query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;"
query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?"
query4 = "INSERT INTO exclusions VALUES (?, ?);"
query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;"
query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;"
query7 = "INSERT INTO updates VALUES (?, ?);"

site = self._sitesdb.get_site(sitename)
with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
urls = set()
for (source,) in conn.execute(query1, (sitename,)):
urls |= self._load_source(site, source)
for (url,) in conn.execute(query2, (sitename,)):
if url in urls:
urls.remove(url)
else:
conn.execute(query3, (sitename, url))
conn.executemany(query4, [(sitename, url) for url in urls])
if conn.execute(query5, (name,)).fetchone():
conn.execute(query6, (time(), sitename))
else:
conn.execute(query7, (sitename, time()))

def _get_last_update(self, sitename):
"""Return the UNIX timestamp of the last time the db was updated."""
query = "SELECT update_time FROM updates WHERE update_sitename = ?;"
with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
try:
result = conn.execute(query, (sitename,)).fetchone()
except sqlite.OperationalError:
self._create()
return 0
return result[0] if result else 0

def sync(self, sitename):
"""Update the database if it hasn't been updated in the past month.

This only updates the exclusions database for the *sitename* site.
"""
max_staleness = 60 * 60 * 24 * 30
time_since_update = int(time() - self._get_last_update())
if time_since_update > max_staleness:
log = "Updating stale database: {0} (last updated {1} seconds ago)"
self._logger.info(log.format(sitename, time_since_update))
self._update(sitename)
else:
log = "Database for {0} is still fresh (last updated {1} seconds ago)"
self._logger.debug(log.format(sitename, time_since_update))

def check(self, sitename, url):
"""Check whether a given URL is in the exclusions database.

Return ``True`` if the URL is in the database, or ``False`` otherwise.
"""
normalized = re.sub("https?://", "", url.lower())
query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?"
with sqlite.connect(self._dbfile) as conn, self._db_access_lock:
for row in conn.execute(query, (sitename,)):
if normalized.startswith(row[0]):
log = "Exclusion detected in {0} for {1}"
self._logger.debug(log.format(sitename, url))
return True

log = "No exclusions in {0} for {1}".format(sitename, url)
self._logger.debug(log)
return False

+ 60
- 0
earwigbot/wiki/copyvios/result.py View File

@@ -0,0 +1,60 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

__all__ = ["CopyvioCheckResult"]

class CopyvioCheckResult(object):
"""
**EarwigBot: Wiki Toolset: Copyvio Check Result**

A class holding information about the results of a copyvio check.

*Attributes:*

- :py:attr:`violation`: ``True`` if this is a violation, else ``False``
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy
- :py:attr:`url`: the URL of the violated page
- :py:attr:`queries`: the number of queries used to reach a result
- :py:attr:`article_chain`: the MarkovChain of the article text
- :py:attr:`source_chain`: the MarkovChain of the violated page text
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two
"""

def __init__(self, violation, confidence, url, queries, article, chains):
self.violation = violation
self.confidence = confidence
self.url = url
self.queries = queries
self.article_chain = article
self.source_chain = chains[0]
self.delta_chain = chains[1]

def __repr__(self):
"""Return the canonical string representation of the result."""
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})"
return res.format(self.violation, self.confidence, self.url,
self.queries)

def __str__(self):
"""Return a nice string representation of the result."""
res = "<CopyvioCheckResult ({0} with {1} conf)>"
return res.format(self.violation, self.confidence)

+ 8
- 0
earwigbot/wiki/sitesdb.py View File

@@ -29,6 +29,7 @@ import sqlite3 as sqlite


from earwigbot import __version__ from earwigbot import __version__
from earwigbot.exceptions import SiteNotFoundError from earwigbot.exceptions import SiteNotFoundError
from earwigbot.wiki.copyvios.exclusions import ExclusionsDB
from earwigbot.wiki.site import Site from earwigbot.wiki.site import Site


__all__ = ["SitesDB"] __all__ = ["SitesDB"]
@@ -58,11 +59,16 @@ class SitesDB(object):
"""Set up the manager with an attribute for the base Bot object.""" """Set up the manager with an attribute for the base Bot object."""
self.config = bot.config self.config = bot.config
self._logger = bot.logger.getChild("wiki") self._logger = bot.logger.getChild("wiki")

self._sites = {} # Internal site cache self._sites = {} # Internal site cache
self._sitesdb = path.join(bot.config.root_dir, "sites.db") self._sitesdb = path.join(bot.config.root_dir, "sites.db")
self._cookie_file = path.join(bot.config.root_dir, ".cookies") self._cookie_file = path.join(bot.config.root_dir, ".cookies")
self._cookiejar = None self._cookiejar = None


excl_db = path.join(bot.config.root_dir, "exclusions.db")
excl_logger = self._logger.getChild("exclusionsdb")
self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger)

def __repr__(self): def __repr__(self):
"""Return the canonical string representation of the SitesDB.""" """Return the canonical string representation of the SitesDB."""
res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})" res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})"
@@ -195,6 +201,7 @@ class SitesDB(object):
if search_config: if search_config:
nltk_dir = path.join(self.config.root_dir, ".nltk") nltk_dir = path.join(self.config.root_dir, ".nltk")
search_config["nltk_dir"] = nltk_dir search_config["nltk_dir"] = nltk_dir
search_config["exclusions_db"] = self._exclusions_db


if not sql: if not sql:
sql = config.wiki.get("sql", {}) sql = config.wiki.get("sql", {})
@@ -379,6 +386,7 @@ class SitesDB(object):
if search_config: if search_config:
nltk_dir = path.join(self.config.root_dir, ".nltk") nltk_dir = path.join(self.config.root_dir, ".nltk")
search_config["nltk_dir"] = nltk_dir search_config["nltk_dir"] = nltk_dir
search_config["exclusions_db"] = self._exclusions_db


if not sql: if not sql:
sql = config.wiki.get("sql", {}) sql = config.wiki.get("sql", {})


Loading…
Cancel
Save