* Added exclusions module with a fully implemented ExclusionsDB that can pull from multiple sources for different sites. * Moved CopyvioCheckResult to its own module, to be imported by __init__. * Some other related changes.tags/v0.1^2
@@ -8,6 +8,13 @@ copyvios Package | |||
:members: | |||
:undoc-members: | |||
:mod:`exclusions` Module | |||
------------------------ | |||
.. automodule:: earwigbot.wiki.copyvios.exclusions | |||
:members: | |||
:undoc-members: | |||
:mod:`markov` Module | |||
-------------------- | |||
@@ -24,6 +31,13 @@ copyvios Package | |||
:undoc-members: | |||
:show-inheritance: | |||
:mod:`result` Module | |||
-------------------- | |||
.. automodule:: earwigbot.wiki.copyvios.result | |||
:members: | |||
:undoc-members: | |||
:mod:`search` Module | |||
-------------------- | |||
@@ -48,9 +48,9 @@ wikis, you can usually use code like this:: | |||
This works because EarwigBot assumes that the URL for the site is | |||
``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL | |||
connection info (if any) are stored as ``config.wiki["sql"]``. This might | |||
change if you're dealing with non-WMF wikis, where the code might look | |||
something more like:: | |||
connection info (if any) is stored as ``config.wiki["sql"]``. This might change | |||
if you're dealing with non-WMF wikis, where the code might look something more | |||
like:: | |||
project, lang = "mywiki", "it" | |||
try: | |||
@@ -33,47 +33,10 @@ except ImportError: | |||
from earwigbot import exceptions | |||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | |||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser | |||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult | |||
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | |||
__all__ = ["CopyvioCheckResult", "CopyvioMixIn"] | |||
class CopyvioCheckResult(object): | |||
""" | |||
**EarwigBot: Wiki Toolset: Copyvio Check Result** | |||
A class holding information about the results of a copyvio check. | |||
*Attributes:* | |||
- :py:attr:`violation`: ``True`` if this is a violation, else ``False`` | |||
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy | |||
- :py:attr:`url`: the URL of the violated page | |||
- :py:attr:`queries`: the number of queries used to reach a result | |||
- :py:attr:`article_chain`: the MarkovChain of the article text | |||
- :py:attr:`source_chain`: the MarkovChain of the violated page text | |||
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two | |||
""" | |||
def __init__(self, violation, confidence, url, queries, article, chains): | |||
self.violation = violation | |||
self.confidence = confidence | |||
self.url = url | |||
self.queries = queries | |||
self.article_chain = article | |||
self.source_chain = chains[0] | |||
self.delta_chain = chains[1] | |||
def __repr__(self): | |||
"""Return the canonical string representation of the result.""" | |||
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" | |||
return res.format(self.violation, self.confidence, self.url, | |||
self.queries) | |||
def __str__(self): | |||
"""Return a nice string representation of the result.""" | |||
res = "<CopyvioCheckResult ({0} with {1} conf)>" | |||
return res.format(self.violation, self.confidence) | |||
__all__ = ["CopyvioMixIn"] | |||
class CopyvioMixIn(object): | |||
""" | |||
@@ -88,6 +51,7 @@ class CopyvioMixIn(object): | |||
def __init__(self, site): | |||
self._search_config = site._search_config | |||
self._exclusions_db = self._search_config["exclusions_db"] | |||
self._opener = build_opener() | |||
self._opener.addheaders = site._opener.addheaders | |||
@@ -156,8 +120,9 @@ class CopyvioMixIn(object): | |||
interquery_sleep=1): | |||
"""Check the page for copyright violations. | |||
Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` | |||
object with information on the results of the check. | |||
Returns a | |||
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object | |||
with information on the results of the check. | |||
*max_queries* is self-explanatory; we will never make more than this | |||
number of queries in a given check. If it's lower than 0, we will not | |||
@@ -171,6 +136,7 @@ class CopyvioMixIn(object): | |||
:py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors. | |||
""" | |||
searcher = self._select_search_engine() | |||
self._exclusions_db.sync(self.site.name) | |||
handled_urls = [] | |||
best_confidence = 0 | |||
best_match = None | |||
@@ -193,6 +159,8 @@ class CopyvioMixIn(object): | |||
urls = [url for url in urls if url not in handled_urls] | |||
for url in urls: | |||
handled_urls.append(url) | |||
if self._exclusions_db.check(self.site.name, url): | |||
continue | |||
conf, chains = self._copyvio_compare_content(article_chain, url) | |||
if conf > best_confidence: | |||
best_confidence = conf | |||
@@ -216,9 +184,9 @@ class CopyvioMixIn(object): | |||
This is essentially a reduced version of the above - a copyivo | |||
comparison is made using Markov chains and the result is returned in a | |||
:py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but | |||
without using a search engine, since the suspected "violated" URL is | |||
supplied from the start. | |||
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object - | |||
but without using a search engine, since the suspected "violated" URL | |||
is supplied from the start. | |||
Its primary use is to generate a result when the URL is retrieved from | |||
a cache, like the one used in EarwigBot's Toolserver site. After a | |||
@@ -0,0 +1,155 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
import re | |||
import sqlite3 as sqlite | |||
from threading import Lock | |||
from time import time | |||
from earwigbot import exceptions | |||
__all__ = ["ExclusionsDB"] | |||
default_sources = { | |||
"enwiki": [ | |||
"Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def", | |||
"Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl", | |||
"Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr", | |||
"Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz" | |||
] | |||
} | |||
class ExclusionsDB(object): | |||
""" | |||
**EarwigBot: Wiki Toolset: Exclusions Database Manager** | |||
Controls the :file:`.exclusions.db` file, which stores URLs excluded from | |||
copyright violation checks on account of being known mirrors, for example. | |||
""" | |||
def __init__(self, sitesdb, dbfile, logger): | |||
self._sitesdb = sitesdb | |||
self._dbfile = dbfile | |||
self._logger = logger | |||
self._db_access_lock = Lock() | |||
def _create(self): | |||
"""Initialize the exclusions database with its necessary tables.""" | |||
script = """ | |||
CREATE TABLE sources (source_sitename, source_page); | |||
CREATE TABLE updates (update_sitename, update_time); | |||
CREATE TABLE exclusions (exclusion_sitename, exclusion_url); | |||
""" | |||
query = "INSERT INTO sources VALUES (?, ?);" | |||
sources = [] | |||
for sitename, pages in default_sources.iteritems(): | |||
[sources.append((sitename, page)) for page in pages] | |||
with sqlite.connect(self._dbfile) as conn: | |||
conn.executescript(script) | |||
conn.executemany(query, sources) | |||
def _load_source(self, site, source): | |||
"""Load from a specific source and return a set of URLs.""" | |||
urls = set() | |||
try: | |||
data = site.get_page(source).get() | |||
except exceptions.PageNotFoundError: | |||
return urls | |||
regexes = [ | |||
"url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>", | |||
"\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?" | |||
] | |||
for regex in regexes: | |||
[urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)] | |||
return urls | |||
def _update(self, sitename): | |||
"""Update the database from listed sources in the index.""" | |||
query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;" | |||
query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | |||
query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?" | |||
query4 = "INSERT INTO exclusions VALUES (?, ?);" | |||
query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;" | |||
query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;" | |||
query7 = "INSERT INTO updates VALUES (?, ?);" | |||
site = self._sitesdb.get_site(sitename) | |||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||
urls = set() | |||
for (source,) in conn.execute(query1, (sitename,)): | |||
urls |= self._load_source(site, source) | |||
for (url,) in conn.execute(query2, (sitename,)): | |||
if url in urls: | |||
urls.remove(url) | |||
else: | |||
conn.execute(query3, (sitename, url)) | |||
conn.executemany(query4, [(sitename, url) for url in urls]) | |||
if conn.execute(query5, (name,)).fetchone(): | |||
conn.execute(query6, (time(), sitename)) | |||
else: | |||
conn.execute(query7, (sitename, time())) | |||
def _get_last_update(self, sitename): | |||
"""Return the UNIX timestamp of the last time the db was updated.""" | |||
query = "SELECT update_time FROM updates WHERE update_sitename = ?;" | |||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||
try: | |||
result = conn.execute(query, (sitename,)).fetchone() | |||
except sqlite.OperationalError: | |||
self._create() | |||
return 0 | |||
return result[0] if result else 0 | |||
def sync(self, sitename): | |||
"""Update the database if it hasn't been updated in the past month. | |||
This only updates the exclusions database for the *sitename* site. | |||
""" | |||
max_staleness = 60 * 60 * 24 * 30 | |||
time_since_update = int(time() - self._get_last_update()) | |||
if time_since_update > max_staleness: | |||
log = "Updating stale database: {0} (last updated {1} seconds ago)" | |||
self._logger.info(log.format(sitename, time_since_update)) | |||
self._update(sitename) | |||
else: | |||
log = "Database for {0} is still fresh (last updated {1} seconds ago)" | |||
self._logger.debug(log.format(sitename, time_since_update)) | |||
def check(self, sitename, url): | |||
"""Check whether a given URL is in the exclusions database. | |||
Return ``True`` if the URL is in the database, or ``False`` otherwise. | |||
""" | |||
normalized = re.sub("https?://", "", url.lower()) | |||
query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | |||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||
for row in conn.execute(query, (sitename,)): | |||
if normalized.startswith(row[0]): | |||
log = "Exclusion detected in {0} for {1}" | |||
self._logger.debug(log.format(sitename, url)) | |||
return True | |||
log = "No exclusions in {0} for {1}".format(sitename, url) | |||
self._logger.debug(log) | |||
return False |
@@ -0,0 +1,60 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
__all__ = ["CopyvioCheckResult"] | |||
class CopyvioCheckResult(object): | |||
""" | |||
**EarwigBot: Wiki Toolset: Copyvio Check Result** | |||
A class holding information about the results of a copyvio check. | |||
*Attributes:* | |||
- :py:attr:`violation`: ``True`` if this is a violation, else ``False`` | |||
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy | |||
- :py:attr:`url`: the URL of the violated page | |||
- :py:attr:`queries`: the number of queries used to reach a result | |||
- :py:attr:`article_chain`: the MarkovChain of the article text | |||
- :py:attr:`source_chain`: the MarkovChain of the violated page text | |||
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two | |||
""" | |||
def __init__(self, violation, confidence, url, queries, article, chains): | |||
self.violation = violation | |||
self.confidence = confidence | |||
self.url = url | |||
self.queries = queries | |||
self.article_chain = article | |||
self.source_chain = chains[0] | |||
self.delta_chain = chains[1] | |||
def __repr__(self): | |||
"""Return the canonical string representation of the result.""" | |||
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" | |||
return res.format(self.violation, self.confidence, self.url, | |||
self.queries) | |||
def __str__(self): | |||
"""Return a nice string representation of the result.""" | |||
res = "<CopyvioCheckResult ({0} with {1} conf)>" | |||
return res.format(self.violation, self.confidence) |
@@ -29,6 +29,7 @@ import sqlite3 as sqlite | |||
from earwigbot import __version__ | |||
from earwigbot.exceptions import SiteNotFoundError | |||
from earwigbot.wiki.copyvios.exclusions import ExclusionsDB | |||
from earwigbot.wiki.site import Site | |||
__all__ = ["SitesDB"] | |||
@@ -58,11 +59,16 @@ class SitesDB(object): | |||
"""Set up the manager with an attribute for the base Bot object.""" | |||
self.config = bot.config | |||
self._logger = bot.logger.getChild("wiki") | |||
self._sites = {} # Internal site cache | |||
self._sitesdb = path.join(bot.config.root_dir, "sites.db") | |||
self._cookie_file = path.join(bot.config.root_dir, ".cookies") | |||
self._cookiejar = None | |||
excl_db = path.join(bot.config.root_dir, "exclusions.db") | |||
excl_logger = self._logger.getChild("exclusionsdb") | |||
self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger) | |||
def __repr__(self): | |||
"""Return the canonical string representation of the SitesDB.""" | |||
res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})" | |||
@@ -195,6 +201,7 @@ class SitesDB(object): | |||
if search_config: | |||
nltk_dir = path.join(self.config.root_dir, ".nltk") | |||
search_config["nltk_dir"] = nltk_dir | |||
search_config["exclusions_db"] = self._exclusions_db | |||
if not sql: | |||
sql = config.wiki.get("sql", {}) | |||
@@ -379,6 +386,7 @@ class SitesDB(object): | |||
if search_config: | |||
nltk_dir = path.join(self.config.root_dir, ".nltk") | |||
search_config["nltk_dir"] = nltk_dir | |||
search_config["exclusions_db"] = self._exclusions_db | |||
if not sql: | |||
sql = config.wiki.get("sql", {}) | |||