* Added exclusions module with a fully implemented ExclusionsDB that can pull from multiple sources for different sites. * Moved CopyvioCheckResult to its own module, to be imported by __init__. * Some other related changes.tags/v0.1^2
@@ -8,6 +8,13 @@ copyvios Package | |||||
:members: | :members: | ||||
:undoc-members: | :undoc-members: | ||||
:mod:`exclusions` Module | |||||
------------------------ | |||||
.. automodule:: earwigbot.wiki.copyvios.exclusions | |||||
:members: | |||||
:undoc-members: | |||||
:mod:`markov` Module | :mod:`markov` Module | ||||
-------------------- | -------------------- | ||||
@@ -24,6 +31,13 @@ copyvios Package | |||||
:undoc-members: | :undoc-members: | ||||
:show-inheritance: | :show-inheritance: | ||||
:mod:`result` Module | |||||
-------------------- | |||||
.. automodule:: earwigbot.wiki.copyvios.result | |||||
:members: | |||||
:undoc-members: | |||||
:mod:`search` Module | :mod:`search` Module | ||||
-------------------- | -------------------- | ||||
@@ -48,9 +48,9 @@ wikis, you can usually use code like this:: | |||||
This works because EarwigBot assumes that the URL for the site is | This works because EarwigBot assumes that the URL for the site is | ||||
``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL | ``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL | ||||
connection info (if any) are stored as ``config.wiki["sql"]``. This might | |||||
change if you're dealing with non-WMF wikis, where the code might look | |||||
something more like:: | |||||
connection info (if any) is stored as ``config.wiki["sql"]``. This might change | |||||
if you're dealing with non-WMF wikis, where the code might look something more | |||||
like:: | |||||
project, lang = "mywiki", "it" | project, lang = "mywiki", "it" | ||||
try: | try: | ||||
@@ -33,47 +33,10 @@ except ImportError: | |||||
from earwigbot import exceptions | from earwigbot import exceptions | ||||
from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection | ||||
from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser | from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser | ||||
from earwigbot.wiki.copyvios.result import CopyvioCheckResult | |||||
from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine | ||||
__all__ = ["CopyvioCheckResult", "CopyvioMixIn"] | |||||
class CopyvioCheckResult(object): | |||||
""" | |||||
**EarwigBot: Wiki Toolset: Copyvio Check Result** | |||||
A class holding information about the results of a copyvio check. | |||||
*Attributes:* | |||||
- :py:attr:`violation`: ``True`` if this is a violation, else ``False`` | |||||
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy | |||||
- :py:attr:`url`: the URL of the violated page | |||||
- :py:attr:`queries`: the number of queries used to reach a result | |||||
- :py:attr:`article_chain`: the MarkovChain of the article text | |||||
- :py:attr:`source_chain`: the MarkovChain of the violated page text | |||||
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two | |||||
""" | |||||
def __init__(self, violation, confidence, url, queries, article, chains): | |||||
self.violation = violation | |||||
self.confidence = confidence | |||||
self.url = url | |||||
self.queries = queries | |||||
self.article_chain = article | |||||
self.source_chain = chains[0] | |||||
self.delta_chain = chains[1] | |||||
def __repr__(self): | |||||
"""Return the canonical string representation of the result.""" | |||||
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" | |||||
return res.format(self.violation, self.confidence, self.url, | |||||
self.queries) | |||||
def __str__(self): | |||||
"""Return a nice string representation of the result.""" | |||||
res = "<CopyvioCheckResult ({0} with {1} conf)>" | |||||
return res.format(self.violation, self.confidence) | |||||
__all__ = ["CopyvioMixIn"] | |||||
class CopyvioMixIn(object): | class CopyvioMixIn(object): | ||||
""" | """ | ||||
@@ -88,6 +51,7 @@ class CopyvioMixIn(object): | |||||
def __init__(self, site): | def __init__(self, site): | ||||
self._search_config = site._search_config | self._search_config = site._search_config | ||||
self._exclusions_db = self._search_config["exclusions_db"] | |||||
self._opener = build_opener() | self._opener = build_opener() | ||||
self._opener.addheaders = site._opener.addheaders | self._opener.addheaders = site._opener.addheaders | ||||
@@ -156,8 +120,9 @@ class CopyvioMixIn(object): | |||||
interquery_sleep=1): | interquery_sleep=1): | ||||
"""Check the page for copyright violations. | """Check the page for copyright violations. | ||||
Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` | |||||
object with information on the results of the check. | |||||
Returns a | |||||
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object | |||||
with information on the results of the check. | |||||
*max_queries* is self-explanatory; we will never make more than this | *max_queries* is self-explanatory; we will never make more than this | ||||
number of queries in a given check. If it's lower than 0, we will not | number of queries in a given check. If it's lower than 0, we will not | ||||
@@ -171,6 +136,7 @@ class CopyvioMixIn(object): | |||||
:py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors. | :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors. | ||||
""" | """ | ||||
searcher = self._select_search_engine() | searcher = self._select_search_engine() | ||||
self._exclusions_db.sync(self.site.name) | |||||
handled_urls = [] | handled_urls = [] | ||||
best_confidence = 0 | best_confidence = 0 | ||||
best_match = None | best_match = None | ||||
@@ -193,6 +159,8 @@ class CopyvioMixIn(object): | |||||
urls = [url for url in urls if url not in handled_urls] | urls = [url for url in urls if url not in handled_urls] | ||||
for url in urls: | for url in urls: | ||||
handled_urls.append(url) | handled_urls.append(url) | ||||
if self._exclusions_db.check(self.site.name, url): | |||||
continue | |||||
conf, chains = self._copyvio_compare_content(article_chain, url) | conf, chains = self._copyvio_compare_content(article_chain, url) | ||||
if conf > best_confidence: | if conf > best_confidence: | ||||
best_confidence = conf | best_confidence = conf | ||||
@@ -216,9 +184,9 @@ class CopyvioMixIn(object): | |||||
This is essentially a reduced version of the above - a copyivo | This is essentially a reduced version of the above - a copyivo | ||||
comparison is made using Markov chains and the result is returned in a | comparison is made using Markov chains and the result is returned in a | ||||
:py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but | |||||
without using a search engine, since the suspected "violated" URL is | |||||
supplied from the start. | |||||
:py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object - | |||||
but without using a search engine, since the suspected "violated" URL | |||||
is supplied from the start. | |||||
Its primary use is to generate a result when the URL is retrieved from | Its primary use is to generate a result when the URL is retrieved from | ||||
a cache, like the one used in EarwigBot's Toolserver site. After a | a cache, like the one used in EarwigBot's Toolserver site. After a | ||||
@@ -0,0 +1,155 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
import re | |||||
import sqlite3 as sqlite | |||||
from threading import Lock | |||||
from time import time | |||||
from earwigbot import exceptions | |||||
__all__ = ["ExclusionsDB"] | |||||
default_sources = { | |||||
"enwiki": [ | |||||
"Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def", | |||||
"Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl", | |||||
"Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr", | |||||
"Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz" | |||||
] | |||||
} | |||||
class ExclusionsDB(object): | |||||
""" | |||||
**EarwigBot: Wiki Toolset: Exclusions Database Manager** | |||||
Controls the :file:`.exclusions.db` file, which stores URLs excluded from | |||||
copyright violation checks on account of being known mirrors, for example. | |||||
""" | |||||
def __init__(self, sitesdb, dbfile, logger): | |||||
self._sitesdb = sitesdb | |||||
self._dbfile = dbfile | |||||
self._logger = logger | |||||
self._db_access_lock = Lock() | |||||
def _create(self): | |||||
"""Initialize the exclusions database with its necessary tables.""" | |||||
script = """ | |||||
CREATE TABLE sources (source_sitename, source_page); | |||||
CREATE TABLE updates (update_sitename, update_time); | |||||
CREATE TABLE exclusions (exclusion_sitename, exclusion_url); | |||||
""" | |||||
query = "INSERT INTO sources VALUES (?, ?);" | |||||
sources = [] | |||||
for sitename, pages in default_sources.iteritems(): | |||||
[sources.append((sitename, page)) for page in pages] | |||||
with sqlite.connect(self._dbfile) as conn: | |||||
conn.executescript(script) | |||||
conn.executemany(query, sources) | |||||
def _load_source(self, site, source): | |||||
"""Load from a specific source and return a set of URLs.""" | |||||
urls = set() | |||||
try: | |||||
data = site.get_page(source).get() | |||||
except exceptions.PageNotFoundError: | |||||
return urls | |||||
regexes = [ | |||||
"url\s*=\s*<nowiki>(?:https?:)?(?://)?(.*)</nowiki>", | |||||
"\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?" | |||||
] | |||||
for regex in regexes: | |||||
[urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)] | |||||
return urls | |||||
def _update(self, sitename): | |||||
"""Update the database from listed sources in the index.""" | |||||
query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;" | |||||
query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | |||||
query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?" | |||||
query4 = "INSERT INTO exclusions VALUES (?, ?);" | |||||
query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;" | |||||
query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;" | |||||
query7 = "INSERT INTO updates VALUES (?, ?);" | |||||
site = self._sitesdb.get_site(sitename) | |||||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||||
urls = set() | |||||
for (source,) in conn.execute(query1, (sitename,)): | |||||
urls |= self._load_source(site, source) | |||||
for (url,) in conn.execute(query2, (sitename,)): | |||||
if url in urls: | |||||
urls.remove(url) | |||||
else: | |||||
conn.execute(query3, (sitename, url)) | |||||
conn.executemany(query4, [(sitename, url) for url in urls]) | |||||
if conn.execute(query5, (name,)).fetchone(): | |||||
conn.execute(query6, (time(), sitename)) | |||||
else: | |||||
conn.execute(query7, (sitename, time())) | |||||
def _get_last_update(self, sitename): | |||||
"""Return the UNIX timestamp of the last time the db was updated.""" | |||||
query = "SELECT update_time FROM updates WHERE update_sitename = ?;" | |||||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||||
try: | |||||
result = conn.execute(query, (sitename,)).fetchone() | |||||
except sqlite.OperationalError: | |||||
self._create() | |||||
return 0 | |||||
return result[0] if result else 0 | |||||
def sync(self, sitename): | |||||
"""Update the database if it hasn't been updated in the past month. | |||||
This only updates the exclusions database for the *sitename* site. | |||||
""" | |||||
max_staleness = 60 * 60 * 24 * 30 | |||||
time_since_update = int(time() - self._get_last_update()) | |||||
if time_since_update > max_staleness: | |||||
log = "Updating stale database: {0} (last updated {1} seconds ago)" | |||||
self._logger.info(log.format(sitename, time_since_update)) | |||||
self._update(sitename) | |||||
else: | |||||
log = "Database for {0} is still fresh (last updated {1} seconds ago)" | |||||
self._logger.debug(log.format(sitename, time_since_update)) | |||||
def check(self, sitename, url): | |||||
"""Check whether a given URL is in the exclusions database. | |||||
Return ``True`` if the URL is in the database, or ``False`` otherwise. | |||||
""" | |||||
normalized = re.sub("https?://", "", url.lower()) | |||||
query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" | |||||
with sqlite.connect(self._dbfile) as conn, self._db_access_lock: | |||||
for row in conn.execute(query, (sitename,)): | |||||
if normalized.startswith(row[0]): | |||||
log = "Exclusion detected in {0} for {1}" | |||||
self._logger.debug(log.format(sitename, url)) | |||||
return True | |||||
log = "No exclusions in {0} for {1}".format(sitename, url) | |||||
self._logger.debug(log) | |||||
return False |
@@ -0,0 +1,60 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2009-2012 Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
__all__ = ["CopyvioCheckResult"] | |||||
class CopyvioCheckResult(object): | |||||
""" | |||||
**EarwigBot: Wiki Toolset: Copyvio Check Result** | |||||
A class holding information about the results of a copyvio check. | |||||
*Attributes:* | |||||
- :py:attr:`violation`: ``True`` if this is a violation, else ``False`` | |||||
- :py:attr:`confidence`: a float between 0 and 1 indicating accuracy | |||||
- :py:attr:`url`: the URL of the violated page | |||||
- :py:attr:`queries`: the number of queries used to reach a result | |||||
- :py:attr:`article_chain`: the MarkovChain of the article text | |||||
- :py:attr:`source_chain`: the MarkovChain of the violated page text | |||||
- :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two | |||||
""" | |||||
def __init__(self, violation, confidence, url, queries, article, chains): | |||||
self.violation = violation | |||||
self.confidence = confidence | |||||
self.url = url | |||||
self.queries = queries | |||||
self.article_chain = article | |||||
self.source_chain = chains[0] | |||||
self.delta_chain = chains[1] | |||||
def __repr__(self): | |||||
"""Return the canonical string representation of the result.""" | |||||
res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" | |||||
return res.format(self.violation, self.confidence, self.url, | |||||
self.queries) | |||||
def __str__(self): | |||||
"""Return a nice string representation of the result.""" | |||||
res = "<CopyvioCheckResult ({0} with {1} conf)>" | |||||
return res.format(self.violation, self.confidence) |
@@ -29,6 +29,7 @@ import sqlite3 as sqlite | |||||
from earwigbot import __version__ | from earwigbot import __version__ | ||||
from earwigbot.exceptions import SiteNotFoundError | from earwigbot.exceptions import SiteNotFoundError | ||||
from earwigbot.wiki.copyvios.exclusions import ExclusionsDB | |||||
from earwigbot.wiki.site import Site | from earwigbot.wiki.site import Site | ||||
__all__ = ["SitesDB"] | __all__ = ["SitesDB"] | ||||
@@ -58,11 +59,16 @@ class SitesDB(object): | |||||
"""Set up the manager with an attribute for the base Bot object.""" | """Set up the manager with an attribute for the base Bot object.""" | ||||
self.config = bot.config | self.config = bot.config | ||||
self._logger = bot.logger.getChild("wiki") | self._logger = bot.logger.getChild("wiki") | ||||
self._sites = {} # Internal site cache | self._sites = {} # Internal site cache | ||||
self._sitesdb = path.join(bot.config.root_dir, "sites.db") | self._sitesdb = path.join(bot.config.root_dir, "sites.db") | ||||
self._cookie_file = path.join(bot.config.root_dir, ".cookies") | self._cookie_file = path.join(bot.config.root_dir, ".cookies") | ||||
self._cookiejar = None | self._cookiejar = None | ||||
excl_db = path.join(bot.config.root_dir, "exclusions.db") | |||||
excl_logger = self._logger.getChild("exclusionsdb") | |||||
self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger) | |||||
def __repr__(self): | def __repr__(self): | ||||
"""Return the canonical string representation of the SitesDB.""" | """Return the canonical string representation of the SitesDB.""" | ||||
res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})" | res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})" | ||||
@@ -195,6 +201,7 @@ class SitesDB(object): | |||||
if search_config: | if search_config: | ||||
nltk_dir = path.join(self.config.root_dir, ".nltk") | nltk_dir = path.join(self.config.root_dir, ".nltk") | ||||
search_config["nltk_dir"] = nltk_dir | search_config["nltk_dir"] = nltk_dir | ||||
search_config["exclusions_db"] = self._exclusions_db | |||||
if not sql: | if not sql: | ||||
sql = config.wiki.get("sql", {}) | sql = config.wiki.get("sql", {}) | ||||
@@ -379,6 +386,7 @@ class SitesDB(object): | |||||
if search_config: | if search_config: | ||||
nltk_dir = path.join(self.config.root_dir, ".nltk") | nltk_dir = path.join(self.config.root_dir, ".nltk") | ||||
search_config["nltk_dir"] = nltk_dir | search_config["nltk_dir"] = nltk_dir | ||||
search_config["exclusions_db"] = self._exclusions_db | |||||
if not sql: | if not sql: | ||||
sql = config.wiki.get("sql", {}) | sql = config.wiki.get("sql", {}) | ||||