From a074da853bd8956803b9f0061e12d4ca1d32cff0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Jul 2012 14:44:15 -0400 Subject: [PATCH] More work on copyvios, including an exclusions database (#5) * Added exclusions module with a fully implemented ExclusionsDB that can pull from multiple sources for different sites. * Moved CopyvioCheckResult to its own module, to be imported by __init__. * Some other related changes. --- docs/api/earwigbot.wiki.copyvios.rst | 14 +++ docs/toolset.rst | 6 +- earwigbot/wiki/copyvios/__init__.py | 56 +++--------- earwigbot/wiki/copyvios/exclusions.py | 155 ++++++++++++++++++++++++++++++++++ earwigbot/wiki/copyvios/result.py | 60 +++++++++++++ earwigbot/wiki/sitesdb.py | 8 ++ 6 files changed, 252 insertions(+), 47 deletions(-) create mode 100644 earwigbot/wiki/copyvios/exclusions.py create mode 100644 earwigbot/wiki/copyvios/result.py diff --git a/docs/api/earwigbot.wiki.copyvios.rst b/docs/api/earwigbot.wiki.copyvios.rst index 7dbcf39..abddf7a 100644 --- a/docs/api/earwigbot.wiki.copyvios.rst +++ b/docs/api/earwigbot.wiki.copyvios.rst @@ -8,6 +8,13 @@ copyvios Package :members: :undoc-members: +:mod:`exclusions` Module +------------------------ + +.. automodule:: earwigbot.wiki.copyvios.exclusions + :members: + :undoc-members: + :mod:`markov` Module -------------------- @@ -24,6 +31,13 @@ copyvios Package :undoc-members: :show-inheritance: +:mod:`result` Module +-------------------- + +.. automodule:: earwigbot.wiki.copyvios.result + :members: + :undoc-members: + :mod:`search` Module -------------------- diff --git a/docs/toolset.rst b/docs/toolset.rst index fcdfc6d..e2258c8 100644 --- a/docs/toolset.rst +++ b/docs/toolset.rst @@ -48,9 +48,9 @@ wikis, you can usually use code like this:: This works because EarwigBot assumes that the URL for the site is ``"//{lang}.{project}.org"``, the API is at ``/w/api.php``, and the SQL -connection info (if any) are stored as ``config.wiki["sql"]``. This might -change if you're dealing with non-WMF wikis, where the code might look -something more like:: +connection info (if any) is stored as ``config.wiki["sql"]``. This might change +if you're dealing with non-WMF wikis, where the code might look something more +like:: project, lang = "mywiki", "it" try: diff --git a/earwigbot/wiki/copyvios/__init__.py b/earwigbot/wiki/copyvios/__init__.py index cf2ddde..0f29403 100644 --- a/earwigbot/wiki/copyvios/__init__.py +++ b/earwigbot/wiki/copyvios/__init__.py @@ -33,47 +33,10 @@ except ImportError: from earwigbot import exceptions from earwigbot.wiki.copyvios.markov import MarkovChain, MarkovChainIntersection from earwigbot.wiki.copyvios.parsers import ArticleTextParser, HTMLTextParser +from earwigbot.wiki.copyvios.result import CopyvioCheckResult from earwigbot.wiki.copyvios.search import YahooBOSSSearchEngine -__all__ = ["CopyvioCheckResult", "CopyvioMixIn"] - -class CopyvioCheckResult(object): - """ - **EarwigBot: Wiki Toolset: Copyvio Check Result** - - A class holding information about the results of a copyvio check. - - *Attributes:* - - - :py:attr:`violation`: ``True`` if this is a violation, else ``False`` - - :py:attr:`confidence`: a float between 0 and 1 indicating accuracy - - :py:attr:`url`: the URL of the violated page - - :py:attr:`queries`: the number of queries used to reach a result - - :py:attr:`article_chain`: the MarkovChain of the article text - - :py:attr:`source_chain`: the MarkovChain of the violated page text - - :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two - """ - - def __init__(self, violation, confidence, url, queries, article, chains): - self.violation = violation - self.confidence = confidence - self.url = url - self.queries = queries - self.article_chain = article - self.source_chain = chains[0] - self.delta_chain = chains[1] - - def __repr__(self): - """Return the canonical string representation of the result.""" - res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" - return res.format(self.violation, self.confidence, self.url, - self.queries) - - def __str__(self): - """Return a nice string representation of the result.""" - res = "" - return res.format(self.violation, self.confidence) - +__all__ = ["CopyvioMixIn"] class CopyvioMixIn(object): """ @@ -88,6 +51,7 @@ class CopyvioMixIn(object): def __init__(self, site): self._search_config = site._search_config + self._exclusions_db = self._search_config["exclusions_db"] self._opener = build_opener() self._opener.addheaders = site._opener.addheaders @@ -156,8 +120,9 @@ class CopyvioMixIn(object): interquery_sleep=1): """Check the page for copyright violations. - Returns a :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` - object with information on the results of the check. + Returns a + :py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object + with information on the results of the check. *max_queries* is self-explanatory; we will never make more than this number of queries in a given check. If it's lower than 0, we will not @@ -171,6 +136,7 @@ class CopyvioMixIn(object): :py:exc:`~earwigbot.exceptions.SearchQueryError`, ...) on errors. """ searcher = self._select_search_engine() + self._exclusions_db.sync(self.site.name) handled_urls = [] best_confidence = 0 best_match = None @@ -193,6 +159,8 @@ class CopyvioMixIn(object): urls = [url for url in urls if url not in handled_urls] for url in urls: handled_urls.append(url) + if self._exclusions_db.check(self.site.name, url): + continue conf, chains = self._copyvio_compare_content(article_chain, url) if conf > best_confidence: best_confidence = conf @@ -216,9 +184,9 @@ class CopyvioMixIn(object): This is essentially a reduced version of the above - a copyivo comparison is made using Markov chains and the result is returned in a - :py:class:`~earwigbot.wiki.copyvios.CopyvioCheckResult` object - but - without using a search engine, since the suspected "violated" URL is - supplied from the start. + :py:class:`~earwigbot.wiki.copyvios.result.CopyvioCheckResult` object - + but without using a search engine, since the suspected "violated" URL + is supplied from the start. Its primary use is to generate a result when the URL is retrieved from a cache, like the one used in EarwigBot's Toolserver site. After a diff --git a/earwigbot/wiki/copyvios/exclusions.py b/earwigbot/wiki/copyvios/exclusions.py new file mode 100644 index 0000000..fdbaa39 --- /dev/null +++ b/earwigbot/wiki/copyvios/exclusions.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2009-2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import re +import sqlite3 as sqlite +from threading import Lock +from time import time + +from earwigbot import exceptions + +__all__ = ["ExclusionsDB"] + +default_sources = { + "enwiki": [ + "Wikipedia:Mirrors and forks/Abc", "Wikipedia:Mirrors and forks/Def", + "Wikipedia:Mirrors and forks/Ghi", "Wikipedia:Mirrors and forks/Jkl", + "Wikipedia:Mirrors and forks/Mno", "Wikipedia:Mirrors and forks/Pqr", + "Wikipedia:Mirrors and forks/Stu", "Wikipedia:Mirrors and forks/Vwxyz" + ] +} + +class ExclusionsDB(object): + """ + **EarwigBot: Wiki Toolset: Exclusions Database Manager** + + Controls the :file:`.exclusions.db` file, which stores URLs excluded from + copyright violation checks on account of being known mirrors, for example. + """ + + def __init__(self, sitesdb, dbfile, logger): + self._sitesdb = sitesdb + self._dbfile = dbfile + self._logger = logger + self._db_access_lock = Lock() + + def _create(self): + """Initialize the exclusions database with its necessary tables.""" + script = """ + CREATE TABLE sources (source_sitename, source_page); + CREATE TABLE updates (update_sitename, update_time); + CREATE TABLE exclusions (exclusion_sitename, exclusion_url); + """ + query = "INSERT INTO sources VALUES (?, ?);" + sources = [] + for sitename, pages in default_sources.iteritems(): + [sources.append((sitename, page)) for page in pages] + + with sqlite.connect(self._dbfile) as conn: + conn.executescript(script) + conn.executemany(query, sources) + + def _load_source(self, site, source): + """Load from a specific source and return a set of URLs.""" + urls = set() + try: + data = site.get_page(source).get() + except exceptions.PageNotFoundError: + return urls + + regexes = [ + "url\s*=\s*(?:https?:)?(?://)?(.*)", + "\*\s*Site:\s*\[?(?:https?:)?(?://)?(.*)\]?" + ] + for regex in regexes: + [urls.add(url.lower()) for (url,) in re.findall(regex, data, re.I)] + return urls + + def _update(self, sitename): + """Update the database from listed sources in the index.""" + query1 = "SELECT source_page FROM sources WHERE source_sitename = ?;" + query2 = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" + query3 = "DELETE FROM exclusions WHERE exclusion_sitename = ? AND exclusion_url = ?" + query4 = "INSERT INTO exclusions VALUES (?, ?);" + query5 = "SELECT 1 FROM updates WHERE update_sitename = ?;" + query6 = "UPDATE updates SET update_time = ? WHERE update_sitename = ?;" + query7 = "INSERT INTO updates VALUES (?, ?);" + + site = self._sitesdb.get_site(sitename) + with sqlite.connect(self._dbfile) as conn, self._db_access_lock: + urls = set() + for (source,) in conn.execute(query1, (sitename,)): + urls |= self._load_source(site, source) + for (url,) in conn.execute(query2, (sitename,)): + if url in urls: + urls.remove(url) + else: + conn.execute(query3, (sitename, url)) + conn.executemany(query4, [(sitename, url) for url in urls]) + if conn.execute(query5, (name,)).fetchone(): + conn.execute(query6, (time(), sitename)) + else: + conn.execute(query7, (sitename, time())) + + def _get_last_update(self, sitename): + """Return the UNIX timestamp of the last time the db was updated.""" + query = "SELECT update_time FROM updates WHERE update_sitename = ?;" + with sqlite.connect(self._dbfile) as conn, self._db_access_lock: + try: + result = conn.execute(query, (sitename,)).fetchone() + except sqlite.OperationalError: + self._create() + return 0 + return result[0] if result else 0 + + def sync(self, sitename): + """Update the database if it hasn't been updated in the past month. + + This only updates the exclusions database for the *sitename* site. + """ + max_staleness = 60 * 60 * 24 * 30 + time_since_update = int(time() - self._get_last_update()) + if time_since_update > max_staleness: + log = "Updating stale database: {0} (last updated {1} seconds ago)" + self._logger.info(log.format(sitename, time_since_update)) + self._update(sitename) + else: + log = "Database for {0} is still fresh (last updated {1} seconds ago)" + self._logger.debug(log.format(sitename, time_since_update)) + + def check(self, sitename, url): + """Check whether a given URL is in the exclusions database. + + Return ``True`` if the URL is in the database, or ``False`` otherwise. + """ + normalized = re.sub("https?://", "", url.lower()) + query = "SELECT exclusion_url FROM exclusions WHERE exclusion_sitename = ?" + with sqlite.connect(self._dbfile) as conn, self._db_access_lock: + for row in conn.execute(query, (sitename,)): + if normalized.startswith(row[0]): + log = "Exclusion detected in {0} for {1}" + self._logger.debug(log.format(sitename, url)) + return True + + log = "No exclusions in {0} for {1}".format(sitename, url) + self._logger.debug(log) + return False diff --git a/earwigbot/wiki/copyvios/result.py b/earwigbot/wiki/copyvios/result.py new file mode 100644 index 0000000..0c3e98f --- /dev/null +++ b/earwigbot/wiki/copyvios/result.py @@ -0,0 +1,60 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2009-2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +__all__ = ["CopyvioCheckResult"] + +class CopyvioCheckResult(object): + """ + **EarwigBot: Wiki Toolset: Copyvio Check Result** + + A class holding information about the results of a copyvio check. + + *Attributes:* + + - :py:attr:`violation`: ``True`` if this is a violation, else ``False`` + - :py:attr:`confidence`: a float between 0 and 1 indicating accuracy + - :py:attr:`url`: the URL of the violated page + - :py:attr:`queries`: the number of queries used to reach a result + - :py:attr:`article_chain`: the MarkovChain of the article text + - :py:attr:`source_chain`: the MarkovChain of the violated page text + - :py:attr:`delta_chain`: the MarkovChainIntersection comparing the two + """ + + def __init__(self, violation, confidence, url, queries, article, chains): + self.violation = violation + self.confidence = confidence + self.url = url + self.queries = queries + self.article_chain = article + self.source_chain = chains[0] + self.delta_chain = chains[1] + + def __repr__(self): + """Return the canonical string representation of the result.""" + res = "CopyvioCheckResult(violation={0!r}, confidence={1!r}, url={2!r}, queries={3|r})" + return res.format(self.violation, self.confidence, self.url, + self.queries) + + def __str__(self): + """Return a nice string representation of the result.""" + res = "" + return res.format(self.violation, self.confidence) diff --git a/earwigbot/wiki/sitesdb.py b/earwigbot/wiki/sitesdb.py index cdff1fe..9d2c828 100644 --- a/earwigbot/wiki/sitesdb.py +++ b/earwigbot/wiki/sitesdb.py @@ -29,6 +29,7 @@ import sqlite3 as sqlite from earwigbot import __version__ from earwigbot.exceptions import SiteNotFoundError +from earwigbot.wiki.copyvios.exclusions import ExclusionsDB from earwigbot.wiki.site import Site __all__ = ["SitesDB"] @@ -58,11 +59,16 @@ class SitesDB(object): """Set up the manager with an attribute for the base Bot object.""" self.config = bot.config self._logger = bot.logger.getChild("wiki") + self._sites = {} # Internal site cache self._sitesdb = path.join(bot.config.root_dir, "sites.db") self._cookie_file = path.join(bot.config.root_dir, ".cookies") self._cookiejar = None + excl_db = path.join(bot.config.root_dir, "exclusions.db") + excl_logger = self._logger.getChild("exclusionsdb") + self._exclusions_db = ExclusionsDB(self, excl_db, excl_logger) + def __repr__(self): """Return the canonical string representation of the SitesDB.""" res = "SitesDB(config={0!r}, sitesdb={1!r}, cookie_file={2!r})" @@ -195,6 +201,7 @@ class SitesDB(object): if search_config: nltk_dir = path.join(self.config.root_dir, ".nltk") search_config["nltk_dir"] = nltk_dir + search_config["exclusions_db"] = self._exclusions_db if not sql: sql = config.wiki.get("sql", {}) @@ -379,6 +386,7 @@ class SitesDB(object): if search_config: nltk_dir = path.join(self.config.root_dir, ".nltk") search_config["nltk_dir"] = nltk_dir + search_config["exclusions_db"] = self._exclusions_db if not sql: sql = config.wiki.get("sql", {})