diff --git a/.gitignore b/.gitignore index 6a014f5..7e00121 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .sass-cache .DS_Store +.my.cnf # github premade rules *.py[cod] diff --git a/app.py b/app.py index c4083c9..2e3b0c8 100644 --- a/app.py +++ b/app.py @@ -5,6 +5,7 @@ Module to contain all the project's Flask server plumbing. from flask import Flask from flask import render_template, session +from bitshift.database import Database from bitshift.query import parse_query app = Flask(__name__) @@ -12,7 +13,9 @@ app.config.from_object("bitshift.config") app_env = app.jinja_env app_env.line_statement_prefix = "=" -app_env.globals.update(assets = assets) +app_env.globals.update(assets=assets) + +database = Database() @app.route("/") def index(): @@ -20,8 +23,8 @@ def index(): @app.route("/search/") def search(query): - ## tree = parse_query(query) - ## database.search(tree) + tree = parse_query(query) + database.search(tree) pass if __name__ == "__main__": diff --git a/bitshift/database.py b/bitshift/database.py deleted file mode 100644 index b8995ee..0000000 --- a/bitshift/database.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -Module with classes and functions to handle communication with the MySQL -database backend, which manages the search index. -""" - -import oursql - -class Database(object): - """Represents the MySQL database.""" - - def __init__(self): - pass - - def _connect(self): - pass - - def _create(self): - pass diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py new file mode 100644 index 0000000..75f39da --- /dev/null +++ b/bitshift/database/__init__.py @@ -0,0 +1,153 @@ +""" +Subpackage with classes and functions to handle communication with the MySQL +database backend, which manages the search index. +""" + +import os + +import mmh3 +import oursql + +from .migration import VERSION, MIGRATIONS + +__all__ = ["Database"] + +class Database(object): + """Represents the MySQL database.""" + + def __init__(self, migrate=False): + self._conn = self._connect() + self._check_version(migrate) + + def _connect(self): + """Establish a connection to the database.""" + root = os.path.dirname(os.path.abspath(__file__)) + default_file = os.path.join(root, ".my.cnf") + return oursql.connect(db="bitshift", read_default_file=default_file, + autoping=True, autoreconnect=True) + + def _migrate(self, cursor, current): + """Migrate the database to the latest schema version.""" + for version in xrange(current, VERSION): + print "Migrating to %d..." % version + 1 + for query in MIGRATIONS[version - 1]: + cursor.execute(query) + cursor.execute("UPDATE version SET version = ?", (version + 1,)) + + def _check_version(self, migrate): + """Check the database schema version and respond accordingly. + + If the schema is out of date, migrate if *migrate* is True, else raise + an exception. + """ + with self._conn.cursor() as cursor: + cursor.execute("SELECT version FROM version") + version = cursor.fetchone()[0] + if version < VERSION: + if migrate: + self._migrate(cursor, version) + else: + err = "Database schema out of date. " \ + "Run `python -m bitshift.database.migration`." + raise RuntimeError(err) + + def _get_codelets_from_ids(self, cursor, ids): + """Return a list of Codelet objects given a list of codelet IDs.""" + raise NotImplementedError() ## TODO + + def _decompose_url(self, cursor, url): + """Break up a URL into an origin (with a URL base) and a suffix.""" + query = """SELECT origin_id, SUBSTR(?, LENGTH(origin_url_base)) + FROM origins + WHERE origin_url_base IS NOT NULL + AND ? LIKE CONCAT(origin_url_base, "%")""" + + cursor.execute(query, (url, url)) + result = cursor.fetchone() + return result if result else (1, url) + + def _insert_symbols(self, cursor, code_id, sym_type, symbols): + """Insert a list of symbols of a given type into the database.""" + sym_types = ["functions", "classes", "variables"] + query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)" + query2 = """INSERT INTO symbol_locations VALUES + (DEFAULT, ?, ?, ?, ?, ?, ?)""" + + for (name, decls, uses) in symbols: + cursor.execute(query1, (code_id, sym_types.index(sym_type), name)) + sym_id = cursor.lastrowid + params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] + + [tuple([sym_id, 1] + list(loc)) for loc in uses]) + cursor.executemany(query2, params) + + def close(self): + """Disconnect from the database.""" + self._conn.close() + + def search(self, query, page=1): + """ + Search the database for a query and return the *n*\ th page of results. + + :param query: The query to search for. + :type query: :py:class:`~.query.tree.Tree` + :param page: The result page to display. + :type page: int + + :return: The total number of results, and the *n*\ th page of results. + :rtype: 2-tuple of (long, list of :py:class:`.Codelet`\ s) + """ + query1 = """SELECT cdata_codelet, cache_count_mnt, cache_count_exp + FROM cache + INNER JOIN cache_data ON cache_id = cdata_cache + WHERE cache_id = ?""" + query2 = "INSERT INTO cache VALUES (?, ?, ?, DEFAULT)" + query3 = "INSERT INTO cache_data VALUES (?, ?)" + + cache_id = mmh3.hash64(str(page) + ":" + query.serialize())[0] + + with self._conn.cursor() as cursor: + cursor.execute(query1, (cache_id,)) + results = cursor.fetchall() + if results: # Cache hit + num_results = results[0][1] * (10 ** results[0][2]) + ids = [res[0] for res in results] + else: # Cache miss + ## TODO: build and execute search query + results = cursor.fetchall() + ids = NotImplemented ## TODO: extract ids from results + num_results = NotImplemented ## TODO: num if results else 0 + num_exp = max(len(str(num_results)) - 3, 0) + num_results = int(round(num_results, -num_exp)) + num_mnt = num_results / (10 ** num_exp) + cursor.execute(query2, (cache_id, num_mnt, num_exp)) + cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) + return (num_results, self._get_codelets_from_ids(cursor, ids)) + + def insert(self, codelet): + """ + Insert a codelet into the database. + + :param codelet: The codelet to insert. + :type codelet: :py:class:`.Codelet` + """ + query1 = """INSERT INTO code VALUES (?, ?, ?) + ON DUPLICATE KEY UPDATE code_id=code_id""" + query2 = """INSERT INTO codelets VALUES + (DEFAULT, ?, ?, ?, ?, ?, ?, ?)""" + query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)" + + hash_key = str(codelet.language) + ":" + codelet.code.encode("utf8") + code_id = mmh3.hash64(hash_key)[0] + + with self._conn.cursor() as cursor: + cursor.execute(query1, (code_id, codelet.language, codelet.code)) + if cursor.rowcount == 1: + for sym_type, symbols in codelet.symbols.iteritems(): + self._insert_symbols(cursor, code_id, sym_type, symbols) + origin, url = self._decompose_url(cursor, codelet.url) + cursor.execute(query2, (codelet.name, code_id, origin, url, + codelet.rank, codelet.date_created, + codelet.date_modified)) + codelet_id = cursor.lastrowid + authors = [(codelet_id, a[0], a[1]) for a in codelet.authors] + cursor.executemany(query3, authors) diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py new file mode 100644 index 0000000..24f744a --- /dev/null +++ b/bitshift/database/migration.py @@ -0,0 +1,97 @@ +""" +Contains information about database schema versions, and SQL queries to update +between them. +""" + +VERSION = 6 + +MIGRATIONS = [ + # 1 -> 2 + [ + """ALTER TABLE `codelets` + DROP FOREIGN KEY `codelets_ibfk_1`""", + """ALTER TABLE `code` + DROP KEY `code_hash`, + DROP COLUMN `code_hash`, + MODIFY COLUMN `code_id` BIGINT NOT NULL""", + """ALTER TABLE `codelets` + MODIFY COLUMN `codelet_code_id` BIGINT NOT NULL, + ADD KEY (`codelet_lang`), + ADD CONSTRAINT `codelets_ibfk_1` FOREIGN KEY (`codelet_code_id`) + REFERENCES `code` (`code_id`) + ON DELETE RESTRICT ON UPDATE CASCADE""", + """ALTER TABLE `symbols` + ADD COLUMN `symbol_end_row` INT UNSIGNED NOT NULL, + ADD COLUMN `symbol_end_col` INT UNSIGNED NOT NULL""" + ], + # 2 -> 3 + [ + """ALTER TABLE `symbols` + DROP FOREIGN KEY `symbols_ibfk_1`, + CHANGE COLUMN `symbol_codelet` `symbol_code` BIGINT NOT NULL, + ADD CONSTRAINT `symbols_ibfk_1` FOREIGN KEY (`symbol_code`) + REFERENCES `code` (`code_id`) + ON DELETE CASCADE ON UPDATE CASCADE""" + ], + # 3 -> 4 + [ + """ALTER TABLE `symbols` + DROP COLUMN `symbol_row`, + DROP COLUMN `symbol_col`, + DROP COLUMN `symbol_end_row`, + DROP COLUMN `symbol_end_col`""", + """CREATE TABLE `symbol_locations` ( + `sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `sloc_symbol` BIGINT UNSIGNED NOT NULL, + `sloc_type` TINYINT UNSIGNED NOT NULL, + `sloc_row` INT UNSIGNED NOT NULL, + `sloc_col` INT UNSIGNED NOT NULL, + `sloc_end_row` INT UNSIGNED NOT NULL, + `sloc_end_col` INT UNSIGNED NOT NULL, + PRIMARY KEY (`sloc_id`), + FOREIGN KEY (`sloc_symbol`) + REFERENCES `symbols` (`symbol_id`) + ON DELETE CASCADE ON UPDATE CASCADE + ) ENGINE=InnoDB""" + ], + # 4 -> 5 + [ + """ALTER TABLE `origins` + MODIFY COLUMN `origin_name` VARCHAR(64) DEFAULT NULL, + MODIFY COLUMN `origin_url` VARCHAR(512) DEFAULT NULL, + MODIFY COLUMN `origin_url_base` VARCHAR(512) DEFAULT NULL""" + ], + # 5 -> 6 + [ + """ALTER TABLE `code` + ADD COLUMN `code_lang` SMALLINT UNSIGNED DEFAULT NULL + AFTER `code_id`, + ADD KEY (`code_lang`)""", + """ALTER TABLE `codelets` + DROP KEY `codelet_lang`, + DROP COLUMN `codelet_lang`""", + """ALTER TABLE `cache_data` + DROP FOREIGN KEY `cache_data_ibfk_1`""", + """ALTER TABLE `cache` + MODIFY COLUMN `cache_id` BIGINT NOT NULL, + DROP COLUMN `cache_hash`, + DROP COLUMN `cache_last_used`, + MODIFY COLUMN `cache_count_mnt` SMALLINT UNSIGNED NOT NULL""", + """ALTER TABLE `cache_data` + MODIFY COLUMN `cdata_cache` BIGINT NOT NULL, + ADD PRIMARY KEY (`cdata_cache`, `cdata_codelet`), + ADD CONSTRAINT `cache_data_ibfk_1` FOREIGN KEY (`cdata_codelet`) + REFERENCES `codelets` (`codelet_id`) + ON DELETE CASCADE ON UPDATE CASCADE""", + """CREATE EVENT `flush_cache` + ON SCHEDULE EVERY 1 HOUR + DO + DELETE FROM `cache` + WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY);""" + ] +] + +if __name__ == "__main__": + from . import Database + + Database(migrate=True).close() diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql new file mode 100644 index 0000000..8634416 --- /dev/null +++ b/bitshift/database/schema.sql @@ -0,0 +1,114 @@ +-- Schema version 6 + +CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; +USE `bitshift`; + +CREATE TABLE `version` ( + `version` INT UNSIGNED NOT NULL +) ENGINE=InnoDB; +INSERT INTO `version` VALUES (6); + +CREATE TABLE `origins` ( + `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, + `origin_name` VARCHAR(64) DEFAULT NULL, + `origin_url` VARCHAR(512) DEFAULT NULL, + `origin_url_base` VARCHAR(512) DEFAULT NULL, + `origin_image` BLOB DEFAULT NULL, + PRIMARY KEY (`origin_id`) +) ENGINE=InnoDB; +INSERT INTO `origins` VALUES (1, NULL, NULL, NULL, NULL); + +CREATE TABLE `code` ( + `code_id` BIGINT NOT NULL, + `code_lang` SMALLINT UNSIGNED DEFAULT NULL, + `code_code` MEDIUMTEXT NOT NULL, + PRIMARY KEY (`code_id`), + KEY (`code_lang`), + FULLTEXT KEY (`code_code`) +) ENGINE=InnoDB; + +CREATE TABLE `codelets` ( + `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `codelet_name` VARCHAR(300) NOT NULL, + `codelet_code_id` BIGINT NOT NULL, + `codelet_origin` TINYINT UNSIGNED NOT NULL, + `codelet_url` VARCHAR(512) NOT NULL, + `codelet_rank` FLOAT NOT NULL, + `codelet_date_created` DATETIME DEFAULT NULL, + `codelet_date_modified` DATETIME DEFAULT NULL, + PRIMARY KEY (`codelet_id`), + FULLTEXT KEY (`codelet_name`), + KEY (`codelet_rank`), + KEY (`codelet_date_created`), + KEY (`codelet_date_modified`), + FOREIGN KEY (`codelet_code_id`) + REFERENCES `code` (`code_id`) + ON DELETE RESTRICT ON UPDATE CASCADE, + FOREIGN KEY (`codelet_origin`) + REFERENCES `origins` (`origin_id`) + ON DELETE RESTRICT ON UPDATE CASCADE +) ENGINE=InnoDB; + +CREATE TABLE `authors` ( + `author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `author_codelet` BIGINT UNSIGNED NOT NULL, + `author_name` VARCHAR(128) NOT NULL, + `author_url` VARCHAR(512) DEFAULT NULL, + PRIMARY KEY (`author_id`), + FULLTEXT KEY (`author_name`), + FOREIGN KEY (`author_codelet`) + REFERENCES `codelets` (`codelet_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; + +CREATE TABLE `symbols` ( + `symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `symbol_code` BIGINT NOT NULL, + `symbol_type` TINYINT UNSIGNED NOT NULL, + `symbol_name` VARCHAR(512) NOT NULL, + PRIMARY KEY (`symbol_id`), + KEY (`symbol_type`, `symbol_name`(32)), + FOREIGN KEY (`symbol_code`) + REFERENCES `code` (`code_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; + +CREATE TABLE `symbol_locations` ( + `sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `sloc_symbol` BIGINT UNSIGNED NOT NULL, + `sloc_type` TINYINT UNSIGNED NOT NULL, + `sloc_row` INT UNSIGNED NOT NULL, + `sloc_col` INT UNSIGNED NOT NULL, + `sloc_end_row` INT UNSIGNED NOT NULL, + `sloc_end_col` INT UNSIGNED NOT NULL, + PRIMARY KEY (`sloc_id`), + FOREIGN KEY (`sloc_symbol`) + REFERENCES `symbols` (`symbol_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; + +CREATE TABLE `cache` ( + `cache_id` BIGINT NOT NULL, + `cache_count_mnt` SMALLINT UNSIGNED NOT NULL, + `cache_count_exp` TINYINT UNSIGNED NOT NULL, + `cache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`cache_id`) +) ENGINE=InnoDB; + +CREATE TABLE `cache_data` ( + `cdata_cache` BIGINT NOT NULL, + `cdata_codelet` BIGINT UNSIGNED NOT NULL, + PRIMARY KEY (`cdata_cache`, `cdata_codelet`), + FOREIGN KEY (`cdata_cache`) + REFERENCES `cache` (`cache_id`) + ON DELETE CASCADE ON UPDATE CASCADE, + FOREIGN KEY (`cdata_codelet`) + REFERENCES `codelets` (`codelet_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; + +CREATE EVENT `flush_cache` + ON SCHEDULE EVERY 1 HOUR + DO + DELETE FROM `cache` + WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY); diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 7d6e0d5..6971c04 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -6,4 +6,6 @@ __all__ = ["parse_query"] def parse_query(query): # gets a string, returns a Tree + # TODO: note: resultant Trees should be normalized so that "foo OR bar" + # and "bar OR foo" result in equivalent trees pass diff --git a/setup.py b/setup.py index 1faa5b9..97441b7 100644 --- a/setup.py +++ b/setup.py @@ -4,8 +4,9 @@ setup( name = "bitshift", version = "0.1", packages = find_packages(), - install_requires = ["Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", - "BeautifulSoup>=3.2.1"], + install_requires = [ + "Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", + "BeautifulSoup>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"], author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", license = "MIT", url = "https://github.com/earwig/bitshift"