diff --git a/bitshift/codelet.py b/bitshift/codelet.py index acaa52b..92debf4 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -18,12 +18,14 @@ class Codelet(object): code was last modified. :ivar rank: (float) A quanitification of the source code's quality, as per available ratings (stars, forks, upvotes, etc.). - :ivar symbols: (dict) Dictionary containing dictionaries of functions, classes, - variable definitions, etc. + :ivar symbols: (dict) Dictionary containing dictionaries of functions, + classes, variable definitions, etc. + :ivar origin: (tuple) 3-tuple of (site_name, site_url, image_blob), as + added by the database. """ def __init__(self, name, code, filename, language, authors, code_url, - date_created, date_modified, rank): + date_created, date_modified, rank, symbols=None, origin=None): """ Create a Codelet instance. @@ -36,6 +38,8 @@ class Codelet(object): :param date_created: see :attr:`self.date_created` :param date_modified: see :attr:`self.date_modified` :param rank: see :attr:`self.rank` + :param symbols: see :attr:`self.symbols` + :param origin: see :attr:`self.origin` :type name: see :attr:`self.name` :type code: see :attr:`self.code` @@ -46,6 +50,8 @@ class Codelet(object): :type date_created: see :attr:`self.date_created` :type date_modified: see :attr:`self.date_modified` :type rank: see :attr:`self.rank` + :type symbols: see :attr:`self.symbols` + :type origin: see :attr:`self.origin` """ self.name = name @@ -57,3 +63,5 @@ class Codelet(object): self.date_created = date_created self.date_modified = date_modified self.rank = rank + self.symbols = symbols or {} + self.origin = origin or (None, None, None) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 07f46c7..e4fa430 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -9,6 +9,9 @@ import mmh3 import oursql from .migration import VERSION, MIGRATIONS +from ..codelet import Codelet +from ..query.nodes import (String, Regex, Text, Language, Author, Date, Symbol, + BinaryOp, UnaryOp) __all__ = ["Database"] @@ -51,23 +54,71 @@ class Database(object): "Run `python -m bitshift.database.migration`." raise RuntimeError(err) - def _search_with_query(self, cursor, query): - """Convert a query tree into SQL SELECTs, execute, and return results. + def _search_with_query(self, cursor, tree, page): + """Execute an SQL query based on a query tree, and return results. The returned data is a 2-tuple of (list of codelet IDs, estimated number of total results). """ - raise NotImplementedError() ## TODO - - results = cursor.fetchall() - ids = NotImplemented ## TODO: extract ids from results - num_results = NotImplemented ## TODO: num if results else 0 - + query, args = tree.build_query(page) + cursor.execute(query, args) + ids = [id for id, _ in cursor.fetchall()] + num_results = 0 # TODO: NotImplemented return ids, num_results + def _get_authors_for_codelet(self, cursor, codelet_id): + """Return a list of authors for a given codelet.""" + query = """SELECT author_name, author_url + FROM authors + WHERE author_codelet = ?""" + + cursor.execute(query, (codelet_id,)) + return cursor.fetchall() + + def _get_symbols_for_code(self, cursor, code_id): + """Return a list of symbols for a given codelet.""" + query = """SELECT symbol_type, symbol_name, sloc_type, sloc_row, + sloc_col, sloc_end_row, sloc_end_col + FROM symbols + INNER JOIN symbol_locations ON sloc_symbol = symbol_id + WHERE symbol_code = ?""" + + symbols = {type_: {} for type_ in Symbol.TYPES_INV} + cursor.execute(query, (code_id,)) + for type_, name, loc_type, row, col, erow, ecol in cursor.fetchall(): + sdict = symbols[Symbol.TYPES_INV[type_]] + if name not in sdict: + sdict[name] = ((), ()) + sdict[name][loc_type].append((row, col, erow, ecol)) + for type_, sdict in symbols.items(): + symbols[type_] = [(n, d, u) for n, (d, u) in sdict.iteritems()] + return symbols + def _get_codelets_from_ids(self, cursor, ids): """Return a list of Codelet objects given a list of codelet IDs.""" - raise NotImplementedError() ## TODO + query = """SELECT * + FROM codelets + INNER JOIN code ON codelet_code_id = code_id + INNER JOIN origins ON codelet_origin = origin_id + WHERE codelet_id = ?""" + + with self._conn.cursor(oursql.DictCursor) as dict_cursor: + dict_cursor.executemany(query, [(id,) for id in ids]) + for row in dict_cursor.fetchone(): + codelet_id = row["codelet_id"] + if row["origin_url_base"]: + url = row["codelet_url"] + else: + url = row["origin_url_base"] + row["codelet_url"] + origin = (row["origin_name"], row["origin_url"], + row["origin_image"]) + authors = self._get_authors_for_codelet(cursor, codelet_id) + symbols = self._get_symbols_for_code(cursor, row["code_id"]) + yield Codelet( + row["codelet_name"], row["code_code"], None, + row["code_lang"], authors, url, + row["codelet_date_created"], row["codelet_date_modified"], + row["codelet_rank"], symbols, origin) def _decompose_url(self, cursor, url): """Break up a URL into an origin (with a URL base) and a suffix.""" @@ -82,13 +133,12 @@ class Database(object): def _insert_symbols(self, cursor, code_id, sym_type, symbols): """Insert a list of symbols of a given type into the database.""" - sym_types = ["functions", "classes", "variables"] query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)" query2 = """INSERT INTO symbol_locations VALUES (DEFAULT, ?, ?, ?, ?, ?, ?)""" for (name, decls, uses) in symbols: - cursor.execute(query1, (code_id, sym_types.index(sym_type), name)) + cursor.execute(query1, (code_id, Symbol.TYPES_INV[sym_type], name)) sym_id = cursor.lastrowid params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] + [tuple([sym_id, 1] + list(loc)) for loc in uses]) @@ -132,7 +182,8 @@ class Database(object): num_mnt = num_results / (10 ** num_exp) cursor.execute(query2, (cache_id, num_mnt, num_exp)) cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) - return (num_results, self._get_codelets_from_ids(cursor, ids)) + codelet_gen = self._get_codelets_from_ids(cursor, ids) + return (num_results, list(codelet_gen)) def insert(self, codelet): """ diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 20cabd5..41d01cf 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -116,6 +116,10 @@ class _QueryParser(object): def _parse_term(self, term): """Parse a query term into a tree node and return it.""" + try: + term = term.decode("unicode_escape") + except UnicodeDecodeError: + raise QueryParseException('Invalid query term: "%s"' % term) if ":" in term and not term[0] == ":": prefix, arg = term.split(":", 1) invert = prefix.lower() == "not" @@ -135,7 +139,7 @@ class _QueryParser(object): Returns a 2-tuple of (first_marker_found, marker_index). """ - def _is_escaped(query, index): + def is_escaped(query, index): """Return whether a query marker is backslash-escaped.""" return (index > 0 and query[index - 1] == "\\" and (index < 2 or query[index - 2] != "\\")) @@ -143,7 +147,7 @@ class _QueryParser(object): best_marker, best_index = None, maxsize for marker in markers: index = query.find(marker) - if _is_escaped(query, index): + if is_escaped(query, index): _, new_index = self._scan_query(query[index + 1:], marker) index += new_index + 1 if index >= 0 and index < best_index: @@ -209,6 +213,9 @@ class _QueryParser(object): def parse_binary_op(op): """Parse a binary operator in a nested query list.""" index = nest.index(op) + if index == 0 or index == len(nest) - 1: + err = "Invalid query: '%s' given without argument." + raise QueryParseException(err % BinaryOp.OPS[op]) left = self._parse_nest(nest[:index]) right = self._parse_nest(nest[index + 1:]) return BinaryOp(left, op, right) @@ -222,6 +229,9 @@ class _QueryParser(object): return parse_binary_op(BinaryOp.AND) elif UnaryOp.NOT in nest: index = nest.index(UnaryOp.NOT) + if index == len(nest) - 1: + err = "Invalid query: '%s' given without argument." + raise QueryParseException(err % UnaryOp.OPS[UnaryOp.NOT]) right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:])) if index > 0: left = self._parse_nest(nest[:index]) diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index b959118..5d157b5 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -15,6 +15,15 @@ class _Node(object): """Return a string sort key for the node.""" return "" + def parameterize(self, tables): + """Parameterize the node. + + Returns a 4-tuple of (conditional string, parameter list, rank list, + should-we-rank boolean). If the rank list is empty, then it is assumed + to contain the conditional string. + """ + return "", [], [], False + class _Literal(object): """Represents a literal component of a search query, present at the leaves. @@ -75,6 +84,20 @@ class Text(_Node): def sortkey(self): return self.text.sortkey() + def parameterize(self, tables): + tables |= {"code", "symbols"} + if isinstance(self.text, Regex): + ranks = ["(codelet_name REGEXP ?)", "(symbol_name REGEXP ?)", + "(code_code REGEXP ?)"] + text = self.text.regex + else: + ranks = ["(MATCH(codelet_name) AGAINST (? IN BOOLEAN MODE))", + "(MATCH(code_code) AGAINST (? IN BOOLEAN MODE))", + "(symbol_name = ?)"] + text = self.text.string + cond = "(" + " OR ".join(ranks) + ")" + return cond, [text] * 3, ranks, True + class Language(_Node): """Represents a language node. @@ -94,6 +117,10 @@ class Language(_Node): def sortkey(self): return LANGS[self.lang] + def parameterize(self, tables): + tables |= {"code"} + return "(code_lang = ?)", [self.lang], [], False + class Author(_Node): """Represents a author node. @@ -113,6 +140,13 @@ class Author(_Node): def sortkey(self): return self.name.sortkey() + def parameterize(self, tables): + tables |= {"authors"} + if isinstance(self.name, Regex): + return "(author_name REGEXP ?)", [self.name.regex], [], False + cond = "(MATCH(author_name) AGAINST (? IN BOOLEAN MODE))" + return cond, [self.name.string], [], True + class Date(_Node): """Represents a date node. @@ -144,38 +178,59 @@ class Date(_Node): def sortkey(self): return self.date.strftime("%Y%m%d%H%M%S") + def parameterize(self, tables): + column = {self.CREATE: "codelet_date_created", + self.MODIFY: "codelet_date_modified"}[self.type] + op = {self.BEFORE: "<=", self.AFTER: ">="}[self.relation] + return "(" + column + " " + op + " ?)", [self.date], [], False + class Symbol(_Node): """Represents a symbol node. Searches in symbol_type and symbol_name. """ - ALL = 0 - FUNCTION = 1 - CLASS = 2 - VARIABLE = 3 + ALL = -1 + FUNCTION = 0 + CLASS = 1 + VARIABLE = 2 + TYPES = {FUNCTION: "FUNCTION", CLASS: "CLASS", VARIABLE: "VARIABLE"} + TYPES_INV = ["functions", "classes", "variables"] def __init__(self, type_, name): """ :type type_: int (``ALL``, ``FUNCTION``, ``CLASS``, etc.) - :type name: :py:class:`.Literal` + :type name: :py:class:`._Literal` """ self.type = type_ self.name = name def __repr__(self): - types = {self.ALL: "ALL", self.FUNCTION: "FUNCTION", - self.CLASS: "CLASS", self.VARIABLE: "VARIABLE"} - return "Symbol({0}, {1})".format(types[self.type], self.name) + type_ = self.TYPES.get(self.type, "ALL") + return "Symbol({0}, {1})".format(type_, self.name) def sortkey(self): return self.name.sortkey() + def parameterize(self, tables): + tables |= {"code", "symbols"} + if isinstance(self.name, Regex): + cond, name = "symbol_name REGEXP ?", self.name.regex + else: + cond, name = "symbol_name = ?", self.name.string + if self.type == self.ALL: + types = ", ".join(str(type_) for type_ in self.TYPES) + cond += " AND symbol_type IN (%s)" % types + if self.type != self.ALL: + cond += " AND symbol_type = %d" % self.type + return "(" + cond + ")", [name], [], False + class BinaryOp(_Node): """Represents a relationship between two nodes: ``and``, ``or``.""" AND = object() OR = object() + OPS = {AND: "AND", OR: "OR"} def __init__(self, left, op, right): self.left = left @@ -183,25 +238,39 @@ class BinaryOp(_Node): self.right = right def __repr__(self): - ops = {self.AND: "AND", self.OR: "OR"} tmpl = "BinaryOp({0}, {1}, {2})" - return tmpl.format(self.left, ops[self.op], self.right) + return tmpl.format(self.left, self.OPS[self.op], self.right) def sortkey(self): return self.left.sortkey() + self.right.sortkey() + def parameterize(self, tables): + lcond, largs, lranks, need_lranks = self.left.parameterize(tables) + rcond, rargs, rranks, need_rranks = self.right.parameterize(tables) + lranks, rranks = lranks or [lcond], rranks or [rcond] + op = self.OPS[self.op] + cond = "(" + lcond + " " + op + " " + rcond + ")" + need_ranks = need_lranks or need_rranks or self.op == self.OR + return cond, largs + rargs, lranks + rranks, need_ranks + class UnaryOp(_Node): """Represents a transformation applied to one node: ``not``.""" NOT = object() + OPS = {NOT: "NOT"} def __init__(self, op, node): self.op = op self.node = node def __repr__(self): - ops = {self.NOT: "NOT"} - return "UnaryOp({0}, {1})".format(ops[self.op], self.node) + return "UnaryOp({0}, {1})".format(self.OPS[self.op], self.node) def sortkey(self): return self.node.sortkey() + + def parameterize(self, tables): + cond, args, ranks, need_ranks = self.node.parameterize(tables) + new_cond = "(" + self.OPS[self.op] + " " + cond + ")" + ranks = ranks or [cond] + return new_cond, args, ranks, need_ranks diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py index 543889e..5da3f02 100644 --- a/bitshift/query/tree.py +++ b/bitshift/query/tree.py @@ -1,5 +1,12 @@ __all__ = ["Tree"] +QUERY_TEMPLATE = """SELECT codelet_id, (codelet_rank%s) AS score +FROM codelets %s +WHERE %s +GROUP BY codelet_id +ORDER BY score DESC +LIMIT %d OFFSET %d""".replace("\n", " ") + class Tree(object): """Represents a query tree.""" @@ -9,6 +16,11 @@ class Tree(object): def __repr__(self): return "Tree({0})".format(self._root) + @property + def root(self): + """The root node of the tree.""" + return self._root + def sortkey(self): """Return a string sort key for the query tree.""" return self._root.sortkey() @@ -20,3 +32,38 @@ class Tree(object): :rtype: str """ return repr(self) + + def build_query(self, page=1, page_size=10): + """Convert the query tree into a parameterized SQL SELECT statement. + + :param page: The page number to get results for. + :type page: int + :param page_size: The number of results per page. + :type page_size: int + + :return: SQL query data. + :rtype: 2-tuple of (SQL statement string, query parameter tuple) + """ + def get_table_joins(tables): + data = [ + ("code", "codelet_code_id", "code_id"), + ("authors", "author_codelet", "codelet_id"), + ("symbols", "symbol_code", "code_id") + ] + tmpl = "INNER JOIN %s ON %s = %s" + for args in data: + if args[0] in tables: + yield tmpl % args + + tables = set() + cond, arglist, ranks, need_ranks = self._root.parameterize(tables) + ranks = ranks or [cond] + if need_ranks: + score = " + ((%s) / %d)" % (" + ".join(ranks), len(ranks)) + else: + score = "" + joins = " ".join(get_table_joins(tables)) + offset = (page - 1) * page_size + + query = QUERY_TEMPLATE % (score, joins, cond, page_size, offset) + return query, tuple(arglist * 2 if need_ranks else arglist)