From 4fa8b9f444f17e840f134dc553f80225aa5faa55 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 16 May 2014 14:18:37 -0400 Subject: [PATCH 01/14] Improve exception behavior; cleanup. --- bitshift/query/__init__.py | 6 ++++++ bitshift/query/nodes.py | 8 ++++---- bitshift/query/tree.py | 5 +++++ 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 20cabd5..474f434 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -209,6 +209,9 @@ class _QueryParser(object): def parse_binary_op(op): """Parse a binary operator in a nested query list.""" index = nest.index(op) + if index == 0 or index == len(nest) - 1: + err = "Invalid query: '%s' given without argument." + raise QueryParseException(err % BinaryOp.OPS[op]) left = self._parse_nest(nest[:index]) right = self._parse_nest(nest[index + 1:]) return BinaryOp(left, op, right) @@ -222,6 +225,9 @@ class _QueryParser(object): return parse_binary_op(BinaryOp.AND) elif UnaryOp.NOT in nest: index = nest.index(UnaryOp.NOT) + if index == len(nest) - 1: + err = "Invalid query: '%s' given without argument." + raise QueryParseException(err % UnaryOp.OPS[op]) right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:])) if index > 0: left = self._parse_nest(nest[:index]) diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index b959118..f54c4e3 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -176,6 +176,7 @@ class BinaryOp(_Node): """Represents a relationship between two nodes: ``and``, ``or``.""" AND = object() OR = object() + OPS = {AND: "AND", OR: "OR"} def __init__(self, left, op, right): self.left = left @@ -183,9 +184,8 @@ class BinaryOp(_Node): self.right = right def __repr__(self): - ops = {self.AND: "AND", self.OR: "OR"} tmpl = "BinaryOp({0}, {1}, {2})" - return tmpl.format(self.left, ops[self.op], self.right) + return tmpl.format(self.left, self.OPS[self.op], self.right) def sortkey(self): return self.left.sortkey() + self.right.sortkey() @@ -194,14 +194,14 @@ class BinaryOp(_Node): class UnaryOp(_Node): """Represents a transformation applied to one node: ``not``.""" NOT = object() + OPS = {NOT: "NOT"} def __init__(self, op, node): self.op = op self.node = node def __repr__(self): - ops = {self.NOT: "NOT"} - return "UnaryOp({0}, {1})".format(ops[self.op], self.node) + return "UnaryOp({0}, {1})".format(self.OPS[self.op], self.node) def sortkey(self): return self.node.sortkey() diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py index 543889e..a13d8f3 100644 --- a/bitshift/query/tree.py +++ b/bitshift/query/tree.py @@ -9,6 +9,11 @@ class Tree(object): def __repr__(self): return "Tree({0})".format(self._root) + @property + def root(self): + """The root node of the tree.""" + return self._root + def sortkey(self): """Return a string sort key for the query tree.""" return self._root.sortkey() From cd27777f83464154f890c0688f81558150138a0f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 16 May 2014 14:37:43 -0400 Subject: [PATCH 02/14] Start exploding query trees. --- bitshift/database/__init__.py | 61 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 07f46c7..08d7f08 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -9,6 +9,8 @@ import mmh3 import oursql from .migration import VERSION, MIGRATIONS +from ..query.nodes import (String, Regex, Text, Language, Author, Date, Symbol, + BinaryOp, UnaryOp) __all__ = ["Database"] @@ -51,23 +53,64 @@ class Database(object): "Run `python -m bitshift.database.migration`." raise RuntimeError(err) - def _search_with_query(self, cursor, query): - """Convert a query tree into SQL SELECTs, execute, and return results. + def _explode_query_tree(self, tree): + """Convert a query tree into components of an SQL SELECT statement.""" + def _parse_node(node): + if isinstance(node, Text): + tables |= {"code", "symbols"} + # (FTS: codelet_name, =: symbol_name, FTS: code_code) vs. node.text (_Literal) + pass + elif isinstance(node, Language): + tables |= {"code"} + return "(code_lang = ?)", [node.lang] + elif isinstance(node, Author): + tables |= {"authors"} + # (FTS: author_name) vs. node.name (_Literal) + pass + elif isinstance(node, Date): + # read node.type, node.relation + # (>=/<=: codelet_date_created / codelet_date_modified) vs. node.date (datetime.datetime) + pass + elif isinstance(node, Symbol): + tables |= {"symbols"} + # (symbol_type, symbol_name) vs. (node.type, node.name) + pass + elif isinstance(node, BinaryOp): + left_cond, left_args = _parse_node(node.left) + right_cond, right_args = _parse_node(node.right) + op = node.OPS[node.op] + cond = "(" + left_cond + " " + op + " " + right_cond + ")" + return cond, left_args + right_args + elif isinstance(node, UnaryOp): + cond, args = _parse_node(node.node) + return "(" + node.OPS[node.op] + " " + cond + ")", args + + tables = set() + conditional, arglist = _parse_node(tree.root) + # joins = " ".join(tables) + + return conditional, joins, tuple(arglist) + + def _search_with_query(self, cursor, query, page): + """Execute an SQL query based on a query tree, and return results. The returned data is a 2-tuple of (list of codelet IDs, estimated number of total results). """ - raise NotImplementedError() ## TODO - - results = cursor.fetchall() - ids = NotImplemented ## TODO: extract ids from results - num_results = NotImplemented ## TODO: num if results else 0 - + conditional, joins, args = self._explode_query_tree(query) + base = "SELECT codelet_id FROM codelets %s WHERE %s LIMIT 10" + qstring = base % (joins, conditional) + if page > 1: + qstring += " OFFSET %d" % ((page - 1) * 10) + + cursor.execute(qstring, args) + ids = [id for id, in cursor.fetchall()] + num_results = 0 # TODO: NotImplemented return ids, num_results def _get_codelets_from_ids(self, cursor, ids): """Return a list of Codelet objects given a list of codelet IDs.""" - raise NotImplementedError() ## TODO + raise NotImplementedError() # TODO def _decompose_url(self, cursor, url): """Break up a URL into an origin (with a URL base) and a suffix.""" From 1111afc198aa7b3d605cbab63ae1d3405b8a14bb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 16 May 2014 14:45:36 -0400 Subject: [PATCH 03/14] Explode dates and symbols. --- bitshift/database/__init__.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 08d7f08..40fdc63 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -68,13 +68,18 @@ class Database(object): # (FTS: author_name) vs. node.name (_Literal) pass elif isinstance(node, Date): - # read node.type, node.relation - # (>=/<=: codelet_date_created / codelet_date_modified) vs. node.date (datetime.datetime) - pass + column = {node.CREATE: "codelet_date_created", + node.MODIFY: "codelet_date_modified"}[node.type] + op = {node.BEFORE: "<=", node.AFTER: ">="}[node.relation] + return "(" + column + " " + op + " ?)", [node.date] elif isinstance(node, Symbol): tables |= {"symbols"} - # (symbol_type, symbol_name) vs. (node.type, node.name) - pass + if node.type == node.ALL: + # OR all of the types of symbol_types + pass + else: + cond = "(symbol_type = ? AND symbol_name = ?)" + return cond, [node.type, node.name] elif isinstance(node, BinaryOp): left_cond, left_args = _parse_node(node.left) right_cond, right_args = _parse_node(node.right) From d3f6f226f1996a102943962077deb6af7817370b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 17 May 2014 20:33:08 -0400 Subject: [PATCH 04/14] Finish exploding symbols. --- bitshift/database/__init__.py | 12 ++++++------ bitshift/query/nodes.py | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 40fdc63..241ad66 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -74,12 +74,12 @@ class Database(object): return "(" + column + " " + op + " ?)", [node.date] elif isinstance(node, Symbol): tables |= {"symbols"} - if node.type == node.ALL: - # OR all of the types of symbol_types - pass - else: - cond = "(symbol_type = ? AND symbol_name = ?)" - return cond, [node.type, node.name] + cond_base = "(symbol_type = ? AND symbol_name = ?)" + if node.type != node.ALL: + return cond_base, [node.type, node.name] + cond = "(" + " OR ".join([cond_base] * len(node.TYPES)) + ")" + args = zip(node.TYPES.keys(), [node.name] * len(node.TYPES)) + return cond, [arg for tup in args for arg in tup] elif isinstance(node, BinaryOp): left_cond, left_args = _parse_node(node.left) right_cond, right_args = _parse_node(node.right) diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index f54c4e3..3021ddf 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -154,6 +154,8 @@ class Symbol(_Node): FUNCTION = 1 CLASS = 2 VARIABLE = 3 + TYPES = {ALL: "ALL", FUNCTION: "FUNCTION", CLASS: "CLASS", + VARIABLE: "VARIABLE"} def __init__(self, type_, name): """ @@ -164,9 +166,7 @@ class Symbol(_Node): self.name = name def __repr__(self): - types = {self.ALL: "ALL", self.FUNCTION: "FUNCTION", - self.CLASS: "CLASS", self.VARIABLE: "VARIABLE"} - return "Symbol({0}, {1})".format(types[self.type], self.name) + return "Symbol({0}, {1})".format(self.TYPES[self.type], self.name) def sortkey(self): return self.name.sortkey() From 08ce46faeb63eea5a82dcf8252db8a5922018138 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 18 May 2014 20:55:14 -0400 Subject: [PATCH 05/14] Implement FTS for authors. --- bitshift/database/__init__.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 241ad66..4e5b8b8 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -65,8 +65,10 @@ class Database(object): return "(code_lang = ?)", [node.lang] elif isinstance(node, Author): tables |= {"authors"} - # (FTS: author_name) vs. node.name (_Literal) - pass + if isinstance(node.name, Regex): + return "(author_name REGEXP ?)", [node.name.regex] + cond = "(MATCH(author_name) AGAINST (? IN BOOLEAN MODE))" + return cond, [node.name.string] elif isinstance(node, Date): column = {node.CREATE: "codelet_date_created", node.MODIFY: "codelet_date_modified"}[node.type] @@ -103,7 +105,10 @@ class Database(object): number of total results). """ conditional, joins, args = self._explode_query_tree(query) - base = "SELECT codelet_id FROM codelets %s WHERE %s LIMIT 10" + base = """SELECT codelet_id + FROM codelets %s + WHERE %s + ORDER BY codelet_rank LIMIT 10""" qstring = base % (joins, conditional) if page > 1: qstring += " OFFSET %d" % ((page - 1) * 10) From 8fbfd4c45c621fdcefc0dc1a225e7d36c8818bbc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 18 May 2014 23:11:23 -0400 Subject: [PATCH 06/14] Fix _parse_node; _parse_term quote handling; should probably refactor. --- bitshift/database/__init__.py | 25 ++++++++++++------------- bitshift/query/__init__.py | 4 ++++ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 4e5b8b8..51cf83d 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -55,45 +55,44 @@ class Database(object): def _explode_query_tree(self, tree): """Convert a query tree into components of an SQL SELECT statement.""" - def _parse_node(node): + def _parse_node(node, tables): if isinstance(node, Text): tables |= {"code", "symbols"} # (FTS: codelet_name, =: symbol_name, FTS: code_code) vs. node.text (_Literal) pass elif isinstance(node, Language): tables |= {"code"} - return "(code_lang = ?)", [node.lang] + return "(code_lang = ?)", tables, [node.lang] elif isinstance(node, Author): tables |= {"authors"} if isinstance(node.name, Regex): return "(author_name REGEXP ?)", [node.name.regex] cond = "(MATCH(author_name) AGAINST (? IN BOOLEAN MODE))" - return cond, [node.name.string] + return cond, tables, [node.name.string] elif isinstance(node, Date): column = {node.CREATE: "codelet_date_created", node.MODIFY: "codelet_date_modified"}[node.type] op = {node.BEFORE: "<=", node.AFTER: ">="}[node.relation] - return "(" + column + " " + op + " ?)", [node.date] + return "(" + column + " " + op + " ?)", tables, [node.date] elif isinstance(node, Symbol): tables |= {"symbols"} cond_base = "(symbol_type = ? AND symbol_name = ?)" if node.type != node.ALL: - return cond_base, [node.type, node.name] + return cond_base, tables, [node.type, node.name] cond = "(" + " OR ".join([cond_base] * len(node.TYPES)) + ")" args = zip(node.TYPES.keys(), [node.name] * len(node.TYPES)) - return cond, [arg for tup in args for arg in tup] + return cond, tables, [arg for tup in args for arg in tup] elif isinstance(node, BinaryOp): - left_cond, left_args = _parse_node(node.left) - right_cond, right_args = _parse_node(node.right) + left_cond, tbls, left_args = _parse_node(node.left, tables) + right_cond, tables, right_args = _parse_node(node.right, tbls) op = node.OPS[node.op] cond = "(" + left_cond + " " + op + " " + right_cond + ")" - return cond, left_args + right_args + return cond, tables, left_args + right_args elif isinstance(node, UnaryOp): - cond, args = _parse_node(node.node) - return "(" + node.OPS[node.op] + " " + cond + ")", args + cond, tables, args = _parse_node(node.node, tables) + return "(" + node.OPS[node.op] + " " + cond + ")", tables, args - tables = set() - conditional, arglist = _parse_node(tree.root) + conditional, tables, arglist = _parse_node(tree.root, set()) # joins = " ".join(tables) return conditional, joins, tuple(arglist) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 474f434..6e5d63b 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -116,6 +116,10 @@ class _QueryParser(object): def _parse_term(self, term): """Parse a query term into a tree node and return it.""" + try: + term = term.decode("unicode_escape") + except UnicodeDecodeError: + raise QueryParseException('Invalid query term: "%s"' % term) if ":" in term and not term[0] == ":": prefix, arg = term.split(":", 1) invert = prefix.lower() == "not" From f49ad1d2816799dc51cf71c55175a17236adec1e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 May 2014 11:00:14 -0400 Subject: [PATCH 07/14] Move parameterization functions to Node classes. --- bitshift/database/__init__.py | 47 ++--------------------------------------- bitshift/query/__init__.py | 4 ++-- bitshift/query/nodes.py | 49 +++++++++++++++++++++++++++++++++++++++++++ bitshift/query/tree.py | 9 ++++++++ 4 files changed, 62 insertions(+), 47 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 51cf83d..7a794f6 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -53,61 +53,18 @@ class Database(object): "Run `python -m bitshift.database.migration`." raise RuntimeError(err) - def _explode_query_tree(self, tree): - """Convert a query tree into components of an SQL SELECT statement.""" - def _parse_node(node, tables): - if isinstance(node, Text): - tables |= {"code", "symbols"} - # (FTS: codelet_name, =: symbol_name, FTS: code_code) vs. node.text (_Literal) - pass - elif isinstance(node, Language): - tables |= {"code"} - return "(code_lang = ?)", tables, [node.lang] - elif isinstance(node, Author): - tables |= {"authors"} - if isinstance(node.name, Regex): - return "(author_name REGEXP ?)", [node.name.regex] - cond = "(MATCH(author_name) AGAINST (? IN BOOLEAN MODE))" - return cond, tables, [node.name.string] - elif isinstance(node, Date): - column = {node.CREATE: "codelet_date_created", - node.MODIFY: "codelet_date_modified"}[node.type] - op = {node.BEFORE: "<=", node.AFTER: ">="}[node.relation] - return "(" + column + " " + op + " ?)", tables, [node.date] - elif isinstance(node, Symbol): - tables |= {"symbols"} - cond_base = "(symbol_type = ? AND symbol_name = ?)" - if node.type != node.ALL: - return cond_base, tables, [node.type, node.name] - cond = "(" + " OR ".join([cond_base] * len(node.TYPES)) + ")" - args = zip(node.TYPES.keys(), [node.name] * len(node.TYPES)) - return cond, tables, [arg for tup in args for arg in tup] - elif isinstance(node, BinaryOp): - left_cond, tbls, left_args = _parse_node(node.left, tables) - right_cond, tables, right_args = _parse_node(node.right, tbls) - op = node.OPS[node.op] - cond = "(" + left_cond + " " + op + " " + right_cond + ")" - return cond, tables, left_args + right_args - elif isinstance(node, UnaryOp): - cond, tables, args = _parse_node(node.node, tables) - return "(" + node.OPS[node.op] + " " + cond + ")", tables, args - - conditional, tables, arglist = _parse_node(tree.root, set()) - # joins = " ".join(tables) - - return conditional, joins, tuple(arglist) - def _search_with_query(self, cursor, query, page): """Execute an SQL query based on a query tree, and return results. The returned data is a 2-tuple of (list of codelet IDs, estimated number of total results). """ - conditional, joins, args = self._explode_query_tree(query) base = """SELECT codelet_id FROM codelets %s WHERE %s ORDER BY codelet_rank LIMIT 10""" + conditional, tables, args = query.parameterize() + joins = " ".join(tables) qstring = base % (joins, conditional) if page > 1: qstring += " OFFSET %d" % ((page - 1) * 10) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 6e5d63b..dab6fe0 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -139,7 +139,7 @@ class _QueryParser(object): Returns a 2-tuple of (first_marker_found, marker_index). """ - def _is_escaped(query, index): + def is_escaped(query, index): """Return whether a query marker is backslash-escaped.""" return (index > 0 and query[index - 1] == "\\" and (index < 2 or query[index - 2] != "\\")) @@ -147,7 +147,7 @@ class _QueryParser(object): best_marker, best_index = None, maxsize for marker in markers: index = query.find(marker) - if _is_escaped(query, index): + if is_escaped(query, index): _, new_index = self._scan_query(query[index + 1:], marker) index += new_index + 1 if index >= 0 and index < best_index: diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index 3021ddf..2797012 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -15,6 +15,13 @@ class _Node(object): """Return a string sort key for the node.""" return "" + def parameterize(self, tables): + """Parameterize the node. + + Returns a 3-tuple of (query conditional string, table set, param list). + """ + return "", tables, [] + class _Literal(object): """Represents a literal component of a search query, present at the leaves. @@ -75,6 +82,11 @@ class Text(_Node): def sortkey(self): return self.text.sortkey() + def parameterize(self, tables): + tables |= {"code", "symbols"} + # (FTS: codelet_name, =: symbol_name, FTS: code_code) vs. node.text (_Literal) + pass + class Language(_Node): """Represents a language node. @@ -94,6 +106,10 @@ class Language(_Node): def sortkey(self): return LANGS[self.lang] + def parameterize(self, tables): + tables |= {"code"} + return "(code_lang = ?)", tables, [self.lang] + class Author(_Node): """Represents a author node. @@ -113,6 +129,13 @@ class Author(_Node): def sortkey(self): return self.name.sortkey() + def parameterize(self, tables): + tables |= {"authors"} + if isinstance(self.name, Regex): + return "(author_name REGEXP ?)", [self.name.regex] + cond = "(MATCH(author_name) AGAINST (? IN BOOLEAN MODE))" + return cond, tables, [self.name.string] + class Date(_Node): """Represents a date node. @@ -144,6 +167,12 @@ class Date(_Node): def sortkey(self): return self.date.strftime("%Y%m%d%H%M%S") + def parameterize(self, tables): + column = {self.CREATE: "codelet_date_created", + self.MODIFY: "codelet_date_modified"}[self.type] + op = {self.BEFORE: "<=", self.AFTER: ">="}[self.relation] + return "(" + column + " " + op + " ?)", tables, [self.date] + class Symbol(_Node): """Represents a symbol node. @@ -171,6 +200,15 @@ class Symbol(_Node): def sortkey(self): return self.name.sortkey() + def parameterize(self, tables): + tables |= {"symbols"} + cond_base = "(symbol_type = ? AND symbol_name = ?)" + if self.type != self.ALL: + return cond_base, tables, [self.type, self.name] + cond = "(" + " OR ".join([cond_base] * len(self.TYPES)) + ")" + args = zip(self.TYPES.keys(), [self.name] * len(self.TYPES)) + return cond, tables, [arg for tup in args for arg in tup] + class BinaryOp(_Node): """Represents a relationship between two nodes: ``and``, ``or``.""" @@ -190,6 +228,13 @@ class BinaryOp(_Node): def sortkey(self): return self.left.sortkey() + self.right.sortkey() + def parameterize(self, tables): + left_cond, tables, left_args = self.left.parameterize(tables) + right_cond, tables, right_args = self.right.parameterize(tables) + op = self.OPS[self.op] + cond = "(" + left_cond + " " + op + " " + right_cond + ")" + return cond, tables, left_args + right_args + class UnaryOp(_Node): """Represents a transformation applied to one node: ``not``.""" @@ -205,3 +250,7 @@ class UnaryOp(_Node): def sortkey(self): return self.node.sortkey() + + def parameterize(self, tables): + cond, tables, args = self.node.parameterize(tables) + return "(" + self.OPS[self.op] + " " + cond + ")", tables, args diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py index a13d8f3..8566365 100644 --- a/bitshift/query/tree.py +++ b/bitshift/query/tree.py @@ -25,3 +25,12 @@ class Tree(object): :rtype: str """ return repr(self) + + def parameterize(self): + """Parameterize the query tree for an SQL SELECT statement. + + :return: SQL query data. + :rtype: 3-tuple of (query conditional string, table set, param tuple) + """ + conditional, tables, arglist = self._root.parameterize(set()) + return conditional, tables, tuple(arglist) From 91256b13840cc1e58859752e98f4bd020ec9327b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 May 2014 13:02:54 -0400 Subject: [PATCH 08/14] Finish parameterize() for Text. --- bitshift/database/__init__.py | 2 +- bitshift/query/nodes.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 7a794f6..4e2d2e8 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -62,7 +62,7 @@ class Database(object): base = """SELECT codelet_id FROM codelets %s WHERE %s - ORDER BY codelet_rank LIMIT 10""" + GROUP BY codelet_id ORDER BY codelet_rank DESC LIMIT 10""" conditional, tables, args = query.parameterize() joins = " ".join(tables) qstring = base % (joins, conditional) diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index 2797012..9f01093 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -84,8 +84,15 @@ class Text(_Node): def parameterize(self, tables): tables |= {"code", "symbols"} - # (FTS: codelet_name, =: symbol_name, FTS: code_code) vs. node.text (_Literal) - pass + if isinstance(self.text, Regex): + cols = ["codelet_name", "symbol_name", "code_code"] + cond = "((" + " REGEXP ?) OR (".join(cols) + " REGEXP ?))" + return cond, [self.text.regex] * 3 + conds = ["MATCH(codelet_name) AGAINST (? IN BOOLEAN MODE)", + "MATCH(code_code) AGAINST (? IN BOOLEAN MODE)", + "symbol_name = ?"] + cond = "((" + ") OR (".join(conds) + "))" + return cond, tables, [self.text.string] * 3 class Language(_Node): From fbc5eebad44d412490c459513dd109aef9b029ec Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 May 2014 10:57:22 -0400 Subject: [PATCH 09/14] Refactor query production; fixes. --- bitshift/database/__init__.py | 18 ++++--------- bitshift/query/nodes.py | 59 ++++++++++++++++++++++++------------------- bitshift/query/tree.py | 31 +++++++++++++++++++---- 3 files changed, 64 insertions(+), 44 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 4e2d2e8..07db422 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -53,29 +53,21 @@ class Database(object): "Run `python -m bitshift.database.migration`." raise RuntimeError(err) - def _search_with_query(self, cursor, query, page): + def _search_with_query(self, cursor, tree, page): """Execute an SQL query based on a query tree, and return results. The returned data is a 2-tuple of (list of codelet IDs, estimated number of total results). """ - base = """SELECT codelet_id - FROM codelets %s - WHERE %s - GROUP BY codelet_id ORDER BY codelet_rank DESC LIMIT 10""" - conditional, tables, args = query.parameterize() - joins = " ".join(tables) - qstring = base % (joins, conditional) - if page > 1: - qstring += " OFFSET %d" % ((page - 1) * 10) - - cursor.execute(qstring, args) - ids = [id for id, in cursor.fetchall()] + query, args = tree.build_query(page) + cursor.execute(query, args) + ids = [id for id, _ in cursor.fetchall()] num_results = 0 # TODO: NotImplemented return ids, num_results def _get_codelets_from_ids(self, cursor, ids): """Return a list of Codelet objects given a list of codelet IDs.""" + # TODO: remember that codelets need an origin field raise NotImplementedError() # TODO def _decompose_url(self, cursor, url): diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index 9f01093..68bf504 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -18,9 +18,11 @@ class _Node(object): def parameterize(self, tables): """Parameterize the node. - Returns a 3-tuple of (query conditional string, table set, param list). + Returns a 3-tuple of (conditional string, rank list, parameter list). + If the rank list is empty, then it is assumed to contain the + conditional string. """ - return "", tables, [] + return "", [], [] class _Literal(object): @@ -85,14 +87,16 @@ class Text(_Node): def parameterize(self, tables): tables |= {"code", "symbols"} if isinstance(self.text, Regex): - cols = ["codelet_name", "symbol_name", "code_code"] - cond = "((" + " REGEXP ?) OR (".join(cols) + " REGEXP ?))" - return cond, [self.text.regex] * 3 - conds = ["MATCH(codelet_name) AGAINST (? IN BOOLEAN MODE)", - "MATCH(code_code) AGAINST (? IN BOOLEAN MODE)", - "symbol_name = ?"] - cond = "((" + ") OR (".join(conds) + "))" - return cond, tables, [self.text.string] * 3 + ranks = ["(codelet_name REGEXP ?)", "(symbol_name REGEXP ?)", + "(code_code REGEXP ?)"] + cond = "(" + " OR ".join(ranks) + ")" + return cond, ranks, [self.text.regex] * 3 + else: + ranks = ["(MATCH(codelet_name) AGAINST (? IN BOOLEAN MODE))", + "(MATCH(code_code) AGAINST (? IN BOOLEAN MODE))", + "(symbol_name = ?)"] + cond = "(" + " OR ".join(ranks) + ")" + return cond, ranks, [self.text.string] * 3 class Language(_Node): @@ -115,7 +119,7 @@ class Language(_Node): def parameterize(self, tables): tables |= {"code"} - return "(code_lang = ?)", tables, [self.lang] + return "(code_lang = ?)", [], [self.lang] class Author(_Node): @@ -139,9 +143,9 @@ class Author(_Node): def parameterize(self, tables): tables |= {"authors"} if isinstance(self.name, Regex): - return "(author_name REGEXP ?)", [self.name.regex] + return "(author_name REGEXP ?)", [], [self.name.regex] cond = "(MATCH(author_name) AGAINST (? IN BOOLEAN MODE))" - return cond, tables, [self.name.string] + return cond, [], [self.name.string] class Date(_Node): @@ -178,7 +182,7 @@ class Date(_Node): column = {self.CREATE: "codelet_date_created", self.MODIFY: "codelet_date_modified"}[self.type] op = {self.BEFORE: "<=", self.AFTER: ">="}[self.relation] - return "(" + column + " " + op + " ?)", tables, [self.date] + return "(" + column + " " + op + " ?)", [], [self.date] class Symbol(_Node): @@ -190,8 +194,7 @@ class Symbol(_Node): FUNCTION = 1 CLASS = 2 VARIABLE = 3 - TYPES = {ALL: "ALL", FUNCTION: "FUNCTION", CLASS: "CLASS", - VARIABLE: "VARIABLE"} + TYPES = {FUNCTION: "FUNCTION", CLASS: "CLASS", VARIABLE: "VARIABLE"} def __init__(self, type_, name): """ @@ -202,7 +205,8 @@ class Symbol(_Node): self.name = name def __repr__(self): - return "Symbol({0}, {1})".format(self.TYPES[self.type], self.name) + type_ = self.TYPES.get(self.type, "ALL") + return "Symbol({0}, {1})".format(type_, self.name) def sortkey(self): return self.name.sortkey() @@ -211,10 +215,11 @@ class Symbol(_Node): tables |= {"symbols"} cond_base = "(symbol_type = ? AND symbol_name = ?)" if self.type != self.ALL: - return cond_base, tables, [self.type, self.name] - cond = "(" + " OR ".join([cond_base] * len(self.TYPES)) + ")" + return cond_base, [], [self.type, self.name] + ranks = [cond_base] * len(self.TYPES) + cond = "(" + " OR ".join(ranks) + ")" args = zip(self.TYPES.keys(), [self.name] * len(self.TYPES)) - return cond, tables, [arg for tup in args for arg in tup] + return cond, ranks, [arg for tup in args for arg in tup] class BinaryOp(_Node): @@ -236,11 +241,12 @@ class BinaryOp(_Node): return self.left.sortkey() + self.right.sortkey() def parameterize(self, tables): - left_cond, tables, left_args = self.left.parameterize(tables) - right_cond, tables, right_args = self.right.parameterize(tables) + lcond, lranks, largs = self.left.parameterize(tables) + rcond, rranks, rargs = self.right.parameterize(tables) + lranks, rranks = lranks or [lcond], rranks or [rcond] op = self.OPS[self.op] - cond = "(" + left_cond + " " + op + " " + right_cond + ")" - return cond, tables, left_args + right_args + cond = "(" + lcond + " " + op + " " + rcond + ")" + return cond, lranks + rranks, largs + rargs class UnaryOp(_Node): @@ -259,5 +265,6 @@ class UnaryOp(_Node): return self.node.sortkey() def parameterize(self, tables): - cond, tables, args = self.node.parameterize(tables) - return "(" + self.OPS[self.op] + " " + cond + ")", tables, args + cond, ranks, args = self.node.parameterize(tables) + ranks = ranks or [cond] + return "(" + self.OPS[self.op] + " " + cond + ")", ranks, args diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py index 8566365..cc03f72 100644 --- a/bitshift/query/tree.py +++ b/bitshift/query/tree.py @@ -1,5 +1,12 @@ __all__ = ["Tree"] +QUERY_TEMPLATE = """SELECT codelet_id, (codelet_rank + %s) AS score +FROM codelets %s +WHERE %s +GROUP BY codelet_id +ORDER BY score DESC +LIMIT %d OFFSET %d""".replace("\n", " ") + class Tree(object): """Represents a query tree.""" @@ -26,11 +33,25 @@ class Tree(object): """ return repr(self) - def parameterize(self): - """Parameterize the query tree for an SQL SELECT statement. + def build_query(self, page=1, page_size=10, pretty=False): + """Convert the query tree into a parameterized SQL SELECT statement. + + :param page: The page number to get results for. + :type page: int + :param page_size: The number of results per page. + :type page_size: int + :param pretty: Whether to pretty-print the SQL query or not. + :type pretty: bool :return: SQL query data. - :rtype: 3-tuple of (query conditional string, table set, param tuple) + :rtype: 2-tuple of (SQL statement string, query parameter tuple) """ - conditional, tables, arglist = self._root.parameterize(set()) - return conditional, tables, tuple(arglist) + tables = set() + cond, ranks, arglist = self._root.parameterize(tables) + ranks = ranks or [cond] + score = "((%s) / %d)" % (" + ".join(ranks), len(ranks)) + joins = " ".join(tables) # TODO + offset = (page - 1) * page_size + + query = QUERY_TEMPLATE % (score, joins, cond, page_size, offset) + return query, tuple(arglist * 2) From ba8bde7689cb34461ca5c778f4a5135ec75c7e6d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 May 2014 13:06:27 -0400 Subject: [PATCH 10/14] Cleanup, fixes, additions. --- bitshift/query/nodes.py | 27 +++++++++++++++------------ bitshift/query/tree.py | 12 +++++++++++- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index 68bf504..342f8ec 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -89,14 +89,13 @@ class Text(_Node): if isinstance(self.text, Regex): ranks = ["(codelet_name REGEXP ?)", "(symbol_name REGEXP ?)", "(code_code REGEXP ?)"] - cond = "(" + " OR ".join(ranks) + ")" - return cond, ranks, [self.text.regex] * 3 + text = self.text.regex else: ranks = ["(MATCH(codelet_name) AGAINST (? IN BOOLEAN MODE))", "(MATCH(code_code) AGAINST (? IN BOOLEAN MODE))", "(symbol_name = ?)"] - cond = "(" + " OR ".join(ranks) + ")" - return cond, ranks, [self.text.string] * 3 + text = self.text.string + return cond, ranks, [text] * 3 class Language(_Node): @@ -199,7 +198,7 @@ class Symbol(_Node): def __init__(self, type_, name): """ :type type_: int (``ALL``, ``FUNCTION``, ``CLASS``, etc.) - :type name: :py:class:`.Literal` + :type name: :py:class:`._Literal` """ self.type = type_ self.name = name @@ -212,14 +211,18 @@ class Symbol(_Node): return self.name.sortkey() def parameterize(self, tables): - tables |= {"symbols"} - cond_base = "(symbol_type = ? AND symbol_name = ?)" + tables |= {"code", "symbols"} + if isinstance(self.name, Regex): + cond_base = "(symbol_type = ? AND symbol_name REGEXP ?)" + name = self.name.regex + else: + cond_base = "(symbol_type = ? AND symbol_name = ?)" + name = self.name.string if self.type != self.ALL: - return cond_base, [], [self.type, self.name] - ranks = [cond_base] * len(self.TYPES) - cond = "(" + " OR ".join(ranks) + ")" - args = zip(self.TYPES.keys(), [self.name] * len(self.TYPES)) - return cond, ranks, [arg for tup in args for arg in tup] + return cond_base, [], [self.type, name] + cond = "(" + " OR ".join([cond_base] * len(self.TYPES)) + ")" + args = zip(self.TYPES.keys(), [name] * len(self.TYPES)) + return cond, [], [arg for tup in args for arg in tup] class BinaryOp(_Node): diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py index cc03f72..86392be 100644 --- a/bitshift/query/tree.py +++ b/bitshift/query/tree.py @@ -46,12 +46,22 @@ class Tree(object): :return: SQL query data. :rtype: 2-tuple of (SQL statement string, query parameter tuple) """ + def get_table_join(table): + tables = { + "code": ("codelet_code_id", "code_id"), + "authors": ("author_codelet", "codelet_id"), + "symbols": ("symbol_code", "code_id") + } + tmpl = "INNER JOIN %s ON %s = %s" + return tmpl % (table, tables[table][0], tables[table][1]) + tables = set() cond, ranks, arglist = self._root.parameterize(tables) ranks = ranks or [cond] score = "((%s) / %d)" % (" + ".join(ranks), len(ranks)) - joins = " ".join(tables) # TODO + joins = " ".join(get_table_join(table) for table in tables) offset = (page - 1) * page_size + ## TODO: handle pretty query = QUERY_TEMPLATE % (score, joins, cond, page_size, offset) return query, tuple(arglist * 2) From 7309ca55329679cf6b51d71694af61bbc634746b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 May 2014 16:32:08 -0400 Subject: [PATCH 11/14] Some fixes and adjustments; note. --- bitshift/query/nodes.py | 15 +++++++-------- bitshift/query/tree.py | 20 ++++++++++++-------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index 342f8ec..905f65b 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -213,16 +213,15 @@ class Symbol(_Node): def parameterize(self, tables): tables |= {"code", "symbols"} if isinstance(self.name, Regex): - cond_base = "(symbol_type = ? AND symbol_name REGEXP ?)" - name = self.name.regex + cond, name = "symbol_name REGEXP ?", self.name.regex else: - cond_base = "(symbol_type = ? AND symbol_name = ?)" - name = self.name.string + cond, name = "symbol_name = ?", self.name.string + if self.type == self.ALL: + types = ", ".join(str(type_) for type_ in self.TYPES) + cond += " AND symbol_type IN (%s)" % types if self.type != self.ALL: - return cond_base, [], [self.type, name] - cond = "(" + " OR ".join([cond_base] * len(self.TYPES)) + ")" - args = zip(self.TYPES.keys(), [name] * len(self.TYPES)) - return cond, [], [arg for tup in args for arg in tup] + cond += " AND symbol_type = %d" % self.type + return "(" + cond + ")", [], [name] class BinaryOp(_Node): diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py index 86392be..8989c31 100644 --- a/bitshift/query/tree.py +++ b/bitshift/query/tree.py @@ -46,20 +46,24 @@ class Tree(object): :return: SQL query data. :rtype: 2-tuple of (SQL statement string, query parameter tuple) """ - def get_table_join(table): - tables = { - "code": ("codelet_code_id", "code_id"), - "authors": ("author_codelet", "codelet_id"), - "symbols": ("symbol_code", "code_id") - } + def get_table_joins(tables): + data = [ + ("code", "codelet_code_id", "code_id"), + ("authors", "author_codelet", "codelet_id"), + ("symbols", "symbol_code", "code_id") + ] tmpl = "INNER JOIN %s ON %s = %s" - return tmpl % (table, tables[table][0], tables[table][1]) + for args in data: + if table in tables: + yield tmpl % args tables = set() cond, ranks, arglist = self._root.parameterize(tables) ranks = ranks or [cond] + # TODO: if the only rank is a single thing and it's a boolean value + # (i.e. not a match statement), get rid of it. score = "((%s) / %d)" % (" + ".join(ranks), len(ranks)) - joins = " ".join(get_table_join(table) for table in tables) + joins = " ".join(get_table_joins(tables)) offset = (page - 1) * page_size ## TODO: handle pretty From 827ea090852b2ca4d496e489e8f1a27fe94470e8 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 May 2014 12:39:58 -0400 Subject: [PATCH 12/14] Only use dynamic ranks if the conditional is complex. --- bitshift/query/nodes.py | 33 ++++++++++++++++++--------------- bitshift/query/tree.py | 15 ++++++++------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index 905f65b..5a6f62c 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -18,11 +18,11 @@ class _Node(object): def parameterize(self, tables): """Parameterize the node. - Returns a 3-tuple of (conditional string, rank list, parameter list). - If the rank list is empty, then it is assumed to contain the - conditional string. + Returns a 4-tuple of (conditional string, parameter list, rank list, + should-we-rank boolean). If the rank list is empty, then it is assumed + to contain the conditional string. """ - return "", [], [] + return "", [], [], False class _Literal(object): @@ -95,7 +95,8 @@ class Text(_Node): "(MATCH(code_code) AGAINST (? IN BOOLEAN MODE))", "(symbol_name = ?)"] text = self.text.string - return cond, ranks, [text] * 3 + cond = "(" + " OR ".join(ranks) + ")" + return cond, [text] * 3, ranks, True class Language(_Node): @@ -118,7 +119,7 @@ class Language(_Node): def parameterize(self, tables): tables |= {"code"} - return "(code_lang = ?)", [], [self.lang] + return "(code_lang = ?)", [self.lang], [], False class Author(_Node): @@ -142,9 +143,9 @@ class Author(_Node): def parameterize(self, tables): tables |= {"authors"} if isinstance(self.name, Regex): - return "(author_name REGEXP ?)", [], [self.name.regex] + return "(author_name REGEXP ?)", [self.name.regex], [], False cond = "(MATCH(author_name) AGAINST (? IN BOOLEAN MODE))" - return cond, [], [self.name.string] + return cond, [self.name.string], [], True class Date(_Node): @@ -181,7 +182,7 @@ class Date(_Node): column = {self.CREATE: "codelet_date_created", self.MODIFY: "codelet_date_modified"}[self.type] op = {self.BEFORE: "<=", self.AFTER: ">="}[self.relation] - return "(" + column + " " + op + " ?)", [], [self.date] + return "(" + column + " " + op + " ?)", [self.date], [], False class Symbol(_Node): @@ -221,7 +222,7 @@ class Symbol(_Node): cond += " AND symbol_type IN (%s)" % types if self.type != self.ALL: cond += " AND symbol_type = %d" % self.type - return "(" + cond + ")", [], [name] + return "(" + cond + ")", [name], [], False class BinaryOp(_Node): @@ -243,12 +244,13 @@ class BinaryOp(_Node): return self.left.sortkey() + self.right.sortkey() def parameterize(self, tables): - lcond, lranks, largs = self.left.parameterize(tables) - rcond, rranks, rargs = self.right.parameterize(tables) + lcond, largs, lranks, need_lranks = self.left.parameterize(tables) + rcond, rargs, rranks, need_rranks = self.right.parameterize(tables) lranks, rranks = lranks or [lcond], rranks or [rcond] op = self.OPS[self.op] cond = "(" + lcond + " " + op + " " + rcond + ")" - return cond, lranks + rranks, largs + rargs + need_ranks = need_lranks or need_rranks or self.op == self.OR + return cond, largs + rargs, lranks + rranks, need_ranks class UnaryOp(_Node): @@ -267,6 +269,7 @@ class UnaryOp(_Node): return self.node.sortkey() def parameterize(self, tables): - cond, ranks, args = self.node.parameterize(tables) + cond, args, ranks, need_ranks = self.node.parameterize(tables) + new_cond = "(" + self.OPS[self.op] + " " + cond + ")" ranks = ranks or [cond] - return "(" + self.OPS[self.op] + " " + cond + ")", ranks, args + return new_cond, args, ranks, need_ranks diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py index 8989c31..4e8ed87 100644 --- a/bitshift/query/tree.py +++ b/bitshift/query/tree.py @@ -1,6 +1,6 @@ __all__ = ["Tree"] -QUERY_TEMPLATE = """SELECT codelet_id, (codelet_rank + %s) AS score +QUERY_TEMPLATE = """SELECT codelet_id, (codelet_rank%s) AS score FROM codelets %s WHERE %s GROUP BY codelet_id @@ -54,18 +54,19 @@ class Tree(object): ] tmpl = "INNER JOIN %s ON %s = %s" for args in data: - if table in tables: + if args[0] in tables: yield tmpl % args tables = set() - cond, ranks, arglist = self._root.parameterize(tables) + cond, arglist, ranks, need_ranks = self._root.parameterize(tables) ranks = ranks or [cond] - # TODO: if the only rank is a single thing and it's a boolean value - # (i.e. not a match statement), get rid of it. - score = "((%s) / %d)" % (" + ".join(ranks), len(ranks)) + if need_ranks: + score = " + ((%s) / %d)" % (" + ".join(ranks), len(ranks)) + else: + score = "" joins = " ".join(get_table_joins(tables)) offset = (page - 1) * page_size ## TODO: handle pretty query = QUERY_TEMPLATE % (score, joins, cond, page_size, offset) - return query, tuple(arglist * 2) + return query, tuple(arglist * 2 if need_ranks else arglist) From 1fbe0e823bccf86358090979993c72c35e66d5bd Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 May 2014 15:04:19 -0400 Subject: [PATCH 13/14] Implement most of _get_codelets_from_ids(). --- bitshift/codelet.py | 14 +++++++++++--- bitshift/database/__init__.py | 29 +++++++++++++++++++++++++---- bitshift/query/tree.py | 5 +---- 3 files changed, 37 insertions(+), 11 deletions(-) diff --git a/bitshift/codelet.py b/bitshift/codelet.py index acaa52b..92debf4 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -18,12 +18,14 @@ class Codelet(object): code was last modified. :ivar rank: (float) A quanitification of the source code's quality, as per available ratings (stars, forks, upvotes, etc.). - :ivar symbols: (dict) Dictionary containing dictionaries of functions, classes, - variable definitions, etc. + :ivar symbols: (dict) Dictionary containing dictionaries of functions, + classes, variable definitions, etc. + :ivar origin: (tuple) 3-tuple of (site_name, site_url, image_blob), as + added by the database. """ def __init__(self, name, code, filename, language, authors, code_url, - date_created, date_modified, rank): + date_created, date_modified, rank, symbols=None, origin=None): """ Create a Codelet instance. @@ -36,6 +38,8 @@ class Codelet(object): :param date_created: see :attr:`self.date_created` :param date_modified: see :attr:`self.date_modified` :param rank: see :attr:`self.rank` + :param symbols: see :attr:`self.symbols` + :param origin: see :attr:`self.origin` :type name: see :attr:`self.name` :type code: see :attr:`self.code` @@ -46,6 +50,8 @@ class Codelet(object): :type date_created: see :attr:`self.date_created` :type date_modified: see :attr:`self.date_modified` :type rank: see :attr:`self.rank` + :type symbols: see :attr:`self.symbols` + :type origin: see :attr:`self.origin` """ self.name = name @@ -57,3 +63,5 @@ class Codelet(object): self.date_created = date_created self.date_modified = date_modified self.rank = rank + self.symbols = symbols or {} + self.origin = origin or (None, None, None) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 07db422..68af79c 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -65,10 +65,30 @@ class Database(object): num_results = 0 # TODO: NotImplemented return ids, num_results - def _get_codelets_from_ids(self, cursor, ids): + def _get_codelets_from_ids(self, ids): """Return a list of Codelet objects given a list of codelet IDs.""" - # TODO: remember that codelets need an origin field - raise NotImplementedError() # TODO + query = """SELECT * + FROM codelets + INNER JOIN code ON codelet_code_id = code_id + INNER JOIN origins ON codelet_origin = origin_id + WHERE codelet_id = ?""" + + with self._conn.cursor(oursql.DictCursor) as cursor: + cursor.executemany(query, [(id,) for id in ids]) + for row in cursor.fetchone(): + if row["origin_url_base"]: + url = row["codelet_url"] + else: + url = row["origin_url_base"] + row["codelet_url"] + origin = (row["origin_name"], row["origin_url"], + row["origin_image"]) + authors = NotImplemented # TODO: list of 3-tuples (author_name, author_url or None) + symbols = NotImplemented # TODO: dict of {sym_type: (name, decls, uses)} + yield Codelet( + row["codelet_name"], row["code_code"], None, + row["code_lang"], authors, url, + row["codelet_date_created"], row["codelet_date_modified"], + row["codelet_rank"], symbols, origin) def _decompose_url(self, cursor, url): """Break up a URL into an origin (with a URL base) and a suffix.""" @@ -133,7 +153,8 @@ class Database(object): num_mnt = num_results / (10 ** num_exp) cursor.execute(query2, (cache_id, num_mnt, num_exp)) cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) - return (num_results, self._get_codelets_from_ids(cursor, ids)) + codelet_gen = self._get_codelets_from_ids(ids) + return (num_results, list(codelet_gen)) def insert(self, codelet): """ diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py index 4e8ed87..5da3f02 100644 --- a/bitshift/query/tree.py +++ b/bitshift/query/tree.py @@ -33,15 +33,13 @@ class Tree(object): """ return repr(self) - def build_query(self, page=1, page_size=10, pretty=False): + def build_query(self, page=1, page_size=10): """Convert the query tree into a parameterized SQL SELECT statement. :param page: The page number to get results for. :type page: int :param page_size: The number of results per page. :type page_size: int - :param pretty: Whether to pretty-print the SQL query or not. - :type pretty: bool :return: SQL query data. :rtype: 2-tuple of (SQL statement string, query parameter tuple) @@ -67,6 +65,5 @@ class Tree(object): joins = " ".join(get_table_joins(tables)) offset = (page - 1) * page_size - ## TODO: handle pretty query = QUERY_TEMPLATE % (score, joins, cond, page_size, offset) return query, tuple(arglist * 2 if need_ranks else arglist) From 860260d0ad5d09fa4fabf07b1f8dfec2410ef9f1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 May 2014 16:12:49 -0400 Subject: [PATCH 14/14] Finish Database.search() (closes #19) --- bitshift/database/__init__.py | 49 ++++++++++++++++++++++++++++++++++--------- bitshift/query/__init__.py | 2 +- bitshift/query/nodes.py | 9 ++++---- 3 files changed, 45 insertions(+), 15 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 68af79c..e4fa430 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -9,6 +9,7 @@ import mmh3 import oursql from .migration import VERSION, MIGRATIONS +from ..codelet import Codelet from ..query.nodes import (String, Regex, Text, Language, Author, Date, Symbol, BinaryOp, UnaryOp) @@ -65,7 +66,35 @@ class Database(object): num_results = 0 # TODO: NotImplemented return ids, num_results - def _get_codelets_from_ids(self, ids): + def _get_authors_for_codelet(self, cursor, codelet_id): + """Return a list of authors for a given codelet.""" + query = """SELECT author_name, author_url + FROM authors + WHERE author_codelet = ?""" + + cursor.execute(query, (codelet_id,)) + return cursor.fetchall() + + def _get_symbols_for_code(self, cursor, code_id): + """Return a list of symbols for a given codelet.""" + query = """SELECT symbol_type, symbol_name, sloc_type, sloc_row, + sloc_col, sloc_end_row, sloc_end_col + FROM symbols + INNER JOIN symbol_locations ON sloc_symbol = symbol_id + WHERE symbol_code = ?""" + + symbols = {type_: {} for type_ in Symbol.TYPES_INV} + cursor.execute(query, (code_id,)) + for type_, name, loc_type, row, col, erow, ecol in cursor.fetchall(): + sdict = symbols[Symbol.TYPES_INV[type_]] + if name not in sdict: + sdict[name] = ((), ()) + sdict[name][loc_type].append((row, col, erow, ecol)) + for type_, sdict in symbols.items(): + symbols[type_] = [(n, d, u) for n, (d, u) in sdict.iteritems()] + return symbols + + def _get_codelets_from_ids(self, cursor, ids): """Return a list of Codelet objects given a list of codelet IDs.""" query = """SELECT * FROM codelets @@ -73,17 +102,18 @@ class Database(object): INNER JOIN origins ON codelet_origin = origin_id WHERE codelet_id = ?""" - with self._conn.cursor(oursql.DictCursor) as cursor: - cursor.executemany(query, [(id,) for id in ids]) - for row in cursor.fetchone(): + with self._conn.cursor(oursql.DictCursor) as dict_cursor: + dict_cursor.executemany(query, [(id,) for id in ids]) + for row in dict_cursor.fetchone(): + codelet_id = row["codelet_id"] if row["origin_url_base"]: url = row["codelet_url"] else: url = row["origin_url_base"] + row["codelet_url"] origin = (row["origin_name"], row["origin_url"], row["origin_image"]) - authors = NotImplemented # TODO: list of 3-tuples (author_name, author_url or None) - symbols = NotImplemented # TODO: dict of {sym_type: (name, decls, uses)} + authors = self._get_authors_for_codelet(cursor, codelet_id) + symbols = self._get_symbols_for_code(cursor, row["code_id"]) yield Codelet( row["codelet_name"], row["code_code"], None, row["code_lang"], authors, url, @@ -103,13 +133,12 @@ class Database(object): def _insert_symbols(self, cursor, code_id, sym_type, symbols): """Insert a list of symbols of a given type into the database.""" - sym_types = ["functions", "classes", "variables"] query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)" query2 = """INSERT INTO symbol_locations VALUES (DEFAULT, ?, ?, ?, ?, ?, ?)""" for (name, decls, uses) in symbols: - cursor.execute(query1, (code_id, sym_types.index(sym_type), name)) + cursor.execute(query1, (code_id, Symbol.TYPES_INV[sym_type], name)) sym_id = cursor.lastrowid params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] + [tuple([sym_id, 1] + list(loc)) for loc in uses]) @@ -153,8 +182,8 @@ class Database(object): num_mnt = num_results / (10 ** num_exp) cursor.execute(query2, (cache_id, num_mnt, num_exp)) cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) - codelet_gen = self._get_codelets_from_ids(ids) - return (num_results, list(codelet_gen)) + codelet_gen = self._get_codelets_from_ids(cursor, ids) + return (num_results, list(codelet_gen)) def insert(self, codelet): """ diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index dab6fe0..41d01cf 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -231,7 +231,7 @@ class _QueryParser(object): index = nest.index(UnaryOp.NOT) if index == len(nest) - 1: err = "Invalid query: '%s' given without argument." - raise QueryParseException(err % UnaryOp.OPS[op]) + raise QueryParseException(err % UnaryOp.OPS[UnaryOp.NOT]) right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:])) if index > 0: left = self._parse_nest(nest[:index]) diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index 5a6f62c..5d157b5 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -190,11 +190,12 @@ class Symbol(_Node): Searches in symbol_type and symbol_name. """ - ALL = 0 - FUNCTION = 1 - CLASS = 2 - VARIABLE = 3 + ALL = -1 + FUNCTION = 0 + CLASS = 1 + VARIABLE = 2 TYPES = {FUNCTION: "FUNCTION", CLASS: "CLASS", VARIABLE: "VARIABLE"} + TYPES_INV = ["functions", "classes", "variables"] def __init__(self, type_, name): """