From fbc5eebad44d412490c459513dd109aef9b029ec Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 May 2014 10:57:22 -0400 Subject: [PATCH] Refactor query production; fixes. --- bitshift/database/__init__.py | 18 ++++--------- bitshift/query/nodes.py | 59 ++++++++++++++++++++++++------------------- bitshift/query/tree.py | 31 +++++++++++++++++++---- 3 files changed, 64 insertions(+), 44 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 4e2d2e8..07db422 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -53,29 +53,21 @@ class Database(object): "Run `python -m bitshift.database.migration`." raise RuntimeError(err) - def _search_with_query(self, cursor, query, page): + def _search_with_query(self, cursor, tree, page): """Execute an SQL query based on a query tree, and return results. The returned data is a 2-tuple of (list of codelet IDs, estimated number of total results). """ - base = """SELECT codelet_id - FROM codelets %s - WHERE %s - GROUP BY codelet_id ORDER BY codelet_rank DESC LIMIT 10""" - conditional, tables, args = query.parameterize() - joins = " ".join(tables) - qstring = base % (joins, conditional) - if page > 1: - qstring += " OFFSET %d" % ((page - 1) * 10) - - cursor.execute(qstring, args) - ids = [id for id, in cursor.fetchall()] + query, args = tree.build_query(page) + cursor.execute(query, args) + ids = [id for id, _ in cursor.fetchall()] num_results = 0 # TODO: NotImplemented return ids, num_results def _get_codelets_from_ids(self, cursor, ids): """Return a list of Codelet objects given a list of codelet IDs.""" + # TODO: remember that codelets need an origin field raise NotImplementedError() # TODO def _decompose_url(self, cursor, url): diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index 9f01093..68bf504 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -18,9 +18,11 @@ class _Node(object): def parameterize(self, tables): """Parameterize the node. - Returns a 3-tuple of (query conditional string, table set, param list). + Returns a 3-tuple of (conditional string, rank list, parameter list). + If the rank list is empty, then it is assumed to contain the + conditional string. """ - return "", tables, [] + return "", [], [] class _Literal(object): @@ -85,14 +87,16 @@ class Text(_Node): def parameterize(self, tables): tables |= {"code", "symbols"} if isinstance(self.text, Regex): - cols = ["codelet_name", "symbol_name", "code_code"] - cond = "((" + " REGEXP ?) OR (".join(cols) + " REGEXP ?))" - return cond, [self.text.regex] * 3 - conds = ["MATCH(codelet_name) AGAINST (? IN BOOLEAN MODE)", - "MATCH(code_code) AGAINST (? IN BOOLEAN MODE)", - "symbol_name = ?"] - cond = "((" + ") OR (".join(conds) + "))" - return cond, tables, [self.text.string] * 3 + ranks = ["(codelet_name REGEXP ?)", "(symbol_name REGEXP ?)", + "(code_code REGEXP ?)"] + cond = "(" + " OR ".join(ranks) + ")" + return cond, ranks, [self.text.regex] * 3 + else: + ranks = ["(MATCH(codelet_name) AGAINST (? IN BOOLEAN MODE))", + "(MATCH(code_code) AGAINST (? IN BOOLEAN MODE))", + "(symbol_name = ?)"] + cond = "(" + " OR ".join(ranks) + ")" + return cond, ranks, [self.text.string] * 3 class Language(_Node): @@ -115,7 +119,7 @@ class Language(_Node): def parameterize(self, tables): tables |= {"code"} - return "(code_lang = ?)", tables, [self.lang] + return "(code_lang = ?)", [], [self.lang] class Author(_Node): @@ -139,9 +143,9 @@ class Author(_Node): def parameterize(self, tables): tables |= {"authors"} if isinstance(self.name, Regex): - return "(author_name REGEXP ?)", [self.name.regex] + return "(author_name REGEXP ?)", [], [self.name.regex] cond = "(MATCH(author_name) AGAINST (? IN BOOLEAN MODE))" - return cond, tables, [self.name.string] + return cond, [], [self.name.string] class Date(_Node): @@ -178,7 +182,7 @@ class Date(_Node): column = {self.CREATE: "codelet_date_created", self.MODIFY: "codelet_date_modified"}[self.type] op = {self.BEFORE: "<=", self.AFTER: ">="}[self.relation] - return "(" + column + " " + op + " ?)", tables, [self.date] + return "(" + column + " " + op + " ?)", [], [self.date] class Symbol(_Node): @@ -190,8 +194,7 @@ class Symbol(_Node): FUNCTION = 1 CLASS = 2 VARIABLE = 3 - TYPES = {ALL: "ALL", FUNCTION: "FUNCTION", CLASS: "CLASS", - VARIABLE: "VARIABLE"} + TYPES = {FUNCTION: "FUNCTION", CLASS: "CLASS", VARIABLE: "VARIABLE"} def __init__(self, type_, name): """ @@ -202,7 +205,8 @@ class Symbol(_Node): self.name = name def __repr__(self): - return "Symbol({0}, {1})".format(self.TYPES[self.type], self.name) + type_ = self.TYPES.get(self.type, "ALL") + return "Symbol({0}, {1})".format(type_, self.name) def sortkey(self): return self.name.sortkey() @@ -211,10 +215,11 @@ class Symbol(_Node): tables |= {"symbols"} cond_base = "(symbol_type = ? AND symbol_name = ?)" if self.type != self.ALL: - return cond_base, tables, [self.type, self.name] - cond = "(" + " OR ".join([cond_base] * len(self.TYPES)) + ")" + return cond_base, [], [self.type, self.name] + ranks = [cond_base] * len(self.TYPES) + cond = "(" + " OR ".join(ranks) + ")" args = zip(self.TYPES.keys(), [self.name] * len(self.TYPES)) - return cond, tables, [arg for tup in args for arg in tup] + return cond, ranks, [arg for tup in args for arg in tup] class BinaryOp(_Node): @@ -236,11 +241,12 @@ class BinaryOp(_Node): return self.left.sortkey() + self.right.sortkey() def parameterize(self, tables): - left_cond, tables, left_args = self.left.parameterize(tables) - right_cond, tables, right_args = self.right.parameterize(tables) + lcond, lranks, largs = self.left.parameterize(tables) + rcond, rranks, rargs = self.right.parameterize(tables) + lranks, rranks = lranks or [lcond], rranks or [rcond] op = self.OPS[self.op] - cond = "(" + left_cond + " " + op + " " + right_cond + ")" - return cond, tables, left_args + right_args + cond = "(" + lcond + " " + op + " " + rcond + ")" + return cond, lranks + rranks, largs + rargs class UnaryOp(_Node): @@ -259,5 +265,6 @@ class UnaryOp(_Node): return self.node.sortkey() def parameterize(self, tables): - cond, tables, args = self.node.parameterize(tables) - return "(" + self.OPS[self.op] + " " + cond + ")", tables, args + cond, ranks, args = self.node.parameterize(tables) + ranks = ranks or [cond] + return "(" + self.OPS[self.op] + " " + cond + ")", ranks, args diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py index 8566365..cc03f72 100644 --- a/bitshift/query/tree.py +++ b/bitshift/query/tree.py @@ -1,5 +1,12 @@ __all__ = ["Tree"] +QUERY_TEMPLATE = """SELECT codelet_id, (codelet_rank + %s) AS score +FROM codelets %s +WHERE %s +GROUP BY codelet_id +ORDER BY score DESC +LIMIT %d OFFSET %d""".replace("\n", " ") + class Tree(object): """Represents a query tree.""" @@ -26,11 +33,25 @@ class Tree(object): """ return repr(self) - def parameterize(self): - """Parameterize the query tree for an SQL SELECT statement. + def build_query(self, page=1, page_size=10, pretty=False): + """Convert the query tree into a parameterized SQL SELECT statement. + + :param page: The page number to get results for. + :type page: int + :param page_size: The number of results per page. + :type page_size: int + :param pretty: Whether to pretty-print the SQL query or not. + :type pretty: bool :return: SQL query data. - :rtype: 3-tuple of (query conditional string, table set, param tuple) + :rtype: 2-tuple of (SQL statement string, query parameter tuple) """ - conditional, tables, arglist = self._root.parameterize(set()) - return conditional, tables, tuple(arglist) + tables = set() + cond, ranks, arglist = self._root.parameterize(tables) + ranks = ranks or [cond] + score = "((%s) / %d)" % (" + ".join(ranks), len(ranks)) + joins = " ".join(tables) # TODO + offset = (page - 1) * page_size + + query = QUERY_TEMPLATE % (score, joins, cond, page_size, offset) + return query, tuple(arglist * 2)