@@ -18,12 +18,14 @@ class Codelet(object): | |||
code was last modified. | |||
:ivar rank: (float) A quanitification of the source code's quality, as | |||
per available ratings (stars, forks, upvotes, etc.). | |||
:ivar symbols: (dict) Dictionary containing dictionaries of functions, classes, | |||
variable definitions, etc. | |||
:ivar symbols: (dict) Dictionary containing dictionaries of functions, | |||
classes, variable definitions, etc. | |||
:ivar origin: (tuple) 3-tuple of (site_name, site_url, image_blob), as | |||
added by the database. | |||
""" | |||
def __init__(self, name, code, filename, language, authors, code_url, | |||
date_created, date_modified, rank): | |||
date_created, date_modified, rank, symbols=None, origin=None): | |||
""" | |||
Create a Codelet instance. | |||
@@ -36,6 +38,8 @@ class Codelet(object): | |||
:param date_created: see :attr:`self.date_created` | |||
:param date_modified: see :attr:`self.date_modified` | |||
:param rank: see :attr:`self.rank` | |||
:param symbols: see :attr:`self.symbols` | |||
:param origin: see :attr:`self.origin` | |||
:type name: see :attr:`self.name` | |||
:type code: see :attr:`self.code` | |||
@@ -46,6 +50,8 @@ class Codelet(object): | |||
:type date_created: see :attr:`self.date_created` | |||
:type date_modified: see :attr:`self.date_modified` | |||
:type rank: see :attr:`self.rank` | |||
:type symbols: see :attr:`self.symbols` | |||
:type origin: see :attr:`self.origin` | |||
""" | |||
self.name = name | |||
@@ -57,3 +63,5 @@ class Codelet(object): | |||
self.date_created = date_created | |||
self.date_modified = date_modified | |||
self.rank = rank | |||
self.symbols = symbols or {} | |||
self.origin = origin or (None, None, None) |
@@ -9,6 +9,9 @@ import mmh3 | |||
import oursql | |||
from .migration import VERSION, MIGRATIONS | |||
from ..codelet import Codelet | |||
from ..query.nodes import (String, Regex, Text, Language, Author, Date, Symbol, | |||
BinaryOp, UnaryOp) | |||
__all__ = ["Database"] | |||
@@ -51,23 +54,71 @@ class Database(object): | |||
"Run `python -m bitshift.database.migration`." | |||
raise RuntimeError(err) | |||
def _search_with_query(self, cursor, query): | |||
"""Convert a query tree into SQL SELECTs, execute, and return results. | |||
def _search_with_query(self, cursor, tree, page): | |||
"""Execute an SQL query based on a query tree, and return results. | |||
The returned data is a 2-tuple of (list of codelet IDs, estimated | |||
number of total results). | |||
""" | |||
raise NotImplementedError() ## TODO | |||
results = cursor.fetchall() | |||
ids = NotImplemented ## TODO: extract ids from results | |||
num_results = NotImplemented ## TODO: num if results else 0 | |||
query, args = tree.build_query(page) | |||
cursor.execute(query, args) | |||
ids = [id for id, _ in cursor.fetchall()] | |||
num_results = 0 # TODO: NotImplemented | |||
return ids, num_results | |||
def _get_authors_for_codelet(self, cursor, codelet_id): | |||
"""Return a list of authors for a given codelet.""" | |||
query = """SELECT author_name, author_url | |||
FROM authors | |||
WHERE author_codelet = ?""" | |||
cursor.execute(query, (codelet_id,)) | |||
return cursor.fetchall() | |||
def _get_symbols_for_code(self, cursor, code_id): | |||
"""Return a list of symbols for a given codelet.""" | |||
query = """SELECT symbol_type, symbol_name, sloc_type, sloc_row, | |||
sloc_col, sloc_end_row, sloc_end_col | |||
FROM symbols | |||
INNER JOIN symbol_locations ON sloc_symbol = symbol_id | |||
WHERE symbol_code = ?""" | |||
symbols = {type_: {} for type_ in Symbol.TYPES_INV} | |||
cursor.execute(query, (code_id,)) | |||
for type_, name, loc_type, row, col, erow, ecol in cursor.fetchall(): | |||
sdict = symbols[Symbol.TYPES_INV[type_]] | |||
if name not in sdict: | |||
sdict[name] = ((), ()) | |||
sdict[name][loc_type].append((row, col, erow, ecol)) | |||
for type_, sdict in symbols.items(): | |||
symbols[type_] = [(n, d, u) for n, (d, u) in sdict.iteritems()] | |||
return symbols | |||
def _get_codelets_from_ids(self, cursor, ids): | |||
"""Return a list of Codelet objects given a list of codelet IDs.""" | |||
raise NotImplementedError() ## TODO | |||
query = """SELECT * | |||
FROM codelets | |||
INNER JOIN code ON codelet_code_id = code_id | |||
INNER JOIN origins ON codelet_origin = origin_id | |||
WHERE codelet_id = ?""" | |||
with self._conn.cursor(oursql.DictCursor) as dict_cursor: | |||
dict_cursor.executemany(query, [(id,) for id in ids]) | |||
for row in dict_cursor.fetchone(): | |||
codelet_id = row["codelet_id"] | |||
if row["origin_url_base"]: | |||
url = row["codelet_url"] | |||
else: | |||
url = row["origin_url_base"] + row["codelet_url"] | |||
origin = (row["origin_name"], row["origin_url"], | |||
row["origin_image"]) | |||
authors = self._get_authors_for_codelet(cursor, codelet_id) | |||
symbols = self._get_symbols_for_code(cursor, row["code_id"]) | |||
yield Codelet( | |||
row["codelet_name"], row["code_code"], None, | |||
row["code_lang"], authors, url, | |||
row["codelet_date_created"], row["codelet_date_modified"], | |||
row["codelet_rank"], symbols, origin) | |||
def _decompose_url(self, cursor, url): | |||
"""Break up a URL into an origin (with a URL base) and a suffix.""" | |||
@@ -82,13 +133,12 @@ class Database(object): | |||
def _insert_symbols(self, cursor, code_id, sym_type, symbols): | |||
"""Insert a list of symbols of a given type into the database.""" | |||
sym_types = ["functions", "classes", "variables"] | |||
query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)" | |||
query2 = """INSERT INTO symbol_locations VALUES | |||
(DEFAULT, ?, ?, ?, ?, ?, ?)""" | |||
for (name, decls, uses) in symbols: | |||
cursor.execute(query1, (code_id, sym_types.index(sym_type), name)) | |||
cursor.execute(query1, (code_id, Symbol.TYPES_INV[sym_type], name)) | |||
sym_id = cursor.lastrowid | |||
params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] + | |||
[tuple([sym_id, 1] + list(loc)) for loc in uses]) | |||
@@ -132,7 +182,8 @@ class Database(object): | |||
num_mnt = num_results / (10 ** num_exp) | |||
cursor.execute(query2, (cache_id, num_mnt, num_exp)) | |||
cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) | |||
return (num_results, self._get_codelets_from_ids(cursor, ids)) | |||
codelet_gen = self._get_codelets_from_ids(cursor, ids) | |||
return (num_results, list(codelet_gen)) | |||
def insert(self, codelet): | |||
""" | |||
@@ -116,6 +116,10 @@ class _QueryParser(object): | |||
def _parse_term(self, term): | |||
"""Parse a query term into a tree node and return it.""" | |||
try: | |||
term = term.decode("unicode_escape") | |||
except UnicodeDecodeError: | |||
raise QueryParseException('Invalid query term: "%s"' % term) | |||
if ":" in term and not term[0] == ":": | |||
prefix, arg = term.split(":", 1) | |||
invert = prefix.lower() == "not" | |||
@@ -135,7 +139,7 @@ class _QueryParser(object): | |||
Returns a 2-tuple of (first_marker_found, marker_index). | |||
""" | |||
def _is_escaped(query, index): | |||
def is_escaped(query, index): | |||
"""Return whether a query marker is backslash-escaped.""" | |||
return (index > 0 and query[index - 1] == "\\" and | |||
(index < 2 or query[index - 2] != "\\")) | |||
@@ -143,7 +147,7 @@ class _QueryParser(object): | |||
best_marker, best_index = None, maxsize | |||
for marker in markers: | |||
index = query.find(marker) | |||
if _is_escaped(query, index): | |||
if is_escaped(query, index): | |||
_, new_index = self._scan_query(query[index + 1:], marker) | |||
index += new_index + 1 | |||
if index >= 0 and index < best_index: | |||
@@ -209,6 +213,9 @@ class _QueryParser(object): | |||
def parse_binary_op(op): | |||
"""Parse a binary operator in a nested query list.""" | |||
index = nest.index(op) | |||
if index == 0 or index == len(nest) - 1: | |||
err = "Invalid query: '%s' given without argument." | |||
raise QueryParseException(err % BinaryOp.OPS[op]) | |||
left = self._parse_nest(nest[:index]) | |||
right = self._parse_nest(nest[index + 1:]) | |||
return BinaryOp(left, op, right) | |||
@@ -222,6 +229,9 @@ class _QueryParser(object): | |||
return parse_binary_op(BinaryOp.AND) | |||
elif UnaryOp.NOT in nest: | |||
index = nest.index(UnaryOp.NOT) | |||
if index == len(nest) - 1: | |||
err = "Invalid query: '%s' given without argument." | |||
raise QueryParseException(err % UnaryOp.OPS[UnaryOp.NOT]) | |||
right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:])) | |||
if index > 0: | |||
left = self._parse_nest(nest[:index]) | |||
@@ -15,6 +15,15 @@ class _Node(object): | |||
"""Return a string sort key for the node.""" | |||
return "" | |||
def parameterize(self, tables): | |||
"""Parameterize the node. | |||
Returns a 4-tuple of (conditional string, parameter list, rank list, | |||
should-we-rank boolean). If the rank list is empty, then it is assumed | |||
to contain the conditional string. | |||
""" | |||
return "", [], [], False | |||
class _Literal(object): | |||
"""Represents a literal component of a search query, present at the leaves. | |||
@@ -75,6 +84,20 @@ class Text(_Node): | |||
def sortkey(self): | |||
return self.text.sortkey() | |||
def parameterize(self, tables): | |||
tables |= {"code", "symbols"} | |||
if isinstance(self.text, Regex): | |||
ranks = ["(codelet_name REGEXP ?)", "(symbol_name REGEXP ?)", | |||
"(code_code REGEXP ?)"] | |||
text = self.text.regex | |||
else: | |||
ranks = ["(MATCH(codelet_name) AGAINST (? IN BOOLEAN MODE))", | |||
"(MATCH(code_code) AGAINST (? IN BOOLEAN MODE))", | |||
"(symbol_name = ?)"] | |||
text = self.text.string | |||
cond = "(" + " OR ".join(ranks) + ")" | |||
return cond, [text] * 3, ranks, True | |||
class Language(_Node): | |||
"""Represents a language node. | |||
@@ -94,6 +117,10 @@ class Language(_Node): | |||
def sortkey(self): | |||
return LANGS[self.lang] | |||
def parameterize(self, tables): | |||
tables |= {"code"} | |||
return "(code_lang = ?)", [self.lang], [], False | |||
class Author(_Node): | |||
"""Represents a author node. | |||
@@ -113,6 +140,13 @@ class Author(_Node): | |||
def sortkey(self): | |||
return self.name.sortkey() | |||
def parameterize(self, tables): | |||
tables |= {"authors"} | |||
if isinstance(self.name, Regex): | |||
return "(author_name REGEXP ?)", [self.name.regex], [], False | |||
cond = "(MATCH(author_name) AGAINST (? IN BOOLEAN MODE))" | |||
return cond, [self.name.string], [], True | |||
class Date(_Node): | |||
"""Represents a date node. | |||
@@ -144,38 +178,59 @@ class Date(_Node): | |||
def sortkey(self): | |||
return self.date.strftime("%Y%m%d%H%M%S") | |||
def parameterize(self, tables): | |||
column = {self.CREATE: "codelet_date_created", | |||
self.MODIFY: "codelet_date_modified"}[self.type] | |||
op = {self.BEFORE: "<=", self.AFTER: ">="}[self.relation] | |||
return "(" + column + " " + op + " ?)", [self.date], [], False | |||
class Symbol(_Node): | |||
"""Represents a symbol node. | |||
Searches in symbol_type and symbol_name. | |||
""" | |||
ALL = 0 | |||
FUNCTION = 1 | |||
CLASS = 2 | |||
VARIABLE = 3 | |||
ALL = -1 | |||
FUNCTION = 0 | |||
CLASS = 1 | |||
VARIABLE = 2 | |||
TYPES = {FUNCTION: "FUNCTION", CLASS: "CLASS", VARIABLE: "VARIABLE"} | |||
TYPES_INV = ["functions", "classes", "variables"] | |||
def __init__(self, type_, name): | |||
""" | |||
:type type_: int (``ALL``, ``FUNCTION``, ``CLASS``, etc.) | |||
:type name: :py:class:`.Literal` | |||
:type name: :py:class:`._Literal` | |||
""" | |||
self.type = type_ | |||
self.name = name | |||
def __repr__(self): | |||
types = {self.ALL: "ALL", self.FUNCTION: "FUNCTION", | |||
self.CLASS: "CLASS", self.VARIABLE: "VARIABLE"} | |||
return "Symbol({0}, {1})".format(types[self.type], self.name) | |||
type_ = self.TYPES.get(self.type, "ALL") | |||
return "Symbol({0}, {1})".format(type_, self.name) | |||
def sortkey(self): | |||
return self.name.sortkey() | |||
def parameterize(self, tables): | |||
tables |= {"code", "symbols"} | |||
if isinstance(self.name, Regex): | |||
cond, name = "symbol_name REGEXP ?", self.name.regex | |||
else: | |||
cond, name = "symbol_name = ?", self.name.string | |||
if self.type == self.ALL: | |||
types = ", ".join(str(type_) for type_ in self.TYPES) | |||
cond += " AND symbol_type IN (%s)" % types | |||
if self.type != self.ALL: | |||
cond += " AND symbol_type = %d" % self.type | |||
return "(" + cond + ")", [name], [], False | |||
class BinaryOp(_Node): | |||
"""Represents a relationship between two nodes: ``and``, ``or``.""" | |||
AND = object() | |||
OR = object() | |||
OPS = {AND: "AND", OR: "OR"} | |||
def __init__(self, left, op, right): | |||
self.left = left | |||
@@ -183,25 +238,39 @@ class BinaryOp(_Node): | |||
self.right = right | |||
def __repr__(self): | |||
ops = {self.AND: "AND", self.OR: "OR"} | |||
tmpl = "BinaryOp({0}, {1}, {2})" | |||
return tmpl.format(self.left, ops[self.op], self.right) | |||
return tmpl.format(self.left, self.OPS[self.op], self.right) | |||
def sortkey(self): | |||
return self.left.sortkey() + self.right.sortkey() | |||
def parameterize(self, tables): | |||
lcond, largs, lranks, need_lranks = self.left.parameterize(tables) | |||
rcond, rargs, rranks, need_rranks = self.right.parameterize(tables) | |||
lranks, rranks = lranks or [lcond], rranks or [rcond] | |||
op = self.OPS[self.op] | |||
cond = "(" + lcond + " " + op + " " + rcond + ")" | |||
need_ranks = need_lranks or need_rranks or self.op == self.OR | |||
return cond, largs + rargs, lranks + rranks, need_ranks | |||
class UnaryOp(_Node): | |||
"""Represents a transformation applied to one node: ``not``.""" | |||
NOT = object() | |||
OPS = {NOT: "NOT"} | |||
def __init__(self, op, node): | |||
self.op = op | |||
self.node = node | |||
def __repr__(self): | |||
ops = {self.NOT: "NOT"} | |||
return "UnaryOp({0}, {1})".format(ops[self.op], self.node) | |||
return "UnaryOp({0}, {1})".format(self.OPS[self.op], self.node) | |||
def sortkey(self): | |||
return self.node.sortkey() | |||
def parameterize(self, tables): | |||
cond, args, ranks, need_ranks = self.node.parameterize(tables) | |||
new_cond = "(" + self.OPS[self.op] + " " + cond + ")" | |||
ranks = ranks or [cond] | |||
return new_cond, args, ranks, need_ranks |
@@ -1,5 +1,12 @@ | |||
__all__ = ["Tree"] | |||
QUERY_TEMPLATE = """SELECT codelet_id, (codelet_rank%s) AS score | |||
FROM codelets %s | |||
WHERE %s | |||
GROUP BY codelet_id | |||
ORDER BY score DESC | |||
LIMIT %d OFFSET %d""".replace("\n", " ") | |||
class Tree(object): | |||
"""Represents a query tree.""" | |||
@@ -9,6 +16,11 @@ class Tree(object): | |||
def __repr__(self): | |||
return "Tree({0})".format(self._root) | |||
@property | |||
def root(self): | |||
"""The root node of the tree.""" | |||
return self._root | |||
def sortkey(self): | |||
"""Return a string sort key for the query tree.""" | |||
return self._root.sortkey() | |||
@@ -20,3 +32,38 @@ class Tree(object): | |||
:rtype: str | |||
""" | |||
return repr(self) | |||
def build_query(self, page=1, page_size=10): | |||
"""Convert the query tree into a parameterized SQL SELECT statement. | |||
:param page: The page number to get results for. | |||
:type page: int | |||
:param page_size: The number of results per page. | |||
:type page_size: int | |||
:return: SQL query data. | |||
:rtype: 2-tuple of (SQL statement string, query parameter tuple) | |||
""" | |||
def get_table_joins(tables): | |||
data = [ | |||
("code", "codelet_code_id", "code_id"), | |||
("authors", "author_codelet", "codelet_id"), | |||
("symbols", "symbol_code", "code_id") | |||
] | |||
tmpl = "INNER JOIN %s ON %s = %s" | |||
for args in data: | |||
if args[0] in tables: | |||
yield tmpl % args | |||
tables = set() | |||
cond, arglist, ranks, need_ranks = self._root.parameterize(tables) | |||
ranks = ranks or [cond] | |||
if need_ranks: | |||
score = " + ((%s) / %d)" % (" + ".join(ranks), len(ranks)) | |||
else: | |||
score = "" | |||
joins = " ".join(get_table_joins(tables)) | |||
offset = (page - 1) * page_size | |||
query = QUERY_TEMPLATE % (score, joins, cond, page_size, offset) | |||
return query, tuple(arglist * 2 if need_ranks else arglist) |