@@ -18,12 +18,14 @@ class Codelet(object): | |||||
code was last modified. | code was last modified. | ||||
:ivar rank: (float) A quanitification of the source code's quality, as | :ivar rank: (float) A quanitification of the source code's quality, as | ||||
per available ratings (stars, forks, upvotes, etc.). | per available ratings (stars, forks, upvotes, etc.). | ||||
:ivar symbols: (dict) Dictionary containing dictionaries of functions, classes, | |||||
variable definitions, etc. | |||||
:ivar symbols: (dict) Dictionary containing dictionaries of functions, | |||||
classes, variable definitions, etc. | |||||
:ivar origin: (tuple) 3-tuple of (site_name, site_url, image_blob), as | |||||
added by the database. | |||||
""" | """ | ||||
def __init__(self, name, code, filename, language, authors, code_url, | def __init__(self, name, code, filename, language, authors, code_url, | ||||
date_created, date_modified, rank): | |||||
date_created, date_modified, rank, symbols=None, origin=None): | |||||
""" | """ | ||||
Create a Codelet instance. | Create a Codelet instance. | ||||
@@ -36,6 +38,8 @@ class Codelet(object): | |||||
:param date_created: see :attr:`self.date_created` | :param date_created: see :attr:`self.date_created` | ||||
:param date_modified: see :attr:`self.date_modified` | :param date_modified: see :attr:`self.date_modified` | ||||
:param rank: see :attr:`self.rank` | :param rank: see :attr:`self.rank` | ||||
:param symbols: see :attr:`self.symbols` | |||||
:param origin: see :attr:`self.origin` | |||||
:type name: see :attr:`self.name` | :type name: see :attr:`self.name` | ||||
:type code: see :attr:`self.code` | :type code: see :attr:`self.code` | ||||
@@ -46,6 +50,8 @@ class Codelet(object): | |||||
:type date_created: see :attr:`self.date_created` | :type date_created: see :attr:`self.date_created` | ||||
:type date_modified: see :attr:`self.date_modified` | :type date_modified: see :attr:`self.date_modified` | ||||
:type rank: see :attr:`self.rank` | :type rank: see :attr:`self.rank` | ||||
:type symbols: see :attr:`self.symbols` | |||||
:type origin: see :attr:`self.origin` | |||||
""" | """ | ||||
self.name = name | self.name = name | ||||
@@ -57,3 +63,5 @@ class Codelet(object): | |||||
self.date_created = date_created | self.date_created = date_created | ||||
self.date_modified = date_modified | self.date_modified = date_modified | ||||
self.rank = rank | self.rank = rank | ||||
self.symbols = symbols or {} | |||||
self.origin = origin or (None, None, None) |
@@ -9,6 +9,9 @@ import mmh3 | |||||
import oursql | import oursql | ||||
from .migration import VERSION, MIGRATIONS | from .migration import VERSION, MIGRATIONS | ||||
from ..codelet import Codelet | |||||
from ..query.nodes import (String, Regex, Text, Language, Author, Date, Symbol, | |||||
BinaryOp, UnaryOp) | |||||
__all__ = ["Database"] | __all__ = ["Database"] | ||||
@@ -51,23 +54,71 @@ class Database(object): | |||||
"Run `python -m bitshift.database.migration`." | "Run `python -m bitshift.database.migration`." | ||||
raise RuntimeError(err) | raise RuntimeError(err) | ||||
def _search_with_query(self, cursor, query): | |||||
"""Convert a query tree into SQL SELECTs, execute, and return results. | |||||
def _search_with_query(self, cursor, tree, page): | |||||
"""Execute an SQL query based on a query tree, and return results. | |||||
The returned data is a 2-tuple of (list of codelet IDs, estimated | The returned data is a 2-tuple of (list of codelet IDs, estimated | ||||
number of total results). | number of total results). | ||||
""" | """ | ||||
raise NotImplementedError() ## TODO | |||||
results = cursor.fetchall() | |||||
ids = NotImplemented ## TODO: extract ids from results | |||||
num_results = NotImplemented ## TODO: num if results else 0 | |||||
query, args = tree.build_query(page) | |||||
cursor.execute(query, args) | |||||
ids = [id for id, _ in cursor.fetchall()] | |||||
num_results = 0 # TODO: NotImplemented | |||||
return ids, num_results | return ids, num_results | ||||
def _get_authors_for_codelet(self, cursor, codelet_id): | |||||
"""Return a list of authors for a given codelet.""" | |||||
query = """SELECT author_name, author_url | |||||
FROM authors | |||||
WHERE author_codelet = ?""" | |||||
cursor.execute(query, (codelet_id,)) | |||||
return cursor.fetchall() | |||||
def _get_symbols_for_code(self, cursor, code_id): | |||||
"""Return a list of symbols for a given codelet.""" | |||||
query = """SELECT symbol_type, symbol_name, sloc_type, sloc_row, | |||||
sloc_col, sloc_end_row, sloc_end_col | |||||
FROM symbols | |||||
INNER JOIN symbol_locations ON sloc_symbol = symbol_id | |||||
WHERE symbol_code = ?""" | |||||
symbols = {type_: {} for type_ in Symbol.TYPES_INV} | |||||
cursor.execute(query, (code_id,)) | |||||
for type_, name, loc_type, row, col, erow, ecol in cursor.fetchall(): | |||||
sdict = symbols[Symbol.TYPES_INV[type_]] | |||||
if name not in sdict: | |||||
sdict[name] = ((), ()) | |||||
sdict[name][loc_type].append((row, col, erow, ecol)) | |||||
for type_, sdict in symbols.items(): | |||||
symbols[type_] = [(n, d, u) for n, (d, u) in sdict.iteritems()] | |||||
return symbols | |||||
def _get_codelets_from_ids(self, cursor, ids): | def _get_codelets_from_ids(self, cursor, ids): | ||||
"""Return a list of Codelet objects given a list of codelet IDs.""" | """Return a list of Codelet objects given a list of codelet IDs.""" | ||||
raise NotImplementedError() ## TODO | |||||
query = """SELECT * | |||||
FROM codelets | |||||
INNER JOIN code ON codelet_code_id = code_id | |||||
INNER JOIN origins ON codelet_origin = origin_id | |||||
WHERE codelet_id = ?""" | |||||
with self._conn.cursor(oursql.DictCursor) as dict_cursor: | |||||
dict_cursor.executemany(query, [(id,) for id in ids]) | |||||
for row in dict_cursor.fetchone(): | |||||
codelet_id = row["codelet_id"] | |||||
if row["origin_url_base"]: | |||||
url = row["codelet_url"] | |||||
else: | |||||
url = row["origin_url_base"] + row["codelet_url"] | |||||
origin = (row["origin_name"], row["origin_url"], | |||||
row["origin_image"]) | |||||
authors = self._get_authors_for_codelet(cursor, codelet_id) | |||||
symbols = self._get_symbols_for_code(cursor, row["code_id"]) | |||||
yield Codelet( | |||||
row["codelet_name"], row["code_code"], None, | |||||
row["code_lang"], authors, url, | |||||
row["codelet_date_created"], row["codelet_date_modified"], | |||||
row["codelet_rank"], symbols, origin) | |||||
def _decompose_url(self, cursor, url): | def _decompose_url(self, cursor, url): | ||||
"""Break up a URL into an origin (with a URL base) and a suffix.""" | """Break up a URL into an origin (with a URL base) and a suffix.""" | ||||
@@ -82,13 +133,12 @@ class Database(object): | |||||
def _insert_symbols(self, cursor, code_id, sym_type, symbols): | def _insert_symbols(self, cursor, code_id, sym_type, symbols): | ||||
"""Insert a list of symbols of a given type into the database.""" | """Insert a list of symbols of a given type into the database.""" | ||||
sym_types = ["functions", "classes", "variables"] | |||||
query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)" | query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)" | ||||
query2 = """INSERT INTO symbol_locations VALUES | query2 = """INSERT INTO symbol_locations VALUES | ||||
(DEFAULT, ?, ?, ?, ?, ?, ?)""" | (DEFAULT, ?, ?, ?, ?, ?, ?)""" | ||||
for (name, decls, uses) in symbols: | for (name, decls, uses) in symbols: | ||||
cursor.execute(query1, (code_id, sym_types.index(sym_type), name)) | |||||
cursor.execute(query1, (code_id, Symbol.TYPES_INV[sym_type], name)) | |||||
sym_id = cursor.lastrowid | sym_id = cursor.lastrowid | ||||
params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] + | params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] + | ||||
[tuple([sym_id, 1] + list(loc)) for loc in uses]) | [tuple([sym_id, 1] + list(loc)) for loc in uses]) | ||||
@@ -132,7 +182,8 @@ class Database(object): | |||||
num_mnt = num_results / (10 ** num_exp) | num_mnt = num_results / (10 ** num_exp) | ||||
cursor.execute(query2, (cache_id, num_mnt, num_exp)) | cursor.execute(query2, (cache_id, num_mnt, num_exp)) | ||||
cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) | cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) | ||||
return (num_results, self._get_codelets_from_ids(cursor, ids)) | |||||
codelet_gen = self._get_codelets_from_ids(cursor, ids) | |||||
return (num_results, list(codelet_gen)) | |||||
def insert(self, codelet): | def insert(self, codelet): | ||||
""" | """ | ||||
@@ -116,6 +116,10 @@ class _QueryParser(object): | |||||
def _parse_term(self, term): | def _parse_term(self, term): | ||||
"""Parse a query term into a tree node and return it.""" | """Parse a query term into a tree node and return it.""" | ||||
try: | |||||
term = term.decode("unicode_escape") | |||||
except UnicodeDecodeError: | |||||
raise QueryParseException('Invalid query term: "%s"' % term) | |||||
if ":" in term and not term[0] == ":": | if ":" in term and not term[0] == ":": | ||||
prefix, arg = term.split(":", 1) | prefix, arg = term.split(":", 1) | ||||
invert = prefix.lower() == "not" | invert = prefix.lower() == "not" | ||||
@@ -135,7 +139,7 @@ class _QueryParser(object): | |||||
Returns a 2-tuple of (first_marker_found, marker_index). | Returns a 2-tuple of (first_marker_found, marker_index). | ||||
""" | """ | ||||
def _is_escaped(query, index): | |||||
def is_escaped(query, index): | |||||
"""Return whether a query marker is backslash-escaped.""" | """Return whether a query marker is backslash-escaped.""" | ||||
return (index > 0 and query[index - 1] == "\\" and | return (index > 0 and query[index - 1] == "\\" and | ||||
(index < 2 or query[index - 2] != "\\")) | (index < 2 or query[index - 2] != "\\")) | ||||
@@ -143,7 +147,7 @@ class _QueryParser(object): | |||||
best_marker, best_index = None, maxsize | best_marker, best_index = None, maxsize | ||||
for marker in markers: | for marker in markers: | ||||
index = query.find(marker) | index = query.find(marker) | ||||
if _is_escaped(query, index): | |||||
if is_escaped(query, index): | |||||
_, new_index = self._scan_query(query[index + 1:], marker) | _, new_index = self._scan_query(query[index + 1:], marker) | ||||
index += new_index + 1 | index += new_index + 1 | ||||
if index >= 0 and index < best_index: | if index >= 0 and index < best_index: | ||||
@@ -209,6 +213,9 @@ class _QueryParser(object): | |||||
def parse_binary_op(op): | def parse_binary_op(op): | ||||
"""Parse a binary operator in a nested query list.""" | """Parse a binary operator in a nested query list.""" | ||||
index = nest.index(op) | index = nest.index(op) | ||||
if index == 0 or index == len(nest) - 1: | |||||
err = "Invalid query: '%s' given without argument." | |||||
raise QueryParseException(err % BinaryOp.OPS[op]) | |||||
left = self._parse_nest(nest[:index]) | left = self._parse_nest(nest[:index]) | ||||
right = self._parse_nest(nest[index + 1:]) | right = self._parse_nest(nest[index + 1:]) | ||||
return BinaryOp(left, op, right) | return BinaryOp(left, op, right) | ||||
@@ -222,6 +229,9 @@ class _QueryParser(object): | |||||
return parse_binary_op(BinaryOp.AND) | return parse_binary_op(BinaryOp.AND) | ||||
elif UnaryOp.NOT in nest: | elif UnaryOp.NOT in nest: | ||||
index = nest.index(UnaryOp.NOT) | index = nest.index(UnaryOp.NOT) | ||||
if index == len(nest) - 1: | |||||
err = "Invalid query: '%s' given without argument." | |||||
raise QueryParseException(err % UnaryOp.OPS[UnaryOp.NOT]) | |||||
right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:])) | right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:])) | ||||
if index > 0: | if index > 0: | ||||
left = self._parse_nest(nest[:index]) | left = self._parse_nest(nest[:index]) | ||||
@@ -15,6 +15,15 @@ class _Node(object): | |||||
"""Return a string sort key for the node.""" | """Return a string sort key for the node.""" | ||||
return "" | return "" | ||||
def parameterize(self, tables): | |||||
"""Parameterize the node. | |||||
Returns a 4-tuple of (conditional string, parameter list, rank list, | |||||
should-we-rank boolean). If the rank list is empty, then it is assumed | |||||
to contain the conditional string. | |||||
""" | |||||
return "", [], [], False | |||||
class _Literal(object): | class _Literal(object): | ||||
"""Represents a literal component of a search query, present at the leaves. | """Represents a literal component of a search query, present at the leaves. | ||||
@@ -75,6 +84,20 @@ class Text(_Node): | |||||
def sortkey(self): | def sortkey(self): | ||||
return self.text.sortkey() | return self.text.sortkey() | ||||
def parameterize(self, tables): | |||||
tables |= {"code", "symbols"} | |||||
if isinstance(self.text, Regex): | |||||
ranks = ["(codelet_name REGEXP ?)", "(symbol_name REGEXP ?)", | |||||
"(code_code REGEXP ?)"] | |||||
text = self.text.regex | |||||
else: | |||||
ranks = ["(MATCH(codelet_name) AGAINST (? IN BOOLEAN MODE))", | |||||
"(MATCH(code_code) AGAINST (? IN BOOLEAN MODE))", | |||||
"(symbol_name = ?)"] | |||||
text = self.text.string | |||||
cond = "(" + " OR ".join(ranks) + ")" | |||||
return cond, [text] * 3, ranks, True | |||||
class Language(_Node): | class Language(_Node): | ||||
"""Represents a language node. | """Represents a language node. | ||||
@@ -94,6 +117,10 @@ class Language(_Node): | |||||
def sortkey(self): | def sortkey(self): | ||||
return LANGS[self.lang] | return LANGS[self.lang] | ||||
def parameterize(self, tables): | |||||
tables |= {"code"} | |||||
return "(code_lang = ?)", [self.lang], [], False | |||||
class Author(_Node): | class Author(_Node): | ||||
"""Represents a author node. | """Represents a author node. | ||||
@@ -113,6 +140,13 @@ class Author(_Node): | |||||
def sortkey(self): | def sortkey(self): | ||||
return self.name.sortkey() | return self.name.sortkey() | ||||
def parameterize(self, tables): | |||||
tables |= {"authors"} | |||||
if isinstance(self.name, Regex): | |||||
return "(author_name REGEXP ?)", [self.name.regex], [], False | |||||
cond = "(MATCH(author_name) AGAINST (? IN BOOLEAN MODE))" | |||||
return cond, [self.name.string], [], True | |||||
class Date(_Node): | class Date(_Node): | ||||
"""Represents a date node. | """Represents a date node. | ||||
@@ -144,38 +178,59 @@ class Date(_Node): | |||||
def sortkey(self): | def sortkey(self): | ||||
return self.date.strftime("%Y%m%d%H%M%S") | return self.date.strftime("%Y%m%d%H%M%S") | ||||
def parameterize(self, tables): | |||||
column = {self.CREATE: "codelet_date_created", | |||||
self.MODIFY: "codelet_date_modified"}[self.type] | |||||
op = {self.BEFORE: "<=", self.AFTER: ">="}[self.relation] | |||||
return "(" + column + " " + op + " ?)", [self.date], [], False | |||||
class Symbol(_Node): | class Symbol(_Node): | ||||
"""Represents a symbol node. | """Represents a symbol node. | ||||
Searches in symbol_type and symbol_name. | Searches in symbol_type and symbol_name. | ||||
""" | """ | ||||
ALL = 0 | |||||
FUNCTION = 1 | |||||
CLASS = 2 | |||||
VARIABLE = 3 | |||||
ALL = -1 | |||||
FUNCTION = 0 | |||||
CLASS = 1 | |||||
VARIABLE = 2 | |||||
TYPES = {FUNCTION: "FUNCTION", CLASS: "CLASS", VARIABLE: "VARIABLE"} | |||||
TYPES_INV = ["functions", "classes", "variables"] | |||||
def __init__(self, type_, name): | def __init__(self, type_, name): | ||||
""" | """ | ||||
:type type_: int (``ALL``, ``FUNCTION``, ``CLASS``, etc.) | :type type_: int (``ALL``, ``FUNCTION``, ``CLASS``, etc.) | ||||
:type name: :py:class:`.Literal` | |||||
:type name: :py:class:`._Literal` | |||||
""" | """ | ||||
self.type = type_ | self.type = type_ | ||||
self.name = name | self.name = name | ||||
def __repr__(self): | def __repr__(self): | ||||
types = {self.ALL: "ALL", self.FUNCTION: "FUNCTION", | |||||
self.CLASS: "CLASS", self.VARIABLE: "VARIABLE"} | |||||
return "Symbol({0}, {1})".format(types[self.type], self.name) | |||||
type_ = self.TYPES.get(self.type, "ALL") | |||||
return "Symbol({0}, {1})".format(type_, self.name) | |||||
def sortkey(self): | def sortkey(self): | ||||
return self.name.sortkey() | return self.name.sortkey() | ||||
def parameterize(self, tables): | |||||
tables |= {"code", "symbols"} | |||||
if isinstance(self.name, Regex): | |||||
cond, name = "symbol_name REGEXP ?", self.name.regex | |||||
else: | |||||
cond, name = "symbol_name = ?", self.name.string | |||||
if self.type == self.ALL: | |||||
types = ", ".join(str(type_) for type_ in self.TYPES) | |||||
cond += " AND symbol_type IN (%s)" % types | |||||
if self.type != self.ALL: | |||||
cond += " AND symbol_type = %d" % self.type | |||||
return "(" + cond + ")", [name], [], False | |||||
class BinaryOp(_Node): | class BinaryOp(_Node): | ||||
"""Represents a relationship between two nodes: ``and``, ``or``.""" | """Represents a relationship between two nodes: ``and``, ``or``.""" | ||||
AND = object() | AND = object() | ||||
OR = object() | OR = object() | ||||
OPS = {AND: "AND", OR: "OR"} | |||||
def __init__(self, left, op, right): | def __init__(self, left, op, right): | ||||
self.left = left | self.left = left | ||||
@@ -183,25 +238,39 @@ class BinaryOp(_Node): | |||||
self.right = right | self.right = right | ||||
def __repr__(self): | def __repr__(self): | ||||
ops = {self.AND: "AND", self.OR: "OR"} | |||||
tmpl = "BinaryOp({0}, {1}, {2})" | tmpl = "BinaryOp({0}, {1}, {2})" | ||||
return tmpl.format(self.left, ops[self.op], self.right) | |||||
return tmpl.format(self.left, self.OPS[self.op], self.right) | |||||
def sortkey(self): | def sortkey(self): | ||||
return self.left.sortkey() + self.right.sortkey() | return self.left.sortkey() + self.right.sortkey() | ||||
def parameterize(self, tables): | |||||
lcond, largs, lranks, need_lranks = self.left.parameterize(tables) | |||||
rcond, rargs, rranks, need_rranks = self.right.parameterize(tables) | |||||
lranks, rranks = lranks or [lcond], rranks or [rcond] | |||||
op = self.OPS[self.op] | |||||
cond = "(" + lcond + " " + op + " " + rcond + ")" | |||||
need_ranks = need_lranks or need_rranks or self.op == self.OR | |||||
return cond, largs + rargs, lranks + rranks, need_ranks | |||||
class UnaryOp(_Node): | class UnaryOp(_Node): | ||||
"""Represents a transformation applied to one node: ``not``.""" | """Represents a transformation applied to one node: ``not``.""" | ||||
NOT = object() | NOT = object() | ||||
OPS = {NOT: "NOT"} | |||||
def __init__(self, op, node): | def __init__(self, op, node): | ||||
self.op = op | self.op = op | ||||
self.node = node | self.node = node | ||||
def __repr__(self): | def __repr__(self): | ||||
ops = {self.NOT: "NOT"} | |||||
return "UnaryOp({0}, {1})".format(ops[self.op], self.node) | |||||
return "UnaryOp({0}, {1})".format(self.OPS[self.op], self.node) | |||||
def sortkey(self): | def sortkey(self): | ||||
return self.node.sortkey() | return self.node.sortkey() | ||||
def parameterize(self, tables): | |||||
cond, args, ranks, need_ranks = self.node.parameterize(tables) | |||||
new_cond = "(" + self.OPS[self.op] + " " + cond + ")" | |||||
ranks = ranks or [cond] | |||||
return new_cond, args, ranks, need_ranks |
@@ -1,5 +1,12 @@ | |||||
__all__ = ["Tree"] | __all__ = ["Tree"] | ||||
QUERY_TEMPLATE = """SELECT codelet_id, (codelet_rank%s) AS score | |||||
FROM codelets %s | |||||
WHERE %s | |||||
GROUP BY codelet_id | |||||
ORDER BY score DESC | |||||
LIMIT %d OFFSET %d""".replace("\n", " ") | |||||
class Tree(object): | class Tree(object): | ||||
"""Represents a query tree.""" | """Represents a query tree.""" | ||||
@@ -9,6 +16,11 @@ class Tree(object): | |||||
def __repr__(self): | def __repr__(self): | ||||
return "Tree({0})".format(self._root) | return "Tree({0})".format(self._root) | ||||
@property | |||||
def root(self): | |||||
"""The root node of the tree.""" | |||||
return self._root | |||||
def sortkey(self): | def sortkey(self): | ||||
"""Return a string sort key for the query tree.""" | """Return a string sort key for the query tree.""" | ||||
return self._root.sortkey() | return self._root.sortkey() | ||||
@@ -20,3 +32,38 @@ class Tree(object): | |||||
:rtype: str | :rtype: str | ||||
""" | """ | ||||
return repr(self) | return repr(self) | ||||
def build_query(self, page=1, page_size=10): | |||||
"""Convert the query tree into a parameterized SQL SELECT statement. | |||||
:param page: The page number to get results for. | |||||
:type page: int | |||||
:param page_size: The number of results per page. | |||||
:type page_size: int | |||||
:return: SQL query data. | |||||
:rtype: 2-tuple of (SQL statement string, query parameter tuple) | |||||
""" | |||||
def get_table_joins(tables): | |||||
data = [ | |||||
("code", "codelet_code_id", "code_id"), | |||||
("authors", "author_codelet", "codelet_id"), | |||||
("symbols", "symbol_code", "code_id") | |||||
] | |||||
tmpl = "INNER JOIN %s ON %s = %s" | |||||
for args in data: | |||||
if args[0] in tables: | |||||
yield tmpl % args | |||||
tables = set() | |||||
cond, arglist, ranks, need_ranks = self._root.parameterize(tables) | |||||
ranks = ranks or [cond] | |||||
if need_ranks: | |||||
score = " + ((%s) / %d)" % (" + ".join(ranks), len(ranks)) | |||||
else: | |||||
score = "" | |||||
joins = " ".join(get_table_joins(tables)) | |||||
offset = (page - 1) * page_size | |||||
query = QUERY_TEMPLATE % (score, joins, cond, page_size, offset) | |||||
return query, tuple(arglist * 2 if need_ranks else arglist) |