Browse Source

Merge branch 'feature/db_search' into develop

tags/v1.0^2
Ben Kurtovic 10 years ago
parent
commit
54d4d88d5c
5 changed files with 214 additions and 29 deletions
  1. +11
    -3
      bitshift/codelet.py
  2. +63
    -12
      bitshift/database/__init__.py
  3. +12
    -2
      bitshift/query/__init__.py
  4. +81
    -12
      bitshift/query/nodes.py
  5. +47
    -0
      bitshift/query/tree.py

+ 11
- 3
bitshift/codelet.py View File

@@ -18,12 +18,14 @@ class Codelet(object):
code was last modified. code was last modified.
:ivar rank: (float) A quanitification of the source code's quality, as :ivar rank: (float) A quanitification of the source code's quality, as
per available ratings (stars, forks, upvotes, etc.). per available ratings (stars, forks, upvotes, etc.).
:ivar symbols: (dict) Dictionary containing dictionaries of functions, classes,
variable definitions, etc.
:ivar symbols: (dict) Dictionary containing dictionaries of functions,
classes, variable definitions, etc.
:ivar origin: (tuple) 3-tuple of (site_name, site_url, image_blob), as
added by the database.
""" """


def __init__(self, name, code, filename, language, authors, code_url, def __init__(self, name, code, filename, language, authors, code_url,
date_created, date_modified, rank):
date_created, date_modified, rank, symbols=None, origin=None):
""" """
Create a Codelet instance. Create a Codelet instance.


@@ -36,6 +38,8 @@ class Codelet(object):
:param date_created: see :attr:`self.date_created` :param date_created: see :attr:`self.date_created`
:param date_modified: see :attr:`self.date_modified` :param date_modified: see :attr:`self.date_modified`
:param rank: see :attr:`self.rank` :param rank: see :attr:`self.rank`
:param symbols: see :attr:`self.symbols`
:param origin: see :attr:`self.origin`


:type name: see :attr:`self.name` :type name: see :attr:`self.name`
:type code: see :attr:`self.code` :type code: see :attr:`self.code`
@@ -46,6 +50,8 @@ class Codelet(object):
:type date_created: see :attr:`self.date_created` :type date_created: see :attr:`self.date_created`
:type date_modified: see :attr:`self.date_modified` :type date_modified: see :attr:`self.date_modified`
:type rank: see :attr:`self.rank` :type rank: see :attr:`self.rank`
:type symbols: see :attr:`self.symbols`
:type origin: see :attr:`self.origin`
""" """


self.name = name self.name = name
@@ -57,3 +63,5 @@ class Codelet(object):
self.date_created = date_created self.date_created = date_created
self.date_modified = date_modified self.date_modified = date_modified
self.rank = rank self.rank = rank
self.symbols = symbols or {}
self.origin = origin or (None, None, None)

+ 63
- 12
bitshift/database/__init__.py View File

@@ -9,6 +9,9 @@ import mmh3
import oursql import oursql


from .migration import VERSION, MIGRATIONS from .migration import VERSION, MIGRATIONS
from ..codelet import Codelet
from ..query.nodes import (String, Regex, Text, Language, Author, Date, Symbol,
BinaryOp, UnaryOp)


__all__ = ["Database"] __all__ = ["Database"]


@@ -51,23 +54,71 @@ class Database(object):
"Run `python -m bitshift.database.migration`." "Run `python -m bitshift.database.migration`."
raise RuntimeError(err) raise RuntimeError(err)


def _search_with_query(self, cursor, query):
"""Convert a query tree into SQL SELECTs, execute, and return results.
def _search_with_query(self, cursor, tree, page):
"""Execute an SQL query based on a query tree, and return results.


The returned data is a 2-tuple of (list of codelet IDs, estimated The returned data is a 2-tuple of (list of codelet IDs, estimated
number of total results). number of total results).
""" """
raise NotImplementedError() ## TODO

results = cursor.fetchall()
ids = NotImplemented ## TODO: extract ids from results
num_results = NotImplemented ## TODO: num if results else 0

query, args = tree.build_query(page)
cursor.execute(query, args)
ids = [id for id, _ in cursor.fetchall()]
num_results = 0 # TODO: NotImplemented
return ids, num_results return ids, num_results


def _get_authors_for_codelet(self, cursor, codelet_id):
"""Return a list of authors for a given codelet."""
query = """SELECT author_name, author_url
FROM authors
WHERE author_codelet = ?"""

cursor.execute(query, (codelet_id,))
return cursor.fetchall()

def _get_symbols_for_code(self, cursor, code_id):
"""Return a list of symbols for a given codelet."""
query = """SELECT symbol_type, symbol_name, sloc_type, sloc_row,
sloc_col, sloc_end_row, sloc_end_col
FROM symbols
INNER JOIN symbol_locations ON sloc_symbol = symbol_id
WHERE symbol_code = ?"""

symbols = {type_: {} for type_ in Symbol.TYPES_INV}
cursor.execute(query, (code_id,))
for type_, name, loc_type, row, col, erow, ecol in cursor.fetchall():
sdict = symbols[Symbol.TYPES_INV[type_]]
if name not in sdict:
sdict[name] = ((), ())
sdict[name][loc_type].append((row, col, erow, ecol))
for type_, sdict in symbols.items():
symbols[type_] = [(n, d, u) for n, (d, u) in sdict.iteritems()]
return symbols

def _get_codelets_from_ids(self, cursor, ids): def _get_codelets_from_ids(self, cursor, ids):
"""Return a list of Codelet objects given a list of codelet IDs.""" """Return a list of Codelet objects given a list of codelet IDs."""
raise NotImplementedError() ## TODO
query = """SELECT *
FROM codelets
INNER JOIN code ON codelet_code_id = code_id
INNER JOIN origins ON codelet_origin = origin_id
WHERE codelet_id = ?"""

with self._conn.cursor(oursql.DictCursor) as dict_cursor:
dict_cursor.executemany(query, [(id,) for id in ids])
for row in dict_cursor.fetchone():
codelet_id = row["codelet_id"]
if row["origin_url_base"]:
url = row["codelet_url"]
else:
url = row["origin_url_base"] + row["codelet_url"]
origin = (row["origin_name"], row["origin_url"],
row["origin_image"])
authors = self._get_authors_for_codelet(cursor, codelet_id)
symbols = self._get_symbols_for_code(cursor, row["code_id"])
yield Codelet(
row["codelet_name"], row["code_code"], None,
row["code_lang"], authors, url,
row["codelet_date_created"], row["codelet_date_modified"],
row["codelet_rank"], symbols, origin)


def _decompose_url(self, cursor, url): def _decompose_url(self, cursor, url):
"""Break up a URL into an origin (with a URL base) and a suffix.""" """Break up a URL into an origin (with a URL base) and a suffix."""
@@ -82,13 +133,12 @@ class Database(object):


def _insert_symbols(self, cursor, code_id, sym_type, symbols): def _insert_symbols(self, cursor, code_id, sym_type, symbols):
"""Insert a list of symbols of a given type into the database.""" """Insert a list of symbols of a given type into the database."""
sym_types = ["functions", "classes", "variables"]
query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)" query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)"
query2 = """INSERT INTO symbol_locations VALUES query2 = """INSERT INTO symbol_locations VALUES
(DEFAULT, ?, ?, ?, ?, ?, ?)""" (DEFAULT, ?, ?, ?, ?, ?, ?)"""


for (name, decls, uses) in symbols: for (name, decls, uses) in symbols:
cursor.execute(query1, (code_id, sym_types.index(sym_type), name))
cursor.execute(query1, (code_id, Symbol.TYPES_INV[sym_type], name))
sym_id = cursor.lastrowid sym_id = cursor.lastrowid
params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] + params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] +
[tuple([sym_id, 1] + list(loc)) for loc in uses]) [tuple([sym_id, 1] + list(loc)) for loc in uses])
@@ -132,7 +182,8 @@ class Database(object):
num_mnt = num_results / (10 ** num_exp) num_mnt = num_results / (10 ** num_exp)
cursor.execute(query2, (cache_id, num_mnt, num_exp)) cursor.execute(query2, (cache_id, num_mnt, num_exp))
cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) cursor.executemany(query3, [(cache_id, c_id) for c_id in ids])
return (num_results, self._get_codelets_from_ids(cursor, ids))
codelet_gen = self._get_codelets_from_ids(cursor, ids)
return (num_results, list(codelet_gen))


def insert(self, codelet): def insert(self, codelet):
""" """


+ 12
- 2
bitshift/query/__init__.py View File

@@ -116,6 +116,10 @@ class _QueryParser(object):


def _parse_term(self, term): def _parse_term(self, term):
"""Parse a query term into a tree node and return it.""" """Parse a query term into a tree node and return it."""
try:
term = term.decode("unicode_escape")
except UnicodeDecodeError:
raise QueryParseException('Invalid query term: "%s"' % term)
if ":" in term and not term[0] == ":": if ":" in term and not term[0] == ":":
prefix, arg = term.split(":", 1) prefix, arg = term.split(":", 1)
invert = prefix.lower() == "not" invert = prefix.lower() == "not"
@@ -135,7 +139,7 @@ class _QueryParser(object):


Returns a 2-tuple of (first_marker_found, marker_index). Returns a 2-tuple of (first_marker_found, marker_index).
""" """
def _is_escaped(query, index):
def is_escaped(query, index):
"""Return whether a query marker is backslash-escaped.""" """Return whether a query marker is backslash-escaped."""
return (index > 0 and query[index - 1] == "\\" and return (index > 0 and query[index - 1] == "\\" and
(index < 2 or query[index - 2] != "\\")) (index < 2 or query[index - 2] != "\\"))
@@ -143,7 +147,7 @@ class _QueryParser(object):
best_marker, best_index = None, maxsize best_marker, best_index = None, maxsize
for marker in markers: for marker in markers:
index = query.find(marker) index = query.find(marker)
if _is_escaped(query, index):
if is_escaped(query, index):
_, new_index = self._scan_query(query[index + 1:], marker) _, new_index = self._scan_query(query[index + 1:], marker)
index += new_index + 1 index += new_index + 1
if index >= 0 and index < best_index: if index >= 0 and index < best_index:
@@ -209,6 +213,9 @@ class _QueryParser(object):
def parse_binary_op(op): def parse_binary_op(op):
"""Parse a binary operator in a nested query list.""" """Parse a binary operator in a nested query list."""
index = nest.index(op) index = nest.index(op)
if index == 0 or index == len(nest) - 1:
err = "Invalid query: '%s' given without argument."
raise QueryParseException(err % BinaryOp.OPS[op])
left = self._parse_nest(nest[:index]) left = self._parse_nest(nest[:index])
right = self._parse_nest(nest[index + 1:]) right = self._parse_nest(nest[index + 1:])
return BinaryOp(left, op, right) return BinaryOp(left, op, right)
@@ -222,6 +229,9 @@ class _QueryParser(object):
return parse_binary_op(BinaryOp.AND) return parse_binary_op(BinaryOp.AND)
elif UnaryOp.NOT in nest: elif UnaryOp.NOT in nest:
index = nest.index(UnaryOp.NOT) index = nest.index(UnaryOp.NOT)
if index == len(nest) - 1:
err = "Invalid query: '%s' given without argument."
raise QueryParseException(err % UnaryOp.OPS[UnaryOp.NOT])
right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:])) right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:]))
if index > 0: if index > 0:
left = self._parse_nest(nest[:index]) left = self._parse_nest(nest[:index])


+ 81
- 12
bitshift/query/nodes.py View File

@@ -15,6 +15,15 @@ class _Node(object):
"""Return a string sort key for the node.""" """Return a string sort key for the node."""
return "" return ""


def parameterize(self, tables):
"""Parameterize the node.

Returns a 4-tuple of (conditional string, parameter list, rank list,
should-we-rank boolean). If the rank list is empty, then it is assumed
to contain the conditional string.
"""
return "", [], [], False



class _Literal(object): class _Literal(object):
"""Represents a literal component of a search query, present at the leaves. """Represents a literal component of a search query, present at the leaves.
@@ -75,6 +84,20 @@ class Text(_Node):
def sortkey(self): def sortkey(self):
return self.text.sortkey() return self.text.sortkey()


def parameterize(self, tables):
tables |= {"code", "symbols"}
if isinstance(self.text, Regex):
ranks = ["(codelet_name REGEXP ?)", "(symbol_name REGEXP ?)",
"(code_code REGEXP ?)"]
text = self.text.regex
else:
ranks = ["(MATCH(codelet_name) AGAINST (? IN BOOLEAN MODE))",
"(MATCH(code_code) AGAINST (? IN BOOLEAN MODE))",
"(symbol_name = ?)"]
text = self.text.string
cond = "(" + " OR ".join(ranks) + ")"
return cond, [text] * 3, ranks, True



class Language(_Node): class Language(_Node):
"""Represents a language node. """Represents a language node.
@@ -94,6 +117,10 @@ class Language(_Node):
def sortkey(self): def sortkey(self):
return LANGS[self.lang] return LANGS[self.lang]


def parameterize(self, tables):
tables |= {"code"}
return "(code_lang = ?)", [self.lang], [], False



class Author(_Node): class Author(_Node):
"""Represents a author node. """Represents a author node.
@@ -113,6 +140,13 @@ class Author(_Node):
def sortkey(self): def sortkey(self):
return self.name.sortkey() return self.name.sortkey()


def parameterize(self, tables):
tables |= {"authors"}
if isinstance(self.name, Regex):
return "(author_name REGEXP ?)", [self.name.regex], [], False
cond = "(MATCH(author_name) AGAINST (? IN BOOLEAN MODE))"
return cond, [self.name.string], [], True



class Date(_Node): class Date(_Node):
"""Represents a date node. """Represents a date node.
@@ -144,38 +178,59 @@ class Date(_Node):
def sortkey(self): def sortkey(self):
return self.date.strftime("%Y%m%d%H%M%S") return self.date.strftime("%Y%m%d%H%M%S")


def parameterize(self, tables):
column = {self.CREATE: "codelet_date_created",
self.MODIFY: "codelet_date_modified"}[self.type]
op = {self.BEFORE: "<=", self.AFTER: ">="}[self.relation]
return "(" + column + " " + op + " ?)", [self.date], [], False



class Symbol(_Node): class Symbol(_Node):
"""Represents a symbol node. """Represents a symbol node.


Searches in symbol_type and symbol_name. Searches in symbol_type and symbol_name.
""" """
ALL = 0
FUNCTION = 1
CLASS = 2
VARIABLE = 3
ALL = -1
FUNCTION = 0
CLASS = 1
VARIABLE = 2
TYPES = {FUNCTION: "FUNCTION", CLASS: "CLASS", VARIABLE: "VARIABLE"}
TYPES_INV = ["functions", "classes", "variables"]


def __init__(self, type_, name): def __init__(self, type_, name):
""" """
:type type_: int (``ALL``, ``FUNCTION``, ``CLASS``, etc.) :type type_: int (``ALL``, ``FUNCTION``, ``CLASS``, etc.)
:type name: :py:class:`.Literal`
:type name: :py:class:`._Literal`
""" """
self.type = type_ self.type = type_
self.name = name self.name = name


def __repr__(self): def __repr__(self):
types = {self.ALL: "ALL", self.FUNCTION: "FUNCTION",
self.CLASS: "CLASS", self.VARIABLE: "VARIABLE"}
return "Symbol({0}, {1})".format(types[self.type], self.name)
type_ = self.TYPES.get(self.type, "ALL")
return "Symbol({0}, {1})".format(type_, self.name)


def sortkey(self): def sortkey(self):
return self.name.sortkey() return self.name.sortkey()


def parameterize(self, tables):
tables |= {"code", "symbols"}
if isinstance(self.name, Regex):
cond, name = "symbol_name REGEXP ?", self.name.regex
else:
cond, name = "symbol_name = ?", self.name.string
if self.type == self.ALL:
types = ", ".join(str(type_) for type_ in self.TYPES)
cond += " AND symbol_type IN (%s)" % types
if self.type != self.ALL:
cond += " AND symbol_type = %d" % self.type
return "(" + cond + ")", [name], [], False



class BinaryOp(_Node): class BinaryOp(_Node):
"""Represents a relationship between two nodes: ``and``, ``or``.""" """Represents a relationship between two nodes: ``and``, ``or``."""
AND = object() AND = object()
OR = object() OR = object()
OPS = {AND: "AND", OR: "OR"}


def __init__(self, left, op, right): def __init__(self, left, op, right):
self.left = left self.left = left
@@ -183,25 +238,39 @@ class BinaryOp(_Node):
self.right = right self.right = right


def __repr__(self): def __repr__(self):
ops = {self.AND: "AND", self.OR: "OR"}
tmpl = "BinaryOp({0}, {1}, {2})" tmpl = "BinaryOp({0}, {1}, {2})"
return tmpl.format(self.left, ops[self.op], self.right)
return tmpl.format(self.left, self.OPS[self.op], self.right)


def sortkey(self): def sortkey(self):
return self.left.sortkey() + self.right.sortkey() return self.left.sortkey() + self.right.sortkey()


def parameterize(self, tables):
lcond, largs, lranks, need_lranks = self.left.parameterize(tables)
rcond, rargs, rranks, need_rranks = self.right.parameterize(tables)
lranks, rranks = lranks or [lcond], rranks or [rcond]
op = self.OPS[self.op]
cond = "(" + lcond + " " + op + " " + rcond + ")"
need_ranks = need_lranks or need_rranks or self.op == self.OR
return cond, largs + rargs, lranks + rranks, need_ranks



class UnaryOp(_Node): class UnaryOp(_Node):
"""Represents a transformation applied to one node: ``not``.""" """Represents a transformation applied to one node: ``not``."""
NOT = object() NOT = object()
OPS = {NOT: "NOT"}


def __init__(self, op, node): def __init__(self, op, node):
self.op = op self.op = op
self.node = node self.node = node


def __repr__(self): def __repr__(self):
ops = {self.NOT: "NOT"}
return "UnaryOp({0}, {1})".format(ops[self.op], self.node)
return "UnaryOp({0}, {1})".format(self.OPS[self.op], self.node)


def sortkey(self): def sortkey(self):
return self.node.sortkey() return self.node.sortkey()

def parameterize(self, tables):
cond, args, ranks, need_ranks = self.node.parameterize(tables)
new_cond = "(" + self.OPS[self.op] + " " + cond + ")"
ranks = ranks or [cond]
return new_cond, args, ranks, need_ranks

+ 47
- 0
bitshift/query/tree.py View File

@@ -1,5 +1,12 @@
__all__ = ["Tree"] __all__ = ["Tree"]


QUERY_TEMPLATE = """SELECT codelet_id, (codelet_rank%s) AS score
FROM codelets %s
WHERE %s
GROUP BY codelet_id
ORDER BY score DESC
LIMIT %d OFFSET %d""".replace("\n", " ")

class Tree(object): class Tree(object):
"""Represents a query tree.""" """Represents a query tree."""


@@ -9,6 +16,11 @@ class Tree(object):
def __repr__(self): def __repr__(self):
return "Tree({0})".format(self._root) return "Tree({0})".format(self._root)


@property
def root(self):
"""The root node of the tree."""
return self._root

def sortkey(self): def sortkey(self):
"""Return a string sort key for the query tree.""" """Return a string sort key for the query tree."""
return self._root.sortkey() return self._root.sortkey()
@@ -20,3 +32,38 @@ class Tree(object):
:rtype: str :rtype: str
""" """
return repr(self) return repr(self)

def build_query(self, page=1, page_size=10):
"""Convert the query tree into a parameterized SQL SELECT statement.

:param page: The page number to get results for.
:type page: int
:param page_size: The number of results per page.
:type page_size: int

:return: SQL query data.
:rtype: 2-tuple of (SQL statement string, query parameter tuple)
"""
def get_table_joins(tables):
data = [
("code", "codelet_code_id", "code_id"),
("authors", "author_codelet", "codelet_id"),
("symbols", "symbol_code", "code_id")
]
tmpl = "INNER JOIN %s ON %s = %s"
for args in data:
if args[0] in tables:
yield tmpl % args

tables = set()
cond, arglist, ranks, need_ranks = self._root.parameterize(tables)
ranks = ranks or [cond]
if need_ranks:
score = " + ((%s) / %d)" % (" + ".join(ranks), len(ranks))
else:
score = ""
joins = " ".join(get_table_joins(tables))
offset = (page - 1) * page_size

query = QUERY_TEMPLATE % (score, joins, cond, page_size, offset)
return query, tuple(arglist * 2 if need_ranks else arglist)

Loading…
Cancel
Save