Browse Source

Merge branch 'feature/db_search' into develop

tags/v1.0^2
Ben Kurtovic 10 years ago
parent
commit
54d4d88d5c
5 changed files with 214 additions and 29 deletions
  1. +11
    -3
      bitshift/codelet.py
  2. +63
    -12
      bitshift/database/__init__.py
  3. +12
    -2
      bitshift/query/__init__.py
  4. +81
    -12
      bitshift/query/nodes.py
  5. +47
    -0
      bitshift/query/tree.py

+ 11
- 3
bitshift/codelet.py View File

@@ -18,12 +18,14 @@ class Codelet(object):
code was last modified.
:ivar rank: (float) A quanitification of the source code's quality, as
per available ratings (stars, forks, upvotes, etc.).
:ivar symbols: (dict) Dictionary containing dictionaries of functions, classes,
variable definitions, etc.
:ivar symbols: (dict) Dictionary containing dictionaries of functions,
classes, variable definitions, etc.
:ivar origin: (tuple) 3-tuple of (site_name, site_url, image_blob), as
added by the database.
"""

def __init__(self, name, code, filename, language, authors, code_url,
date_created, date_modified, rank):
date_created, date_modified, rank, symbols=None, origin=None):
"""
Create a Codelet instance.

@@ -36,6 +38,8 @@ class Codelet(object):
:param date_created: see :attr:`self.date_created`
:param date_modified: see :attr:`self.date_modified`
:param rank: see :attr:`self.rank`
:param symbols: see :attr:`self.symbols`
:param origin: see :attr:`self.origin`

:type name: see :attr:`self.name`
:type code: see :attr:`self.code`
@@ -46,6 +50,8 @@ class Codelet(object):
:type date_created: see :attr:`self.date_created`
:type date_modified: see :attr:`self.date_modified`
:type rank: see :attr:`self.rank`
:type symbols: see :attr:`self.symbols`
:type origin: see :attr:`self.origin`
"""

self.name = name
@@ -57,3 +63,5 @@ class Codelet(object):
self.date_created = date_created
self.date_modified = date_modified
self.rank = rank
self.symbols = symbols or {}
self.origin = origin or (None, None, None)

+ 63
- 12
bitshift/database/__init__.py View File

@@ -9,6 +9,9 @@ import mmh3
import oursql

from .migration import VERSION, MIGRATIONS
from ..codelet import Codelet
from ..query.nodes import (String, Regex, Text, Language, Author, Date, Symbol,
BinaryOp, UnaryOp)

__all__ = ["Database"]

@@ -51,23 +54,71 @@ class Database(object):
"Run `python -m bitshift.database.migration`."
raise RuntimeError(err)

def _search_with_query(self, cursor, query):
"""Convert a query tree into SQL SELECTs, execute, and return results.
def _search_with_query(self, cursor, tree, page):
"""Execute an SQL query based on a query tree, and return results.

The returned data is a 2-tuple of (list of codelet IDs, estimated
number of total results).
"""
raise NotImplementedError() ## TODO

results = cursor.fetchall()
ids = NotImplemented ## TODO: extract ids from results
num_results = NotImplemented ## TODO: num if results else 0

query, args = tree.build_query(page)
cursor.execute(query, args)
ids = [id for id, _ in cursor.fetchall()]
num_results = 0 # TODO: NotImplemented
return ids, num_results

def _get_authors_for_codelet(self, cursor, codelet_id):
"""Return a list of authors for a given codelet."""
query = """SELECT author_name, author_url
FROM authors
WHERE author_codelet = ?"""

cursor.execute(query, (codelet_id,))
return cursor.fetchall()

def _get_symbols_for_code(self, cursor, code_id):
"""Return a list of symbols for a given codelet."""
query = """SELECT symbol_type, symbol_name, sloc_type, sloc_row,
sloc_col, sloc_end_row, sloc_end_col
FROM symbols
INNER JOIN symbol_locations ON sloc_symbol = symbol_id
WHERE symbol_code = ?"""

symbols = {type_: {} for type_ in Symbol.TYPES_INV}
cursor.execute(query, (code_id,))
for type_, name, loc_type, row, col, erow, ecol in cursor.fetchall():
sdict = symbols[Symbol.TYPES_INV[type_]]
if name not in sdict:
sdict[name] = ((), ())
sdict[name][loc_type].append((row, col, erow, ecol))
for type_, sdict in symbols.items():
symbols[type_] = [(n, d, u) for n, (d, u) in sdict.iteritems()]
return symbols

def _get_codelets_from_ids(self, cursor, ids):
"""Return a list of Codelet objects given a list of codelet IDs."""
raise NotImplementedError() ## TODO
query = """SELECT *
FROM codelets
INNER JOIN code ON codelet_code_id = code_id
INNER JOIN origins ON codelet_origin = origin_id
WHERE codelet_id = ?"""

with self._conn.cursor(oursql.DictCursor) as dict_cursor:
dict_cursor.executemany(query, [(id,) for id in ids])
for row in dict_cursor.fetchone():
codelet_id = row["codelet_id"]
if row["origin_url_base"]:
url = row["codelet_url"]
else:
url = row["origin_url_base"] + row["codelet_url"]
origin = (row["origin_name"], row["origin_url"],
row["origin_image"])
authors = self._get_authors_for_codelet(cursor, codelet_id)
symbols = self._get_symbols_for_code(cursor, row["code_id"])
yield Codelet(
row["codelet_name"], row["code_code"], None,
row["code_lang"], authors, url,
row["codelet_date_created"], row["codelet_date_modified"],
row["codelet_rank"], symbols, origin)

def _decompose_url(self, cursor, url):
"""Break up a URL into an origin (with a URL base) and a suffix."""
@@ -82,13 +133,12 @@ class Database(object):

def _insert_symbols(self, cursor, code_id, sym_type, symbols):
"""Insert a list of symbols of a given type into the database."""
sym_types = ["functions", "classes", "variables"]
query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)"
query2 = """INSERT INTO symbol_locations VALUES
(DEFAULT, ?, ?, ?, ?, ?, ?)"""

for (name, decls, uses) in symbols:
cursor.execute(query1, (code_id, sym_types.index(sym_type), name))
cursor.execute(query1, (code_id, Symbol.TYPES_INV[sym_type], name))
sym_id = cursor.lastrowid
params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] +
[tuple([sym_id, 1] + list(loc)) for loc in uses])
@@ -132,7 +182,8 @@ class Database(object):
num_mnt = num_results / (10 ** num_exp)
cursor.execute(query2, (cache_id, num_mnt, num_exp))
cursor.executemany(query3, [(cache_id, c_id) for c_id in ids])
return (num_results, self._get_codelets_from_ids(cursor, ids))
codelet_gen = self._get_codelets_from_ids(cursor, ids)
return (num_results, list(codelet_gen))

def insert(self, codelet):
"""


+ 12
- 2
bitshift/query/__init__.py View File

@@ -116,6 +116,10 @@ class _QueryParser(object):

def _parse_term(self, term):
"""Parse a query term into a tree node and return it."""
try:
term = term.decode("unicode_escape")
except UnicodeDecodeError:
raise QueryParseException('Invalid query term: "%s"' % term)
if ":" in term and not term[0] == ":":
prefix, arg = term.split(":", 1)
invert = prefix.lower() == "not"
@@ -135,7 +139,7 @@ class _QueryParser(object):

Returns a 2-tuple of (first_marker_found, marker_index).
"""
def _is_escaped(query, index):
def is_escaped(query, index):
"""Return whether a query marker is backslash-escaped."""
return (index > 0 and query[index - 1] == "\\" and
(index < 2 or query[index - 2] != "\\"))
@@ -143,7 +147,7 @@ class _QueryParser(object):
best_marker, best_index = None, maxsize
for marker in markers:
index = query.find(marker)
if _is_escaped(query, index):
if is_escaped(query, index):
_, new_index = self._scan_query(query[index + 1:], marker)
index += new_index + 1
if index >= 0 and index < best_index:
@@ -209,6 +213,9 @@ class _QueryParser(object):
def parse_binary_op(op):
"""Parse a binary operator in a nested query list."""
index = nest.index(op)
if index == 0 or index == len(nest) - 1:
err = "Invalid query: '%s' given without argument."
raise QueryParseException(err % BinaryOp.OPS[op])
left = self._parse_nest(nest[:index])
right = self._parse_nest(nest[index + 1:])
return BinaryOp(left, op, right)
@@ -222,6 +229,9 @@ class _QueryParser(object):
return parse_binary_op(BinaryOp.AND)
elif UnaryOp.NOT in nest:
index = nest.index(UnaryOp.NOT)
if index == len(nest) - 1:
err = "Invalid query: '%s' given without argument."
raise QueryParseException(err % UnaryOp.OPS[UnaryOp.NOT])
right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:]))
if index > 0:
left = self._parse_nest(nest[:index])


+ 81
- 12
bitshift/query/nodes.py View File

@@ -15,6 +15,15 @@ class _Node(object):
"""Return a string sort key for the node."""
return ""

def parameterize(self, tables):
"""Parameterize the node.

Returns a 4-tuple of (conditional string, parameter list, rank list,
should-we-rank boolean). If the rank list is empty, then it is assumed
to contain the conditional string.
"""
return "", [], [], False


class _Literal(object):
"""Represents a literal component of a search query, present at the leaves.
@@ -75,6 +84,20 @@ class Text(_Node):
def sortkey(self):
return self.text.sortkey()

def parameterize(self, tables):
tables |= {"code", "symbols"}
if isinstance(self.text, Regex):
ranks = ["(codelet_name REGEXP ?)", "(symbol_name REGEXP ?)",
"(code_code REGEXP ?)"]
text = self.text.regex
else:
ranks = ["(MATCH(codelet_name) AGAINST (? IN BOOLEAN MODE))",
"(MATCH(code_code) AGAINST (? IN BOOLEAN MODE))",
"(symbol_name = ?)"]
text = self.text.string
cond = "(" + " OR ".join(ranks) + ")"
return cond, [text] * 3, ranks, True


class Language(_Node):
"""Represents a language node.
@@ -94,6 +117,10 @@ class Language(_Node):
def sortkey(self):
return LANGS[self.lang]

def parameterize(self, tables):
tables |= {"code"}
return "(code_lang = ?)", [self.lang], [], False


class Author(_Node):
"""Represents a author node.
@@ -113,6 +140,13 @@ class Author(_Node):
def sortkey(self):
return self.name.sortkey()

def parameterize(self, tables):
tables |= {"authors"}
if isinstance(self.name, Regex):
return "(author_name REGEXP ?)", [self.name.regex], [], False
cond = "(MATCH(author_name) AGAINST (? IN BOOLEAN MODE))"
return cond, [self.name.string], [], True


class Date(_Node):
"""Represents a date node.
@@ -144,38 +178,59 @@ class Date(_Node):
def sortkey(self):
return self.date.strftime("%Y%m%d%H%M%S")

def parameterize(self, tables):
column = {self.CREATE: "codelet_date_created",
self.MODIFY: "codelet_date_modified"}[self.type]
op = {self.BEFORE: "<=", self.AFTER: ">="}[self.relation]
return "(" + column + " " + op + " ?)", [self.date], [], False


class Symbol(_Node):
"""Represents a symbol node.

Searches in symbol_type and symbol_name.
"""
ALL = 0
FUNCTION = 1
CLASS = 2
VARIABLE = 3
ALL = -1
FUNCTION = 0
CLASS = 1
VARIABLE = 2
TYPES = {FUNCTION: "FUNCTION", CLASS: "CLASS", VARIABLE: "VARIABLE"}
TYPES_INV = ["functions", "classes", "variables"]

def __init__(self, type_, name):
"""
:type type_: int (``ALL``, ``FUNCTION``, ``CLASS``, etc.)
:type name: :py:class:`.Literal`
:type name: :py:class:`._Literal`
"""
self.type = type_
self.name = name

def __repr__(self):
types = {self.ALL: "ALL", self.FUNCTION: "FUNCTION",
self.CLASS: "CLASS", self.VARIABLE: "VARIABLE"}
return "Symbol({0}, {1})".format(types[self.type], self.name)
type_ = self.TYPES.get(self.type, "ALL")
return "Symbol({0}, {1})".format(type_, self.name)

def sortkey(self):
return self.name.sortkey()

def parameterize(self, tables):
tables |= {"code", "symbols"}
if isinstance(self.name, Regex):
cond, name = "symbol_name REGEXP ?", self.name.regex
else:
cond, name = "symbol_name = ?", self.name.string
if self.type == self.ALL:
types = ", ".join(str(type_) for type_ in self.TYPES)
cond += " AND symbol_type IN (%s)" % types
if self.type != self.ALL:
cond += " AND symbol_type = %d" % self.type
return "(" + cond + ")", [name], [], False


class BinaryOp(_Node):
"""Represents a relationship between two nodes: ``and``, ``or``."""
AND = object()
OR = object()
OPS = {AND: "AND", OR: "OR"}

def __init__(self, left, op, right):
self.left = left
@@ -183,25 +238,39 @@ class BinaryOp(_Node):
self.right = right

def __repr__(self):
ops = {self.AND: "AND", self.OR: "OR"}
tmpl = "BinaryOp({0}, {1}, {2})"
return tmpl.format(self.left, ops[self.op], self.right)
return tmpl.format(self.left, self.OPS[self.op], self.right)

def sortkey(self):
return self.left.sortkey() + self.right.sortkey()

def parameterize(self, tables):
lcond, largs, lranks, need_lranks = self.left.parameterize(tables)
rcond, rargs, rranks, need_rranks = self.right.parameterize(tables)
lranks, rranks = lranks or [lcond], rranks or [rcond]
op = self.OPS[self.op]
cond = "(" + lcond + " " + op + " " + rcond + ")"
need_ranks = need_lranks or need_rranks or self.op == self.OR
return cond, largs + rargs, lranks + rranks, need_ranks


class UnaryOp(_Node):
"""Represents a transformation applied to one node: ``not``."""
NOT = object()
OPS = {NOT: "NOT"}

def __init__(self, op, node):
self.op = op
self.node = node

def __repr__(self):
ops = {self.NOT: "NOT"}
return "UnaryOp({0}, {1})".format(ops[self.op], self.node)
return "UnaryOp({0}, {1})".format(self.OPS[self.op], self.node)

def sortkey(self):
return self.node.sortkey()

def parameterize(self, tables):
cond, args, ranks, need_ranks = self.node.parameterize(tables)
new_cond = "(" + self.OPS[self.op] + " " + cond + ")"
ranks = ranks or [cond]
return new_cond, args, ranks, need_ranks

+ 47
- 0
bitshift/query/tree.py View File

@@ -1,5 +1,12 @@
__all__ = ["Tree"]

QUERY_TEMPLATE = """SELECT codelet_id, (codelet_rank%s) AS score
FROM codelets %s
WHERE %s
GROUP BY codelet_id
ORDER BY score DESC
LIMIT %d OFFSET %d""".replace("\n", " ")

class Tree(object):
"""Represents a query tree."""

@@ -9,6 +16,11 @@ class Tree(object):
def __repr__(self):
return "Tree({0})".format(self._root)

@property
def root(self):
"""The root node of the tree."""
return self._root

def sortkey(self):
"""Return a string sort key for the query tree."""
return self._root.sortkey()
@@ -20,3 +32,38 @@ class Tree(object):
:rtype: str
"""
return repr(self)

def build_query(self, page=1, page_size=10):
"""Convert the query tree into a parameterized SQL SELECT statement.

:param page: The page number to get results for.
:type page: int
:param page_size: The number of results per page.
:type page_size: int

:return: SQL query data.
:rtype: 2-tuple of (SQL statement string, query parameter tuple)
"""
def get_table_joins(tables):
data = [
("code", "codelet_code_id", "code_id"),
("authors", "author_codelet", "codelet_id"),
("symbols", "symbol_code", "code_id")
]
tmpl = "INNER JOIN %s ON %s = %s"
for args in data:
if args[0] in tables:
yield tmpl % args

tables = set()
cond, arglist, ranks, need_ranks = self._root.parameterize(tables)
ranks = ranks or [cond]
if need_ranks:
score = " + ((%s) / %d)" % (" + ".join(ranks), len(ranks))
else:
score = ""
joins = " ".join(get_table_joins(tables))
offset = (page - 1) * page_size

query = QUERY_TEMPLATE % (score, joins, cond, page_size, offset)
return query, tuple(arglist * 2 if need_ranks else arglist)

Loading…
Cancel
Save