Browse Source

Merge branch 'feature/query_parser' into develop

tags/v1.0^2
Ben Kurtovic 10 years ago
parent
commit
49a858f60e
7 changed files with 495 additions and 12 deletions
  1. +15
    -4
      bitshift/database/__init__.py
  2. +269
    -7
      bitshift/query/__init__.py
  3. +174
    -0
      bitshift/query/nodes.py
  4. +18
    -0
      bitshift/query/tree.py
  5. +2
    -1
      setup.py
  6. +0
    -0
     
  7. +17
    -0
      test/test_query_parser.py

+ 15
- 4
bitshift/database/__init__.py View File

@@ -51,6 +51,20 @@ class Database(object):
"Run `python -m bitshift.database.migration`." "Run `python -m bitshift.database.migration`."
raise RuntimeError(err) raise RuntimeError(err)


def _search_with_query(self, cursor, query):
"""Convert a query tree into SQL SELECTs, execute, and return results.

The returned data is a 2-tuple of (list of codelet IDs, estimated
number of total results).
"""
raise NotImplementedError() ## TODO

results = cursor.fetchall()
ids = NotImplemented ## TODO: extract ids from results
num_results = NotImplemented ## TODO: num if results else 0

return ids, num_results

def _get_codelets_from_ids(self, cursor, ids): def _get_codelets_from_ids(self, cursor, ids):
"""Return a list of Codelet objects given a list of codelet IDs.""" """Return a list of Codelet objects given a list of codelet IDs."""
raise NotImplementedError() ## TODO raise NotImplementedError() ## TODO
@@ -112,10 +126,7 @@ class Database(object):
num_results = results[0][1] * (10 ** results[0][2]) num_results = results[0][1] * (10 ** results[0][2])
ids = [res[0] for res in results] ids = [res[0] for res in results]
else: # Cache miss else: # Cache miss
## TODO: build and execute search query
results = cursor.fetchall()
ids = NotImplemented ## TODO: extract ids from results
num_results = NotImplemented ## TODO: num if results else 0
ids, num_results = self._search_with_query(cursor, query, page)
num_exp = max(len(str(num_results)) - 3, 0) num_exp = max(len(str(num_results)) - 3, 0)
num_results = int(round(num_results, -num_exp)) num_results = int(round(num_results, -num_exp))
num_mnt = num_results / (10 ** num_exp) num_mnt = num_results / (10 ** num_exp)


+ 269
- 7
bitshift/query/__init__.py View File

@@ -1,11 +1,273 @@
from .association import Association
from .node import Node
"""
This subpackage contains code to parse search queries received from the
frontend into trees that can be used by the database backend.
"""

from __future__ import unicode_literals
from re import IGNORECASE, search
from sys import maxsize

from dateutil.parser import parse as parse_date

from .nodes import (String, Regex, Text, Language, Author, Date, Symbol,
BinaryOp, UnaryOp)
from .tree import Tree from .tree import Tree
from ..languages import LANGS


__all__ = ["parse_query"]
__all__ = ["QueryParseException", "parse_query"]


def parse_query(query):
# gets a string, returns a Tree
# TODO: note: resultant Trees should be normalized so that "foo OR bar"
# and "bar OR foo" result in equivalent trees
class QueryParseException(Exception):
"""Raised by parse_query() when a query is invalid."""
pass pass


class _QueryParser(object):
"""Wrapper class with methods to parse queries. Used as a singleton."""

def __init__(self):
self._prefixes = {
self._parse_language: ["l", "lang", "language"],
self._parse_author: ["a", "author"],
self._parse_modified: ["m", "mod", "modified", "modify"],
self._parse_created: ["cr", "create", "created"],
self._parse_symbol: ["s", "sym", "symb", "symbol"],
self._parse_function: ["f", "fn", "fun", "func", "function"],
self._parse_class: ["cl", "class", "clss"],
self._parse_variable: ["v", "var", "variable"]
}

def _parse_literal(self, literal):
"""Parse part of a search query into a string or regular expression."""
if literal.startswith(("r:", "re:", "regex:", "regexp:")):
return Regex(literal.split(":", 1)[1])
return String(literal)

def _parse_language(self, term):
"""Parse part of a query into a language node and return it."""
term = self._parse_literal(term)
if isinstance(term, Regex):
langs = [i for i, lang in enumerate(LANGS)
if search(term.regex, lang, IGNORECASE)]
if not langs:
err = 'No languages found for regex: "%s"' % term.regex
raise QueryParseException(err)
node = Language(langs.pop())
while langs:
node = BinaryOp(Language(langs.pop()), BinaryOp.OR, node)
return node

needle = term.string.lower()
for i, lang in enumerate(LANGS):
if lang.lower() == needle:
return Language(i)
for i, lang in enumerate(LANGS):
if lang.lower().startswith(needle):
return Language(i)
err = 'No languages found for string: "%s"' % term.string
raise QueryParseException(err)

def _parse_author(self, term):
"""Parse part of a query into an author node and return it."""
return Author(self._parse_literal(term))

def _parse_date(self, term, type_):
"""Parse part of a query into a date node and return it."""
if ":" not in term:
err = "A date relationship is required " \
'("before:<date>" or "after:<date>"): "%s"'
raise QueryParseException(err % term)
relstr, dtstr = term.split(":", 1)
if relstr.lower() in ("before", "b"):
relation = Date.BEFORE
elif relstr.lower() in ("after", "a"):
relation = Date.AFTER
else:
err = 'Bad date relationship (should be "before" or "after"): "%s"'
raise QueryParseException(err % relstr)
try:
dt = parse_date(dtstr)
except (TypeError, ValueError):
raise QueryParseException('Bad date/time string: "%s"' % dtstr)
return Date(type_, relation, dt)

def _parse_modified(self, term):
"""Parse part of a query into a date modified node and return it."""
return self._parse_date(term, Date.MODIFY)

def _parse_created(self, term):
"""Parse part of a query into a date created node and return it."""
return self._parse_date(term, Date.CREATE)

def _parse_symbol(self, term):
"""Parse part of a query into a symbol node and return it."""
return Symbol(Symbol.ALL, self._parse_literal(term))

def _parse_function(self, term):
"""Parse part of a query into a function node and return it."""
return Symbol(Symbol.FUNCTION, self._parse_literal(term))

def _parse_class(self, term):
"""Parse part of a query into a class node and return it."""
return Symbol(Symbol.CLASS, self._parse_literal(term))

def _parse_variable(self, term):
"""Parse part of a query into a variable node and return it."""
return Symbol(Symbol.VARIABLE, self._parse_literal(term))

def _parse_term(self, term):
"""Parse a query term into a tree node and return it."""
if ":" in term and not term[0] == ":":
prefix, arg = term.split(":", 1)
invert = prefix.lower() == "not"
if invert:
prefix, arg = arg.split(":", 1)
if not arg:
raise QueryParseException('Incomplete query term: "%s"' % term)
for meth, prefixes in self._prefixes.iteritems():
if prefix.lower() in prefixes:
if invert:
return UnaryOp(UnaryOp.NOT, meth(arg))
return meth(arg)
return Text(self._parse_literal(term))

def _scan_query(self, query, markers):
"""Scan a query (sub)string for the first occurance of some markers.

Returns a 2-tuple of (first_marker_found, marker_index).
"""
def _is_escaped(query, index):
"""Return whether a query marker is backslash-escaped."""
return (index > 0 and query[index - 1] == "\\" and
(index < 2 or query[index - 2] != "\\"))

best_marker, best_index = None, maxsize
for marker in markers:
index = query.find(marker)
if _is_escaped(query, index):
_, new_index = self._scan_query(query[index + 1:], marker)
index += new_index + 1
if index >= 0 and index < best_index:
best_marker, best_index = marker, index
return best_marker, best_index

def _split_query(self, query, parens=False):
"""Split a query string into a nested list of query terms.

Returns a list of terms and/or nested sublists of terms. Each term and
sublist is guarenteed to be non-empty.
"""
query = query.lstrip()
if not query:
return []
marker, index = self._scan_query(query, " \"'()")
if not marker:
return [query]
nest = [query[:index]] if index > 0 else []
after = query[index + 1:]

if marker == " ":
nest += self._split_query(after, parens)
elif marker in ('"', "'"):
close_marker, close_index = self._scan_query(after, marker)
if close_marker:
if close_index > 0:
nest.append(after[:close_index])
after = after[close_index + 1:]
nest += self._split_query(after, parens)
elif after:
nest.append(after)
elif marker == "(":
inner, after = self._split_query(after, True), []
if inner and isinstance(inner[-1], tuple):
after = self._split_query(inner.pop()[0], parens)
if inner:
nest.append(inner)
if after:
nest += after
elif marker == ")":
if parens:
nest.append((after,))
else:
nest += self._split_query(after)
return nest

def _parse_boolean_operators(self, nest):
"""Parse boolean operators in a nested query list."""
op_lookup = {
"and": BinaryOp.AND,
"or": BinaryOp.OR,
"not": UnaryOp.NOT
}
for i, term in enumerate(nest):
if isinstance(term, list):
self._parse_boolean_operators(term)
else:
nest[i] = op_lookup.get(term.lower(), term)

def _parse_nest(self, nest):
"""Recursively parse a nested list of search query terms."""
def parse_binary_op(op):
"""Parse a binary operator in a nested query list."""
index = nest.index(op)
left = self._parse_nest(nest[:index])
right = self._parse_nest(nest[index + 1:])
return BinaryOp(left, op, right)

if not nest:
err = "Error while parsing query: empty nest detected."
raise QueryParseException(err)
elif BinaryOp.OR in nest:
return parse_binary_op(BinaryOp.OR)
elif BinaryOp.AND in nest:
return parse_binary_op(BinaryOp.AND)
elif UnaryOp.NOT in nest:
index = nest.index(UnaryOp.NOT)
right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:]))
if index > 1:
left = self._parse_nest(nest[:index])
return BinaryOp(left, BinaryOp.AND, right)
return right
elif len(nest) > 1:
left, right = self._parse_term(nest[0]), self._parse_nest(nest[1:])
return BinaryOp(left, BinaryOp.AND, right)
elif isinstance(nest[0], list):
return self._parse_nest(nest[0])
else:
return self._parse_term(nest[0])

def _balance_tree(self, node):
"""Auto-balance a tree using a string sorting function."""
if isinstance(node, BinaryOp):
self._balance_tree(node.left)
self._balance_tree(node.right)
if repr(node.right) < repr(node.left):
node.left, node.right = node.right, node.left
elif isinstance(node, UnaryOp):
self._balance_tree(node.node)

def parse(self, query):
"""
Parse a search query.

The result is normalized with a sorting function so that
``"foo OR bar"`` and ``"bar OR foo"`` result in the same tree. This is
important for caching purposes.

:param query: The query be converted.
:type query: str

:return: A tree storing the data in the query.
:rtype: :py:class:`~.query.tree.Tree`

:raises: :py:class:`.QueryParseException`
"""
nest = self._split_query(query.rstrip())
if not nest:
raise QueryParseException('Empty query: "%s"' % query)
self._parse_boolean_operators(nest)
root = self._parse_nest(nest)
self._balance_tree(root)
return Tree(root)


parse_query = _QueryParser().parse

+ 174
- 0
bitshift/query/nodes.py View File

@@ -0,0 +1,174 @@
from ..languages import LANGS

__all__ = ["String", "Regex", "Text", "Language", "Author", "Date", "Symbol",
"BinaryOp", "UnaryOp"]

class _Node(object):
"""Represents a single node in a query tree.

Generally speaking, a node is a constraint applied to the database. Thus,
a :py:class:`~.Language` node represents a constraint where only codelets
of a specific language are selected.
"""
pass


class _Literal(object):
"""Represents a literal component of a search query, present at the leaves.

A literal might be a string or a regular expression.
"""
pass


class String(_Literal):
"""Represents a string literal."""

def __init__(self, string):
"""
:type string: unicode
"""
self.string = string

def __repr__(self):
return "String({0!r})".format(self.string)


class Regex(_Literal):
"""Represents a regular expression literal."""

def __init__(self, regex):
"""
:type string: unicode
"""
self.regex = regex

def __repr__(self):
return "Regex({0!r})".format(self.regex)


class Text(_Node):
"""Represents a text node.

Searches in codelet names (full-text search), symbols (equality), and
source code (full-text search).
"""

def __init__(self, text):
"""
:type text: :py:class:`._Literal`
"""
self.text = text

def __repr__(self):
return "Text({0})".format(self.text)


class Language(_Node):
"""Represents a language node.

Searches in the code_lang field.
"""

def __init__(self, lang):
"""
:type lang: int
"""
self.lang = lang

def __repr__(self):
return "Language({0})".format(LANGS[self.lang])


class Author(_Node):
"""Represents a author node.

Searches in the author_name field (full-text search).
"""

def __init__(self, name):
self.name = name

def __repr__(self):
return "Author({0})".format(self.name)


class Date(_Node):
"""Represents a date node.

Searches in the codelet_date_created or codelet_date_modified fields.
"""
CREATE = 1
MODIFY = 2

BEFORE = 1
AFTER = 2

def __init__(self, type_, relation, date):
"""
:type type_: int (``CREATE`` or ``MODIFY``)
:type relation: int (``BEFORE``, ``AFTER``)
:type date: datetime.datetime
"""
self.type = type_
self.relation = relation
self.date = date

def __repr__(self):
types = {self.CREATE: "CREATE", self.MODIFY: "MODIFY"}
relations = {self.BEFORE: "BEFORE", self.AFTER: "AFTER"}
tm = "Date({0}, {1}, {2})"
return tm.format(types[self.type], relations[self.relation], self.date)


class Symbol(_Node):
"""Represents a symbol node.

Searches in symbol_type and symbol_name.
"""
ALL = 0
FUNCTION = 1
CLASS = 2
VARIABLE = 3

def __init__(self, type_, name):
"""
:type type_: int (``ALL``, ``FUNCTION``, ``CLASS``, etc.)
:type name: :py:class:`.Literal`
"""
self.type = type_
self.name = name

def __repr__(self):
types = {self.ALL: "ALL", self.FUNCTION: "FUNCTION",
self.CLASS: "CLASS", self.VARIABLE: "VARIABLE"}
return "Symbol({0}, {1})".format(types[self.type], self.name)


class BinaryOp(_Node):
"""Represents a relationship between two nodes: ``and``, ``or``."""
AND = object()
OR = object()

def __init__(self, left, op, right):
self.left = left
self.op = op
self.right = right

def __repr__(self):
ops = {self.AND: "AND", self.OR: "OR"}
tmpl = "BinaryOp({0}, {1}, {2})"
return tmpl.format(self.left, ops[self.op], self.right)


class UnaryOp(_Node):
"""Represents a transformation applied to one node: ``not``."""
NOT = object()

def __init__(self, op, node):
self.op = op
self.node = node

def __repr__(self):
ops = {self.NOT: "NOT"}
return "UnaryOp({0}, {1})".format(ops[self.op], self.node)

+ 18
- 0
bitshift/query/tree.py View File

@@ -0,0 +1,18 @@
__all__ = ["Tree"]

class Tree(object):
"""Represents a query tree."""

def __init__(self, root):
self._root = root

def __repr__(self):
return "Tree({0})".format(self._root)

def serialize(self):
"""Create a string representation of the query for caching.

:return: Query string representation.
:rtype: str
"""
return repr(self)

+ 2
- 1
setup.py View File

@@ -6,7 +6,8 @@ setup(
packages = find_packages(), packages = find_packages(),
install_requires = [ install_requires = [
"Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", "Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0",
"beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"],
"beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3",
"python-dateutil>=2.2"],
author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak",
license = "MIT", license = "MIT",
url = "https://github.com/earwig/bitshift" url = "https://github.com/earwig/bitshift"


+ 0
- 0
View File


+ 17
- 0
test/test_query_parser.py View File

@@ -0,0 +1,17 @@
from __future__ import unicode_literals
import unittest

from bitshift.query import parse_query

class TestQueryParser(unittest.TestCase):
"""Unit tests for the query parser in :py:mod:`bitshift.query`."""

def test_parse(self):
"""test basic query parsing"""
pq = lambda s: parse_query(s).serialize()
self.assertEqual("Tree(Text(String('test')))", pq("test"))
self.assertEqual("Tree(Text(Regex('test')))", pq("re:test"))


if __name__ == "__main__":
unittest.main(verbosity=2)

Loading…
Cancel
Save