From 34e629b3cd5ad5d09da4f22b9ef2f48baed26923 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 14 Apr 2014 10:01:34 -0400 Subject: [PATCH 01/21] Some early work on varous query objects. --- bitshift/query/__init__.py | 18 +++++++++++++++++- bitshift/query/associations.py | 29 +++++++++++++++++++++++++++++ bitshift/query/node.py | 4 ++++ bitshift/query/tree.py | 4 ++++ 4 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 bitshift/query/associations.py create mode 100644 bitshift/query/node.py create mode 100644 bitshift/query/tree.py diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 7d6e0d5..bc70cde 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -1,9 +1,25 @@ -from .association import Association +from .associations import BinaryOp, UnaryOp from .node import Node from .tree import Tree __all__ = ["parse_query"] def parse_query(query): + """ + Parse a search query. + + :param query: The query be converted. + :type query: str + + :return: A tree storing the data in the query. + :rtype: :py:class:`~.query.tree.Tree` + """ + + + + + "bubble sort lang:python" + + # gets a string, returns a Tree pass diff --git a/bitshift/query/associations.py b/bitshift/query/associations.py new file mode 100644 index 0000000..17e7a1e --- /dev/null +++ b/bitshift/query/associations.py @@ -0,0 +1,29 @@ +__all__ = ["BinaryOp", "UnaryOp"] + +class _Association(object): + pass + + +class BinaryOp(_Association): + AND = 1 + OR = 2 + + def __init__(self, left, right, op): + self.left = left + self.right = right + self.op = op + + def __str__(self): + ops = {AND: "And", OR: "Or"} + return "{0}({1}, {2})".format(ops[self.op], self.left, self.right) + + +class UnaryOp(_Association): + NOT = 1 + + def __init__(self, node, op): + self.node = node + self.op = op + + def __str__(self): + pass diff --git a/bitshift/query/node.py b/bitshift/query/node.py new file mode 100644 index 0000000..3317dac --- /dev/null +++ b/bitshift/query/node.py @@ -0,0 +1,4 @@ +__all__ = ["Node"] + +class Node(object): + pass diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py new file mode 100644 index 0000000..90702dc --- /dev/null +++ b/bitshift/query/tree.py @@ -0,0 +1,4 @@ +__all__ = ["Tree"] + +class Tree(object): + pass From 674f227b2229b817fc13eeff9bf3729fface07af Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 6 May 2014 13:13:03 -0400 Subject: [PATCH 02/21] Work more on query tree structure. --- bitshift/query/__init__.py | 21 +++++------ bitshift/query/associations.py | 29 --------------- bitshift/query/node.py | 4 --- bitshift/query/nodes.py | 81 ++++++++++++++++++++++++++++++++++++++++++ bitshift/query/tree.py | 11 +++++- 5 files changed, 100 insertions(+), 46 deletions(-) delete mode 100644 bitshift/query/associations.py delete mode 100644 bitshift/query/node.py create mode 100644 bitshift/query/nodes.py diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 5498b62..d183bde 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -1,5 +1,4 @@ -from .associations import BinaryOp, UnaryOp -from .node import Node +from .nodes import * ## TODO from .tree import Tree __all__ = ["parse_query"] @@ -8,20 +7,18 @@ def parse_query(query): """ Parse a search query. + The result is normalized with a sorting function so that ``"foo OR bar"`` + and ``"bar OR foo"`` result in the same tree. This is important for caching + purposes. + :param query: The query be converted. :type query: str :return: A tree storing the data in the query. :rtype: :py:class:`~.query.tree.Tree` """ - - - - - "bubble sort lang:python" - - - # gets a string, returns a Tree - # TODO: note: resultant Trees should be normalized so that "foo OR bar" - # and "bar OR foo" result in equivalent trees pass + + # "foo" -> Tree() + # "foo bar" -> "foo bar" OR ("foo" or "bar") + # "foo bar baz" -> ""foo bar baz" OR ("foo" OR "bar baz") OR ("foo" OR "bar baz") OR ('foo' OR 'bar' OR 'baz')" diff --git a/bitshift/query/associations.py b/bitshift/query/associations.py deleted file mode 100644 index 17e7a1e..0000000 --- a/bitshift/query/associations.py +++ /dev/null @@ -1,29 +0,0 @@ -__all__ = ["BinaryOp", "UnaryOp"] - -class _Association(object): - pass - - -class BinaryOp(_Association): - AND = 1 - OR = 2 - - def __init__(self, left, right, op): - self.left = left - self.right = right - self.op = op - - def __str__(self): - ops = {AND: "And", OR: "Or"} - return "{0}({1}, {2})".format(ops[self.op], self.left, self.right) - - -class UnaryOp(_Association): - NOT = 1 - - def __init__(self, node, op): - self.node = node - self.op = op - - def __str__(self): - pass diff --git a/bitshift/query/node.py b/bitshift/query/node.py deleted file mode 100644 index 3317dac..0000000 --- a/bitshift/query/node.py +++ /dev/null @@ -1,4 +0,0 @@ -__all__ = ["Node"] - -class Node(object): - pass diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py new file mode 100644 index 0000000..c1f2dfe --- /dev/null +++ b/bitshift/query/nodes.py @@ -0,0 +1,81 @@ +__all__ = ["Node", "Text", "BinaryOp", "UnaryOp"] + +class _Literal(object): + """Represents a literal component of a search query, present at the leaves. + + A literal might be a string or a regular expression. + """ + pass + + +class _String(_Literal) + """Represents a string literal.""" + + def __init__(self, string): + self.string = string + + def __repr__(self): + return "String({0!r})".format(self.string) + + +class _Regex(_Literal): + """Represents a regular expression literal.""" + + def __init__(self, regex): + self.regex = regex + + def __repr__(self): + return "Regex({0!r})".format(self.regex) + + +class Node(object): + """Represents a single node in a query tree.""" + pass + + +class Text(Node): + """Represents a text node. + + Searches in codelet names (full-text search), symbols (equality), and + source code (full-text search). + """ + + def __init__(self, text): + self.text = text + + def __repr__(self): + return "Text({0})".format(self.text) + + +# Language -> code_lang (direct) +# DateRange -> codelet_date_created (cmp), codelet_date_modified (cmp) +# Author -> author_name (FTS) +# Symbol -> func, class, var -> symbol_type, symbol_name (direct) + + +class BinaryOp(Node): + """Represents a relationship between two nodes: ``and``, ``or``.""" + AND = 1 + OR = 2 + + def __init__(self, left, right, op): + self.left = left + self.right = right + self.op = op + + def __repr__(self): + ops = {self.AND: "And", self.OR: "Or"} + return "{0}({1}, {2})".format(ops[self.op], self.left, self.right) + + +class UnaryOp(Node): + """Represents a transformation applied to one node: ``not``.""" + NOT = 1 + + def __init__(self, node, op): + self.node = node + self.op = op + + def __repr__(self): + ops = {self.NOT: "Not"} + return "{0}({1})".format(ops[self.op], self.node) diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py index 90702dc..fe65744 100644 --- a/bitshift/query/tree.py +++ b/bitshift/query/tree.py @@ -1,4 +1,13 @@ __all__ = ["Tree"] class Tree(object): - pass + """Represents a query tree.""" + + def __init__(self, root): + self._root = root + + def serialize(self): + pass + + def build_query(self): + pass From cf2b48e2177b673e004031f895d6edbb6c6cc9de Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 7 May 2014 12:57:37 -0400 Subject: [PATCH 03/21] More work on query tree structure. --- bitshift/database/__init__.py | 19 ++++-- bitshift/query/nodes.py | 139 +++++++++++++++++++++++++++++++++++------- bitshift/query/tree.py | 3 - 3 files changed, 131 insertions(+), 30 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 75f39da..07f46c7 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -51,6 +51,20 @@ class Database(object): "Run `python -m bitshift.database.migration`." raise RuntimeError(err) + def _search_with_query(self, cursor, query): + """Convert a query tree into SQL SELECTs, execute, and return results. + + The returned data is a 2-tuple of (list of codelet IDs, estimated + number of total results). + """ + raise NotImplementedError() ## TODO + + results = cursor.fetchall() + ids = NotImplemented ## TODO: extract ids from results + num_results = NotImplemented ## TODO: num if results else 0 + + return ids, num_results + def _get_codelets_from_ids(self, cursor, ids): """Return a list of Codelet objects given a list of codelet IDs.""" raise NotImplementedError() ## TODO @@ -112,10 +126,7 @@ class Database(object): num_results = results[0][1] * (10 ** results[0][2]) ids = [res[0] for res in results] else: # Cache miss - ## TODO: build and execute search query - results = cursor.fetchall() - ids = NotImplemented ## TODO: extract ids from results - num_results = NotImplemented ## TODO: num if results else 0 + ids, num_results = self._search_with_query(cursor, query, page) num_exp = max(len(str(num_results)) - 3, 0) num_results = int(round(num_results, -num_exp)) num_mnt = num_results / (10 ** num_exp) diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index c1f2dfe..8dc7fe9 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -1,4 +1,17 @@ -__all__ = ["Node", "Text", "BinaryOp", "UnaryOp"] +# from ..languages import LANGS + +__all__ = ["String", "Regex", "Text", "Language", "Date", "Author", "Symbol", + "BinaryOp", "UnaryOp"] + +class _Node(object): + """Represents a single node in a query tree. + + Generally speaking, a node is a constraint applied to the database. Thus, + a :py:class:`~.Language` node represents a constraint where only codelets + of a specific language are selected. + """ + pass + class _Literal(object): """Represents a literal component of a search query, present at the leaves. @@ -8,32 +21,33 @@ class _Literal(object): pass -class _String(_Literal) +class String(_Literal): """Represents a string literal.""" def __init__(self, string): + """ + :type string: unicode + """ self.string = string def __repr__(self): return "String({0!r})".format(self.string) -class _Regex(_Literal): +class Regex(_Literal): """Represents a regular expression literal.""" def __init__(self, regex): + """ + :type string: unicode + """ self.regex = regex def __repr__(self): return "Regex({0!r})".format(self.regex) -class Node(object): - """Represents a single node in a query tree.""" - pass - - -class Text(Node): +class Text(_Node): """Represents a text node. Searches in codelet names (full-text search), symbols (equality), and @@ -41,41 +55,120 @@ class Text(Node): """ def __init__(self, text): + """ + :type text: :py:class:`._Literal` + """ self.text = text def __repr__(self): return "Text({0})".format(self.text) -# Language -> code_lang (direct) -# DateRange -> codelet_date_created (cmp), codelet_date_modified (cmp) -# Author -> author_name (FTS) -# Symbol -> func, class, var -> symbol_type, symbol_name (direct) +class Language(_Node): + """Represents a language node. + Searches in the code_lang field. + """ + + def __init__(self, lang): + """ + :type lang: int + """ + self.lang = lang + + def __repr__(self): + return "Language({0})".format(LANGS[self.lang]) + + +class Date(_Node): + """Represents a date node. + + Searches in the codelet_date_created or codelet_date_modified fields. + """ + CREATE = 1 + MODIFY = 2 + + BEFORE = 1 + AFTER = 2 + + def __init__(self, type_, relation, date): + """ + :type type_: int (``CREATE`` or ``MODIFY``) + :type relation: int (``BEFORE``, ``AFTER``) + :type date: datetime.datetime + """ + self.type = type_ + self.relation = relation + self.date = date + + def __repr__(self): + types = {self.CREATE: "CREATE", self.MODIFY: MODIFY} + relations = {self.BEFORE: "BEFORE", self.AFTER: "AFTER"} + tm = "Date({0}, {1}, {2})" + return tm.format(types[self.type], relations[self.relation], self.date) -class BinaryOp(Node): + +class Author(_Node): + """Represents a author node. + + Searches in the author_name field (full-text search). + """ + + def __init__(self, name): + self.name = name + + def __repr__(self): + return "Author({0})".format(self.name) + + +class Symbol(_Node): + """Represents a symbol node. + + Searches in symbol_type and symbol_name. + """ + ALL = 0 + FUNCTION = 1 + CLASS = 2 + VARIABLE = 3 + + def __init__(self, type_, name): + """ + :type type_: int (``ALL``, ``FUNCTION``, ``CLASS``, etc.) + :type name: :py:class:`.Literal` + """ + self.type = type_ + self.name = name + + def __repr__(self): + types = {self.ALL: "ALL", self.FUNCTION: "FUNCTION", + self.CLASS: "CLASS", self.VARIABLE: "VARIABLE"} + return "Symbol({0}, {1})".format(types[self.type], name) + + +class BinaryOp(_Node): """Represents a relationship between two nodes: ``and``, ``or``.""" AND = 1 OR = 2 - def __init__(self, left, right, op): + def __init__(self, left, op, right): self.left = left - self.right = right self.op = op + self.right = right def __repr__(self): - ops = {self.AND: "And", self.OR: "Or"} - return "{0}({1}, {2})".format(ops[self.op], self.left, self.right) + ops = {self.AND: "AND", self.OR: "OR"} + tmpl = "BinaryOp({0}, {1}, {2})" + return tmpl.format(self.left, ops[self.op], self.right) -class UnaryOp(Node): +class UnaryOp(_Node): """Represents a transformation applied to one node: ``not``.""" NOT = 1 - def __init__(self, node, op): - self.node = node + def __init__(self, op, node): self.op = op + self.node = node def __repr__(self): - ops = {self.NOT: "Not"} - return "{0}({1})".format(ops[self.op], self.node) + ops = {self.NOT: "NOT"} + return "UnaryOp({0}, {1})".format(ops[self.op], self.node) diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py index fe65744..3f09c0c 100644 --- a/bitshift/query/tree.py +++ b/bitshift/query/tree.py @@ -8,6 +8,3 @@ class Tree(object): def serialize(self): pass - - def build_query(self): - pass From b5c22d3b4a54eed654c439eebf136f0770cd81c2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 7 May 2014 13:15:49 -0400 Subject: [PATCH 04/21] More work. --- bitshift/query/__init__.py | 26 ++++++++++++++++++++++++-- bitshift/query/tree.py | 10 +++++++++- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index d183bde..fc602f6 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -1,7 +1,12 @@ from .nodes import * ## TODO from .tree import Tree -__all__ = ["parse_query"] +__all__ = ["QueryParseException", "parse_query"] + +class QueryParseException(Exception): + """Raised by parse_query when a query is invalid.""" + pass + def parse_query(query): """ @@ -16,8 +21,25 @@ def parse_query(query): :return: A tree storing the data in the query. :rtype: :py:class:`~.query.tree.Tree` + + :raises: :py:class:`.QueryParseException` """ - pass + for term in query.split(" "): + pass + + language:"Python" + lang: + l: + + author:"Ben Kurtovic" + + modified:before + modified:after + created:before + created:after:"Jaunary 4, 2014" + + func:"foobar" + func:re|gex:"foo?b|car" # "foo" -> Tree() # "foo bar" -> "foo bar" OR ("foo" or "bar") diff --git a/bitshift/query/tree.py b/bitshift/query/tree.py index 3f09c0c..4c1b463 100644 --- a/bitshift/query/tree.py +++ b/bitshift/query/tree.py @@ -6,5 +6,13 @@ class Tree(object): def __init__(self, root): self._root = root + def __repr__(self): + return "Tree({0})".format(self._root) + def serialize(self): - pass + """Create a string representation of the query for caching. + + :return: Query string representation. + :rtype: str + """ + return repr(self) From 816d003dd4a2a982c78c37bf52245726853db34a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 8 May 2014 13:10:57 -0400 Subject: [PATCH 05/21] More work on query parsing. --- bitshift/query/__init__.py | 106 +++++++++++++++++++++++++++++++++------------ bitshift/query/nodes.py | 2 +- 2 files changed, 80 insertions(+), 28 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index fc602f6..711e359 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -1,46 +1,98 @@ +""" +This subpackage contains code to parse search queries received from the +frontend into trees that can be used by the database backend. +""" + +from shlex import split + from .nodes import * ## TODO from .tree import Tree +from ..languages import LANGS __all__ = ["QueryParseException", "parse_query"] class QueryParseException(Exception): - """Raised by parse_query when a query is invalid.""" + """Raised by parse_query() when a query is invalid.""" pass +class _QueryParser(object): + """Wrapper class with methods to parse queries. Used as a singleton.""" + + def __init__(self): + prefixes = { + "language": _parse_language, + "author": _parse_author, + "modified": _parse_modified, + "created": _parse_created, + "symbol": _parse_symbol, + "function": _parse_function, + "class": _parse_class, + "variable": _parse_variable + } -def parse_query(query): - """ - Parse a search query. + def _parse_language(self, term): + pass + + def _parse_author(self, term): + pass - The result is normalized with a sorting function so that ``"foo OR bar"`` - and ``"bar OR foo"`` result in the same tree. This is important for caching - purposes. + def _parse_modified(self, term): + pass + + def _parse_created(self, term): + pass + + def _parse_symbol(self, term): + pass - :param query: The query be converted. - :type query: str + def _parse_function(self, term): + pass - :return: A tree storing the data in the query. - :rtype: :py:class:`~.query.tree.Tree` + def _parse_class(self, term): + pass - :raises: :py:class:`.QueryParseException` - """ - for term in query.split(" "): + def _parse_variable(self, term): pass - language:"Python" - lang: - l: + def parse(self, query): + """ + Parse a search query. + + The result is normalized with a sorting function so that ``"foo OR bar"`` + and ``"bar OR foo"`` result in the same tree. This is important for caching + purposes. + + :param query: The query be converted. + :type query: str + + :return: A tree storing the data in the query. + :rtype: :py:class:`~.query.tree.Tree` + + :raises: :py:class:`.QueryParseException` + """ + for term in split(query): + if ":" in term and not term[0] == ":": + prefix = term.split(":")[0] + + + + # language:"Python" + # lang: + # l: + + # author:"Ben Kurtovic" + + # modified:before + # modified:after + # created:before + # created:after:"Jaunary 4, 2014" - author:"Ben Kurtovic" + # func:"foobar" + # func:re|gex:"foo?b|car" - modified:before - modified:after - created:before - created:after:"Jaunary 4, 2014" + # "foo" -> Tree() + # "foo bar" -> "foo bar" OR ("foo" or "bar") + # "foo bar baz" -> ""foo bar baz" OR ("foo" OR "bar baz") OR ("foo" OR "bar baz") OR ('foo' OR 'bar' OR 'baz')" - func:"foobar" - func:re|gex:"foo?b|car" - # "foo" -> Tree() - # "foo bar" -> "foo bar" OR ("foo" or "bar") - # "foo bar baz" -> ""foo bar baz" OR ("foo" OR "bar baz") OR ("foo" OR "bar baz") OR ('foo' OR 'bar' OR 'baz')" +parse_query = _QueryParser().parse diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index 8dc7fe9..ff9d21b 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -1,4 +1,4 @@ -# from ..languages import LANGS +from ..languages import LANGS __all__ = ["String", "Regex", "Text", "Language", "Date", "Author", "Symbol", "BinaryOp", "UnaryOp"] From c9258f35e48a76641cf684e68f0d98b0740d7dca Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 8 May 2014 14:39:06 -0400 Subject: [PATCH 06/21] Literal parsing; term parsing hooks; unit test stubs. --- bitshift/query/__init__.py | 45 ++++++++++++++++++++++++++++++++------------- test/__init__.py | 0 test/test_query_parser.py | 17 +++++++++++++++++ 3 files changed, 49 insertions(+), 13 deletions(-) create mode 100644 test/__init__.py create mode 100644 test/test_query_parser.py diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 711e359..1f115a5 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -19,15 +19,15 @@ class _QueryParser(object): """Wrapper class with methods to parse queries. Used as a singleton.""" def __init__(self): - prefixes = { - "language": _parse_language, - "author": _parse_author, - "modified": _parse_modified, - "created": _parse_created, - "symbol": _parse_symbol, - "function": _parse_function, - "class": _parse_class, - "variable": _parse_variable + self._prefixes = { + self._parse_language: ["l", "lang", "language"], + self._parse_author: ["a", "author"], + self._parse_modified: ["m", "mod", "modified", "modify"], + self._parse_created: ["cr", "create", "created"], + self._parse_symbol: ["s", "sym", "symb", "symbol"], + self._parse_function: ["f", "fn", "func", "function"], + self._parse_class: ["cl", "class", "clss"], + self._parse_variable: ["v", "var", "variable"] } def _parse_language(self, term): @@ -54,6 +54,21 @@ class _QueryParser(object): def _parse_variable(self, term): pass + def _parse_literal(self, literal): + """Parse part of a search query into a string or regular expression.""" + if literal.startswith(("r:", "re:", "regex:", "regexp:")): + return Regex(literal.split(":", 1)[1]) + return String(literal) + + def _parse_term(self, term): + """Parse a query term into a tree node and return it.""" + if ":" in term and not term[0] == ":": + prefix, arg = term.split(":", 1) + for meth, prefixes in self._prefixes.iteritems(): + if prefix in prefixes: + return meth(arg) + return Text(self._parse_literal(term)) + def parse(self, query): """ Parse a search query. @@ -70,11 +85,15 @@ class _QueryParser(object): :raises: :py:class:`.QueryParseException` """ + print "input:", query for term in split(query): - if ":" in term and not term[0] == ":": - prefix = term.split(":")[0] - - + print "term: ", term + node = self._parse_term(term) + print "parse:", node + tree = Tree(node) + print "tree: ", tree + return tree + ## TODO # language:"Python" # lang: diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/test_query_parser.py b/test/test_query_parser.py new file mode 100644 index 0000000..c74da4a --- /dev/null +++ b/test/test_query_parser.py @@ -0,0 +1,17 @@ +from __future__ import unicode_literals +import unittest + +from bitshift.query import parse_query + +class TestQueryParser(unittest.TestCase): + """Unit tests for the query parser in :py:mod:`bitshift.query`.""" + + def test_parse(self): + """test basic query parsing""" + pq = lambda s: parse_query(s).serialize() + self.assertEqual("Tree(Text(String('test')))", pq("test")) + self.assertEqual("Tree(Text(Regex('test')))", pq("re:test")) + + +if __name__ == "__main__": + unittest.main(verbosity=2) From 27ac48ea6c7f4b4d4abffac273d83355443b5f12 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 8 May 2014 14:46:49 -0400 Subject: [PATCH 07/21] More debug stuff. --- bitshift/query/__init__.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 1f115a5..3705860 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -31,7 +31,8 @@ class _QueryParser(object): } def _parse_language(self, term): - pass + ## TODO: look up language ID + return Language(0) def _parse_author(self, term): pass @@ -85,13 +86,14 @@ class _QueryParser(object): :raises: :py:class:`.QueryParseException` """ - print "input:", query - for term in split(query): - print "term: ", term + print " STRING:", query + for i, term in enumerate(split(query), 1): + ## TODO: remove enumerate when removing debug prints + print " in [%02d]:" % i, term node = self._parse_term(term) - print "parse:", node + print "out [%02d]:" % i, node tree = Tree(node) - print "tree: ", tree + print " TREE:", tree return tree ## TODO From 21d1c49eadb69d710998dc851b5fefb1a5fdbe1a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 9 May 2014 13:08:08 -0400 Subject: [PATCH 08/21] Parsing dates should work. --- bitshift/query/__init__.py | 37 +++++++++++++++++++++++++++++++++---- bitshift/query/nodes.py | 28 ++++++++++++++-------------- setup.py | 3 ++- 3 files changed, 49 insertions(+), 19 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 3705860..e5844f1 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -3,9 +3,13 @@ This subpackage contains code to parse search queries received from the frontend into trees that can be used by the database backend. """ +from __future__ import unicode_literals from shlex import split -from .nodes import * ## TODO +from dateutil.parser import parse as parse_date + +from .nodes import (String, Regex, Text, Language, Author, Date, Symbol, + BinaryOp, UnaryOp) from .tree import Tree from ..languages import LANGS @@ -15,6 +19,7 @@ class QueryParseException(Exception): """Raised by parse_query() when a query is invalid.""" pass + class _QueryParser(object): """Wrapper class with methods to parse queries. Used as a singleton.""" @@ -31,28 +36,52 @@ class _QueryParser(object): } def _parse_language(self, term): + """Parse part of a query into a language node and return it.""" ## TODO: look up language ID return Language(0) def _parse_author(self, term): - pass + """Parse part of a query into an author node and return it.""" + return Author(self._parse_literal(term)) + + def _parse_date(self, term, type_): + """Parse part of a query into a date node and return it.""" + if term.startswith(("before:", "b:")): + relation = Date.BEFORE + dtstr = term.split(":", 1)[1] + elif term.startswith(("after:", "a:")): + relation = Date.AFTER + dtstr = term.split(":", 1)[1] + else: + raise QueryParseException('Bad relation for date: "%s"' % term) + try: + dt = parse_date(dtstr) + except (TypeError, ValueError): + raise QueryParseException('Bad datetime for date: "%s"' % dtstr) + return Date(type_, relation, dt) def _parse_modified(self, term): - pass + """Parse part of a query into a date modified node and return it.""" + return self._parse_date(term, Date.MODIFY) def _parse_created(self, term): - pass + """Parse part of a query into a date created node and return it.""" + return self._parse_date(term, Date.CREATE) def _parse_symbol(self, term): + """Parse part of a query into a symbol node and return it.""" pass def _parse_function(self, term): + """Parse part of a query into a function node and return it.""" pass def _parse_class(self, term): + """Parse part of a query into a class node and return it.""" pass def _parse_variable(self, term): + """Parse part of a query into a variable node and return it.""" pass def _parse_literal(self, literal): diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index ff9d21b..8ac3684 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -1,6 +1,6 @@ from ..languages import LANGS -__all__ = ["String", "Regex", "Text", "Language", "Date", "Author", "Symbol", +__all__ = ["String", "Regex", "Text", "Language", "Author", "Date", "Symbol", "BinaryOp", "UnaryOp"] class _Node(object): @@ -80,6 +80,19 @@ class Language(_Node): return "Language({0})".format(LANGS[self.lang]) +class Author(_Node): + """Represents a author node. + + Searches in the author_name field (full-text search). + """ + + def __init__(self, name): + self.name = name + + def __repr__(self): + return "Author({0})".format(self.name) + + class Date(_Node): """Represents a date node. @@ -108,19 +121,6 @@ class Date(_Node): return tm.format(types[self.type], relations[self.relation], self.date) -class Author(_Node): - """Represents a author node. - - Searches in the author_name field (full-text search). - """ - - def __init__(self, name): - self.name = name - - def __repr__(self): - return "Author({0})".format(self.name) - - class Symbol(_Node): """Represents a symbol node. diff --git a/setup.py b/setup.py index 48d4c42..5ab7a7c 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,8 @@ setup( packages = find_packages(), install_requires = [ "Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", - "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"], + "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3", + "python-dateutil>=2.2"], author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", license = "MIT", url = "https://github.com/earwig/bitshift" From ed6d0258b2560d119b50c545b6429afa71cbaf72 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 9 May 2014 13:10:00 -0400 Subject: [PATCH 09/21] Fix a typo. --- bitshift/query/nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index 8ac3684..a89a8b9 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -115,7 +115,7 @@ class Date(_Node): self.date = date def __repr__(self): - types = {self.CREATE: "CREATE", self.MODIFY: MODIFY} + types = {self.CREATE: "CREATE", self.MODIFY: "MODIFY"} relations = {self.BEFORE: "BEFORE", self.AFTER: "AFTER"} tm = "Date({0}, {1}, {2})" return tm.format(types[self.type], relations[self.relation], self.date) From 064c448824b6812218e376ce58e253a4311d9dda Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 9 May 2014 13:14:22 -0400 Subject: [PATCH 10/21] Symbol parsing works. --- bitshift/query/__init__.py | 14 +++++++------- bitshift/query/nodes.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index e5844f1..111c35b 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -70,19 +70,19 @@ class _QueryParser(object): def _parse_symbol(self, term): """Parse part of a query into a symbol node and return it.""" - pass + return Symbol(Symbol.ALL, self._parse_literal(term)) def _parse_function(self, term): """Parse part of a query into a function node and return it.""" - pass + return Symbol(Symbol.FUNCTION, self._parse_literal(term)) def _parse_class(self, term): """Parse part of a query into a class node and return it.""" - pass + return Symbol(Symbol.CLASS, self._parse_literal(term)) def _parse_variable(self, term): """Parse part of a query into a variable node and return it.""" - pass + return Symbol(Symbol.VARIABLE, self._parse_literal(term)) def _parse_literal(self, literal): """Parse part of a search query into a string or regular expression.""" @@ -103,9 +103,9 @@ class _QueryParser(object): """ Parse a search query. - The result is normalized with a sorting function so that ``"foo OR bar"`` - and ``"bar OR foo"`` result in the same tree. This is important for caching - purposes. + The result is normalized with a sorting function so that + ``"foo OR bar"`` and ``"bar OR foo"`` result in the same tree. This is + important for caching purposes. :param query: The query be converted. :type query: str diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index a89a8b9..b2e4864 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -142,7 +142,7 @@ class Symbol(_Node): def __repr__(self): types = {self.ALL: "ALL", self.FUNCTION: "FUNCTION", self.CLASS: "CLASS", self.VARIABLE: "VARIABLE"} - return "Symbol({0}, {1})".format(types[self.type], name) + return "Symbol({0}, {1})".format(types[self.type], self.name) class BinaryOp(_Node): From 3fbe0c25593d9453fb74f8d3fb052ad1a44779c5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 9 May 2014 13:49:08 -0400 Subject: [PATCH 11/21] Finish parsing for languages. --- bitshift/query/__init__.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 111c35b..83fdf71 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -4,6 +4,7 @@ frontend into trees that can be used by the database backend. """ from __future__ import unicode_literals +from re import IGNORECASE, search from shlex import split from dateutil.parser import parse as parse_date @@ -35,10 +36,35 @@ class _QueryParser(object): self._parse_variable: ["v", "var", "variable"] } + def _parse_literal(self, literal): + """Parse part of a search query into a string or regular expression.""" + if literal.startswith(("r:", "re:", "regex:", "regexp:")): + return Regex(literal.split(":", 1)[1]) + return String(literal) + def _parse_language(self, term): """Parse part of a query into a language node and return it.""" - ## TODO: look up language ID - return Language(0) + term = self._parse_literal(term) + if isinstance(term, Regex): + langs = [i for i, lang in enumerate(LANGS) + if search(term.regex, lang, IGNORECASE)] + if not langs: + err = "No languages found for regex: %r" % term.regex + raise QueryParseException(err) + node = Language(langs.pop()) + while langs: + node = BinaryOp(Language(langs.pop()), BinaryOp.OR, node) + return node + + needle = term.string.lower() + for i, lang in enumerate(LANGS): + if lang.lower() == needle: + return Language(i) + for i, lang in enumerate(LANGS): + if lang.lower().startswith(needle): + return Language(i) + err = "No languages found for string: %r" % term.string + raise QueryParseException(err) def _parse_author(self, term): """Parse part of a query into an author node and return it.""" @@ -53,11 +79,11 @@ class _QueryParser(object): relation = Date.AFTER dtstr = term.split(":", 1)[1] else: - raise QueryParseException('Bad relation for date: "%s"' % term) + raise QueryParseException("Bad relation for date node: %r" % term) try: dt = parse_date(dtstr) except (TypeError, ValueError): - raise QueryParseException('Bad datetime for date: "%s"' % dtstr) + raise QueryParseException("Bad datetime for date node: %r" % dtstr) return Date(type_, relation, dt) def _parse_modified(self, term): @@ -84,12 +110,6 @@ class _QueryParser(object): """Parse part of a query into a variable node and return it.""" return Symbol(Symbol.VARIABLE, self._parse_literal(term)) - def _parse_literal(self, literal): - """Parse part of a search query into a string or regular expression.""" - if literal.startswith(("r:", "re:", "regex:", "regexp:")): - return Regex(literal.split(":", 1)[1]) - return String(literal) - def _parse_term(self, term): """Parse a query term into a tree node and return it.""" if ":" in term and not term[0] == ":": From 461f0fe052a4751af3156215cd3ae60fa40f6f59 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 10 May 2014 20:07:47 -0400 Subject: [PATCH 12/21] More complete and descriptive error messages. --- bitshift/query/__init__.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 83fdf71..5f6d9cf 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -49,7 +49,7 @@ class _QueryParser(object): langs = [i for i, lang in enumerate(LANGS) if search(term.regex, lang, IGNORECASE)] if not langs: - err = "No languages found for regex: %r" % term.regex + err = 'No languages found for regex: "%s"' % term.regex raise QueryParseException(err) node = Language(langs.pop()) while langs: @@ -63,7 +63,7 @@ class _QueryParser(object): for i, lang in enumerate(LANGS): if lang.lower().startswith(needle): return Language(i) - err = "No languages found for string: %r" % term.string + err = 'No languages found for string: "%s"' % term.string raise QueryParseException(err) def _parse_author(self, term): @@ -72,18 +72,22 @@ class _QueryParser(object): def _parse_date(self, term, type_): """Parse part of a query into a date node and return it.""" - if term.startswith(("before:", "b:")): + if ":" not in term: + err = "A date relationship is required " \ + '("before:" or "after:"): "%s"' + raise QueryParseException(err % term) + relstr, dtstr = term.split(":", 1) + if relstr.lower() in ("before", "b"): relation = Date.BEFORE - dtstr = term.split(":", 1)[1] - elif term.startswith(("after:", "a:")): + elif relstr.lower() in ("after", "a"): relation = Date.AFTER - dtstr = term.split(":", 1)[1] else: - raise QueryParseException("Bad relation for date node: %r" % term) + err = 'Bad date relationship (should be "before" or "after"): "%s"' + raise QueryParseException(err % relstr) try: dt = parse_date(dtstr) except (TypeError, ValueError): - raise QueryParseException("Bad datetime for date node: %r" % dtstr) + raise QueryParseException('Bad date/time string: "%s"' % dtstr) return Date(type_, relation, dt) def _parse_modified(self, term): @@ -114,6 +118,8 @@ class _QueryParser(object): """Parse a query term into a tree node and return it.""" if ":" in term and not term[0] == ":": prefix, arg = term.split(":", 1) + if not arg: + raise QueryParseException('Incomplete query term: "%s"' % term) for meth, prefixes in self._prefixes.iteritems(): if prefix in prefixes: return meth(arg) From 525b49b4b2d1986a19db265b3bcd26a8ff67b784 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 10 May 2014 20:29:13 -0400 Subject: [PATCH 13/21] Construct full trees; support inverting with not:. --- bitshift/query/__init__.py | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 5f6d9cf..a15b5e0 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -118,10 +118,15 @@ class _QueryParser(object): """Parse a query term into a tree node and return it.""" if ":" in term and not term[0] == ":": prefix, arg = term.split(":", 1) + invert = prefix.lower() == "not" + if invert: + prefix, arg = arg.split(":", 1) if not arg: raise QueryParseException('Incomplete query term: "%s"' % term) for meth, prefixes in self._prefixes.iteritems(): - if prefix in prefixes: + if prefix.lower() in prefixes: + if invert: + return UnaryOp(UnaryOp.NOT, meth(arg)) return meth(arg) return Text(self._parse_literal(term)) @@ -141,34 +146,12 @@ class _QueryParser(object): :raises: :py:class:`.QueryParseException` """ - print " STRING:", query - for i, term in enumerate(split(query), 1): - ## TODO: remove enumerate when removing debug prints - print " in [%02d]:" % i, term + root = None + for term in split(query): node = self._parse_term(term) - print "out [%02d]:" % i, node - tree = Tree(node) - print " TREE:", tree - return tree - ## TODO - - # language:"Python" - # lang: - # l: - - # author:"Ben Kurtovic" - - # modified:before - # modified:after - # created:before - # created:after:"Jaunary 4, 2014" - - # func:"foobar" - # func:re|gex:"foo?b|car" - - # "foo" -> Tree() - # "foo bar" -> "foo bar" OR ("foo" or "bar") - # "foo bar baz" -> ""foo bar baz" OR ("foo" OR "bar baz") OR ("foo" OR "bar baz") OR ('foo' OR 'bar' OR 'baz')" + root = BinaryOp(root, BinaryOp.AND, node) if root else node + tree = Tree(root) + return tree parse_query = _QueryParser().parse From 4100107e93d6a5089f1343dd5f3ce0b4358a78e9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 11 May 2014 01:28:39 -0400 Subject: [PATCH 14/21] Change UnaryOp.NOT to work with BinaryOp.{OR,AND} --- bitshift/query/nodes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index b2e4864..dcf297d 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -163,7 +163,7 @@ class BinaryOp(_Node): class UnaryOp(_Node): """Represents a transformation applied to one node: ``not``.""" - NOT = 1 + NOT = 3 def __init__(self, op, node): self.op = op From 9c867b4dc3dd17365474c019806aef19f729fd71 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 11 May 2014 01:29:16 -0400 Subject: [PATCH 15/21] code --- bitshift/query/__init__.py | 109 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 103 insertions(+), 6 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index a15b5e0..0b893bd 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -5,7 +5,7 @@ frontend into trees that can be used by the database backend. from __future__ import unicode_literals from re import IGNORECASE, search -from shlex import split +from sys import maxsize from dateutil.parser import parse as parse_date @@ -146,12 +146,109 @@ class _QueryParser(object): :raises: :py:class:`.QueryParseException` """ - root = None + ## TODO: balance tree + ## -------------------------------------------------------------------- + + def SCAN_FOR_MARKER(string, markers): + best_marker, best_index = None, maxsize + for marker in markers: + index = string.find(marker) + if index > 0 and string[index - 1] == "\\" and (index == 1 or string[index - 2] != "\\"): + _, new_index = SCAN_FOR_MARKER(string[index + 1:], marker) + index += new_index + 1 + if index >= 0 and index < best_index: + best_marker, best_index = marker, index + return best_marker, best_index + + def SPLIT_QUERY_STRING(string, parens=False): + string = string.lstrip() + if not string: + return [] + marker, index = SCAN_FOR_MARKER(string, " \"'()") + + if not marker: + return [string] + + before = [string[:index]] if index > 0 else [] + after = string[index + 1:] + + if marker == " ": + return before + SPLIT_QUERY_STRING(after, parens) + + elif marker in ('"', "'"): + close_marker, close_index = SCAN_FOR_MARKER(after, marker) + if not close_marker: + return before + [after] + quoted, after = after[:close_index], after[close_index + 1:] + return before + [quoted] + SPLIT_QUERY_STRING(after, parens) + + elif marker == "(": + inner = SPLIT_QUERY_STRING(after, True) + if inner and isinstance(inner[-1], tuple): + after, inner = inner.pop()[0], [inner] if inner else [] + return before + inner + SPLIT_QUERY_STRING(after, parens) + return before + [inner] + + elif marker == ")": + if parens: + return before + [(after,)] + return before + SPLIT_QUERY_STRING(after) + + nest = SPLIT_QUERY_STRING(query.rstrip()) + if not nest: + raise QueryParseException('Empty query: "%s"' % query) + + return nest + + ########### + + group = _NodeGroup() for term in split(query): - node = self._parse_term(term) - root = BinaryOp(root, BinaryOp.AND, node) if root else node - tree = Tree(root) - return tree + + while term.startswith("("): + group = _NodeList(group, explicit=True) + term = term[1:] + + closes = 0 + while term.endswith(")"): + closes += 1 + term = term[:-1] + + if not term: + for i in xrange(closes): + group = reduce_group(group, explicit=True) + continue + + lcase = term.lower() + + if lcase == "not": + UnaryOp.NOT + elif lcase == "or": + BinaryOp.OR + elif lcase == "and": + if group.pending_op: + pass + else: + group.pending_op = BinaryOP.AND + else: + group.nodes.append(self._parse_term(term)) + + return Tree(reduce_group(group, explicit=False)) + + ## -------------------------------------------------------------------- + + # root = None + # for node in reversed(nodes): + # root = BinaryOp(node, BinaryOp.AND, root) if root else node + # tree = Tree(root) + # return tree + +class _NodeGroup(object): + def __init__(self, parent=None): + self.parent = parent + self.op = None + self.left = None + self.right = None parse_query = _QueryParser().parse From da8759372a2d1276b9dca7bca4d197c450d45e52 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 11 May 2014 02:32:30 -0400 Subject: [PATCH 16/21] Clean up SPLIT_QUERY_STRING. --- bitshift/query/__init__.py | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 0b893bd..e10e5cb 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -149,12 +149,12 @@ class _QueryParser(object): ## TODO: balance tree ## -------------------------------------------------------------------- - def SCAN_FOR_MARKER(string, markers): + def SCAN_FOR_MARKERS(string, markers): best_marker, best_index = None, maxsize for marker in markers: index = string.find(marker) if index > 0 and string[index - 1] == "\\" and (index == 1 or string[index - 2] != "\\"): - _, new_index = SCAN_FOR_MARKER(string[index + 1:], marker) + _, new_index = SCAN_FOR_MARKERS(string[index + 1:], marker) index += new_index + 1 if index >= 0 and index < best_index: best_marker, best_index = marker, index @@ -164,35 +164,43 @@ class _QueryParser(object): string = string.lstrip() if not string: return [] - marker, index = SCAN_FOR_MARKER(string, " \"'()") + marker, index = SCAN_FOR_MARKERS(string, " \"'()") if not marker: return [string] - before = [string[:index]] if index > 0 else [] + nest = [string[:index]] if index > 0 else [] after = string[index + 1:] if marker == " ": - return before + SPLIT_QUERY_STRING(after, parens) + nest += SPLIT_QUERY_STRING(after, parens) elif marker in ('"', "'"): - close_marker, close_index = SCAN_FOR_MARKER(after, marker) - if not close_marker: - return before + [after] - quoted, after = after[:close_index], after[close_index + 1:] - return before + [quoted] + SPLIT_QUERY_STRING(after, parens) + close_marker, close_index = SCAN_FOR_MARKERS(after, marker) + if close_marker: + if close_index > 0: + nest.append(after[:close_index]) + after = after[close_index + 1:] + nest += SPLIT_QUERY_STRING(after, parens) + elif after: + nest.append(after) elif marker == "(": - inner = SPLIT_QUERY_STRING(after, True) + inner, after = SPLIT_QUERY_STRING(after, True), [] if inner and isinstance(inner[-1], tuple): - after, inner = inner.pop()[0], [inner] if inner else [] - return before + inner + SPLIT_QUERY_STRING(after, parens) - return before + [inner] + after = SPLIT_QUERY_STRING(inner.pop()[0], parens) + if inner: + nest.append(inner) + if after: + nest += after elif marker == ")": if parens: - return before + [(after,)] - return before + SPLIT_QUERY_STRING(after) + nest.append((after,)) + else: + nest += SPLIT_QUERY_STRING(after) + + return nest nest = SPLIT_QUERY_STRING(query.rstrip()) if not nest: From bcab13f0272fe2cd3cf999d232096bcfb2294bc6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 11 May 2014 02:52:14 -0400 Subject: [PATCH 17/21] Finish _scan_query(), _split_query(). --- bitshift/query/__init__.py | 112 +++++++++++++++++++++++---------------------- 1 file changed, 58 insertions(+), 54 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index e10e5cb..7cfa592 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -130,6 +130,63 @@ class _QueryParser(object): return meth(arg) return Text(self._parse_literal(term)) + def _scan_query(self, query, markers): + """Scan a query (sub)string for the first occurance of some markers. + + Returns a 2-tuple of (first_marker_found, marker_index). + """ + def _is_escaped(query, index): + """Return whether a query marker is backslash-escaped.""" + return (index > 0 and query[index - 1] == "\\" and + (index < 2 or query[index - 2] != "\\")) + + best_marker, best_index = None, maxsize + for marker in markers: + index = query.find(marker) + if _is_escaped(query, index): + _, new_index = self._scan_query(query[index + 1:], marker) + index += new_index + 1 + if index >= 0 and index < best_index: + best_marker, best_index = marker, index + return best_marker, best_index + + def _split_query(self, query, parens=False): + """Split a query string into a nested list of query terms.""" + query = query.lstrip() + if not query: + return [] + marker, index = self._scan_query(query, " \"'()") + if not marker: + return [query] + nest = [query[:index]] if index > 0 else [] + after = query[index + 1:] + + if marker == " ": + nest += self._split_query(after, parens) + elif marker in ('"', "'"): + close_marker, close_index = self._scan_query(after, marker) + if close_marker: + if close_index > 0: + nest.append(after[:close_index]) + after = after[close_index + 1:] + nest += self._split_query(after, parens) + elif after: + nest.append(after) + elif marker == "(": + inner, after = self._split_query(after, True), [] + if inner and isinstance(inner[-1], tuple): + after = self._split_query(inner.pop()[0], parens) + if inner: + nest.append(inner) + if after: + nest += after + elif marker == ")": + if parens: + nest.append((after,)) + else: + nest += self._split_query(after) + return nest + def parse(self, query): """ Parse a search query. @@ -149,60 +206,7 @@ class _QueryParser(object): ## TODO: balance tree ## -------------------------------------------------------------------- - def SCAN_FOR_MARKERS(string, markers): - best_marker, best_index = None, maxsize - for marker in markers: - index = string.find(marker) - if index > 0 and string[index - 1] == "\\" and (index == 1 or string[index - 2] != "\\"): - _, new_index = SCAN_FOR_MARKERS(string[index + 1:], marker) - index += new_index + 1 - if index >= 0 and index < best_index: - best_marker, best_index = marker, index - return best_marker, best_index - - def SPLIT_QUERY_STRING(string, parens=False): - string = string.lstrip() - if not string: - return [] - marker, index = SCAN_FOR_MARKERS(string, " \"'()") - - if not marker: - return [string] - - nest = [string[:index]] if index > 0 else [] - after = string[index + 1:] - - if marker == " ": - nest += SPLIT_QUERY_STRING(after, parens) - - elif marker in ('"', "'"): - close_marker, close_index = SCAN_FOR_MARKERS(after, marker) - if close_marker: - if close_index > 0: - nest.append(after[:close_index]) - after = after[close_index + 1:] - nest += SPLIT_QUERY_STRING(after, parens) - elif after: - nest.append(after) - - elif marker == "(": - inner, after = SPLIT_QUERY_STRING(after, True), [] - if inner and isinstance(inner[-1], tuple): - after = SPLIT_QUERY_STRING(inner.pop()[0], parens) - if inner: - nest.append(inner) - if after: - nest += after - - elif marker == ")": - if parens: - nest.append((after,)) - else: - nest += SPLIT_QUERY_STRING(after) - - return nest - - nest = SPLIT_QUERY_STRING(query.rstrip()) + nest = self._split_query(query.rstrip()) if not nest: raise QueryParseException('Empty query: "%s"' % query) From d319bde7db69e1c62e7b1133eec5a66cfac1ea6c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 12 May 2014 00:02:13 -0400 Subject: [PATCH 18/21] Begin work on a model for boolean logic parsing. --- bitshift/query/__init__.py | 118 ++++++++++++++++++++++++--------------------- 1 file changed, 63 insertions(+), 55 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 7cfa592..1a2a65c 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -151,7 +151,11 @@ class _QueryParser(object): return best_marker, best_index def _split_query(self, query, parens=False): - """Split a query string into a nested list of query terms.""" + """Split a query string into a nested list of query terms. + + Returns a list of terms and/or nested sublists of terms. Each term and + sublist is guarenteed to be non-empty. + """ query = query.lstrip() if not query: return [] @@ -187,6 +191,63 @@ class _QueryParser(object): nest += self._split_query(after) return nest + def _parse_nest(self, nest): + """Recursively parse a nested list of search query terms.""" + + class _NodeGroup(object): + def __init__(self, left=None, op=None, right=None): + self.left = left or [] + self.op = op + self.right = right or [] + + def append(self, node): + self.right.append(node) if self.op else self.left.append(node) + + " a AND b OR c AND d NOT e f" + "((a) AND (b)) OR ((c) AND (d (NOT (e f))))" + "((a) AND (b)) OR ((c) AND (d" + + group = _NodeGroup() + + for term in nest: + if isinstance(term, list): + group.append(self._parse_nest(term)) + else: + lcase = term.lower() + + if lcase == "not": + + if group.op: + group.append(_NodeGroup(None, UnaryOp.NOT) ## TODO + else: + pass + + elif lcase == "or": + + if group.op == BinaryOp.AND: + group = _NodeGroup(group, BinaryOp.OR) + elif group.op == BinaryOp.OR: + pass + elif group.op == UnaryOp.NOT: + pass + else: + group.op = BinaryOp.OR + + elif lcase == "and": + + if group.op == BinaryOp.OR: + group.right = _NodeGroup(group.right, BinaryOp.AND) + elif group.op == BinaryOp.AND: + pass + elif group.op == UnaryOp.NOT: + pass + else: + group.op = BinaryOP.AND + + + else: + group.append(self._parse_term(term)) + def parse(self, query): """ Parse a search query. @@ -204,63 +265,10 @@ class _QueryParser(object): :raises: :py:class:`.QueryParseException` """ ## TODO: balance tree - ## -------------------------------------------------------------------- - nest = self._split_query(query.rstrip()) if not nest: raise QueryParseException('Empty query: "%s"' % query) - - return nest - - ########### - - group = _NodeGroup() - for term in split(query): - - while term.startswith("("): - group = _NodeList(group, explicit=True) - term = term[1:] - - closes = 0 - while term.endswith(")"): - closes += 1 - term = term[:-1] - - if not term: - for i in xrange(closes): - group = reduce_group(group, explicit=True) - continue - - lcase = term.lower() - - if lcase == "not": - UnaryOp.NOT - elif lcase == "or": - BinaryOp.OR - elif lcase == "and": - if group.pending_op: - pass - else: - group.pending_op = BinaryOP.AND - else: - group.nodes.append(self._parse_term(term)) - - return Tree(reduce_group(group, explicit=False)) - - ## -------------------------------------------------------------------- - - # root = None - # for node in reversed(nodes): - # root = BinaryOp(node, BinaryOp.AND, root) if root else node - # tree = Tree(root) - # return tree - -class _NodeGroup(object): - def __init__(self, parent=None): - self.parent = parent - self.op = None - self.left = None - self.right = None + return Tree(self._parse_nest(nest)) parse_query = _QueryParser().parse From 611a4ea9b2759fac69efe8cc8db0ed62eaa47e9e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 12 May 2014 14:06:26 -0400 Subject: [PATCH 19/21] More accurate model for boolean logic. --- bitshift/query/__init__.py | 86 ++++++++++++++++++---------------------------- bitshift/query/nodes.py | 6 ++-- 2 files changed, 36 insertions(+), 56 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 1a2a65c..6f338fa 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -31,7 +31,7 @@ class _QueryParser(object): self._parse_modified: ["m", "mod", "modified", "modify"], self._parse_created: ["cr", "create", "created"], self._parse_symbol: ["s", "sym", "symb", "symbol"], - self._parse_function: ["f", "fn", "func", "function"], + self._parse_function: ["f", "fn", "fun", "func", "function"], self._parse_class: ["cl", "class", "clss"], self._parse_variable: ["v", "var", "variable"] } @@ -191,62 +191,41 @@ class _QueryParser(object): nest += self._split_query(after) return nest - def _parse_nest(self, nest): - """Recursively parse a nested list of search query terms.""" - - class _NodeGroup(object): - def __init__(self, left=None, op=None, right=None): - self.left = left or [] - self.op = op - self.right = right or [] - - def append(self, node): - self.right.append(node) if self.op else self.left.append(node) - - " a AND b OR c AND d NOT e f" - "((a) AND (b)) OR ((c) AND (d (NOT (e f))))" - "((a) AND (b)) OR ((c) AND (d" - - group = _NodeGroup() - - for term in nest: + def _parse_boolean_operators(self, nest): + """Parse boolean operators in a nested query list.""" + op_lookup = { + "and": BinaryOp.AND, + "or": BinaryOp.OR, + "not": UnaryOp.NOT + } + for i, term in enumerate(nest): if isinstance(term, list): - group.append(self._parse_nest(term)) + self._parse_boolean_operators(term) else: - lcase = term.lower() - - if lcase == "not": - - if group.op: - group.append(_NodeGroup(None, UnaryOp.NOT) ## TODO - else: - pass - - elif lcase == "or": - - if group.op == BinaryOp.AND: - group = _NodeGroup(group, BinaryOp.OR) - elif group.op == BinaryOp.OR: - pass - elif group.op == UnaryOp.NOT: - pass - else: - group.op = BinaryOp.OR - - elif lcase == "and": - - if group.op == BinaryOp.OR: - group.right = _NodeGroup(group.right, BinaryOp.AND) - elif group.op == BinaryOp.AND: - pass - elif group.op == UnaryOp.NOT: - pass - else: - group.op = BinaryOP.AND + nest[i] = op_lookup.get(term.lower(), term) + def _parse_nest(self, nest): + """Recursively parse a nested list of search query terms.""" + def _parse_binary_op(op): + """Parse a binary operator in a nested query list.""" + index = nest.index(op) + left = self._parse_nest(nest[:index]) + right = self._parse_nest(nest[index + 1:]) + pass - else: - group.append(self._parse_term(term)) + if not nest: + raise QueryParseException("???") + elif BinaryOp.OR in nest: + return _parse_binary_op(BinaryOp.OR) + elif BinaryOp.AND in nest: + return _parse_binary_op(BinaryOp.AND) + elif UnaryOp.NOT in nest: + index = nest.index(UnaryOp.NOT) + pass + elif len(nest) > 1: + pass + else: + return self._parse_nest(nest) def parse(self, query): """ @@ -268,6 +247,7 @@ class _QueryParser(object): nest = self._split_query(query.rstrip()) if not nest: raise QueryParseException('Empty query: "%s"' % query) + self._parse_boolean_operators(nest) return Tree(self._parse_nest(nest)) diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index dcf297d..7b12a33 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -147,8 +147,8 @@ class Symbol(_Node): class BinaryOp(_Node): """Represents a relationship between two nodes: ``and``, ``or``.""" - AND = 1 - OR = 2 + AND = object() + OR = object() def __init__(self, left, op, right): self.left = left @@ -163,7 +163,7 @@ class BinaryOp(_Node): class UnaryOp(_Node): """Represents a transformation applied to one node: ``not``.""" - NOT = 3 + NOT = object() def __init__(self, op, node): self.op = op From c9b558d0b343e3b47163d756938c5a7d52a12764 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 13 May 2014 13:15:34 -0400 Subject: [PATCH 20/21] Finish query parsing, mostly. --- bitshift/query/__init__.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 6f338fa..3348dc9 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -206,26 +206,34 @@ class _QueryParser(object): def _parse_nest(self, nest): """Recursively parse a nested list of search query terms.""" - def _parse_binary_op(op): + def parse_binary_op(op): """Parse a binary operator in a nested query list.""" index = nest.index(op) left = self._parse_nest(nest[:index]) right = self._parse_nest(nest[index + 1:]) - pass + return BinaryOp(left, op, right) if not nest: - raise QueryParseException("???") + err = "Error while parsing query: empty nest detected." + raise QueryParseException(err) elif BinaryOp.OR in nest: - return _parse_binary_op(BinaryOp.OR) + return parse_binary_op(BinaryOp.OR) elif BinaryOp.AND in nest: - return _parse_binary_op(BinaryOp.AND) + return parse_binary_op(BinaryOp.AND) elif UnaryOp.NOT in nest: index = nest.index(UnaryOp.NOT) - pass + right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:])) + if index > 1: + left = self._parse_nest(nest[:index]) + return BinaryOp(left, BinaryOp.AND, right) + return right elif len(nest) > 1: - pass + left, right = self._parse_term(nest[0]), self._parse_nest(nest[1:]) + return BinaryOp(left, BinaryOp.AND, right) + elif isinstance(nest[0], list): + return self._parse_nest(nest[0]) else: - return self._parse_nest(nest) + return self._parse_term(nest[0]) def parse(self, query): """ From 983f9a5d94146c301eb5c6bde84bc67acc879188 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 14 May 2014 11:02:35 -0400 Subject: [PATCH 21/21] Implement basic tree balancing. --- bitshift/query/__init__.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 3348dc9..2ed29c2 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -235,6 +235,16 @@ class _QueryParser(object): else: return self._parse_term(nest[0]) + def _balance_tree(self, node): + """Auto-balance a tree using a string sorting function.""" + if isinstance(node, BinaryOp): + self._balance_tree(node.left) + self._balance_tree(node.right) + if repr(node.right) < repr(node.left): + node.left, node.right = node.right, node.left + elif isinstance(node, UnaryOp): + self._balance_tree(node.node) + def parse(self, query): """ Parse a search query. @@ -251,12 +261,13 @@ class _QueryParser(object): :raises: :py:class:`.QueryParseException` """ - ## TODO: balance tree nest = self._split_query(query.rstrip()) if not nest: raise QueryParseException('Empty query: "%s"' % query) self._parse_boolean_operators(nest) - return Tree(self._parse_nest(nest)) + root = self._parse_nest(nest) + self._balance_tree(root) + return Tree(root) parse_query = _QueryParser().parse