A semantic search engine for source code https://bitshift.benkurtovic.com/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

284 line
11 KiB

  1. """
  2. This subpackage contains code to parse search queries received from the
  3. frontend into trees that can be used by the database backend.
  4. """
  5. from __future__ import unicode_literals
  6. from re import IGNORECASE, search
  7. from sys import maxsize
  8. from dateutil.parser import parse as parse_date
  9. from .nodes import (String, Regex, Text, Language, Author, Date, Symbol,
  10. BinaryOp, UnaryOp)
  11. from .tree import Tree
  12. from ..languages import LANGS
  13. __all__ = ["QueryParseException", "parse_query"]
  14. class QueryParseException(Exception):
  15. """Raised by parse_query() when a query is invalid."""
  16. pass
  17. class _QueryParser(object):
  18. """Wrapper class with methods to parse queries. Used as a singleton."""
  19. def __init__(self):
  20. self._prefixes = {
  21. self._parse_language: ["l", "lang", "language"],
  22. self._parse_author: ["a", "author"],
  23. self._parse_modified: ["m", "mod", "modified", "modify"],
  24. self._parse_created: ["cr", "create", "created"],
  25. self._parse_symbol: ["s", "sym", "symb", "symbol"],
  26. self._parse_function: ["f", "fn", "fun", "func", "function"],
  27. self._parse_class: ["cl", "class", "clss"],
  28. self._parse_variable: ["v", "var", "variable"]
  29. }
  30. def _parse_literal(self, literal):
  31. """Parse part of a search query into a string or regular expression."""
  32. if literal.startswith(("r:", "re:", "regex:", "regexp:")):
  33. return Regex(literal.split(":", 1)[1])
  34. return String(literal)
  35. def _parse_language(self, term):
  36. """Parse part of a query into a language node and return it."""
  37. term = self._parse_literal(term)
  38. if isinstance(term, Regex):
  39. langs = [i for i, lang in enumerate(LANGS)
  40. if search(term.regex, lang, IGNORECASE)]
  41. if not langs:
  42. err = 'No languages found for regex: "%s"' % term.regex
  43. raise QueryParseException(err)
  44. node = Language(langs.pop())
  45. while langs:
  46. node = BinaryOp(Language(langs.pop()), BinaryOp.OR, node)
  47. return node
  48. needle = term.string.lower()
  49. for i, lang in enumerate(LANGS):
  50. if lang.lower() == needle:
  51. return Language(i)
  52. for i, lang in enumerate(LANGS):
  53. if lang.lower().startswith(needle):
  54. return Language(i)
  55. err = 'No languages found for string: "%s"' % term.string
  56. raise QueryParseException(err)
  57. def _parse_author(self, term):
  58. """Parse part of a query into an author node and return it."""
  59. return Author(self._parse_literal(term))
  60. def _parse_date(self, term, type_):
  61. """Parse part of a query into a date node and return it."""
  62. if ":" not in term:
  63. err = "A date relationship is required " \
  64. '("before:<date>" or "after:<date>"): "%s"'
  65. raise QueryParseException(err % term)
  66. relstr, dtstr = term.split(":", 1)
  67. if relstr.lower() in ("before", "b"):
  68. relation = Date.BEFORE
  69. elif relstr.lower() in ("after", "a"):
  70. relation = Date.AFTER
  71. else:
  72. err = 'Bad date relationship (should be "before" or "after"): "%s"'
  73. raise QueryParseException(err % relstr)
  74. try:
  75. dt = parse_date(dtstr)
  76. except (TypeError, ValueError):
  77. raise QueryParseException('Bad date/time string: "%s"' % dtstr)
  78. return Date(type_, relation, dt)
  79. def _parse_modified(self, term):
  80. """Parse part of a query into a date modified node and return it."""
  81. return self._parse_date(term, Date.MODIFY)
  82. def _parse_created(self, term):
  83. """Parse part of a query into a date created node and return it."""
  84. return self._parse_date(term, Date.CREATE)
  85. def _parse_symbol(self, term):
  86. """Parse part of a query into a symbol node and return it."""
  87. return Symbol(Symbol.ALL, self._parse_literal(term))
  88. def _parse_function(self, term):
  89. """Parse part of a query into a function node and return it."""
  90. return Symbol(Symbol.FUNCTION, self._parse_literal(term))
  91. def _parse_class(self, term):
  92. """Parse part of a query into a class node and return it."""
  93. return Symbol(Symbol.CLASS, self._parse_literal(term))
  94. def _parse_variable(self, term):
  95. """Parse part of a query into a variable node and return it."""
  96. return Symbol(Symbol.VARIABLE, self._parse_literal(term))
  97. def _parse_term(self, term):
  98. """Parse a query term into a tree node and return it."""
  99. try:
  100. term = term.decode("unicode_escape")
  101. except UnicodeDecodeError:
  102. raise QueryParseException('Invalid query term: "%s"' % term)
  103. if ":" in term and not term[0] == ":":
  104. prefix, arg = term.split(":", 1)
  105. invert = prefix.lower() == "not"
  106. if invert:
  107. prefix, arg = arg.split(":", 1)
  108. if not arg:
  109. raise QueryParseException('Incomplete query term: "%s"' % term)
  110. for meth, prefixes in self._prefixes.iteritems():
  111. if prefix.lower() in prefixes:
  112. if invert:
  113. return UnaryOp(UnaryOp.NOT, meth(arg))
  114. return meth(arg)
  115. return Text(self._parse_literal(term))
  116. def _scan_query(self, query, markers):
  117. """Scan a query (sub)string for the first occurance of some markers.
  118. Returns a 2-tuple of (first_marker_found, marker_index).
  119. """
  120. def is_escaped(query, index):
  121. """Return whether a query marker is backslash-escaped."""
  122. return (index > 0 and query[index - 1] == "\\" and
  123. (index < 2 or query[index - 2] != "\\"))
  124. best_marker, best_index = None, maxsize
  125. for marker in markers:
  126. index = query.find(marker)
  127. if is_escaped(query, index):
  128. _, new_index = self._scan_query(query[index + 1:], marker)
  129. index += new_index + 1
  130. if index >= 0 and index < best_index:
  131. best_marker, best_index = marker, index
  132. return best_marker, best_index
  133. def _split_query(self, query, parens=False):
  134. """Split a query string into a nested list of query terms.
  135. Returns a list of terms and/or nested sublists of terms. Each term and
  136. sublist is guarenteed to be non-empty.
  137. """
  138. query = query.lstrip()
  139. if not query:
  140. return []
  141. marker, index = self._scan_query(query, " \"'()")
  142. if not marker:
  143. return [query]
  144. nest = [query[:index]] if index > 0 else []
  145. after = query[index + 1:]
  146. if marker == " ":
  147. nest += self._split_query(after, parens)
  148. elif marker in ('"', "'"):
  149. close_marker, close_index = self._scan_query(after, marker)
  150. if close_marker:
  151. if close_index > 0:
  152. nest.append(after[:close_index])
  153. after = after[close_index + 1:]
  154. nest += self._split_query(after, parens)
  155. elif after:
  156. nest.append(after)
  157. elif marker == "(":
  158. inner, after = self._split_query(after, True), []
  159. if inner and isinstance(inner[-1], tuple):
  160. after = self._split_query(inner.pop()[0], parens)
  161. if inner:
  162. nest.append(inner)
  163. if after:
  164. nest += after
  165. elif marker == ")":
  166. if parens:
  167. nest.append((after,))
  168. else:
  169. nest += self._split_query(after)
  170. return nest
  171. def _parse_boolean_operators(self, nest):
  172. """Parse boolean operators in a nested query list."""
  173. op_lookup = {
  174. "and": BinaryOp.AND,
  175. "or": BinaryOp.OR,
  176. "not": UnaryOp.NOT
  177. }
  178. for i, term in enumerate(nest):
  179. if isinstance(term, list):
  180. self._parse_boolean_operators(term)
  181. else:
  182. nest[i] = op_lookup.get(term.lower(), term)
  183. def _parse_nest(self, nest):
  184. """Recursively parse a nested list of search query terms."""
  185. def parse_binary_op(op):
  186. """Parse a binary operator in a nested query list."""
  187. index = nest.index(op)
  188. if index == 0 or index == len(nest) - 1:
  189. err = "Invalid query: '%s' given without argument."
  190. raise QueryParseException(err % BinaryOp.OPS[op])
  191. left = self._parse_nest(nest[:index])
  192. right = self._parse_nest(nest[index + 1:])
  193. return BinaryOp(left, op, right)
  194. if not nest:
  195. err = "Error while parsing query: empty nest detected."
  196. raise QueryParseException(err)
  197. elif BinaryOp.OR in nest:
  198. return parse_binary_op(BinaryOp.OR)
  199. elif BinaryOp.AND in nest:
  200. return parse_binary_op(BinaryOp.AND)
  201. elif UnaryOp.NOT in nest:
  202. index = nest.index(UnaryOp.NOT)
  203. if index == len(nest) - 1:
  204. err = "Invalid query: '%s' given without argument."
  205. raise QueryParseException(err % UnaryOp.OPS[op])
  206. right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:]))
  207. if index > 0:
  208. left = self._parse_nest(nest[:index])
  209. return BinaryOp(left, BinaryOp.AND, right)
  210. return right
  211. elif len(nest) > 1:
  212. left, right = self._parse_term(nest[0]), self._parse_nest(nest[1:])
  213. return BinaryOp(left, BinaryOp.AND, right)
  214. elif isinstance(nest[0], list):
  215. return self._parse_nest(nest[0])
  216. else:
  217. return self._parse_term(nest[0])
  218. def _balance_tree(self, node):
  219. """Auto-balance a tree using a string sorting function."""
  220. if isinstance(node, BinaryOp):
  221. self._balance_tree(node.left)
  222. self._balance_tree(node.right)
  223. if node.right.sortkey() < node.left.sortkey():
  224. node.left, node.right = node.right, node.left
  225. elif isinstance(node, UnaryOp):
  226. self._balance_tree(node.node)
  227. def parse(self, query):
  228. """
  229. Parse a search query.
  230. The result is normalized with a sorting function so that
  231. ``"foo OR bar"`` and ``"bar OR foo"`` result in the same tree. This is
  232. important for caching purposes.
  233. :param query: The query be converted.
  234. :type query: str
  235. :return: A tree storing the data in the query.
  236. :rtype: :py:class:`~.query.tree.Tree`
  237. :raises: :py:class:`.QueryParseException`
  238. """
  239. nest = self._split_query(query.rstrip())
  240. if not nest:
  241. raise QueryParseException('Empty query: "%s"' % query)
  242. self._parse_boolean_operators(nest)
  243. root = self._parse_nest(nest)
  244. self._balance_tree(root)
  245. return Tree(root)
  246. parse_query = _QueryParser().parse