A semantic search engine for source code https://bitshift.benkurtovic.com/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

296 lines
11 KiB

  1. """
  2. This subpackage contains code to parse search queries received from the
  3. frontend into trees that can be used by the database backend.
  4. """
  5. from __future__ import unicode_literals
  6. from re import IGNORECASE, search
  7. from sys import maxsize
  8. from dateutil.parser import parse as parse_date
  9. from .nodes import (String, Regex, Text, Language, Author, Date, Symbol,
  10. BinaryOp, UnaryOp)
  11. from .tree import Tree
  12. from ..languages import LANGS
  13. __all__ = ["QueryParseException", "parse_query"]
  14. class QueryParseException(Exception):
  15. """Raised by parse_query() when a query is invalid."""
  16. pass
  17. class _QueryParser(object):
  18. """Wrapper class with methods to parse queries. Used as a singleton."""
  19. def __init__(self):
  20. self._prefixes = {
  21. self._parse_language: ["l", "lang", "language"],
  22. self._parse_author: ["a", "author"],
  23. self._parse_modified: ["m", "mod", "modified", "modify"],
  24. self._parse_created: ["cr", "create", "created"],
  25. self._parse_symbol: ["s", "sym", "symb", "symbol"],
  26. self._parse_function: ["f", "fn", "fun", "func", "function"],
  27. self._parse_class: ["cl", "class", "clss"],
  28. self._parse_variable: ["v", "var", "variable"]
  29. }
  30. def _scan_query(self, query, markers):
  31. """Scan a query (sub)string for the first occurance of some markers.
  32. Returns a 2-tuple of (first_marker_found, marker_index).
  33. """
  34. def is_escaped(query, index):
  35. """Return whether a query marker is backslash-escaped."""
  36. return (index > 0 and query[index - 1] == "\\" and
  37. (index < 2 or query[index - 2] != "\\"))
  38. best_marker, best_index = None, maxsize
  39. for marker in markers:
  40. index = query.find(marker)
  41. if is_escaped(query, index):
  42. _, new_index = self._scan_query(query[index + 1:], marker)
  43. index += new_index + 1
  44. if index >= 0 and index < best_index:
  45. best_marker, best_index = marker, index
  46. return best_marker, best_index
  47. def _split_query(self, query, markers, parens=False):
  48. """Split a query string into a nested list of query terms.
  49. Returns a list of terms and/or nested sublists of terms. Each term and
  50. sublist is guarenteed to be non-empty.
  51. """
  52. query = query.lstrip()
  53. if not query:
  54. return []
  55. marker, index = self._scan_query(query, markers)
  56. if not marker:
  57. return [query]
  58. nest = [query[:index]] if index > 0 else []
  59. after = query[index + 1:]
  60. if marker == " ":
  61. nest += self._split_query(after, markers, parens)
  62. elif marker in ('"', "'"):
  63. close_marker, close_index = self._scan_query(after, marker)
  64. if close_marker:
  65. if close_index > 0:
  66. nest.append(after[:close_index])
  67. after = after[close_index + 1:]
  68. nest += self._split_query(after, markers, parens)
  69. elif after:
  70. nest.append(after)
  71. elif marker == "(":
  72. inner, after = self._split_query(after, markers, True), []
  73. if inner and isinstance(inner[-1], tuple):
  74. after = self._split_query(inner.pop()[0], markers, parens)
  75. if inner:
  76. nest.append(inner)
  77. if after:
  78. nest += after
  79. elif marker == ")":
  80. if parens:
  81. nest.append((after,))
  82. else:
  83. nest += self._split_query(after, markers)
  84. return nest
  85. def _parse_literal(self, literal):
  86. """Parse part of a search query into a string or regular expression."""
  87. if literal.startswith(("r:", "re:", "regex:", "regexp:")):
  88. arg = literal.split(":", 1)[1]
  89. if not arg:
  90. err = 'Incomplete query term: "%s"' % literal
  91. raise QueryParseException(err)
  92. return Regex(arg)
  93. return String(literal)
  94. def _parse_language(self, term):
  95. """Parse part of a query into a language node and return it."""
  96. term = self._parse_literal(term)
  97. if isinstance(term, Regex):
  98. langs = [i for i, lang in enumerate(LANGS)
  99. if search(term.regex, lang, IGNORECASE)]
  100. if not langs:
  101. err = 'No languages found for regex: "%s"' % term.regex
  102. raise QueryParseException(err)
  103. node = Language(langs.pop())
  104. while langs:
  105. node = BinaryOp(Language(langs.pop()), BinaryOp.OR, node)
  106. return node
  107. needle = term.string.lower()
  108. for i, lang in enumerate(LANGS):
  109. if lang.lower() == needle:
  110. return Language(i)
  111. for i, lang in enumerate(LANGS):
  112. if lang.lower().startswith(needle):
  113. return Language(i)
  114. err = 'No languages found for string: "%s"' % term.string
  115. raise QueryParseException(err)
  116. def _parse_author(self, term):
  117. """Parse part of a query into an author node and return it."""
  118. return Author(self._parse_literal(term))
  119. def _parse_date(self, term, type_):
  120. """Parse part of a query into a date node and return it."""
  121. if ":" not in term:
  122. err = "A date relationship is required " \
  123. '("before:<date>" or "after:<date>"): "%s"'
  124. raise QueryParseException(err % term)
  125. relstr, dtstr = term.split(":", 1)
  126. if relstr.lower() in ("before", "b"):
  127. relation = Date.BEFORE
  128. elif relstr.lower() in ("after", "a"):
  129. relation = Date.AFTER
  130. else:
  131. err = 'Bad date relationship (should be "before" or "after"): "%s"'
  132. raise QueryParseException(err % relstr)
  133. try:
  134. dt = parse_date(dtstr)
  135. except (TypeError, ValueError):
  136. raise QueryParseException('Bad date/time string: "%s"' % dtstr)
  137. return Date(type_, relation, dt)
  138. def _parse_modified(self, term):
  139. """Parse part of a query into a date modified node and return it."""
  140. return self._parse_date(term, Date.MODIFY)
  141. def _parse_created(self, term):
  142. """Parse part of a query into a date created node and return it."""
  143. return self._parse_date(term, Date.CREATE)
  144. def _parse_symbol(self, term, stype=Symbol.ALL):
  145. """Parse part of a query into a symbol node and return it."""
  146. literal = self._parse_literal(term)
  147. if isinstance(literal, String):
  148. make_symbol = lambda lit: Symbol(stype, String(lit))
  149. symbols = self._split_query(literal.string, " \"'")
  150. node = make_symbol(symbols.pop())
  151. while symbols:
  152. node = BinaryOp(make_symbol(symbols.pop()), BinaryOp.OR, node)
  153. return node
  154. return Symbol(stype, literal)
  155. def _parse_function(self, term):
  156. """Parse part of a query into a function node and return it."""
  157. return self._parse_symbol(term, Symbol.FUNCTION)
  158. def _parse_class(self, term):
  159. """Parse part of a query into a class node and return it."""
  160. return self._parse_symbol(term, Symbol.CLASS)
  161. def _parse_variable(self, term):
  162. """Parse part of a query into a variable node and return it."""
  163. return self._parse_symbol(term, Symbol.VARIABLE)
  164. def _parse_term(self, term):
  165. """Parse a query term into a tree node and return it."""
  166. try:
  167. term = term.decode("unicode_escape")
  168. except UnicodeDecodeError:
  169. raise QueryParseException('Invalid query term: "%s"' % term)
  170. if ":" in term and not term[0] == ":":
  171. prefix, arg = term.split(":", 1)
  172. invert = prefix.lower() == "not"
  173. if invert:
  174. prefix, arg = arg.split(":", 1)
  175. if not arg:
  176. raise QueryParseException('Incomplete query term: "%s"' % term)
  177. for meth, prefixes in self._prefixes.iteritems():
  178. if prefix.lower() in prefixes:
  179. if invert:
  180. return UnaryOp(UnaryOp.NOT, meth(arg))
  181. return meth(arg)
  182. return Text(self._parse_literal(term))
  183. def _parse_boolean_operators(self, nest):
  184. """Parse boolean operators in a nested query list."""
  185. op_lookup = {
  186. "and": BinaryOp.AND,
  187. "or": BinaryOp.OR,
  188. "not": UnaryOp.NOT
  189. }
  190. for i, term in enumerate(nest):
  191. if isinstance(term, list):
  192. self._parse_boolean_operators(term)
  193. else:
  194. nest[i] = op_lookup.get(term.lower(), term)
  195. def _parse_nest(self, nest):
  196. """Recursively parse a nested list of search query terms."""
  197. def parse_binary_op(op):
  198. """Parse a binary operator in a nested query list."""
  199. index = nest.index(op)
  200. if index == 0 or index == len(nest) - 1:
  201. err = "Invalid query: '%s' given without argument."
  202. raise QueryParseException(err % BinaryOp.OPS[op])
  203. left = self._parse_nest(nest[:index])
  204. right = self._parse_nest(nest[index + 1:])
  205. return BinaryOp(left, op, right)
  206. if not nest:
  207. err = "Error while parsing query: empty nest detected."
  208. raise QueryParseException(err)
  209. elif BinaryOp.OR in nest:
  210. return parse_binary_op(BinaryOp.OR)
  211. elif BinaryOp.AND in nest:
  212. return parse_binary_op(BinaryOp.AND)
  213. elif UnaryOp.NOT in nest:
  214. index = nest.index(UnaryOp.NOT)
  215. if index == len(nest) - 1:
  216. err = "Invalid query: '%s' given without argument."
  217. raise QueryParseException(err % UnaryOp.OPS[UnaryOp.NOT])
  218. right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:]))
  219. if index > 0:
  220. left = self._parse_nest(nest[:index])
  221. return BinaryOp(left, BinaryOp.AND, right)
  222. return right
  223. elif len(nest) > 1:
  224. left, right = self._parse_term(nest[0]), self._parse_nest(nest[1:])
  225. return BinaryOp(left, BinaryOp.AND, right)
  226. elif isinstance(nest[0], list):
  227. return self._parse_nest(nest[0])
  228. else:
  229. return self._parse_term(nest[0])
  230. def _balance_tree(self, node):
  231. """Auto-balance a tree using a string sorting function."""
  232. if isinstance(node, BinaryOp):
  233. self._balance_tree(node.left)
  234. self._balance_tree(node.right)
  235. if node.right.sortkey() < node.left.sortkey():
  236. node.left, node.right = node.right, node.left
  237. elif isinstance(node, UnaryOp):
  238. self._balance_tree(node.node)
  239. def parse(self, query):
  240. """
  241. Parse a search query.
  242. The result is normalized with a sorting function so that
  243. ``"foo OR bar"`` and ``"bar OR foo"`` result in the same tree. This is
  244. important for caching purposes.
  245. :param query: The query be converted.
  246. :type query: str
  247. :return: A tree storing the data in the query.
  248. :rtype: :py:class:`~.query.tree.Tree`
  249. :raises: :py:class:`.QueryParseException`
  250. """
  251. nest = self._split_query(query.rstrip(), " \"'()")
  252. if not nest:
  253. raise QueryParseException('Empty query: "%s"' % query)
  254. self._parse_boolean_operators(nest)
  255. root = self._parse_nest(nest)
  256. self._balance_tree(root)
  257. return Tree(root)
  258. parse_query = _QueryParser().parse