A semantic search engine for source code https://bitshift.benkurtovic.com/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

321 lines
13 KiB

  1. """
  2. This subpackage contains code to parse search queries received from the
  3. frontend into trees that can be used by the database backend.
  4. """
  5. from __future__ import unicode_literals
  6. from re import IGNORECASE, search
  7. from sys import maxsize
  8. from dateutil.parser import parse as parse_date
  9. from .nodes import (String, Regex, Text, Language, Author, Date, Symbol,
  10. BinaryOp, UnaryOp)
  11. from .tree import Tree
  12. from ..languages import LANGS
  13. __all__ = ["QueryParseException", "parse_query"]
  14. class QueryParseException(Exception):
  15. """Raised by parse_query() when a query is invalid."""
  16. pass
  17. class _QueryParser(object):
  18. """Wrapper class with methods to parse queries. Used as a singleton."""
  19. def __init__(self):
  20. self._prefixes = {
  21. self._parse_language: ["l", "lang", "language"],
  22. self._parse_author: ["a", "author"],
  23. self._parse_modified: ["m", "mod", "modified", "modify"],
  24. self._parse_created: ["cr", "create", "created"],
  25. self._parse_symbol: ["s", "sym", "symb", "symbol"],
  26. self._parse_function: ["f", "fn", "fun", "func", "function",
  27. "meth", "method"],
  28. self._parse_class: ["cl", "class", "clss"],
  29. self._parse_variable: ["v", "var", "variable"],
  30. self._parse_namespace: ["n", "ns", "namespace", "module"],
  31. self._parse_interface: ["in", "inter", "interface", "implements"],
  32. self._parse_import: ["im", "imp", "import", "include", "require",
  33. "imports", "requires"]
  34. }
  35. def _scan_query(self, query, markers):
  36. """Scan a query (sub)string for the first occurance of some markers.
  37. Returns a 2-tuple of (first_marker_found, marker_index).
  38. """
  39. def is_escaped(query, index):
  40. """Return whether a query marker is backslash-escaped."""
  41. return (index > 0 and query[index - 1] == "\\" and
  42. (index < 2 or query[index - 2] != "\\"))
  43. best_marker, best_index = None, maxsize
  44. for marker in markers:
  45. index = query.find(marker)
  46. if is_escaped(query, index):
  47. _, new_index = self._scan_query(query[index + 1:], marker)
  48. index += new_index + 1
  49. if index >= 0 and index < best_index:
  50. best_marker, best_index = marker, index
  51. return best_marker, best_index
  52. def _split_query(self, query, markers, parens=False):
  53. """Split a query string into a nested list of query terms.
  54. Returns a list of terms and/or nested sublists of terms. Each term and
  55. sublist is guarenteed to be non-empty.
  56. """
  57. query = query.lstrip()
  58. if not query:
  59. return []
  60. marker, index = self._scan_query(query, markers)
  61. if not marker:
  62. return [query]
  63. nest = [query[:index]] if index > 0 else []
  64. after = query[index + 1:]
  65. if marker == " ":
  66. nest += self._split_query(after, markers, parens)
  67. elif marker in ('"', "'"):
  68. close_marker, close_index = self._scan_query(after, marker)
  69. if close_marker:
  70. if close_index > 0:
  71. nest.append(after[:close_index])
  72. after = after[close_index + 1:]
  73. nest += self._split_query(after, markers, parens)
  74. elif after:
  75. nest.append(after)
  76. elif marker == "(":
  77. inner, after = self._split_query(after, markers, True), []
  78. if inner and isinstance(inner[-1], tuple):
  79. after = self._split_query(inner.pop()[0], markers, parens)
  80. if inner:
  81. nest.append(inner)
  82. if after:
  83. nest += after
  84. elif marker == ")":
  85. if parens:
  86. nest.append((after,))
  87. else:
  88. nest += self._split_query(after, markers)
  89. return nest
  90. def _parse_literal(self, literal):
  91. """Parse part of a search query into a string or regular expression."""
  92. if literal.startswith(("r:", "re:", "regex:", "regexp:")):
  93. arg = literal.split(":", 1)[1]
  94. if not arg:
  95. err = 'Incomplete query term: "%s"' % literal
  96. raise QueryParseException(err)
  97. return Regex(arg)
  98. return String(literal)
  99. def _parse_language(self, term):
  100. """Parse part of a query into a language node and return it."""
  101. term = self._parse_literal(term)
  102. if isinstance(term, Regex):
  103. langs = [i for i, lang in enumerate(LANGS)
  104. if search(term.regex, lang, IGNORECASE)]
  105. if not langs:
  106. err = 'No languages found for regex: "%s"' % term.regex
  107. raise QueryParseException(err)
  108. node = Language(langs.pop())
  109. while langs:
  110. node = BinaryOp(Language(langs.pop()), BinaryOp.OR, node)
  111. return node
  112. needle = term.string.lower()
  113. for i, lang in enumerate(LANGS):
  114. if lang.lower() == needle:
  115. return Language(i)
  116. for i, lang in enumerate(LANGS):
  117. if lang.lower().startswith(needle):
  118. return Language(i)
  119. err = 'No languages found for string: "%s"' % term.string
  120. raise QueryParseException(err)
  121. def _parse_author(self, term):
  122. """Parse part of a query into an author node and return it."""
  123. return Author(self._parse_literal(term))
  124. def _parse_date(self, term, type_):
  125. """Parse part of a query into a date node and return it."""
  126. if ":" not in term:
  127. err = "A date relationship is required " \
  128. '("before:<date>" or "after:<date>"): "%s"'
  129. raise QueryParseException(err % term)
  130. relstr, dtstr = term.split(":", 1)
  131. if relstr.lower() in ("before", "b"):
  132. relation = Date.BEFORE
  133. elif relstr.lower() in ("after", "a"):
  134. relation = Date.AFTER
  135. else:
  136. err = 'Bad date relationship (should be "before" or "after"): "%s"'
  137. raise QueryParseException(err % relstr)
  138. try:
  139. dt = parse_date(dtstr)
  140. except (TypeError, ValueError):
  141. raise QueryParseException('Bad date/time string: "%s"' % dtstr)
  142. return Date(type_, relation, dt)
  143. def _parse_modified(self, term):
  144. """Parse part of a query into a date modified node and return it."""
  145. return self._parse_date(term, Date.MODIFY)
  146. def _parse_created(self, term):
  147. """Parse part of a query into a date created node and return it."""
  148. return self._parse_date(term, Date.CREATE)
  149. def _parse_symbol(self, term, stype=Symbol.ALL):
  150. """Parse part of a query into a symbol node and return it."""
  151. defines = ("a:", "assign:", "assignment:", "d:", "def:", "definition:",
  152. "decl:", "declare:", "declaration:")
  153. uses = ("u:", "use:", "c:", "call:")
  154. if term.startswith(defines) or term.startswith(uses):
  155. context = Symbol.DEFINE if term.startswith(defines) else Symbol.USE
  156. term_part = term.split(":", 1)[1]
  157. if not term_part:
  158. raise QueryParseException('Incomplete query term: "%s"' % term)
  159. term = term_part
  160. else:
  161. context = Symbol.ALL
  162. literal = self._parse_literal(term)
  163. if isinstance(literal, String):
  164. make_symbol = lambda lit: Symbol(context, stype, String(lit))
  165. symbols = self._split_query(literal.string, " \"'")
  166. node = make_symbol(symbols.pop())
  167. while symbols:
  168. node = BinaryOp(make_symbol(symbols.pop()), BinaryOp.OR, node)
  169. return node
  170. return Symbol(context, stype, literal)
  171. def _parse_function(self, term):
  172. """Parse part of a query into a function node and return it."""
  173. return self._parse_symbol(term, Symbol.FUNCTION)
  174. def _parse_class(self, term):
  175. """Parse part of a query into a class node and return it."""
  176. return self._parse_symbol(term, Symbol.CLASS)
  177. def _parse_variable(self, term):
  178. """Parse part of a query into a variable node and return it."""
  179. return self._parse_symbol(term, Symbol.VARIABLE)
  180. def _parse_namespace(self, term):
  181. """Parse part of a query into a namespace node and return it."""
  182. return self._parse_symbol(term, Symbol.NAMESPACE)
  183. def _parse_interface(self, term):
  184. """Parse part of a query into a interface node and return it."""
  185. return self._parse_symbol(term, Symbol.INTERFACE)
  186. def _parse_import(self, term):
  187. """Parse part of a query into a import node and return it."""
  188. return self._parse_symbol(term, Symbol.IMPORT)
  189. def _parse_term(self, term):
  190. """Parse a query term into a tree node and return it."""
  191. term = term.replace('\\"', '"').replace("\\\\", "\\")
  192. if ":" in term and not term[0] == ":":
  193. prefix, arg = term.split(":", 1)
  194. invert = prefix.lower() == "not"
  195. if invert:
  196. prefix, arg = arg.split(":", 1)
  197. if not arg:
  198. raise QueryParseException('Incomplete query term: "%s"' % term)
  199. for meth, prefixes in self._prefixes.iteritems():
  200. if prefix.lower() in prefixes:
  201. if invert:
  202. return UnaryOp(UnaryOp.NOT, meth(arg))
  203. return meth(arg)
  204. return Text(self._parse_literal(term))
  205. def _parse_boolean_operators(self, nest):
  206. """Parse boolean operators in a nested query list."""
  207. op_lookup = {
  208. "and": BinaryOp.AND,
  209. "or": BinaryOp.OR,
  210. "not": UnaryOp.NOT
  211. }
  212. for i, term in enumerate(nest):
  213. if isinstance(term, list):
  214. self._parse_boolean_operators(term)
  215. else:
  216. nest[i] = op_lookup.get(term.lower(), term)
  217. def _parse_nest(self, nest):
  218. """Recursively parse a nested list of search query terms."""
  219. def parse_binary_op(op):
  220. """Parse a binary operator in a nested query list."""
  221. index = nest.index(op)
  222. if index == 0 or index == len(nest) - 1:
  223. err = "Invalid query: '%s' given without argument."
  224. raise QueryParseException(err % BinaryOp.OPS[op])
  225. left = self._parse_nest(nest[:index])
  226. right = self._parse_nest(nest[index + 1:])
  227. return BinaryOp(left, op, right)
  228. if not nest:
  229. err = "Error while parsing query: empty nest detected."
  230. raise QueryParseException(err)
  231. elif BinaryOp.OR in nest:
  232. return parse_binary_op(BinaryOp.OR)
  233. elif BinaryOp.AND in nest:
  234. return parse_binary_op(BinaryOp.AND)
  235. elif UnaryOp.NOT in nest:
  236. index = nest.index(UnaryOp.NOT)
  237. if index == len(nest) - 1:
  238. err = "Invalid query: '%s' given without argument."
  239. raise QueryParseException(err % UnaryOp.OPS[UnaryOp.NOT])
  240. right = UnaryOp(UnaryOp.NOT, self._parse_nest(nest[index + 1:]))
  241. if index > 0:
  242. left = self._parse_nest(nest[:index])
  243. return BinaryOp(left, BinaryOp.AND, right)
  244. return right
  245. elif len(nest) > 1:
  246. left, right = self._parse_term(nest[0]), self._parse_nest(nest[1:])
  247. return BinaryOp(left, BinaryOp.AND, right)
  248. elif isinstance(nest[0], list):
  249. return self._parse_nest(nest[0])
  250. else:
  251. return self._parse_term(nest[0])
  252. def _balance_tree(self, node):
  253. """Auto-balance a tree using a string sorting function."""
  254. if isinstance(node, BinaryOp):
  255. self._balance_tree(node.left)
  256. self._balance_tree(node.right)
  257. if node.right.sortkey() < node.left.sortkey():
  258. node.left, node.right = node.right, node.left
  259. elif isinstance(node, UnaryOp):
  260. self._balance_tree(node.node)
  261. def parse(self, query):
  262. """
  263. Parse a search query.
  264. The result is normalized with a sorting function so that
  265. ``"foo OR bar"`` and ``"bar OR foo"`` result in the same tree. This is
  266. important for caching purposes.
  267. :param query: The query be converted.
  268. :type query: str
  269. :return: A tree storing the data in the query.
  270. :rtype: :py:class:`~.query.tree.Tree`
  271. :raises: :py:class:`.QueryParseException`
  272. """
  273. nest = self._split_query(query.rstrip(), " \"'()")
  274. if not nest:
  275. raise QueryParseException('Empty query: "%s"' % query)
  276. self._parse_boolean_operators(nest)
  277. root = self._parse_nest(nest)
  278. self._balance_tree(root)
  279. return Tree(root)
  280. parse_query = _QueryParser().parse