A semantic search engine for source code https://bitshift.benkurtovic.com/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

397 lines
14 KiB

  1. """
  2. :synopsis: Contains a singleton GitIndexer class, which clones and indexes git
  3. repositories.
  4. """
  5. from datetime import datetime
  6. import logging
  7. import os
  8. import Queue
  9. import shutil
  10. import string
  11. import time
  12. import threading
  13. from bs4 import UnicodeDammit
  14. import git
  15. from ..database import Database
  16. from ..parser import parse, UnsupportedFileError
  17. from ..codelet import Codelet
  18. GIT_CLONE_DIR = "/tmp/bitshift"
  19. THREAD_QUEUE_SLEEP = 0.5
  20. MAX_INDEX_QUEUE_SIZE = 10
  21. class GitRepository(object):
  22. """
  23. A representation of a Git repository's metadata.
  24. :ivar url: (str) The repository's url.
  25. :ivar name: (str) The name of the repository.
  26. :ivar framework_name: (str) The name of the online Git framework that the
  27. repository belongs to (eg, GitHub, BitBucket).
  28. :ivar rank: (float) The rank of the repository, as assigned by
  29. :class:`crawler.GitHubCrawler`.
  30. :ivar path: (str) The repository's on-disk directory path.
  31. :ivar repo: (git.Repo) A git.Repo representation of the repository.
  32. """
  33. def __init__(self, url, name, framework_name, rank):
  34. """
  35. Create a GitRepository instance.
  36. :param url: see :attr:`GitRepository.url`
  37. :param name: see :attr:`GitRepository.name`
  38. :param framework_name: see :attr:`GitRepository.framework_name`
  39. :param rank: see :attr:`GitRepository.rank`
  40. :type url: str
  41. :type name: str
  42. :type framework_name: str
  43. :type rank: float
  44. """
  45. self.url = url
  46. self.name = name
  47. self.framework_name = framework_name
  48. self.rank = rank
  49. dirname = name.replace("/", "-") + "-" + str(int(time.time()))
  50. self.path = os.path.join(GIT_CLONE_DIR, dirname)
  51. self.repo = None
  52. class GitIndexer(threading.Thread):
  53. """
  54. A singleton Git repository indexer.
  55. :class:`GitIndexer` indexes the repositories cloned by the
  56. :class:`_GitCloner` singleton.
  57. :ivar index_queue: (:class:`Queue.Queue`) A queue containing
  58. :class:`GitRepository` objects for every new repository succesfully
  59. cloned by :class:`_GitCloner`, which are to be indexed.
  60. :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner,
  61. which feeds :class:`GitIndexer`.
  62. :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
  63. """
  64. def __init__(self, clone_queue, run_event):
  65. """
  66. Create an instance of the singleton `GitIndexer`.
  67. :param clone_queue: see :attr:`self.index_queue`
  68. :type index_queue: see :attr:`self.index_queue`
  69. """
  70. self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
  71. self.run_event = run_event
  72. self.git_cloner = _GitCloner(clone_queue, self.index_queue, run_event)
  73. self.git_cloner.start()
  74. self.database = Database()
  75. self._logger = logging.getLogger("%s.%s" %
  76. (__name__, self.__class__.__name__))
  77. self._logger.info("Starting.")
  78. if not os.path.exists(GIT_CLONE_DIR):
  79. os.makedirs(GIT_CLONE_DIR)
  80. super(GitIndexer, self).__init__(name=self.__class__.__name__)
  81. def run(self):
  82. """
  83. Retrieve metadata about newly cloned repositories and index them.
  84. Blocks until new repositories appear in :attr:`self.index_queue`, then
  85. retrieves one, and attempts indexing it. Should any errors occur, the
  86. new repository will be discarded and the indexer will index the next in
  87. the queue.
  88. """
  89. while True:
  90. while self.index_queue.empty() and self.run_event.is_set():
  91. time.sleep(THREAD_QUEUE_SLEEP)
  92. if not self.run_event.is_set():
  93. break
  94. repo = self.index_queue.get()
  95. self.index_queue.task_done()
  96. self._index_repository(repo)
  97. def _index_repository(self, repo):
  98. """
  99. Clone and index (create and insert Codeletes for) a Git repository.
  100. `git clone` the Git repository located at **repo.url**, call
  101. `_insert_repository_codelets()`, then remove said repository.
  102. :param repo: The metadata of the repository to be indexed.
  103. :type repo: :class:`GitRepository`
  104. """
  105. self._logger.info(u"Indexing repo: %s", repo.name)
  106. try:
  107. self._insert_repository_codelets(repo)
  108. except Exception:
  109. self._logger.exception("Exception raised while indexing:")
  110. finally:
  111. if os.path.isdir(repo.path):
  112. shutil.rmtree(repo.path)
  113. def _insert_repository_codelets(self, repo):
  114. """
  115. Create and insert a Codelet for the files inside a Git repository.
  116. Create a new Codelet, and insert it into the Database singleton, for
  117. every file inside the current working directory's default branch
  118. (usually *master*).
  119. :param repo_url: The metadata of the repository to be indexed.
  120. :type repo_url: :class:`GitRepository`
  121. """
  122. file_meta = self._get_file_metadata(repo)
  123. if file_meta is None:
  124. return
  125. for filename, data in file_meta.iteritems():
  126. authors = [(author, None) for author in data["authors"]]
  127. encoded_source = data["blob"].data_stream.read()
  128. source = UnicodeDammit(encoded_source).unicode_markup
  129. url = self._generate_file_url(filename, repo)
  130. codelet = Codelet("%s: %s" % (repo.name, filename), source,
  131. filename, None, authors, url, data["time_created"],
  132. data["time_last_modified"], repo.rank)
  133. self._logger.debug("Indexing file: %s", codelet.name)
  134. try:
  135. parse(codelet)
  136. except UnsupportedFileError:
  137. continue
  138. self.database.insert(codelet)
  139. def _generate_file_url(self, filename, repo):
  140. """
  141. Return a url for a filename from a Git wrapper framework.
  142. :param filename: The path of the file.
  143. :param repo: The git repo.
  144. :type filename: str
  145. :type repo: :class:`GitRepository`
  146. :return: The file's full url on the given framework, if successfully
  147. derived.
  148. :rtype: str, or None
  149. """
  150. if repo.framework_name == "GitHub":
  151. default_branch = repo.repo.active_branch.name
  152. parts = [repo.url, "blob", default_branch, filename]
  153. elif repo.framework_name == "Bitbucket":
  154. try:
  155. commit_hash = repo.repo.head.commit.hexsha
  156. except ValueError: # No commits
  157. return None
  158. parts = [repo.url, "src", commit_hash, filename]
  159. return "/".join(s.strip("/") for s in parts)
  160. def _walk_history(self, files, head):
  161. """Walk a repository's history for metadata."""
  162. def update_entry(commit, entry, new_file):
  163. if commit.author.name not in entry["authors"]:
  164. entry["authors"].append(commit.author.name)
  165. commit_ts = datetime.utcfromtimestamp(commit.committed_date)
  166. if commit_ts > entry["time_last_modified"]:
  167. entry["time_last_modified"] = commit_ts
  168. if new_file:
  169. entry["time_created"] = commit_ts
  170. def get_diffs(commit, parent):
  171. cache_key = parent.binsha + commit.binsha
  172. if cache_key in diff_cache:
  173. return diff_cache[cache_key]
  174. diffs = parent.diff(commit, create_patch=True)
  175. for diff in diffs:
  176. del diff.diff
  177. diff_cache[cache_key] = diffs
  178. return diffs
  179. def handle_commit(commit, paths):
  180. if not commit.parents:
  181. for item in commit.tree.traverse():
  182. if item.type == "blob" and item.path in paths:
  183. update_entry(commit, files[paths[item.path]], True)
  184. return
  185. for parent in commit.parents:
  186. for diff in get_diffs(commit, parent):
  187. if not diff.b_blob: # Happens when file modes are changed
  188. continue
  189. pth = diff.rename_to if diff.renamed else diff.b_blob.path
  190. if pth not in paths:
  191. continue
  192. update_entry(commit, files[paths[pth]], diff.new_file)
  193. if diff.renamed:
  194. paths[diff.rename_from] = paths[pth]
  195. del paths[pth]
  196. pending = [(head, {path: path for path in files})]
  197. diff_cache = {}
  198. processed = {}
  199. while pending:
  200. commit, paths = pending.pop()
  201. handle_commit(commit, paths)
  202. hash_key = hash(frozenset(paths.items()))
  203. for parent in commit.parents:
  204. new_paths = paths.copy() if len(commit.parents) > 1 else paths
  205. if parent.binsha in processed:
  206. if hash_key not in processed[parent.binsha]:
  207. pending.append((parent, new_paths))
  208. processed[parent.binsha].append(hash_key)
  209. else:
  210. pending.append((parent, new_paths))
  211. processed[parent.binsha] = [hash_key]
  212. def _get_file_metadata(self, repo):
  213. """
  214. Return a dictionary containing every valuable tracked file's metadata.
  215. :return: A dictionary with author names, time of creation, and time of
  216. last modification for every filename key.
  217. .. code-block:: python
  218. sample_returned_dict = {
  219. "my_file" : {
  220. "blob": (GitPython Blob) <object>,
  221. "authors" : (str list) ["author1", "author2"],
  222. "time_created" : (`datetime.datetime`) <object>,
  223. "time_last_modified" : (`datetime.datetime`) <object>
  224. }
  225. }
  226. :rtype: dictionary of dictionaries
  227. """
  228. try:
  229. tree = repo.repo.head.commit.tree
  230. except ValueError: # No commits
  231. return {}
  232. files = {}
  233. for item in tree.traverse():
  234. if item.type == "blob" and self._is_ascii(item.data_stream):
  235. files[item.path] = {
  236. "blob": item,
  237. "authors" : [],
  238. "time_last_modified": datetime.utcfromtimestamp(0),
  239. "time_created": datetime.utcfromtimestamp(0)
  240. }
  241. self._logger.debug("Building file metadata")
  242. self._walk_history(files, repo.repo.head.commit)
  243. return files
  244. def _is_ascii(self, source):
  245. """
  246. Heuristically determine whether a file is ASCII text or binary.
  247. If a portion of the file contains null bytes, or the percentage of bytes
  248. that aren't ASCII is greater than 30%, then the file is concluded to be
  249. binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T`
  250. operator, and is the de-facto method for in : passdetermining whether a
  251. file is ASCII.
  252. :param source: The file object to test.
  253. :type source: `file`
  254. :return: Whether the file is probably ASCII.
  255. :rtype: Boolean
  256. """
  257. file_snippet = source.read(512)
  258. if not file_snippet:
  259. return True
  260. ascii_characters = "".join(map(chr, range(32, 127)) +
  261. list("\n\r\t\b"))
  262. null_trans = string.maketrans("", "")
  263. if "\0" in file_snippet:
  264. return False
  265. non_ascii = file_snippet.translate(null_trans, ascii_characters)
  266. return not float(len(non_ascii)) / len(file_snippet) > 0.30
  267. class _GitCloner(threading.Thread):
  268. """
  269. A singleton Git repository cloner.
  270. Clones the repositories crawled by :class:`crawler.GitHubCrawler` for
  271. :class:`GitIndexer` to index.
  272. :ivar clone_queue: (:class:`Queue.Queue`) see
  273. :attr:`crawler.GitHubCrawler.clone_queue`.
  274. :ivar index_queue: (:class:`Queue.Queue`) see
  275. :attr:`GitIndexer.index_queue`.
  276. :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
  277. """
  278. def __init__(self, clone_queue, index_queue, run_event):
  279. """
  280. Create an instance of the singleton :class:`_GitCloner`.
  281. :param clone_queue: see :attr:`self.clone_queue`
  282. :param index_queue: see :attr:`self.index_queue`
  283. :type clone_queue: see :attr:`self.clone_queue`
  284. :type index_queue: see :attr:`self.index_queue`
  285. """
  286. self.clone_queue = clone_queue
  287. self.index_queue = index_queue
  288. self.run_event = run_event
  289. self._logger = logging.getLogger("%s.%s" %
  290. (__name__, self.__class__.__name__))
  291. self._logger.info("Starting.")
  292. super(_GitCloner, self).__init__(name=self.__class__.__name__)
  293. def run(self):
  294. """
  295. Retrieve metadata about newly crawled repositories and clone them.
  296. Blocks until new :class:`GitRepository` appear in
  297. :attr:`self.clone_queue`, then attempts cloning them. If
  298. succcessful, the cloned repository is added to :attr:`self.index_queue`
  299. for the `GitIndexer` to clone; otherwise, it is discarded.
  300. """
  301. while True:
  302. while self.clone_queue.empty() and self.run_event.is_set():
  303. time.sleep(THREAD_QUEUE_SLEEP)
  304. if not self.run_event.is_set():
  305. break
  306. repo = self.clone_queue.get()
  307. self.clone_queue.task_done()
  308. try:
  309. self._clone_repository(repo)
  310. except Exception:
  311. self._logger.exception("Exception raised while cloning:")
  312. def _clone_repository(self, repo):
  313. """
  314. Attempt cloning a Git repository.
  315. :param repo: Metadata about the repository to clone.
  316. :type repo: :class:`GitRepository`
  317. """
  318. self._logger.info("Cloning repo: %s", repo.url)
  319. repo.repo = git.Repo.clone_from(repo.url, to_path=repo.path, bare=True,
  320. single_branch=True)
  321. while self.index_queue.full() and self.run_event.is_set():
  322. time.sleep(THREAD_QUEUE_SLEEP)
  323. if self.run_event.is_set():
  324. self.index_queue.put(repo)