A semantic search engine for source code https://bitshift.benkurtovic.com/
Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.
 
 
 
 
 
 

510 řádky
17 KiB

  1. """
  2. :synopsis: Contains a singleton GitIndexer class, which clones and indexes git
  3. repositories.
  4. """
  5. import datetime
  6. import logging
  7. import os
  8. import Queue
  9. import shutil
  10. import string
  11. import subprocess
  12. import time
  13. import threading
  14. import bs4
  15. from ..database import Database
  16. from ..parser import parse, UnsupportedFileError
  17. from ..languages import LANGS
  18. from ..codelet import Codelet
  19. GIT_CLONE_DIR = "/tmp/bitshift"
  20. THREAD_QUEUE_SLEEP = 0.5
  21. class GitRepository(object):
  22. """
  23. A representation of a Git repository's metadata.
  24. :ivar url: (str) The repository's url.
  25. :ivar name: (str) The name of the repository.
  26. :ivar framework_name: (str) The name of the online Git framework that the
  27. repository belongs to (eg, GitHub, BitBucket).
  28. :ivar rank: (float) The rank of the repository, as assigned by
  29. :class:`crawler.GitHubCrawler`.
  30. """
  31. def __init__(self, url, name, framework_name, rank):
  32. """
  33. Create a GitRepository instance.
  34. :param url: see :attr:`GitRepository.url`
  35. :param name: see :attr:`GitRepository.name`
  36. :param framework_name: see :attr:`GitRepository.framework_name`
  37. :param rank: see :attr:`GitRepository.rank`
  38. :type url: str
  39. :type name: str
  40. :type framework_name: str
  41. :type rank: float
  42. """
  43. self.url = url
  44. self.name = name
  45. self.framework_name = framework_name
  46. self.rank = rank
  47. class GitIndexer(threading.Thread):
  48. """
  49. A singleton Git repository indexer.
  50. :class:`GitIndexer` indexes the repositories cloned by the
  51. :class:`_GitCloner` singleton.
  52. :ivar index_queue: (:class:`Queue.Queue`) A queue containing
  53. :class:`GitRepository` objects for every new repository succesfully
  54. cloned by :class:`_GitCloner`, which are to be indexed.
  55. :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner,
  56. which feeds :class:`GitIndexer`.
  57. :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
  58. """
  59. def __init__(self, clone_queue, run_event):
  60. """
  61. Create an instance of the singleton `GitIndexer`.
  62. :param clone_queue: see :attr:`self.index_queue`
  63. :type index_queue: see :attr:`self.index_queue`
  64. """
  65. MAX_INDEX_QUEUE_SIZE = 10
  66. self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
  67. self.run_event = run_event
  68. self.git_cloner = _GitCloner(clone_queue, self.index_queue, run_event)
  69. self.git_cloner.start()
  70. self.database = Database()
  71. self._logger = logging.getLogger("%s.%s" %
  72. (__name__, self.__class__.__name__))
  73. self._logger.info("Starting.")
  74. if not os.path.exists(GIT_CLONE_DIR):
  75. os.makedirs(GIT_CLONE_DIR)
  76. super(GitIndexer, self).__init__(name=self.__class__.__name__)
  77. def run(self):
  78. """
  79. Retrieve metadata about newly cloned repositories and index them.
  80. Blocks until new repositories appear in :attr:`self.index_queue`, then
  81. retrieves one, and attempts indexing it. Should any errors occur, the
  82. new repository will be discarded and the indexer will index the next in
  83. the queue.
  84. """
  85. while self.run_event.is_set():
  86. while self.index_queue.empty():
  87. time.sleep(THREAD_QUEUE_SLEEP)
  88. repo = self.index_queue.get()
  89. self.index_queue.task_done()
  90. self._index_repository(repo)
  91. def _index_repository(self, repo):
  92. """
  93. Clone and index (create and insert Codeletes for) a Git repository.
  94. `git clone` the Git repository located at **repo.url**, call
  95. `_insert_repository_codelets()`, then remove said repository.
  96. :param repo_url: The metadata of the repository to be indexed.
  97. :type repo_url: :class:`GitRepository`
  98. """
  99. with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
  100. try:
  101. self._insert_repository_codelets(repo)
  102. except Exception:
  103. self._logger.exception("Exception raised while indexing:")
  104. finally:
  105. if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
  106. if len([obj for obj in os.listdir('.') if
  107. os.path.isdir(obj)]) <= 1:
  108. shutil.rmtree("%s/%s" % (
  109. GIT_CLONE_DIR, repo.name.split("/")[0]))
  110. else:
  111. shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
  112. def _insert_repository_codelets(self, repo):
  113. """
  114. Create and insert a Codelet for the files inside a Git repository.
  115. Create a new Codelet, and insert it into the Database singleton, for
  116. every file inside the current working directory's default branch
  117. (usually *master*).
  118. :param repo_url: The metadata of the repository to be indexed.
  119. :type repo_url: :class:`GitRepository`
  120. """
  121. commits_meta = self._get_commits_metadata()
  122. if commits_meta is None:
  123. return
  124. for filename in commits_meta.keys():
  125. try:
  126. with open(filename) as source_file:
  127. source = self._decode(source_file.read())
  128. if source is None:
  129. continue
  130. except IOError:
  131. continue
  132. authors = [(self._decode(author), None) for author in
  133. commits_meta[filename]["authors"]]
  134. codelet = Codelet("%s:%s" % (repo.name, filename), source, filename,
  135. None, authors, self._generate_file_url(filename,
  136. repo.url, repo.framework_name),
  137. commits_meta[filename]["time_created"],
  138. commits_meta[filename]["time_last_modified"],
  139. repo.rank)
  140. try:
  141. parse(codelet)
  142. except UnsupportedFileError:
  143. continue
  144. self.database.insert(codelet)
  145. def _generate_file_url(self, filename, repo_url, framework_name):
  146. """
  147. Return a url for a filename from a Git wrapper framework.
  148. :param filename: The path of the file.
  149. :param repo_url: The url of the file's parent repository.
  150. :param framework_name: The name of the framework the repository is from.
  151. :type filename: str
  152. :type repo_url: str
  153. :type framework_name: str
  154. :return: The file's full url on the given framework, if successfully
  155. derived.
  156. :rtype: str, or None
  157. .. warning::
  158. Various Git subprocesses will occasionally fail, and, seeing as the
  159. information they provide is a crucial component of some repository
  160. file urls, None may be returned.
  161. """
  162. try:
  163. if framework_name == "GitHub":
  164. default_branch = subprocess.check_output("git branch"
  165. " --no-color", shell=True)[2:-1]
  166. parts = [repo_url, "blob", default_branch, filename]
  167. elif framework_name == "Bitbucket":
  168. commit_hash = subprocess.check_output("git rev-parse HEAD",
  169. shell=True).replace("\n", "")
  170. parts = [repo_url, "src", commit_hash, filename]
  171. return "/".join(s.strip("/") for s in parts)
  172. except subprocess.CalledProcessError:
  173. return None
  174. def _get_git_commits(self):
  175. """
  176. Return the current working directory's formatted commit data.
  177. Uses `git log` to generate metadata about every single file in the
  178. repository's commit history.
  179. :return: The author, timestamp, and names of all modified files of every
  180. commit.
  181. .. code-block:: python
  182. sample_returned_array = [
  183. {
  184. "author" : (str) "author"
  185. "timestamp" : (`datetime.datetime`) <object>,
  186. "filenames" : (str array) ["file1", "file2"]
  187. }
  188. ]
  189. :rtype: array of dictionaries
  190. """
  191. git_log = subprocess.check_output(("git --no-pager log --name-only"
  192. " --pretty=format:'%n%n%an%n%at' -z"), shell=True)
  193. commits = []
  194. for commit in git_log.split("\n\n"):
  195. fields = commit.split("\n")
  196. if len(fields) > 2:
  197. commits.append({
  198. "author" : fields[0],
  199. "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
  200. "filenames" : fields[2].split("\x00")[:-2]
  201. })
  202. return commits
  203. def _get_tracked_files(self):
  204. """
  205. Return a list of the filenames of all valuable files in the Git repository.
  206. Get a list of the filenames of the non-binary (Perl heuristics used for
  207. filetype identification) files currently inside the current working
  208. directory's Git repository. Then, weed out any boilerplate/non-code files
  209. that match the regex rules in GIT_IGNORE_FILES.
  210. :return: The filenames of all index-worthy non-binary files.
  211. :rtype: str array
  212. """
  213. files = []
  214. for dirname, subdir_names, filenames in os.walk("."):
  215. for filename in filenames:
  216. path = os.path.join(dirname, filename)
  217. if self._is_ascii(path):
  218. files.append(path[2:])
  219. return files
  220. def _get_commits_metadata(self):
  221. """
  222. Return a dictionary containing every valuable tracked file's metadata.
  223. :return: A dictionary with author names, time of creation, and time of last
  224. modification for every filename key.
  225. .. code-block:: python
  226. sample_returned_dict = {
  227. "my_file" : {
  228. "authors" : (str array) ["author1", "author2"],
  229. "time_created" : (`datetime.datetime`) <object>,
  230. "time_last_modified" : (`datetime.datetime`) <object>
  231. }
  232. }
  233. :rtype: dictionary of dictionaries
  234. """
  235. commits = self._get_git_commits()
  236. tracked_files = self._get_tracked_files()
  237. files_meta = {}
  238. for commit in commits:
  239. for filename in commit["filenames"]:
  240. if filename not in tracked_files:
  241. continue
  242. if filename not in files_meta.keys():
  243. files_meta[filename] = {
  244. "authors" : [commit["author"]],
  245. "time_last_modified" : commit["timestamp"],
  246. "time_created" : commit["timestamp"]
  247. }
  248. else:
  249. if commit["author"] not in files_meta[filename]["authors"]:
  250. files_meta[filename]["authors"].append(commit["author"])
  251. files_meta[filename]["time_created"] = commit["timestamp"]
  252. return files_meta
  253. def _decode(self, raw):
  254. """
  255. Return a decoded a raw string.
  256. :param raw: The string to string.
  257. :type raw: (str)
  258. :return: If the original encoding is successfully inferenced, return the
  259. decoded string.
  260. :rtype: str, or None
  261. .. warning::
  262. The raw string's original encoding is identified by heuristics which
  263. can, and occasionally will, fail. Decoding will then fail, and None
  264. will be returned.
  265. """
  266. try:
  267. encoding = bs4.BeautifulSoup(raw).original_encoding
  268. return raw.decode(encoding) if encoding is not None else None
  269. except (LookupError, UnicodeDecodeError, UserWarning) as exception:
  270. return None
  271. def _is_ascii(self, filename):
  272. """
  273. Heuristically determine whether a file is ASCII text or binary.
  274. If a portion of the file contains null bytes, or the percentage of bytes
  275. that aren't ASCII is greater than 30%, then the file is concluded to be
  276. binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T`
  277. operator, and is the de-facto method for in : passdetermining whether a
  278. file is ASCII.
  279. :param filename: The path of the file to test.
  280. :type filename: str
  281. :return: Whether the file is probably ASCII.
  282. :rtype: Boolean
  283. """
  284. try:
  285. with open(filename) as source:
  286. file_snippet = source.read(512)
  287. if not file_snippet:
  288. return True
  289. ascii_characters = "".join(map(chr, range(32, 127)) +
  290. list("\n\r\t\b"))
  291. null_trans = string.maketrans("", "")
  292. if "\0" in file_snippet:
  293. return False
  294. non_ascii = file_snippet.translate(null_trans, ascii_characters)
  295. return not float(len(non_ascii)) / len(file_snippet) > 0.30
  296. except IOError:
  297. return False
  298. class _GitCloner(threading.Thread):
  299. """
  300. A singleton Git repository cloner.
  301. Clones the repositories crawled by :class:`crawler.GitHubCrawler` for
  302. :class:`GitIndexer` to index.
  303. :ivar clone_queue: (:class:`Queue.Queue`) see
  304. :attr:`crawler.GitHubCrawler.clone_queue`.
  305. :ivar index_queue: (:class:`Queue.Queue`) see
  306. :attr:`GitIndexer.index_queue`.
  307. :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
  308. """
  309. def __init__(self, clone_queue, index_queue, run_event):
  310. """
  311. Create an instance of the singleton :class:`_GitCloner`.
  312. :param clone_queue: see :attr:`self.clone_queue`
  313. :param index_queue: see :attr:`self.index_queue`
  314. :type clone_queue: see :attr:`self.clone_queue`
  315. :type index_queue: see :attr:`self.index_queue`
  316. """
  317. self.clone_queue = clone_queue
  318. self.index_queue = index_queue
  319. self.run_event = run_event
  320. self._logger = logging.getLogger("%s.%s" %
  321. (__name__, self.__class__.__name__))
  322. self._logger.info("Starting.")
  323. super(_GitCloner, self).__init__(name=self.__class__.__name__)
  324. def run(self):
  325. """
  326. Retrieve metadata about newly crawled repositories and clone them.
  327. Blocks until new :class:`GitRepository` appear in
  328. :attr:`self.clone_queue`, then attempts cloning them. If
  329. succcessful, the cloned repository is added to :attr:`self.index_queue`
  330. for the `GitIndexer` to clone; otherwise, it is discarded.
  331. """
  332. while self.run_event.is_set():
  333. while self.clone_queue.empty():
  334. time.sleep(THREAD_QUEUE_SLEEP)
  335. repo = self.clone_queue.get()
  336. self.clone_queue.task_done()
  337. try:
  338. self._clone_repository(repo)
  339. except Exception:
  340. pass
  341. def _clone_repository(self, repo):
  342. """
  343. Attempt cloning a Git repository.
  344. :param repo: Metadata about the repository to clone.
  345. :type repo: :class:`GitRepository`
  346. """
  347. GIT_CLONE_TIMEOUT = 500
  348. queue_percent_full = (float(self.index_queue.qsize()) /
  349. self.index_queue.maxsize) * 100
  350. exit_code = None
  351. command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone"
  352. " --single-branch %s %s/%s || pkill -f git")
  353. command_attempt = 0
  354. while exit_code is None:
  355. try:
  356. exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT,
  357. repo.url, GIT_CLONE_DIR, repo.name), shell=True)
  358. except Exception:
  359. time.sleep(1)
  360. command_attempt += 1
  361. if command_attempt == 20:
  362. break
  363. else:
  364. continue
  365. else:
  366. break
  367. if exit_code != 0:
  368. if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
  369. shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
  370. return
  371. while self.index_queue.full():
  372. time.sleep(THREAD_QUEUE_SLEEP)
  373. self.index_queue.put(repo)
  374. class _ChangeDir(object):
  375. """
  376. A wrapper class for os.chdir(), to map onto `with` and handle exceptions.
  377. :ivar new_path: (str) The path to change the current directory to.
  378. :ivar old_path: (str) The path of the directory to return to.
  379. """
  380. def __init__(self, new_path):
  381. """
  382. Create a _ChangeDir instance.
  383. :param new_path: The directory to enter.
  384. :type new_path: str
  385. """
  386. self.new_path = new_path
  387. def __enter__(self):
  388. """
  389. Change the current working-directory to **new_path**.
  390. """
  391. self.old_path = os.getcwd()
  392. os.chdir(self.new_path)
  393. def __exit__(self, *exception):
  394. """
  395. Change the current working-directory to **old_path**.
  396. :param exception: Various exception arguments passed by `with`.
  397. :type exception: varargs
  398. """
  399. os.chdir(self.old_path)