A semantic search engine for source code https://bitshift.benkurtovic.com/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

501 lines
16 KiB

  1. """
  2. :synopsis: Contains a singleton GitIndexer class, which clones and indexes git
  3. repositories.
  4. """
  5. import bs4, os, Queue, re, shutil, string, subprocess, time, threading
  6. from ..database import Database
  7. from ..codelet import Codelet
  8. GIT_CLONE_DIR = "/tmp/bitshift"
  9. THREAD_QUEUE_SLEEP = 0.5
  10. class GitRepository(object):
  11. """
  12. A representation of a Git repository's metadata.
  13. :ivar url: (str) The repository's url.
  14. :ivar name: (str) The name of the repository.
  15. :ivar framework_name: (str) The name of the online Git framework that the
  16. repository belongs to (eg, GitHub, BitBucket).
  17. """
  18. def __init__(self, url, name, framework_name):
  19. """
  20. Create a GitRepository instance.
  21. :param url: see :attr:`GitRepository.url`
  22. :param name: see :attr:`GitRepository.name`
  23. :param framework_name: see :attr:`GitRepository.framework_name`
  24. :type url: str
  25. :type name: str
  26. :type framework_name: str
  27. """
  28. self.url = url
  29. self.name = name
  30. self.framework_name = framework_name
  31. class GitIndexer(threading.Thread):
  32. """
  33. A singleton Git repository indexer.
  34. :class:`GitIndexer` indexes the repositories cloned by the
  35. :class:`_GitCloner` singleton.
  36. :ivar index_queue: (:class:`Queue.Queue`) A queue containing
  37. :class:`GitRepository` objects for every new repository succesfully
  38. cloned by :class:`_GitCloner`, which are to be indexed.
  39. :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner,
  40. which feeds :class:`GitIndexer`.
  41. """
  42. def __init__(self, clone_queue):
  43. """
  44. Create an instance of the singleton `GitIndexer`.
  45. :param clone_queue: see :attr:`self.index_queue`
  46. :type index_queue: see :attr:`self.index_queue`
  47. """
  48. MAX_INDEX_QUEUE_SIZE = 10
  49. self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
  50. self.git_cloner = _GitCloner(clone_queue, self.index_queue)
  51. self.git_cloner.start()
  52. if not os.path.exists(GIT_CLONE_DIR):
  53. os.makedirs(GIT_CLONE_DIR)
  54. super(GitIndexer, self).__init__(name=self.__class__.__name__)
  55. def run(self):
  56. """
  57. Retrieve metadata about newly cloned repositories and index them.
  58. Blocks until new repositories appear in :attr:`self.index_queue`, then
  59. retrieves one, and attempts indexing it. Should any errors occur, the
  60. new repository will be discarded and the indexer will index the next in
  61. the queue.
  62. """
  63. while True:
  64. while self.index_queue.empty():
  65. time.sleep(THREAD_QUEUE_SLEEP)
  66. repo = self.index_queue.get()
  67. self.index_queue.task_done()
  68. try:
  69. self._index_repository(repo.url, repo.name, repo.framework_name)
  70. except Exception as exception:
  71. pass
  72. def _index_repository(self, repo_url, repo_name, framework_name):
  73. """
  74. Clone and index (create and insert Codeletes for) a Git repository.
  75. `git clone` the Git repository located at **repo_url**, call
  76. _insert_repository_codelets, then remove said repository.
  77. :param repo_url: The url the Git repository was cloned from.
  78. :param repo_name: The name of the repository.
  79. :param framework_name: The name of the framework the repository is from.
  80. :type repo_url: str
  81. :type repo_name: str
  82. :type framework_name: str
  83. """
  84. with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir:
  85. try:
  86. self._insert_repository_codelets(repo_url, repo_name,
  87. framework_name)
  88. except Exception as exception:
  89. pass
  90. if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
  91. shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
  92. def _insert_repository_codelets(self, repo_url, repo_name, framework_name):
  93. """
  94. Create and insert a Codelet for the files inside a Git repository.
  95. Create a new Codelet, and insert it into the Database singleton, for every
  96. file inside the current working directory's default branch (usually
  97. *master*).
  98. :param repo_url: The url the Git repository was cloned from.
  99. :param repo_name: The name of the repository.
  100. :param framework_name: The name of the framework the repository is from.
  101. :type repo_url: str
  102. :type repo_name: str
  103. :type framework_name: str
  104. """
  105. commits_meta = _get_commits_metadata()
  106. if commits_meta is None:
  107. return
  108. for filename in commits_meta.keys():
  109. try:
  110. source = ""
  111. with open(filename) as source_file:
  112. source = _decode(source_file.read())
  113. if source is None:
  114. continue
  115. except IOError as exception:
  116. continue
  117. authors = [(_decode(author),) for author in \
  118. commits_meta[filename]["authors"]]
  119. codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
  120. None, authors, _generate_file_url(filename, repo_url,
  121. framework_name),
  122. commits_meta[filename]["time_created"],
  123. commits_meta[filename]["time_last_modified"])
  124. # Database.insert(codelet)
  125. class _GitCloner(threading.Thread):
  126. """
  127. A singleton Git repository cloner.
  128. Clones the repositories crawled by :class:`crawler.GitHubCrawler` for
  129. :class:`GitIndexer` to index.
  130. :ivar clone_queue: (:class:`Queue.Queue`) see
  131. :attr:`crawler.GitHubCrawler.clone_queue`.
  132. :ivar index_queue: (:class:`Queue.Queue`) see
  133. :attr:`GitIndexer.index_queue`.
  134. """
  135. def __init__(self, clone_queue, index_queue):
  136. """
  137. Create an instance of the singleton :class:`_GitCloner`.
  138. :param clone_queue: see :attr:`self.clone_queue`
  139. :param index_queue: see :attr:`self.index_queue`
  140. :type clone_queue: see :attr:`self.clone_queue`
  141. :type index_queue: see :attr:`self.index_queue`
  142. """
  143. self.clone_queue = clone_queue
  144. self.index_queue = index_queue
  145. super(_GitCloner, self).__init__(name=self.__class__.__name__)
  146. def run(self):
  147. """
  148. Retrieve metadata about newly crawled repositories and clone them.
  149. Blocks until new :class:`GitRepository` appear in
  150. :attr:`self.clone_queue`, then attempts cloning them. If
  151. succcessful, the cloned repository is added to :attr:`self.index_queue`
  152. for the `GitIndexer` to clone; otherwise, it is discarded.
  153. """
  154. while True:
  155. while self.clone_queue.empty():
  156. time.sleep(THREAD_QUEUE_SLEEP)
  157. repo = self.clone_queue.get()
  158. self.clone_queue.task_done()
  159. try:
  160. self._clone_repository(repo)
  161. except Exception as exception:
  162. pass
  163. def _clone_repository(self, repo):
  164. """
  165. Attempt cloning a Git repository.
  166. :param repo: Metadata about the repository to clone.
  167. :type repo: :class:`GitRepository`
  168. """
  169. GIT_CLONE_TIMEOUT = 500
  170. queue_percent_full = (float(self.index_queue.qsize()) /
  171. self.index_queue.maxsize) * 100
  172. exit_code = None
  173. command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone"
  174. " --single-branch %s %s/%s || pkill -f git")
  175. command_attempt = 0
  176. while exit_code is None:
  177. try:
  178. exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT,
  179. repo.url, GIT_CLONE_DIR, repo.name), shell=True)
  180. except Exception as exception:
  181. time.sleep(1)
  182. command_attempt += 1
  183. if command_attempt == 20:
  184. break
  185. else:
  186. continue
  187. else:
  188. break
  189. if exit_code != 0:
  190. if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
  191. shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
  192. return
  193. while self.index_queue.full():
  194. time.sleep(THREAD_QUEUE_SLEEP)
  195. self.index_queue.put(repo)
  196. class _ChangeDir(object):
  197. """
  198. A wrapper class for os.chdir(), to map onto `with` and handle exceptions.
  199. :ivar new_path: (str) The path to change the current directory to.
  200. :ivar old_path: (str) The path of the directory to return to.
  201. """
  202. def __init__(self, new_path):
  203. """
  204. Create a _ChangeDir instance.
  205. :param new_path: The directory to enter.
  206. :type new_path: str
  207. """
  208. self.new_path = new_path
  209. def __enter__(self):
  210. """
  211. Change the current working-directory to **new_path**.
  212. """
  213. self.old_path = os.getcwd()
  214. os.chdir(self.new_path)
  215. def __exit__(self, *exception):
  216. """
  217. Change the current working-directory to **old_path**.
  218. :param exception: Various exception arguments passed by `with`.
  219. :type exception: varargs
  220. """
  221. os.chdir(self.old_path)
  222. def _generate_file_url(filename, repo_url, framework_name):
  223. """
  224. Return a url for a filename from a Git wrapper framework.
  225. :param filename: The path of the file.
  226. :param repo_url: The url of the file's parent repository.
  227. :param framework_name: The name of the framework the repository is from.
  228. :type filename: str
  229. :type repo_url: str
  230. :type framework_name: str
  231. :return: The file's full url on the given framework, if successfully
  232. derived.
  233. :rtype: str, or None
  234. .. warning::
  235. Various Git subprocesses will occasionally fail, and, seeing as the
  236. information they provide is a crucial component of some repository file
  237. urls, None may be returned.
  238. """
  239. try:
  240. if framework_name == "GitHub":
  241. default_branch = subprocess.check_output("git branch"
  242. " --no-color", shell=True)[2:-1]
  243. return ("%s/blob/%s/%s" % (repo_url, default_branch,
  244. filename)).replace("//", "/")
  245. elif framework_name == "Bitbucket":
  246. commit_hash = subprocess.check_output("git rev-parse HEAD",
  247. shell=True).replace("\n", "")
  248. return ("%s/src/%s/%s" % (repo_url, commit_hash,
  249. filename)).replace("//", "/")
  250. except subprocess.CalledProcessError as exception:
  251. return None
  252. def _get_git_commits():
  253. """
  254. Return the current working directory's formatted commit data.
  255. Uses `git log` to generate metadata about every single file in the
  256. repository's commit history.
  257. :return: The author, timestamp, and names of all modified files of every
  258. commit.
  259. .. code-block:: python
  260. sample_returned_array = [
  261. {
  262. "author" : (str) "author"
  263. "timestamp" : (int) 1396919293,
  264. "filenames" : (str array) ["file1", "file2"]
  265. }
  266. ]
  267. :rtype: dictionary
  268. """
  269. git_log = subprocess.check_output(("git --no-pager log --name-only"
  270. " --pretty=format:'%n%n%an%n%at' -z"), shell=True)
  271. commits = []
  272. for commit in git_log.split("\n\n"):
  273. fields = commit.split("\n")
  274. if len(fields) > 2:
  275. commits.append({
  276. "author" : fields[0],
  277. "timestamp" : int(fields[1]),
  278. "filenames" : fields[2].split("\x00")[:-2]
  279. })
  280. return commits
  281. def _get_tracked_files():
  282. """
  283. Return a list of the filenames of all valuable files in the Git repository.
  284. Get a list of the filenames of the non-binary (Perl heuristics used for
  285. filetype identification) files currently inside the current working
  286. directory's Git repository. Then, weed out any boilerplate/non-code files
  287. that match the regex rules in GIT_IGNORE_FILES.
  288. :return: The filenames of all index-worthy non-binary files.
  289. :rtype: str array
  290. """
  291. GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"]
  292. GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?",
  293. "md(wn|t[e]?xt)?", "rst"]
  294. files = []
  295. for dirname, subdir_names, filenames in os.walk("."):
  296. for filename in filenames:
  297. path = os.path.join(dirname, filename)
  298. if _is_ascii(path):
  299. files.append(path)
  300. valuable_files = []
  301. for filename in files:
  302. filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
  303. for pattern in GIT_IGNORE_FILES])
  304. extension = filename.split(".")[-1]
  305. extension_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
  306. for pattern in GIT_IGNORE_EXTENSIONS])
  307. if not (filename_match or extension_match):
  308. valuable_files.append(filename[2:])
  309. return valuable_files
  310. def _get_commits_metadata():
  311. """
  312. Return a dictionary containing every valuable tracked file's metadata.
  313. :return: A dictionary with author names, time of creation, and time of last
  314. modification for every filename key.
  315. .. code-block:: python
  316. sample_returned_dict = {
  317. "my_file" : {
  318. "authors" : (str array) ["author1", "author2"],
  319. "time_created" : (int) 1395939566,
  320. "time_last_modified" : (int) 1396920409
  321. }
  322. }
  323. :rtype: dictionary
  324. """
  325. commits = _get_git_commits()
  326. tracked_files = _get_tracked_files()
  327. files_meta = {}
  328. for commit in commits:
  329. for filename in commit["filenames"]:
  330. if filename not in tracked_files:
  331. continue
  332. if filename not in files_meta.keys():
  333. files_meta[filename] = {
  334. "authors" : [commit["author"]],
  335. "time_last_modified" : commit["timestamp"],
  336. "time_created" : commit["timestamp"]
  337. }
  338. else:
  339. if commit["author"] not in files_meta[filename]["authors"]:
  340. files_meta[filename]["authors"].append(commit["author"])
  341. files_meta[filename]["time_created"] = commit["timestamp"]
  342. return files_meta
  343. def _decode(raw):
  344. """
  345. Return a decoded a raw string.
  346. :param raw: The string to string.
  347. :type raw: (str)
  348. :return: If the original encoding is successfully inferenced, return the
  349. decoded string.
  350. :rtype: str, or None
  351. .. warning::
  352. The raw string's original encoding is identified by heuristics which
  353. can, and occasionally will, fail. Decoding will then fail, and None
  354. will be returned.
  355. """
  356. try:
  357. encoding = bs4.BeautifulSoup(raw).original_encoding
  358. return raw.decode(encoding) if encoding is not None else None
  359. except (LookupError, UnicodeDecodeError, UserWarning) as exception:
  360. return None
  361. def _is_ascii(filename):
  362. """
  363. Heuristically determine whether a file is ASCII text or binary.
  364. If a portion of the file contains null bytes, or the percentage of bytes
  365. that aren't ASCII is greater than 30%, then the file is concluded to be
  366. binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T`
  367. operator, and is the de-facto method for in : passdetermining whether a
  368. file is ASCII.
  369. :param filename: The path of the file to test.
  370. :type filename: str
  371. :return: Whether the file is probably ASCII.
  372. :rtype: Boolean
  373. """
  374. try:
  375. with open(filename) as source:
  376. file_snippet = source.read(512)
  377. if not file_snippet:
  378. return True
  379. ascii_characters = "".join(map(chr, range(32, 127)) +
  380. list("\n\r\t\b"))
  381. null_trans = string.maketrans("", "")
  382. if "\0" in file_snippet:
  383. return False
  384. non_ascii = file_snippet.translate(null_trans, ascii_characters)
  385. return not float(len(non_ascii)) / len(file_snippet) > 0.30
  386. except IOError as exception:
  387. return False