A semantic search engine for source code https://bitshift.benkurtovic.com/
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

244 lines
8.8 KiB

  1. """
  2. :synopsis: Main crawler module, to oversee all site-specific crawlers.
  3. Contains all website/framework-specific Class crawlers.
  4. """
  5. import logging
  6. import math
  7. import time
  8. import threading
  9. import requests
  10. from . import indexer
  11. class GitHubCrawler(threading.Thread):
  12. """
  13. Crawler that retrieves links to all of GitHub's public repositories.
  14. GitHubCrawler is a threaded singleton that queries GitHub's API for urls
  15. to its public repositories, which it inserts into a :class:`Queue.Queue`
  16. shared with :class:`indexer.GitIndexer`.
  17. :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository`
  18. with repository metadata retrieved by :class:`GitHubCrawler`, and other Git
  19. crawlers, to be processed by :class:`indexer.GitIndexer`.
  20. :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
  21. """
  22. AUTHENTICATION = {
  23. "client_id" : "436cb884ae09be7f2a4e",
  24. "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
  25. }
  26. def __init__(self, clone_queue, run_event):
  27. """
  28. Create an instance of the singleton `GitHubCrawler`.
  29. :param clone_queue: see :attr:`self.clone_queue`
  30. :type clone_queue: see :attr:`self.clone_queue`
  31. """
  32. self.clone_queue = clone_queue
  33. self.run_event = run_event
  34. self._logger = logging.getLogger("%s.%s" %
  35. (__name__, self.__class__.__name__))
  36. self._logger.info("Starting.")
  37. super(GitHubCrawler, self).__init__(name=self.__class__.__name__)
  38. def run(self):
  39. """
  40. Query the GitHub API for data about every public repository.
  41. Pull all of GitHub's repositories by making calls to its API in a loop,
  42. accessing a subsequent page of results via the "next" URL returned in an
  43. API response header. Uses Severyn Kozak's (sevko) authentication
  44. credentials. For every new repository, a :class:`GitRepository` is
  45. inserted into :attr:`self.clone_queue`.
  46. """
  47. next_api_url = "https://api.github.com/repositories"
  48. api_request_interval = 5e3 / 60 ** 2
  49. while next_api_url and self.run_event.is_set():
  50. start_time = time.time()
  51. try:
  52. resp = requests.get(next_api_url, params=self.AUTHENTICATION)
  53. except requests.ConnectionError:
  54. self._logger.exception("API %s call failed:" % next_api_url)
  55. time.sleep(0.5)
  56. continue
  57. queue_percent_full = (float(self.clone_queue.qsize()) /
  58. self.clone_queue.maxsize) * 100
  59. self._logger.info("API call made. Queue size: %d/%d, %d%%." %
  60. ((self.clone_queue.qsize(), self.clone_queue.maxsize,
  61. queue_percent_full)))
  62. repo_names = [repo["full_name"] for repo in resp.json()]
  63. repo_ranks = self.get_ranks(repo_names)
  64. for repo in resp.json():
  65. while self.clone_queue.full():
  66. time.sleep(1)
  67. self.clone_queue.put(indexer.GitRepository(
  68. repo["html_url"], repo["full_name"], "GitHub",
  69. repo_ranks[repo["full_name"]]))
  70. if int(resp.headers["x-ratelimit-remaining"]) == 0:
  71. time.sleep(int(resp.headers["x-ratelimit-reset"]) -
  72. time.time())
  73. next_api_url = resp.headers["link"].split(">")[0][1:]
  74. sleep_time = api_request_interval - (time.time() - start_time)
  75. if sleep_time > 0:
  76. time.sleep(sleep_time)
  77. @classmethod
  78. def get_ranks(cls, repo_names):
  79. """
  80. Return the ranks for several repositories.
  81. Queries the GitHub API for the number of stargazers for any given
  82. repositories, and blocks if the query limit is exceeded. The rank is
  83. calculated using these numbers.
  84. :param repo_names: An array of repository names, in
  85. `username/repository_name` format.
  86. :type repo_names: str
  87. :return: A dictionary mapping repository names to ranks.
  88. Example dictionary:
  89. .. code-block:: python
  90. {
  91. "user/repository" : 0.2564949357461537
  92. }
  93. :rtype: dictionary
  94. """
  95. API_URL = "https://api.github.com/search/repositories"
  96. REPOS_PER_QUERY = 25
  97. repo_ranks = {}
  98. for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in
  99. xrange(0, len(repo_names), REPOS_PER_QUERY)]:
  100. query_url = "%s?q=%s" % (API_URL,
  101. "+".join("repo:%s" % name for name in names))
  102. params = cls.AUTHENTICATION
  103. resp = requests.get(query_url,
  104. params=params,
  105. headers={
  106. "Accept" : "application/vnd.github.preview"
  107. })
  108. if int(resp.headers["x-ratelimit-remaining"]) == 0:
  109. sleep_time = int(resp.headers["x-ratelimit-reset"]) - \
  110. time.time() + 1
  111. if sleep_time > 0:
  112. logging.info("API quota exceeded. Sleep time: %d." %
  113. sleep_time)
  114. time.sleep(sleep_time)
  115. for repo in resp.json()["items"]:
  116. stars = repo["stargazers_count"]
  117. rank = min(math.log(max(stars, 1), 5000), 1.0)
  118. repo_ranks[repo["full_name"]] = rank
  119. for name in repo_names:
  120. if name not in repo_ranks:
  121. repo_ranks[name] = 0.1
  122. return repo_ranks
  123. class BitbucketCrawler(threading.Thread):
  124. """
  125. Crawler that retrieves links to all of Bitbucket's public repositories.
  126. BitbucketCrawler is a threaded singleton that queries Bitbucket's API for
  127. urls to its public repositories, and inserts them as
  128. :class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with
  129. :class:`indexer.GitIndexer`.
  130. :ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert
  131. :class:`indexer.GitRepository` repository urls into.
  132. :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
  133. """
  134. def __init__(self, clone_queue, run_event):
  135. """
  136. Create an instance of the singleton `BitbucketCrawler`.
  137. :param clone_queue: see :attr:`self.clone_queue`
  138. :type clone_queue: see :attr:`self.clone_queue`
  139. """
  140. self.clone_queue = clone_queue
  141. self.run_event = run_event
  142. self._logger = logging.getLogger("%s.%s" %
  143. (__name__, self.__class__.__name__))
  144. self._logger.info("Starting.")
  145. super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)
  146. def run(self):
  147. """
  148. Query the Bitbucket API for data about every public repository.
  149. Query the Bitbucket API's "/repositories" endpoint and read its
  150. paginated responses in a loop; any "git" repositories have their
  151. clone-urls and names inserted into a :class:`indexer.GitRepository` in
  152. :attr:`self.clone_queue`.
  153. """
  154. next_api_url = "https://api.bitbucket.org/2.0/repositories"
  155. while self.run_event.is_set():
  156. try:
  157. response = requests.get(next_api_url).json()
  158. except requests.ConnectionError:
  159. self._logger.exception("API %s call failed:", next_api_url)
  160. time.sleep(0.5)
  161. continue
  162. queue_percent_full = (float(self.clone_queue.qsize()) /
  163. self.clone_queue.maxsize) * 100
  164. self._logger.info("API call made. Queue size: %d/%d, %d%%." %
  165. ((self.clone_queue.qsize(), self.clone_queue.maxsize,
  166. queue_percent_full)))
  167. for repo in response["values"]:
  168. if repo["scm"] == "git":
  169. while self.clone_queue.full():
  170. time.sleep(1)
  171. clone_links = repo["links"]["clone"]
  172. clone_url = (clone_links[0]["href"] if
  173. clone_links[0]["name"] == "https" else
  174. clone_links[1]["href"])
  175. try:
  176. watchers = requests.get(
  177. repo["links"]["watchers"]["href"])
  178. num = len(watchers.json()["values"])
  179. rank = min(math.log(max(num, 1), 500), 1.0)
  180. except requests.ConnectionError:
  181. err = "API %s call failed:" % next_api_url
  182. self._logger.exception(err)
  183. time.sleep(0.5)
  184. continue
  185. self.clone_queue.put(indexer.GitRepository(
  186. clone_url, repo["full_name"], "Bitbucket"), rank)
  187. next_api_url = response["next"]
  188. time.sleep(0.2)