A semantic search engine for source code https://bitshift.benkurtovic.com/
您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符
 
 
 
 
 
 

241 行
8.9 KiB

  1. """
  2. :synopsis: Main crawler module, to oversee all site-specific crawlers.
  3. Contains all website/framework-specific Class crawlers.
  4. """
  5. import logging, requests, time, threading
  6. from bitshift.crawler import indexer
  7. from ..codelet import Codelet
  8. from ..database import Database
  9. class GitHubCrawler(threading.Thread):
  10. """
  11. Crawler that retrieves links to all of GitHub's public repositories.
  12. GitHubCrawler is a threaded singleton that queries GitHub's API for urls
  13. to its public repositories, which it inserts into a :class:`Queue.Queue`
  14. shared with :class:`indexer.GitIndexer`.
  15. :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository`
  16. with repository metadata retrieved by :class:`GitHubCrawler`, and other Git
  17. crawlers, to be processed by :class:`indexer.GitIndexer`.
  18. :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
  19. """
  20. AUTHENTICATION = {
  21. "client_id" : "436cb884ae09be7f2a4e",
  22. "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
  23. }
  24. def __init__(self, clone_queue):
  25. """
  26. Create an instance of the singleton `GitHubCrawler`.
  27. :param clone_queue: see :attr:`self.clone_queue`
  28. :type clone_queue: see :attr:`self.clone_queue`
  29. """
  30. self.clone_queue = clone_queue
  31. self._logger = logging.getLogger("%s.%s" %
  32. (__name__, self.__class__.__name__))
  33. self._logger.info("Starting.")
  34. super(GitHubCrawler, self).__init__(name=self.__class__.__name__)
  35. def run(self):
  36. """
  37. Query the GitHub API for data about every public repository.
  38. Pull all of GitHub's repositories by making calls to its API in a loop,
  39. accessing a subsequent page of results via the "next" URL returned in an
  40. API response header. Uses Severyn Kozak's (sevko) authentication
  41. credentials. For every new repository, a :class:`GitRepository` is
  42. inserted into :attr:`self.clone_queue`.
  43. """
  44. next_api_url = "https://api.github.com/repositories"
  45. api_request_interval = 5e3 / 60 ** 2
  46. while len(next_api_url) > 0:
  47. start_time = time.time()
  48. try:
  49. resp = requests.get(next_api_url, params=self.AUTHENTICATION)
  50. except ConnectionError as excep:
  51. self._logger.warning("API %s call failed: %s: %s",
  52. next_api_url, excep.__class__.__name__, excep)
  53. time.sleep(0.5)
  54. continue
  55. queue_percent_full = (float(self.clone_queue.qsize()) /
  56. self.clone_queue.maxsize) * 100
  57. self._logger.info("API call made. Queue size: %d/%d, %d%%." %
  58. ((self.clone_queue.qsize(), self.clone_queue.maxsize,
  59. queue_percent_full)))
  60. repo_names = [repo["full_name"] for repo in resp.json()]
  61. repo_stars = self._get_repositories_stars(repo_names)
  62. for repo in resp.json():
  63. while self.clone_queue.full():
  64. time.sleep(1)
  65. self.clone_queue.put(indexer.GitRepository(
  66. repo["html_url"], repo["full_name"].replace("/", ""),
  67. "GitHub", repo_stars[repo["full_name"]]))
  68. if int(resp.headers["x-ratelimit-remaining"]) == 0:
  69. time.sleep(int(resp.headers["x-ratelimit-reset"]) -
  70. time.time())
  71. next_api_url = resp.headers["link"].split(">")[0][1:]
  72. sleep_time = api_request_interval - (time.time() - start_time)
  73. if sleep_time > 0:
  74. time.sleep(sleep_time)
  75. def _get_repositories_stars(self, repo_names):
  76. """
  77. Return the number of stargazers for several repositories.
  78. Queries the GitHub API for the number of stargazers for any given
  79. repositories, and blocks if the query limit is exceeded.
  80. :param repo_names: An array of repository names, in
  81. `username/repository_name` format.
  82. :type repo_names: str
  83. :return: A dictionary with repository name keys, and corresponding
  84. stargazer count values.
  85. Example dictionary:
  86. .. code-block:: python
  87. {
  88. "user/repository" : 100
  89. }
  90. :rtype: dictionary
  91. """
  92. API_URL = "https://api.github.com/search/repositories"
  93. REPOS_PER_QUERY = 25
  94. repo_stars = {}
  95. for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in
  96. xrange(0, len(repo_names), REPOS_PER_QUERY)]:
  97. query_url = "%s?q=%s" % (API_URL,
  98. "+".join("repo:%s" % name for name in names))
  99. params = self.AUTHENTICATION
  100. resp = requests.get(query_url,
  101. params=params,
  102. headers={
  103. "Accept" : "application/vnd.github.preview"
  104. })
  105. if int(resp.headers["x-ratelimit-remaining"]) == 0:
  106. sleep_time = int(resp.headers["x-ratelimit-reset"]) - \
  107. time.time() + 1
  108. if sleep_time > 0:
  109. logging.info("API quota exceeded. Sleep time: %d." %
  110. sleep_time)
  111. time.sleep(sleep_time)
  112. for repo in resp.json()["items"]:
  113. rank = float(repo["stargazers_count"]) / 1000
  114. repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0
  115. for name in repo_names:
  116. if name not in repo_stars:
  117. repo_stars[name] = 0.5
  118. return repo_stars
  119. class BitbucketCrawler(threading.Thread):
  120. """
  121. Crawler that retrieves links to all of Bitbucket's public repositories.
  122. BitbucketCrawler is a threaded singleton that queries Bitbucket's API for
  123. urls to its public repositories, and inserts them as
  124. :class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with
  125. :class:`indexer.GitIndexer`.
  126. :ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert
  127. :class:`indexer.GitRepository` repository urls into.
  128. :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
  129. """
  130. def __init__(self, clone_queue):
  131. """
  132. Create an instance of the singleton `BitbucketCrawler`.
  133. :param clone_queue: see :attr:`self.clone_queue`
  134. :type clone_queue: see :attr:`self.clone_queue`
  135. """
  136. self.clone_queue = clone_queue
  137. self._logger = logging.getLogger("%s.%s" %
  138. (__name__, self.__class__.__name__))
  139. self._logger.info("Starting.")
  140. super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)
  141. def run(self):
  142. """
  143. Query the Bitbucket API for data about every public repository.
  144. Query the Bitbucket API's "/repositories" endpoint and read its
  145. paginated responses in a loop; any "git" repositories have their
  146. clone-urls and names inserted into a :class:`indexer.GitRepository` in
  147. :attr:`self.clone_queue`.
  148. """
  149. next_api_url = "https://api.bitbucket.org/2.0/repositories"
  150. while True:
  151. try:
  152. response = requests.get(next_api_url).json()
  153. except ConnectionError as exception:
  154. time.sleep(0.5)
  155. self._logger.warning("API %s call failed: %s: %s",
  156. next_api_url, excep.__class__.__name__, excep)
  157. continue
  158. queue_percent_full = (float(self.clone_queue.qsize()) /
  159. self.clone_queue.maxsize) * 100
  160. self._logger.info("API call made. Queue size: %d/%d, %d%%." %
  161. ((self.clone_queue.qsize(), self.clone_queue.maxsize,
  162. queue_percent_full)))
  163. for repo in response["values"]:
  164. if repo["scm"] == "git":
  165. while self.clone_queue.full():
  166. time.sleep(1)
  167. clone_links = repo["links"]["clone"]
  168. clone_url = (clone_links[0]["href"] if
  169. clone_links[0]["name"] == "https" else
  170. clone_links[1]["href"])
  171. links.append("clone_url")
  172. try:
  173. watchers = requests.get(
  174. repo["links"]["watchers"]["href"])
  175. rank = len(watchers.json()["values"]) / 100
  176. except ConnectionError as exception:
  177. time.sleep(0.5)
  178. self._logger.warning("API %s call failed: %s: %s",
  179. next_api_url, excep.__class__.__name__, excep)
  180. continue
  181. self.clone_queue.put(indexer.GitRepository(
  182. clone_url, repo["full_name"], "Bitbucket"),
  183. rank if rank < 1.0 else 1.0)
  184. next_api_url = response["next"]
  185. time.sleep(0.2)