From 1b2739f8c4439219d18a5f4f3d9bd02d3360ef85 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Wed, 30 Apr 2014 15:20:15 -0400 Subject: [PATCH] Add GitHub repo star count, simple logging. Add: bitshift/crawler/crawler.py -add `_get_repo_stars()` to `GitHubCrawler`, which queries the GitHub API for the number of a stars that a given repository has. -log the `next_api_url` every time it's generated by `GitHubCrawler` and `BitbucketCrawler` to two respective log-files. --- bitshift/crawler/crawler.py | 51 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 6196a13..e4b4929 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -24,6 +24,11 @@ class GitHubCrawler(threading.Thread): crawlers, to be processed by :class:`indexer.GitIndexer`. """ + AUTHENTICATION = { + "client_id" : "436cb884ae09be7f2a4e", + "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" + } + def __init__(self, clone_queue): """ Create an instance of the singleton `GitHubCrawler`. @@ -48,10 +53,6 @@ class GitHubCrawler(threading.Thread): """ next_api_url = "https://api.github.com/repositories" - authentication_params = { - "client_id" : "436cb884ae09be7f2a4e", - "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" - } api_request_interval = 5e3 / 60 ** 2 while len(next_api_url) > 0: @@ -59,7 +60,7 @@ class GitHubCrawler(threading.Thread): try: response = requests.get(next_api_url, - params=authentication_params) + params=self.AUTHENTICATION) except ConnectionError as exception: continue @@ -76,14 +77,49 @@ class GitHubCrawler(threading.Thread): if int(response.headers["x-ratelimit-remaining"]) == 0: time.sleep(int(response.headers["x-ratelimit-reset"]) - - time.time()) + time.time()) next_api_url = response.headers["link"].split(">")[0][1:] + with open(".github_api.log", "w") as log_file: + log_file.write("%s\n" % next_api_url) sleep_time = api_request_interval - (time.time() - start_time) if sleep_time > 0: time.sleep(sleep_time) + def _get_repo_stars(self, repo_name): + """ + Return the number of stargazers for a repository. + + Queries the GitHub API for the number of stargazers for a given + repository, and blocks if the query limit is exceeded. + + :param repo_name: The name of the repository, in + `username/repository_name` format. + + :type repo_name: str + + :return: The number of stargazers for the repository. + :rtype: int + """ + + API_URL = "https://api.github.com/search/repositories" + + + params = self.AUTHENTICATION + params["q"] = "repo:%s" % repo_name + + resp = requests.get(API_URL, + params=params, + headers={ + "Accept" : "application/vnd.github.preview" + }) + + if int(resp.headers["x-ratelimit-remaining"]) == 0: + time.sleep(int(resp.headers["x-ratelimit-reset"]) - time.time()) + + return int(resp.json()["items"][0]["stargazers_count"]) + class BitbucketCrawler(threading.Thread): """ Crawler that retrieves links to all of Bitbucket's public repositories. @@ -145,4 +181,7 @@ class BitbucketCrawler(threading.Thread): clone_url, repo["full_name"], "Bitbucket")) next_api_url = response["next"] + with open(".bitbucket_api.log", "w") as log_file: + log_file.write("%s\n" % next_api_url) + time.sleep(0.2)