Browse Source

Add GitHub repo star count, simple logging.

Add:
    bitshift/crawler/crawler.py
        -add `_get_repo_stars()` to `GitHubCrawler`, which queries the GitHub
        API for the number of a stars that a given repository has.
        -log the `next_api_url` every time it's generated by `GitHubCrawler` and
        `BitbucketCrawler` to two respective log-files.
tags/v1.0^2
Severyn Kozak 10 years ago
parent
commit
1b2739f8c4
1 changed files with 45 additions and 6 deletions
  1. +45
    -6
      bitshift/crawler/crawler.py

+ 45
- 6
bitshift/crawler/crawler.py View File

@@ -24,6 +24,11 @@ class GitHubCrawler(threading.Thread):
crawlers, to be processed by :class:`indexer.GitIndexer`.
"""

AUTHENTICATION = {
"client_id" : "436cb884ae09be7f2a4e",
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
}

def __init__(self, clone_queue):
"""
Create an instance of the singleton `GitHubCrawler`.
@@ -48,10 +53,6 @@ class GitHubCrawler(threading.Thread):
"""

next_api_url = "https://api.github.com/repositories"
authentication_params = {
"client_id" : "436cb884ae09be7f2a4e",
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
}
api_request_interval = 5e3 / 60 ** 2

while len(next_api_url) > 0:
@@ -59,7 +60,7 @@ class GitHubCrawler(threading.Thread):

try:
response = requests.get(next_api_url,
params=authentication_params)
params=self.AUTHENTICATION)
except ConnectionError as exception:
continue

@@ -76,14 +77,49 @@ class GitHubCrawler(threading.Thread):

if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) -
time.time())
time.time())

next_api_url = response.headers["link"].split(">")[0][1:]
with open(".github_api.log", "w") as log_file:
log_file.write("%s\n" % next_api_url)

sleep_time = api_request_interval - (time.time() - start_time)
if sleep_time > 0:
time.sleep(sleep_time)

def _get_repo_stars(self, repo_name):
"""
Return the number of stargazers for a repository.

Queries the GitHub API for the number of stargazers for a given
repository, and blocks if the query limit is exceeded.

:param repo_name: The name of the repository, in
`username/repository_name` format.

:type repo_name: str

:return: The number of stargazers for the repository.
:rtype: int
"""

API_URL = "https://api.github.com/search/repositories"


params = self.AUTHENTICATION
params["q"] = "repo:%s" % repo_name

resp = requests.get(API_URL,
params=params,
headers={
"Accept" : "application/vnd.github.preview"
})

if int(resp.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(resp.headers["x-ratelimit-reset"]) - time.time())

return int(resp.json()["items"][0]["stargazers_count"])

class BitbucketCrawler(threading.Thread):
"""
Crawler that retrieves links to all of Bitbucket's public repositories.
@@ -145,4 +181,7 @@ class BitbucketCrawler(threading.Thread):
clone_url, repo["full_name"], "Bitbucket"))

next_api_url = response["next"]
with open(".bitbucket_api.log", "w") as log_file:
log_file.write("%s\n" % next_api_url)

time.sleep(0.2)

Loading…
Cancel
Save