瀏覽代碼

Add GitHub stars, Bitbucket watchers; close #14.

Add:
    bitshift/crawler/crawler.py
        -Add more efficient method of querying GitHub's API for stargazer
        counts, by batching 25 repositories per request.
        -Add watcher counts for Bitbucket repositories, by querying the
        Bitbucket API once per repository (inefficient, but the API in question
        isn't sufficiently robust to accommodate a better approach, and Git
        repositories surface so infrequently that there shouldn't be any query
        limit problems).
tags/v1.0^2
Severyn Kozak 10 年之前
父節點
當前提交
7c5c9fc7e1
共有 2 個文件被更改,包括 71 次插入41 次删除
  1. +1
    -0
      bitshift/crawler/__init__.py
  2. +70
    -41
      bitshift/crawler/crawler.py

+ 1
- 0
bitshift/crawler/__init__.py 查看文件

@@ -39,6 +39,7 @@ def _configure_logging():
os.mkdir(LOG_FILE_DIR)

logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

formatter = logging.Formatter(
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"


+ 70
- 41
bitshift/crawler/crawler.py 查看文件

@@ -63,8 +63,7 @@ class GitHubCrawler(threading.Thread):
start_time = time.time()

try:
response = requests.get(next_api_url,
params=self.AUTHENTICATION)
resp = requests.get(next_api_url, params=self.AUTHENTICATION)
except ConnectionError as excep:
self._logger.warning("API %s call failed: %s: %s",
next_api_url, excep.__class__.__name__, excep)
@@ -77,66 +76,84 @@ class GitHubCrawler(threading.Thread):
((self.clone_queue.qsize(), self.clone_queue.maxsize,
queue_percent_full)))

for repo in response.json():
repo_names = [repo["full_name"] for repo in resp.json()]
repo_stars = self._get_repositories_stars(repo_names)

for repo in resp.json():
while self.clone_queue.full():
time.sleep(1)

self.clone_queue.put(indexer.GitRepository(
repo["html_url"], repo["full_name"].replace("/", ""),
"GitHub",
#self._get_repo_stars(repo["full_name"]))
0))
"GitHub", repo_stars[repo["full_name"]]))

if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) -
if int(resp.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(resp.headers["x-ratelimit-reset"]) -
time.time())

next_api_url = response.headers["link"].split(">")[0][1:]
next_api_url = resp.headers["link"].split(">")[0][1:]

sleep_time = api_request_interval - (time.time() - start_time)
if sleep_time > 0:
time.sleep(sleep_time)

def _get_repo_stars(self, repo_name):
def _get_repositories_stars(self, repo_names):
"""
Return the number of stargazers for a repository.
Return the number of stargazers for several repositories.

Queries the GitHub API for the number of stargazers for a given
repository, and blocks if the query limit is exceeded.
Queries the GitHub API for the number of stargazers for any given
repositories, and blocks if the query limit is exceeded.

:param repo_name: The name of the repository, in
:param repo_names: An array of repository names, in
`username/repository_name` format.

:type repo_name: str

:return: The number of stargazers for the repository.
:rtype: int
"""

API_URL = "https://api.github.com/search/repositories"
:type repo_names: str

params = self.AUTHENTICATION
params["q"] = "repo:%s" % repo_name
:return: A dictionary with repository name keys, and corresponding
stargazer count values.

resp = requests.get(API_URL,
params=params,
headers={
"Accept" : "application/vnd.github.preview"
})
Example dictionary:
.. code-block:: python
{
"user/repository" : 100
}

if int(resp.headers["x-ratelimit-remaining"]) == 0:
sleep_time = int(resp.headers["x-ratelimit-reset"]) - time.time()
if sleep_time > 0:
logging.info("API quota exceeded. Sleep time: %d." % sleep_time)
time.sleep(sleep_time)
:rtype: dictionary
"""

if "items" not in resp.json() or len(resp.json()["items"]) == 0:
self._logger.critical("No API result: %s. Result: %s" % (resp.url,
str(resp.json())))
return 0
else:
rank = float(resp.json()["items"][0]["stargazers_count"]) / 1000
return rank if rank < 1.0 else 1.0
API_URL = "https://api.github.com/search/repositories"
REPOS_PER_QUERY = 25

repo_stars = {}
for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in
xrange(0, len(repo_names), REPOS_PER_QUERY)]:
query_url = "%s?q=%s" % (API_URL,
"+".join("repo:%s" % name for name in names))

params = self.AUTHENTICATION
resp = requests.get(query_url,
params=params,
headers={
"Accept" : "application/vnd.github.preview"
})

if int(resp.headers["x-ratelimit-remaining"]) == 0:
sleep_time = int(resp.headers["x-ratelimit-reset"]) - \
time.time() + 1
if sleep_time > 0:
logging.info("API quota exceeded. Sleep time: %d." %
sleep_time)
time.sleep(sleep_time)

for repo in resp.json()["items"]:
rank = float(repo["stargazers_count"]) / 1000
repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0

for name in repo_names:
if name not in repo_stars:
repo_stars[name] = 0.5

return repo_stars

class BitbucketCrawler(threading.Thread):
"""
@@ -204,8 +221,20 @@ class BitbucketCrawler(threading.Thread):
clone_links[0]["name"] == "https" else
clone_links[1]["href"])
links.append("clone_url")

try:
watchers = requests.get(
repo["links"]["watchers"]["href"])
rank = len(watchers.json()["values"]) / 100
except ConnectionError as exception:
time.sleep(0.5)
self._logger.warning("API %s call failed: %s: %s",
next_api_url, excep.__class__.__name__, excep)
continue

self.clone_queue.put(indexer.GitRepository(
clone_url, repo["full_name"], "Bitbucket"))
clone_url, repo["full_name"], "Bitbucket"),
rank if rank < 1.0 else 1.0)

next_api_url = response["next"]
time.sleep(0.2)

Loading…
取消
儲存