瀏覽代碼

Use logs to calculate ranks (closes #61).

tags/v1.0^2
Ben Kurtovic 10 年之前
父節點
當前提交
ddcb5b221f
共有 1 個文件被更改,包括 19 次插入17 次删除
  1. +19
    -17
      bitshift/crawler/crawler.py

+ 19
- 17
bitshift/crawler/crawler.py 查看文件

@@ -5,6 +5,7 @@ Contains all website/framework-specific Class crawlers.
"""

import logging
import math
import time
import threading

@@ -78,7 +79,7 @@ class GitHubCrawler(threading.Thread):
queue_percent_full)))

repo_names = [repo["full_name"] for repo in resp.json()]
repo_stars = self._get_repositories_stars(repo_names)
repo_ranks = self._get_repository_ranks(repo_names)

for repo in resp.json():
while self.clone_queue.full():
@@ -86,7 +87,7 @@ class GitHubCrawler(threading.Thread):

self.clone_queue.put(indexer.GitRepository(
repo["html_url"], repo["full_name"].replace("/", ""),
"GitHub", repo_stars[repo["full_name"]]))
"GitHub", repo_ranks[repo["full_name"]]))

if int(resp.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(resp.headers["x-ratelimit-reset"]) -
@@ -98,25 +99,25 @@ class GitHubCrawler(threading.Thread):
if sleep_time > 0:
time.sleep(sleep_time)

def _get_repositories_stars(self, repo_names):
def _get_repository_ranks(self, repo_names):
"""
Return the number of stargazers for several repositories.
Return the ranks for several repositories.

Queries the GitHub API for the number of stargazers for any given
repositories, and blocks if the query limit is exceeded.
repositories, and blocks if the query limit is exceeded. The rank is
calculated using these numbers.

:param repo_names: An array of repository names, in
`username/repository_name` format.

:type repo_names: str

:return: A dictionary with repository name keys, and corresponding
stargazer count values.
:return: A dictionary mapping repository names to ranks.

Example dictionary:
.. code-block:: python
{
"user/repository" : 100
"user/repository" : 0.2564949357461537
}

:rtype: dictionary
@@ -125,7 +126,7 @@ class GitHubCrawler(threading.Thread):
API_URL = "https://api.github.com/search/repositories"
REPOS_PER_QUERY = 25

repo_stars = {}
repo_ranks = {}
for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in
xrange(0, len(repo_names), REPOS_PER_QUERY)]:
query_url = "%s?q=%s" % (API_URL,
@@ -147,14 +148,15 @@ class GitHubCrawler(threading.Thread):
time.sleep(sleep_time)

for repo in resp.json()["items"]:
rank = float(repo["stargazers_count"]) / 1000
repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0
stars = repo["stargazers_count"]
rank = min(math.log(max(stars, 1), 5000), 1.0)
repo_ranks[repo["full_name"]] = rank

for name in repo_names:
if name not in repo_stars:
repo_stars[name] = 0.5
if name not in repo_ranks:
repo_ranks[name] = 0.1

return repo_stars
return repo_ranks

class BitbucketCrawler(threading.Thread):
"""
@@ -225,7 +227,8 @@ class BitbucketCrawler(threading.Thread):
try:
watchers = requests.get(
repo["links"]["watchers"]["href"])
rank = len(watchers.json()["values"]) / 100
num = len(watchers.json()["values"])
rank = min(math.log(max(num, 1), 500), 1.0)
except requests.ConnectionError:
err = "API %s call failed:" % next_api_url
self._logger.exception(err)
@@ -233,8 +236,7 @@ class BitbucketCrawler(threading.Thread):
continue

self.clone_queue.put(indexer.GitRepository(
clone_url, repo["full_name"], "Bitbucket"),
rank if rank < 1.0 else 1.0)
clone_url, repo["full_name"], "Bitbucket"), rank)

next_api_url = response["next"]
time.sleep(0.2)

Loading…
取消
儲存