Browse Source

Use logs to calculate ranks (closes #61).

tags/v1.0^2
Ben Kurtovic 10 years ago
parent
commit
ddcb5b221f
1 changed files with 19 additions and 17 deletions
  1. +19
    -17
      bitshift/crawler/crawler.py

+ 19
- 17
bitshift/crawler/crawler.py View File

@@ -5,6 +5,7 @@ Contains all website/framework-specific Class crawlers.
""" """


import logging import logging
import math
import time import time
import threading import threading


@@ -78,7 +79,7 @@ class GitHubCrawler(threading.Thread):
queue_percent_full))) queue_percent_full)))


repo_names = [repo["full_name"] for repo in resp.json()] repo_names = [repo["full_name"] for repo in resp.json()]
repo_stars = self._get_repositories_stars(repo_names)
repo_ranks = self._get_repository_ranks(repo_names)


for repo in resp.json(): for repo in resp.json():
while self.clone_queue.full(): while self.clone_queue.full():
@@ -86,7 +87,7 @@ class GitHubCrawler(threading.Thread):


self.clone_queue.put(indexer.GitRepository( self.clone_queue.put(indexer.GitRepository(
repo["html_url"], repo["full_name"].replace("/", ""), repo["html_url"], repo["full_name"].replace("/", ""),
"GitHub", repo_stars[repo["full_name"]]))
"GitHub", repo_ranks[repo["full_name"]]))


if int(resp.headers["x-ratelimit-remaining"]) == 0: if int(resp.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(resp.headers["x-ratelimit-reset"]) - time.sleep(int(resp.headers["x-ratelimit-reset"]) -
@@ -98,25 +99,25 @@ class GitHubCrawler(threading.Thread):
if sleep_time > 0: if sleep_time > 0:
time.sleep(sleep_time) time.sleep(sleep_time)


def _get_repositories_stars(self, repo_names):
def _get_repository_ranks(self, repo_names):
""" """
Return the number of stargazers for several repositories.
Return the ranks for several repositories.


Queries the GitHub API for the number of stargazers for any given Queries the GitHub API for the number of stargazers for any given
repositories, and blocks if the query limit is exceeded.
repositories, and blocks if the query limit is exceeded. The rank is
calculated using these numbers.


:param repo_names: An array of repository names, in :param repo_names: An array of repository names, in
`username/repository_name` format. `username/repository_name` format.


:type repo_names: str :type repo_names: str


:return: A dictionary with repository name keys, and corresponding
stargazer count values.
:return: A dictionary mapping repository names to ranks.


Example dictionary: Example dictionary:
.. code-block:: python .. code-block:: python
{ {
"user/repository" : 100
"user/repository" : 0.2564949357461537
} }


:rtype: dictionary :rtype: dictionary
@@ -125,7 +126,7 @@ class GitHubCrawler(threading.Thread):
API_URL = "https://api.github.com/search/repositories" API_URL = "https://api.github.com/search/repositories"
REPOS_PER_QUERY = 25 REPOS_PER_QUERY = 25


repo_stars = {}
repo_ranks = {}
for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in
xrange(0, len(repo_names), REPOS_PER_QUERY)]: xrange(0, len(repo_names), REPOS_PER_QUERY)]:
query_url = "%s?q=%s" % (API_URL, query_url = "%s?q=%s" % (API_URL,
@@ -147,14 +148,15 @@ class GitHubCrawler(threading.Thread):
time.sleep(sleep_time) time.sleep(sleep_time)


for repo in resp.json()["items"]: for repo in resp.json()["items"]:
rank = float(repo["stargazers_count"]) / 1000
repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0
stars = repo["stargazers_count"]
rank = min(math.log(max(stars, 1), 5000), 1.0)
repo_ranks[repo["full_name"]] = rank


for name in repo_names: for name in repo_names:
if name not in repo_stars:
repo_stars[name] = 0.5
if name not in repo_ranks:
repo_ranks[name] = 0.1


return repo_stars
return repo_ranks


class BitbucketCrawler(threading.Thread): class BitbucketCrawler(threading.Thread):
""" """
@@ -225,7 +227,8 @@ class BitbucketCrawler(threading.Thread):
try: try:
watchers = requests.get( watchers = requests.get(
repo["links"]["watchers"]["href"]) repo["links"]["watchers"]["href"])
rank = len(watchers.json()["values"]) / 100
num = len(watchers.json()["values"])
rank = min(math.log(max(num, 1), 500), 1.0)
except requests.ConnectionError: except requests.ConnectionError:
err = "API %s call failed:" % next_api_url err = "API %s call failed:" % next_api_url
self._logger.exception(err) self._logger.exception(err)
@@ -233,8 +236,7 @@ class BitbucketCrawler(threading.Thread):
continue continue


self.clone_queue.put(indexer.GitRepository( self.clone_queue.put(indexer.GitRepository(
clone_url, repo["full_name"], "Bitbucket"),
rank if rank < 1.0 else 1.0)
clone_url, repo["full_name"], "Bitbucket"), rank)


next_api_url = response["next"] next_api_url = response["next"]
time.sleep(0.2) time.sleep(0.2)

Loading…
Cancel
Save