From 7c5c9fc7e1c99c1d67146570c43e60d0b04c899f Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Sat, 3 May 2014 22:20:12 -0400
Subject: [PATCH] Add GitHub stars, Bitbucket watchers; close #14.

Add:
    bitshift/crawler/crawler.py
        -Add more efficient method of querying GitHub's API for stargazer
        counts, by batching 25 repositories per request.
        -Add watcher counts for Bitbucket repositories, by querying the
        Bitbucket API once per repository (inefficient, but the API in question
        isn't sufficiently robust to accommodate a better approach, and Git
        repositories surface so infrequently that there shouldn't be any query
        limit problems).
---
 bitshift/crawler/__init__.py |   1 +
 bitshift/crawler/crawler.py  | 111 +++++++++++++++++++++++++++----------------
 2 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py
index cfec64c..73b1c22 100644
--- a/bitshift/crawler/__init__.py
+++ b/bitshift/crawler/__init__.py
@@ -39,6 +39,7 @@ def _configure_logging():
         os.mkdir(LOG_FILE_DIR)
 
     logging.getLogger("requests").setLevel(logging.WARNING)
+    logging.getLogger("urllib3").setLevel(logging.WARNING)
 
     formatter = logging.Formatter(
             fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"
diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index 785ac61..9501bd0 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -63,8 +63,7 @@ class GitHubCrawler(threading.Thread):
             start_time = time.time()
 
             try:
-                response = requests.get(next_api_url,
-                        params=self.AUTHENTICATION)
+                resp = requests.get(next_api_url, params=self.AUTHENTICATION)
             except ConnectionError as excep:
                 self._logger.warning("API %s call failed: %s: %s",
                         next_api_url, excep.__class__.__name__, excep)
@@ -77,66 +76,84 @@ class GitHubCrawler(threading.Thread):
                     ((self.clone_queue.qsize(), self.clone_queue.maxsize,
                     queue_percent_full)))
 
-            for repo in response.json():
+            repo_names = [repo["full_name"] for repo in resp.json()]
+            repo_stars = self._get_repositories_stars(repo_names)
+
+            for repo in resp.json():
                 while self.clone_queue.full():
                     time.sleep(1)
 
                 self.clone_queue.put(indexer.GitRepository(
                         repo["html_url"], repo["full_name"].replace("/", ""),
-                        "GitHub",
-                        #self._get_repo_stars(repo["full_name"]))
-                        0))
+                        "GitHub", repo_stars[repo["full_name"]]))
 
-            if int(response.headers["x-ratelimit-remaining"]) == 0:
-                time.sleep(int(response.headers["x-ratelimit-reset"]) -
+            if int(resp.headers["x-ratelimit-remaining"]) == 0:
+                time.sleep(int(resp.headers["x-ratelimit-reset"]) -
                         time.time())
 
-            next_api_url = response.headers["link"].split(">")[0][1:]
+            next_api_url = resp.headers["link"].split(">")[0][1:]
 
             sleep_time = api_request_interval - (time.time() - start_time)
             if sleep_time > 0:
                 time.sleep(sleep_time)
 
-    def _get_repo_stars(self, repo_name):
+    def _get_repositories_stars(self, repo_names):
         """
-        Return the number of stargazers for a repository.
+        Return the number of stargazers for several repositories.
 
-        Queries the GitHub API for the number of stargazers for a given
-        repository, and blocks if the query limit is exceeded.
+        Queries the GitHub API for the number of stargazers for any given
+        repositories, and blocks if the query limit is exceeded.
 
-        :param repo_name: The name of the repository, in
+        :param repo_names: An array of repository names, in
             `username/repository_name` format.
 
-        :type repo_name: str
-
-        :return: The number of stargazers for the repository.
-        :rtype: int
-        """
-
-        API_URL = "https://api.github.com/search/repositories"
+        :type repo_names: str
 
-        params = self.AUTHENTICATION
-        params["q"] = "repo:%s" % repo_name
+        :return: A dictionary with repository name keys, and corresponding
+            stargazer count values.
 
-        resp = requests.get(API_URL,
-                params=params,
-                headers={
-                    "Accept" : "application/vnd.github.preview"
-                })
+            Example dictionary:
+            .. code-block:: python
+                {
+                    "user/repository" : 100
+                }
 
-        if int(resp.headers["x-ratelimit-remaining"]) == 0:
-            sleep_time = int(resp.headers["x-ratelimit-reset"]) - time.time()
-            if sleep_time > 0:
-                logging.info("API quota exceeded. Sleep time: %d." % sleep_time)
-                time.sleep(sleep_time)
+        :rtype: dictionary
+        """
 
-        if "items" not in resp.json() or len(resp.json()["items"]) == 0:
-            self._logger.critical("No API result: %s. Result: %s" % (resp.url,
-                    str(resp.json())))
-            return 0
-        else:
-            rank = float(resp.json()["items"][0]["stargazers_count"]) / 1000
-            return rank if rank < 1.0 else 1.0
+        API_URL = "https://api.github.com/search/repositories"
+        REPOS_PER_QUERY = 25
+
+        repo_stars = {}
+        for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in
+                xrange(0, len(repo_names), REPOS_PER_QUERY)]:
+            query_url = "%s?q=%s" % (API_URL,
+                "+".join("repo:%s" % name for name in names))
+
+            params = self.AUTHENTICATION
+            resp = requests.get(query_url,
+                    params=params,
+                    headers={
+                        "Accept" : "application/vnd.github.preview"
+                    })
+
+            if int(resp.headers["x-ratelimit-remaining"]) == 0:
+                sleep_time = int(resp.headers["x-ratelimit-reset"]) - \
+                        time.time() + 1
+                if sleep_time > 0:
+                    logging.info("API quota exceeded. Sleep time: %d." %
+                            sleep_time)
+                    time.sleep(sleep_time)
+
+            for repo in resp.json()["items"]:
+                rank = float(repo["stargazers_count"]) / 1000
+                repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0
+
+        for name in repo_names:
+            if name not in repo_stars:
+                repo_stars[name] = 0.5
+
+        return repo_stars
 
 class BitbucketCrawler(threading.Thread):
     """
@@ -204,8 +221,20 @@ class BitbucketCrawler(threading.Thread):
                             clone_links[0]["name"] == "https" else
                             clone_links[1]["href"])
                     links.append("clone_url")
+
+                    try:
+                        watchers = requests.get(
+                                repo["links"]["watchers"]["href"])
+                        rank = len(watchers.json()["values"]) / 100
+                    except ConnectionError as exception:
+                        time.sleep(0.5)
+                        self._logger.warning("API %s call failed: %s: %s",
+                                next_api_url, excep.__class__.__name__, excep)
+                        continue
+
                     self.clone_queue.put(indexer.GitRepository(
-                        clone_url, repo["full_name"], "Bitbucket"))
+                        clone_url, repo["full_name"], "Bitbucket"),
+                        rank if rank < 1.0 else 1.0)
 
             next_api_url = response["next"]
             time.sleep(0.2)