From b680756f8dba4f5ab3690f069f5520978846fc06 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Wed, 16 Apr 2014 13:32:04 -0400
Subject: [PATCH] Test crawler, complete documentation.

Add, Fix:
    bitshift/crawler/
        __init__.py
            -add module and crawl() docstrings.
            -add repository_queue size limit.

        crawler.py
            -account for time spent executing an API query in the run() loop
            sleep() interval.
---
 bitshift/crawler/__init__.py                    |  18 +++-
 bitshift/crawler/crawler.py                     | 106 +++++++++++++++++-------
 bitshift/crawler/{git_indexer.py => indexer.py} |   0
 3 files changed, 91 insertions(+), 33 deletions(-)
 rename bitshift/crawler/{git_indexer.py => indexer.py} (100%)

diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py
index f38a187..6c13be9 100644
--- a/bitshift/crawler/__init__.py
+++ b/bitshift/crawler/__init__.py
@@ -1,3 +1,9 @@
+"""
+:synopsis: Parent crawler module, which supervises all crawlers.
+
+Contains functions for initializing all subsidiary, threaded crawlers.
+"""
+
 import Queue
 
 from bitshift.crawler import crawler
@@ -5,8 +11,18 @@ from bitshift.crawler import git_indexer
 
 __all__ = ["crawl"]
 
+MAX_URL_QUEUE_SIZE = 5e3
+
 def crawl():
-    repository_queue = Queue.Queue()
+    """
+    Initialize all crawlers (and indexers).
+
+    Start the:
+    1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler`
+    2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer`
+    """
+
+    repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
     github_crawler = crawler.GitHubCrawler(repository_queue)
     indexer = git_indexer.GitIndexer(repository_queue)
 
diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index fc1aadb..5b0f600 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -12,46 +12,88 @@ from ..codelet import Codelet
 from ..database import Database
 
 class GitHubCrawler(threading.Thread):
+    """
+    Crawler that retrieves links to all of GitHub's public repositories.
+
+    GitHubCrawler is a threaded singleton that queries GitHub's API for URLs
+    to its public repositories, which it inserts into a :class:`Queue.Queue`
+    shared with :class:`bitshift.crawler.git_indexer.GitIndexer`.
+
+    :ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with
+        repository information retrieved by `GitHubCrawler`, and other Git
+        crawlers, to be processed by
+        :class:`bitshift.crawler.git_indexer.GitIndexer`.
+    """
+
     def __init__(self, repository_queue):
+        """
+        Create an instance of the singleton `GitHubCrawler`.
+
+        :param repository_queue: A queue containing dictionaries of  repository
+            metadata retrieved by `GitHubCrawler`, meant to be processed by an
+            instance of :class:`bitshift.crawler.git_indexer.GitIndexer`.
+
+            .. code-block:: python
+                sample_dict = {
+                    "url" : "https://github.com/user/repo",
+                    "name" : "repo",
+                    "framework_name" : "GitHub"
+                }
+
+        :type repository_queue: :class:`Queue.Queue`
+        """
+
+
         self.repository_queue = repository_queue
         super(GitHubCrawler, self).__init__()
 
-    def run():
-        _github()
+    def run(self):
+        """
+        Query the GitHub API for data about every public repository.
 
-def _github():
-    """
-    Query the GitHub API for data about every public repository.
+        Pull all of GitHub's repositories by making calls to its API in a loop,
+        accessing a subsequent page of results via the "next" URL returned in an
+        API response header. Uses Severyn Kozak's (sevko) authentication
+        credentials.
+        """
 
-    Pull all of GitHub's repositories by making calls to its API in a loop,
-    accessing a subsequent page of results via the "next" URL returned in an
-    API response header. Uses Severyn Kozak's (sevko) authentication
-    credentials.
-    """
+        next_api_url = "https://api.github.com/repositories"
+        authentication_params = {
+            "client_id" : "436cb884ae09be7f2a4e",
+            "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
+        }
+        api_request_interval = 5e3 / 60 ** 2
 
-    next_api_url = "https://api.github.com/repositories"
-    authentication_params = {
-        "client_id" : "436cb884ae09be7f2a4e",
-        "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
-    }
-    api_request_interval = 5e3 / 60 ** 2
-
-    while len(next_api_url) > 0:
-        start_time = time.time()
-        response = requests.get(next_api_url, params=authentication_params)
-
-        for repo in response.json():
-            self.repository_queue.put({
-                "url" : repo["html_url"],
-                "framework_name" : "GitHub"
+        while len(next_api_url) > 0:
+            # DEBUG
+            db.log.insert({
+                "time" : str(time.time()).split(".")[0][-4:],
+                "qsize" : self.repository_queue.qsize()
             })
-            self.repository_queue.task_done()
 
-        if int(response.headers["x-ratelimit-remaining"]) == 0:
-            time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())
+            start_time = time.time()
+            response = requests.get(next_api_url, params=authentication_params)
+
+            for repo in response.json():
+                logging.basicConfig(filename="crawler.log", level=logging.DEBUG)
+                logging.debug("crawler: %-20s: %-5s: %-5s: %s",
+                             str(time.time()).split(".")[0],
+                             self.repository_queue.qsize(), repo["id"],
+                             repo["name"])
+                while self.repository_queue.full():
+                    pass
+                self.repository_queue.put({
+                    "url" : repo["html_url"],
+                    "name" : repo["html_url"].split("/")[-1],
+                    "framework_name" : "GitHub"
+                })
+
+            if int(response.headers["x-ratelimit-remaining"]) == 0:
+                time.sleep(int(response.headers["x-ratelimit-reset"]) -
+                           time.time())
 
-        next_api_url = response.headers["link"].split(">")[0][1:]
+            next_api_url = response.headers["link"].split(">")[0][1:]
 
-        sleep_time = api_request_interval - (time.time() - start_time)
-        if sleep_time > 0:
-            time.sleep(sleep_time)
+            sleep_time = api_request_interval - (time.time() - start_time)
+            if sleep_time > 0:
+                time.sleep(sleep_time)
diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/indexer.py
similarity index 100%
rename from bitshift/crawler/git_indexer.py
rename to bitshift/crawler/indexer.py