Test crawler, complete documentation.

Add, Fix: bitshift/crawler/ __init__.py -add module and crawl() docstrings. -add repository_queue size limit. crawler.py -account for time spent executing an API query in the run() loop sleep() interval.
10 years ago · b680756f8d
--- a/bitshift/crawler/init.py
+++ b/bitshift/crawler/init.py
@@ -1,3 +1,9 @@
 """
 :synopsis: Parent crawler module, which supervises all crawlers.

 Contains functions for initializing all subsidiary, threaded crawlers.
 """

 import Queue

 from bitshift.crawler import crawler
@@ -5,8 +11,18 @@ from bitshift.crawler import git_indexer

 __all__ = ["crawl"]

 MAX_URL_QUEUE_SIZE = 5e3

 def crawl():
    repository_queue = Queue.Queue()
    """
    Initialize all crawlers (and indexers).

    Start the:
    1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler`
    2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer`
    """

    repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
    github_crawler = crawler.GitHubCrawler(repository_queue)
    indexer = git_indexer.GitIndexer(repository_queue)

--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -12,46 +12,88 @@ from ..codelet import Codelet
 from ..database import Database

 class GitHubCrawler(threading.Thread):
    """
    Crawler that retrieves links to all of GitHub's public repositories.

    GitHubCrawler is a threaded singleton that queries GitHub's API for URLs
    to its public repositories, which it inserts into a :class:`Queue.Queue`
    shared with :class:`bitshift.crawler.git_indexer.GitIndexer`.

    :ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with
        repository information retrieved by `GitHubCrawler`, and other Git
        crawlers, to be processed by
        :class:`bitshift.crawler.git_indexer.GitIndexer`.
    """

    def __init__(self, repository_queue):
        """
        Create an instance of the singleton `GitHubCrawler`.

        :param repository_queue: A queue containing dictionaries of  repository
            metadata retrieved by `GitHubCrawler`, meant to be processed by an
            instance of :class:`bitshift.crawler.git_indexer.GitIndexer`.

            .. code-block:: python
                sample_dict = {
                    "url" : "https://github.com/user/repo",
                    "name" : "repo",
                    "framework_name" : "GitHub"
                }

        :type repository_queue: :class:`Queue.Queue`
        """


        self.repository_queue = repository_queue
        super(GitHubCrawler, self).__init__()

    def run():
        _github()
    def run(self):
        """
        Query the GitHub API for data about every public repository.

 def _github():
    """
    Query the GitHub API for data about every public repository.
        Pull all of GitHub's repositories by making calls to its API in a loop,
        accessing a subsequent page of results via the "next" URL returned in an
        API response header. Uses Severyn Kozak's (sevko) authentication
        credentials.
        """

    Pull all of GitHub's repositories by making calls to its API in a loop,
    accessing a subsequent page of results via the "next" URL returned in an
    API response header. Uses Severyn Kozak's (sevko) authentication
    credentials.
    """
        next_api_url = "https://api.github.com/repositories"
        authentication_params = {
            "client_id" : "436cb884ae09be7f2a4e",
            "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
        }
        api_request_interval = 5e3 / 60 ** 2

    next_api_url = "https://api.github.com/repositories"
    authentication_params = {
        "client_id" : "436cb884ae09be7f2a4e",
        "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
    }
    api_request_interval = 5e3 / 60 ** 2

    while len(next_api_url) > 0:
        start_time = time.time()
        response = requests.get(next_api_url, params=authentication_params)

        for repo in response.json():
            self.repository_queue.put({
                "url" : repo["html_url"],
                "framework_name" : "GitHub"
        while len(next_api_url) > 0:
            # DEBUG
            db.log.insert({
                "time" : str(time.time()).split(".")[0][-4:],
                "qsize" : self.repository_queue.qsize()
            })
            self.repository_queue.task_done()

        if int(response.headers["x-ratelimit-remaining"]) == 0:
            time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())
            start_time = time.time()
            response = requests.get(next_api_url, params=authentication_params)

            for repo in response.json():
                logging.basicConfig(filename="crawler.log", level=logging.DEBUG)
                logging.debug("crawler: %-20s: %-5s: %-5s: %s",
                             str(time.time()).split(".")[0],
                             self.repository_queue.qsize(), repo["id"],
                             repo["name"])
                while self.repository_queue.full():
                    pass
                self.repository_queue.put({
                    "url" : repo["html_url"],
                    "name" : repo["html_url"].split("/")[-1],
                    "framework_name" : "GitHub"
                })

            if int(response.headers["x-ratelimit-remaining"]) == 0:
                time.sleep(int(response.headers["x-ratelimit-reset"]) -
                           time.time())

        next_api_url = response.headers["link"].split(">")[0][1:]
            next_api_url = response.headers["link"].split(">")[0][1:]

        sleep_time = api_request_interval - (time.time() - start_time)
        if sleep_time > 0:
            time.sleep(sleep_time)
            sleep_time = api_request_interval - (time.time() - start_time)
            if sleep_time > 0:
                time.sleep(sleep_time)
--- a/bitshift/crawler/git_indexer.py
+++ b/bitshift/crawler/git_indexer.py