From b680756f8dba4f5ab3690f069f5520978846fc06 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Wed, 16 Apr 2014 13:32:04 -0400 Subject: [PATCH] Test crawler, complete documentation. Add, Fix: bitshift/crawler/ __init__.py -add module and crawl() docstrings. -add repository_queue size limit. crawler.py -account for time spent executing an API query in the run() loop sleep() interval. --- bitshift/crawler/__init__.py | 18 +++- bitshift/crawler/crawler.py | 106 +++++++++++++++++------- bitshift/crawler/{git_indexer.py => indexer.py} | 0 3 files changed, 91 insertions(+), 33 deletions(-) rename bitshift/crawler/{git_indexer.py => indexer.py} (100%) diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index f38a187..6c13be9 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -1,3 +1,9 @@ +""" +:synopsis: Parent crawler module, which supervises all crawlers. + +Contains functions for initializing all subsidiary, threaded crawlers. +""" + import Queue from bitshift.crawler import crawler @@ -5,8 +11,18 @@ from bitshift.crawler import git_indexer __all__ = ["crawl"] +MAX_URL_QUEUE_SIZE = 5e3 + def crawl(): - repository_queue = Queue.Queue() + """ + Initialize all crawlers (and indexers). + + Start the: + 1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler` + 2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer` + """ + + repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) github_crawler = crawler.GitHubCrawler(repository_queue) indexer = git_indexer.GitIndexer(repository_queue) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index fc1aadb..5b0f600 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -12,46 +12,88 @@ from ..codelet import Codelet from ..database import Database class GitHubCrawler(threading.Thread): + """ + Crawler that retrieves links to all of GitHub's public repositories. + + GitHubCrawler is a threaded singleton that queries GitHub's API for URLs + to its public repositories, which it inserts into a :class:`Queue.Queue` + shared with :class:`bitshift.crawler.git_indexer.GitIndexer`. + + :ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with + repository information retrieved by `GitHubCrawler`, and other Git + crawlers, to be processed by + :class:`bitshift.crawler.git_indexer.GitIndexer`. + """ + def __init__(self, repository_queue): + """ + Create an instance of the singleton `GitHubCrawler`. + + :param repository_queue: A queue containing dictionaries of repository + metadata retrieved by `GitHubCrawler`, meant to be processed by an + instance of :class:`bitshift.crawler.git_indexer.GitIndexer`. + + .. code-block:: python + sample_dict = { + "url" : "https://github.com/user/repo", + "name" : "repo", + "framework_name" : "GitHub" + } + + :type repository_queue: :class:`Queue.Queue` + """ + + self.repository_queue = repository_queue super(GitHubCrawler, self).__init__() - def run(): - _github() + def run(self): + """ + Query the GitHub API for data about every public repository. -def _github(): - """ - Query the GitHub API for data about every public repository. + Pull all of GitHub's repositories by making calls to its API in a loop, + accessing a subsequent page of results via the "next" URL returned in an + API response header. Uses Severyn Kozak's (sevko) authentication + credentials. + """ - Pull all of GitHub's repositories by making calls to its API in a loop, - accessing a subsequent page of results via the "next" URL returned in an - API response header. Uses Severyn Kozak's (sevko) authentication - credentials. - """ + next_api_url = "https://api.github.com/repositories" + authentication_params = { + "client_id" : "436cb884ae09be7f2a4e", + "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" + } + api_request_interval = 5e3 / 60 ** 2 - next_api_url = "https://api.github.com/repositories" - authentication_params = { - "client_id" : "436cb884ae09be7f2a4e", - "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" - } - api_request_interval = 5e3 / 60 ** 2 - - while len(next_api_url) > 0: - start_time = time.time() - response = requests.get(next_api_url, params=authentication_params) - - for repo in response.json(): - self.repository_queue.put({ - "url" : repo["html_url"], - "framework_name" : "GitHub" + while len(next_api_url) > 0: + # DEBUG + db.log.insert({ + "time" : str(time.time()).split(".")[0][-4:], + "qsize" : self.repository_queue.qsize() }) - self.repository_queue.task_done() - if int(response.headers["x-ratelimit-remaining"]) == 0: - time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) + start_time = time.time() + response = requests.get(next_api_url, params=authentication_params) + + for repo in response.json(): + logging.basicConfig(filename="crawler.log", level=logging.DEBUG) + logging.debug("crawler: %-20s: %-5s: %-5s: %s", + str(time.time()).split(".")[0], + self.repository_queue.qsize(), repo["id"], + repo["name"]) + while self.repository_queue.full(): + pass + self.repository_queue.put({ + "url" : repo["html_url"], + "name" : repo["html_url"].split("/")[-1], + "framework_name" : "GitHub" + }) + + if int(response.headers["x-ratelimit-remaining"]) == 0: + time.sleep(int(response.headers["x-ratelimit-reset"]) - + time.time()) - next_api_url = response.headers["link"].split(">")[0][1:] + next_api_url = response.headers["link"].split(">")[0][1:] - sleep_time = api_request_interval - (time.time() - start_time) - if sleep_time > 0: - time.sleep(sleep_time) + sleep_time = api_request_interval - (time.time() - start_time) + if sleep_time > 0: + time.sleep(sleep_time) diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/indexer.py similarity index 100% rename from bitshift/crawler/git_indexer.py rename to bitshift/crawler/indexer.py