From b7ccec05015cbd011a7ddaa7e2a69462d518af9e Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Tue, 15 Apr 2014 11:08:53 -0400 Subject: [PATCH] Add untested threaded indexer/crawler prototype. Additions are not tested and not yet documented. Add: crawler.py -add threaded GitHubCrawler class, which interacts with a GitIndexer via a Queue. git_indexer.py -add threaded GitIndexer class, which interacts with GitHubCrawler via a Queue. -rename context-manager ChangeDir class to _ChangeDir, because it's essentially "private". __init__.py -add body to crawl(), which creates instances of GitHubCrawler and GitIndexer and starts them. --- bitshift/crawler/__init__.py | 12 ++++++++++-- bitshift/crawler/crawler.py | 20 ++++++++++++++++---- bitshift/crawler/git_indexer.py | 28 ++++++++++++++++++++-------- 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index a518970..f38a187 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -1,6 +1,14 @@ -import crawler +import Queue + +from bitshift.crawler import crawler +from bitshift.crawler import git_indexer __all__ = ["crawl"] def crawl(): - pass + repository_queue = Queue.Queue() + github_crawler = crawler.GitHubCrawler(repository_queue) + indexer = git_indexer.GitIndexer(repository_queue) + + for thread in [github_crawler, indexer]: + thread.start() diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 34f2819..fc1aadb 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -4,14 +4,22 @@ ...more info soon... """ -import requests, time +import requests, time, threading -import git_indexer +import bitshift.crawler.git_indexer from ..codelet import Codelet from ..database import Database -def github(): +class GitHubCrawler(threading.Thread): + def __init__(self, repository_queue): + self.repository_queue = repository_queue + super(GitHubCrawler, self).__init__() + + def run(): + _github() + +def _github(): """ Query the GitHub API for data about every public repository. @@ -33,7 +41,11 @@ def github(): response = requests.get(next_api_url, params=authentication_params) for repo in response.json(): - print repo["id"] + self.repository_queue.put({ + "url" : repo["html_url"], + "framework_name" : "GitHub" + }) + self.repository_queue.task_done() if int(response.headers["x-ratelimit-remaining"]) == 0: time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py index 8cd3ae3..2268895 100644 --- a/bitshift/crawler/git_indexer.py +++ b/bitshift/crawler/git_indexer.py @@ -5,14 +5,26 @@ Add documentation, threaded Indexer class. """ -import shutil, subprocess, os +import os, shutil, subprocess, threading from ..database import Database from ..codelet import Codelet GIT_CLONE_DIR = "/tmp" -class ChangeDir(object): +class GitIndexer(threading.Thread): + def __init__(self, repository_queue): + self.repository_queue = repository_queue + super(GitIndexer, self).__init__() + + def run(self): + while True: + while self.repository_queue.empty(): + pass + new_repo = self.repository_queue.get() + _index_repository(new_repo["url"], new_repo["framework_name"]) + +class _ChangeDir(object): """ A wrapper class for os.chdir(), to map onto `with` and handle exceptions. @@ -22,7 +34,7 @@ class ChangeDir(object): def __init__(self, new_path): """ - Create a ChangeDir instance. + Create a _ChangeDir instance. :param new_path: The directory to enter. @@ -50,7 +62,7 @@ class ChangeDir(object): os.chdir(self.old_path) -def index_repository(repo_url, framework_name): +def _index_repository(repo_url, framework_name): """ Clone and index (create and insert Codeletes for) a Git repository. @@ -70,9 +82,9 @@ def index_repository(repo_url, framework_name): repo_name = repo_url.split("/")[-1] codelets = [] - with ChangeDir(GIT_CLONE_DIR) as git_clone_dir: + with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: subprocess.call("git clone %s" % repo_url, shell=True) - with ChangeDir(repo_name) as repository_dir: + with _ChangeDir(repo_name) as repository_dir: codelets = _insert_repository_codelets(repo_url, repo_name, framework_name) shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) @@ -128,7 +140,7 @@ def _generate_file_url(filename, repo_url, framework_name): :rtype: str """ - if framework_name == "github": + if framework_name == "GitHub": default_branch = subprocess.check_output("git branch --no-color", shell=True)[2:-1] return "%s/blob/%s/%s" % (repo_url, default_branch, filename) @@ -164,7 +176,7 @@ def _get_git_commits(): commits.append({ "author" : fields[0], "timestamp" : int(fields[1]), - "filenames" : fields[2].split("\0")[:-2] + "filenames" : fields[2].split("\x00")[:-2] }) return commits