diff --git a/bitshift/crawler/crawl.py b/bitshift/crawler/crawl.py index 2e5ace5..e7b45bf 100644 --- a/bitshift/crawler/crawl.py +++ b/bitshift/crawler/crawl.py @@ -12,11 +12,14 @@ import sys import time from threading import Event -from bitshift.crawler import crawler, indexer -from bitshift.parser import start_parse_servers +from .crawler import GitHubCrawler, BitbucketCrawler +from .indexer import GitIndexer, GitRepository +from ..parser import start_parse_servers __all__ = ["crawl"] +MAX_URL_QUEUE_SIZE = 5e3 + def crawl(): """ Initialize all crawlers (and indexers). @@ -28,17 +31,24 @@ def crawl(): """ _configure_logging() - - MAX_URL_QUEUE_SIZE = 5e3 + parse_servers = start_parse_servers() repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) run_event = Event() run_event.set() - threads = [crawler.GitHubCrawler(repo_clone_queue, run_event), - crawler.BitbucketCrawler(repo_clone_queue, run_event), - indexer.GitIndexer(repo_clone_queue, run_event)] + threads = [GitIndexer(repo_clone_queue, run_event)] + + if sys.argv[1:]: + names = sys.argv[1:] + ranks = GitHubCrawler.get_ranks(names) + for name in names: + repo = GitRepository("https://github.com/" + name, name, "GitHub", + ranks[name]) + repo_clone_queue.put(repo) + else: + threads += [GitHubCrawler(repo_clone_queue, run_event), + BitbucketCrawler(repo_clone_queue, run_event)] - parse_servers = start_parse_servers() time.sleep(5) for thread in threads: thread.start() @@ -48,6 +58,8 @@ def crawl(): time.sleep(0.1) except KeyboardInterrupt: run_event.clear() + with repo_clone_queue.mutex: + repo_clone_queue.queue.clear() for thread in threads: thread.join() for server in parse_servers: @@ -69,13 +81,17 @@ def _configure_logging(): fmt=("%(asctime)s %(levelname)s %(name)s:%(funcName)s" " %(message)s"), datefmt="%y-%m-%d %H:%M:%S") - handler = logging.handlers.TimedRotatingFileHandler( + file_handler = logging.handlers.TimedRotatingFileHandler( "%s/%s" % (log_dir, "app.log"), when="H", interval=1, backupCount=20) - handler.setFormatter(formatter) + stream_handler = logging.StreamHandler() + file_handler.setFormatter(formatter) + stream_handler.setFormatter(formatter) root_logger = logging.getLogger() - root_logger.addHandler(handler) + root_logger.handlers = [] + root_logger.addHandler(file_handler) + root_logger.addHandler(stream_handler) root_logger.setLevel(logging.NOTSET) if __name__ == "__main__": diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index ba04412..0161eec 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -79,7 +79,7 @@ class GitHubCrawler(threading.Thread): queue_percent_full))) repo_names = [repo["full_name"] for repo in resp.json()] - repo_ranks = self._get_repository_ranks(repo_names) + repo_ranks = self.get_ranks(repo_names) for repo in resp.json(): while self.clone_queue.full(): @@ -99,7 +99,8 @@ class GitHubCrawler(threading.Thread): if sleep_time > 0: time.sleep(sleep_time) - def _get_repository_ranks(self, repo_names): + @classmethod + def get_ranks(cls, repo_names): """ Return the ranks for several repositories. @@ -132,7 +133,7 @@ class GitHubCrawler(threading.Thread): query_url = "%s?q=%s" % (API_URL, "+".join("repo:%s" % name for name in names)) - params = self.AUTHENTICATION + params = cls.AUTHENTICATION resp = requests.get(query_url, params=params, headers={ diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index b35559a..8e3b9ac 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -127,6 +127,7 @@ class GitIndexer(threading.Thread): :type repo_url: :class:`GitRepository` """ + self._logger.info(u"Indexing repo: %s", repo.name) with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): try: self._insert_repository_codelets(repo) @@ -170,6 +171,7 @@ class GitIndexer(threading.Thread): commits_meta[filename]["time_created"], commits_meta[filename]["time_last_modified"], repo.rank) + self._logger.debug("Indexing file: %s", codelet.name) try: parse(codelet) except UnsupportedFileError: diff --git a/bitshift/languages.yml b/bitshift/languages.yml index cd29c7e..93c6004 100644 --- a/bitshift/languages.yml +++ b/bitshift/languages.yml @@ -9,7 +9,6 @@ languages: - Python console session - Python Traceback - NumPy - - PyPy Log - C - Java - Ruby: @@ -276,6 +275,7 @@ languages: - Properties - Protocol Buffer - Puppet + - PyPy Log - QBasic - QML - Racket