|
|
@@ -15,20 +15,22 @@ def crawl(): |
|
|
|
Initialize all crawlers (and indexers). |
|
|
|
|
|
|
|
Start the: |
|
|
|
1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler` |
|
|
|
2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer` |
|
|
|
1. GitHub crawler, :class:`crawler.GitHubCrawler`. |
|
|
|
2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`. |
|
|
|
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. |
|
|
|
""" |
|
|
|
|
|
|
|
MAX_URL_QUEUE_SIZE = 5e3 |
|
|
|
DEBUG_FILE = "crawler.log" |
|
|
|
|
|
|
|
logging.basicConfig(filename=DEBUG_FILE, |
|
|
|
format="%(asctime)s:\t%(threadName)s:\t%(message)s", |
|
|
|
format="%(levelname)s %(asctime)s:\t%(threadName)s:\t%(message)s", |
|
|
|
level=logging.DEBUG) |
|
|
|
|
|
|
|
repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) |
|
|
|
github_crawler = crawler.GitHubCrawler(repository_queue) |
|
|
|
git_indexer = indexer.GitIndexer(repository_queue) |
|
|
|
repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) |
|
|
|
threads = [crawler.GitHubCrawler(repo_clone_queue), |
|
|
|
crawler.BitbucketCrawler(repo_clone_queue), |
|
|
|
indexer.GitIndexer(repo_clone_queue)] |
|
|
|
|
|
|
|
for thread in [github_crawler, git_indexer]: |
|
|
|
for thread in threads: |
|
|
|
thread.start() |