Add partially integrated BitbucketCrawler().

Add: bitshift/crawler/ __init__.py -Initialize 'BitbucketCrawler()' singleton. -Instantiate all thread instances on-the-fly in a 'threads' array, as opposed to individual named variables. crawler.py -Add 'BitbucketCrawler()', to crawl Bitbucket for repositories. -Not entirely tested for proper functionality. -The Bitbucket framework is not yet accounted for in 'indexer._generate_file_url()'.
10 年前 · 2954161747
--- a/bitshift/crawler/init.py
+++ b/bitshift/crawler/init.py
@@ -15,20 +15,22 @@ def crawl():
    Initialize all crawlers (and indexers).

    Start the:
    1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler`
    2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`
    1. GitHub crawler, :class:`crawler.GitHubCrawler`.
    2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`.
    3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
    """

    MAX_URL_QUEUE_SIZE = 5e3
    DEBUG_FILE = "crawler.log"

    logging.basicConfig(filename=DEBUG_FILE,
            format="%(asctime)s:\t%(threadName)s:\t%(message)s",
            format="%(levelname)s %(asctime)s:\t%(threadName)s:\t%(message)s",
            level=logging.DEBUG)

    repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
    github_crawler = crawler.GitHubCrawler(repository_queue)
    git_indexer = indexer.GitIndexer(repository_queue)
    repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
    threads = [crawler.GitHubCrawler(repo_clone_queue),
            crawler.BitbucketCrawler(repo_clone_queue),
            indexer.GitIndexer(repo_clone_queue)]

    for thread in [github_crawler, git_indexer]:
    for thread in threads:
        thread.start()