- """
- :synopsis: Parent crawler module, which supervises all crawlers.
-
- Contains functions for initializing all subsidiary, threaded crawlers.
- """
-
- import logging, logging.handlers, os, Queue
-
- from bitshift.crawler import crawler, indexer
-
- __all__ = ["crawl"]
-
- def crawl():
- """
- Initialize all crawlers (and indexers).
-
- Start the:
- 1. GitHub crawler, :class:`crawler.GitHubCrawler`.
- 2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`.
- 3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
- """
-
- _configure_logging()
-
- MAX_URL_QUEUE_SIZE = 5e3
-
- repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
- threads = [crawler.GitHubCrawler(repo_clone_queue),
- crawler.BitbucketCrawler(repo_clone_queue),
- indexer.GitIndexer(repo_clone_queue)]
-
- for thread in threads:
- thread.start()
-
- def _configure_logging():
- LOG_FILE_DIR = "log"
-
- if not os.path.exists(LOG_FILE_DIR):
- os.mkdir(LOG_FILE_DIR)
-
- logging.getLogger("requests").setLevel(logging.WARNING)
- logging.getLogger("urllib3").setLevel(logging.WARNING)
-
- formatter = logging.Formatter(
- fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"
- " %(message)s"), datefmt="%y-%m-%d %H:%M:%S")
-
- handler = logging.handlers.TimedRotatingFileHandler(
- "%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1,
- backupCount=20)
- handler.setFormatter(formatter)
-
- root_logger = logging.getLogger()
- root_logger.addHandler(handler)
- root_logger.setLevel(logging.NOTSET)
|