Add: bitshift/crawler/(__init__, crawl).py -Move `__init__.py` to `crawl.py`, and add a `main` block to allow running the crawler via `python -m`.tags/v1.0^2
@@ -1,55 +0,0 @@ | |||||
""" | |||||
:synopsis: Parent crawler module, which supervises all crawlers. | |||||
Contains functions for initializing all subsidiary, threaded crawlers. | |||||
""" | |||||
import logging, logging.handlers, os, Queue | |||||
from bitshift.crawler import crawler, indexer | |||||
__all__ = ["crawl"] | |||||
def crawl(): | |||||
""" | |||||
Initialize all crawlers (and indexers). | |||||
Start the: | |||||
1. GitHub crawler, :class:`crawler.GitHubCrawler`. | |||||
2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`. | |||||
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. | |||||
""" | |||||
_configure_logging() | |||||
MAX_URL_QUEUE_SIZE = 5e3 | |||||
repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) | |||||
threads = [crawler.GitHubCrawler(repo_clone_queue), | |||||
crawler.BitbucketCrawler(repo_clone_queue), | |||||
indexer.GitIndexer(repo_clone_queue)] | |||||
for thread in threads: | |||||
thread.start() | |||||
def _configure_logging(): | |||||
LOG_FILE_DIR = "log" | |||||
if not os.path.exists(LOG_FILE_DIR): | |||||
os.mkdir(LOG_FILE_DIR) | |||||
logging.getLogger("requests").setLevel(logging.WARNING) | |||||
logging.getLogger("urllib3").setLevel(logging.WARNING) | |||||
formatter = logging.Formatter( | |||||
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" | |||||
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S") | |||||
handler = logging.handlers.TimedRotatingFileHandler( | |||||
"%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1, | |||||
backupCount=20) | |||||
handler.setFormatter(formatter) | |||||
root_logger = logging.getLogger() | |||||
root_logger.addHandler(handler) | |||||
root_logger.setLevel(logging.NOTSET) |
@@ -0,0 +1,59 @@ | |||||
""" | |||||
:synopsis: Parent crawler module, which supervises all crawlers. | |||||
Contains functions for initializing all subsidiary, threaded crawlers. | |||||
""" | |||||
import logging, logging.handlers, os, Queue | |||||
from bitshift.crawler import crawler, indexer | |||||
__all__ = ["crawl"] | |||||
def crawl(): | |||||
""" | |||||
Initialize all crawlers (and indexers). | |||||
Start the: | |||||
1. GitHub crawler, :class:`crawler.GitHubCrawler`. | |||||
2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`. | |||||
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. | |||||
""" | |||||
_configure_logging() | |||||
MAX_URL_QUEUE_SIZE = 5e3 | |||||
repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) | |||||
threads = [crawler.GitHubCrawler(repo_clone_queue), | |||||
crawler.BitbucketCrawler(repo_clone_queue), | |||||
indexer.GitIndexer(repo_clone_queue)] | |||||
for thread in threads: | |||||
thread.start() | |||||
def _configure_logging(): | |||||
LOG_FILE_DIR = "log" | |||||
if not os.path.exists(LOG_FILE_DIR): | |||||
os.mkdir(LOG_FILE_DIR) | |||||
logging.getLogger("requests").setLevel(logging.WARNING) | |||||
logging.getLogger("urllib3").setLevel(logging.WARNING) | |||||
formatter = logging.Formatter( | |||||
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" | |||||
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S") | |||||
handler = logging.handlers.TimedRotatingFileHandler( | |||||
"%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1, | |||||
backupCount=20) | |||||
handler.setFormatter(formatter) | |||||
root_logger = logging.getLogger() | |||||
root_logger.addHandler(handler) | |||||
root_logger.setLevel(logging.NOTSET) | |||||
if __name__ == "__main__": | |||||
_configure_logging() | |||||
crawl() |