Browse Source

Part of #26. Move __init__.py to crawl.py.

Add:
    bitshift/crawler/(__init__, crawl).py
        -Move `__init__.py` to `crawl.py`, and add a `main` block to allow
        running the crawler via `python -m`.
tags/v1.0^2
Severyn Kozak 10 years ago
parent
commit
f8436fa484
2 changed files with 59 additions and 55 deletions
  1. +0
    -55
      bitshift/crawler/__init__.py
  2. +59
    -0
      bitshift/crawler/crawl.py

+ 0
- 55
bitshift/crawler/__init__.py View File

@@ -1,55 +0,0 @@
"""
:synopsis: Parent crawler module, which supervises all crawlers.

Contains functions for initializing all subsidiary, threaded crawlers.
"""

import logging, logging.handlers, os, Queue

from bitshift.crawler import crawler, indexer

__all__ = ["crawl"]

def crawl():
"""
Initialize all crawlers (and indexers).

Start the:
1. GitHub crawler, :class:`crawler.GitHubCrawler`.
2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`.
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
"""

_configure_logging()

MAX_URL_QUEUE_SIZE = 5e3

repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
threads = [crawler.GitHubCrawler(repo_clone_queue),
crawler.BitbucketCrawler(repo_clone_queue),
indexer.GitIndexer(repo_clone_queue)]

for thread in threads:
thread.start()

def _configure_logging():
LOG_FILE_DIR = "log"

if not os.path.exists(LOG_FILE_DIR):
os.mkdir(LOG_FILE_DIR)

logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

formatter = logging.Formatter(
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S")

handler = logging.handlers.TimedRotatingFileHandler(
"%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1,
backupCount=20)
handler.setFormatter(formatter)

root_logger = logging.getLogger()
root_logger.addHandler(handler)
root_logger.setLevel(logging.NOTSET)

+ 59
- 0
bitshift/crawler/crawl.py View File

@@ -0,0 +1,59 @@
"""
:synopsis: Parent crawler module, which supervises all crawlers.

Contains functions for initializing all subsidiary, threaded crawlers.
"""

import logging, logging.handlers, os, Queue

from bitshift.crawler import crawler, indexer

__all__ = ["crawl"]

def crawl():
"""
Initialize all crawlers (and indexers).

Start the:
1. GitHub crawler, :class:`crawler.GitHubCrawler`.
2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`.
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
"""

_configure_logging()

MAX_URL_QUEUE_SIZE = 5e3

repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
threads = [crawler.GitHubCrawler(repo_clone_queue),
crawler.BitbucketCrawler(repo_clone_queue),
indexer.GitIndexer(repo_clone_queue)]

for thread in threads:
thread.start()

def _configure_logging():
LOG_FILE_DIR = "log"

if not os.path.exists(LOG_FILE_DIR):
os.mkdir(LOG_FILE_DIR)

logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

formatter = logging.Formatter(
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S")

handler = logging.handlers.TimedRotatingFileHandler(
"%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1,
backupCount=20)
handler.setFormatter(formatter)

root_logger = logging.getLogger()
root_logger.addHandler(handler)
root_logger.setLevel(logging.NOTSET)

if __name__ == "__main__":
_configure_logging()
crawl()

Loading…
Cancel
Save