|
|
@@ -12,11 +12,14 @@ import sys |
|
|
|
import time |
|
|
|
from threading import Event |
|
|
|
|
|
|
|
from bitshift.crawler import crawler, indexer |
|
|
|
from bitshift.parser import start_parse_servers |
|
|
|
from .crawler import GitHubCrawler, BitbucketCrawler |
|
|
|
from .indexer import GitIndexer, GitRepository |
|
|
|
from ..parser import start_parse_servers |
|
|
|
|
|
|
|
__all__ = ["crawl"] |
|
|
|
|
|
|
|
MAX_URL_QUEUE_SIZE = 5e3 |
|
|
|
|
|
|
|
def crawl(): |
|
|
|
""" |
|
|
|
Initialize all crawlers (and indexers). |
|
|
@@ -28,17 +31,24 @@ def crawl(): |
|
|
|
""" |
|
|
|
|
|
|
|
_configure_logging() |
|
|
|
|
|
|
|
MAX_URL_QUEUE_SIZE = 5e3 |
|
|
|
parse_servers = start_parse_servers() |
|
|
|
|
|
|
|
repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) |
|
|
|
run_event = Event() |
|
|
|
run_event.set() |
|
|
|
threads = [crawler.GitHubCrawler(repo_clone_queue, run_event), |
|
|
|
crawler.BitbucketCrawler(repo_clone_queue, run_event), |
|
|
|
indexer.GitIndexer(repo_clone_queue, run_event)] |
|
|
|
threads = [GitIndexer(repo_clone_queue, run_event)] |
|
|
|
|
|
|
|
if sys.argv[1:]: |
|
|
|
names = sys.argv[1:] |
|
|
|
ranks = GitHubCrawler.get_ranks(names) |
|
|
|
for name in names: |
|
|
|
repo = GitRepository("https://github.com/" + name, name, "GitHub", |
|
|
|
ranks[name]) |
|
|
|
repo_clone_queue.put(repo) |
|
|
|
else: |
|
|
|
threads += [GitHubCrawler(repo_clone_queue, run_event), |
|
|
|
BitbucketCrawler(repo_clone_queue, run_event)] |
|
|
|
|
|
|
|
parse_servers = start_parse_servers() |
|
|
|
time.sleep(5) |
|
|
|
for thread in threads: |
|
|
|
thread.start() |
|
|
@@ -48,6 +58,8 @@ def crawl(): |
|
|
|
time.sleep(0.1) |
|
|
|
except KeyboardInterrupt: |
|
|
|
run_event.clear() |
|
|
|
with repo_clone_queue.mutex: |
|
|
|
repo_clone_queue.queue.clear() |
|
|
|
for thread in threads: |
|
|
|
thread.join() |
|
|
|
for server in parse_servers: |
|
|
@@ -69,13 +81,17 @@ def _configure_logging(): |
|
|
|
fmt=("%(asctime)s %(levelname)s %(name)s:%(funcName)s" |
|
|
|
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S") |
|
|
|
|
|
|
|
handler = logging.handlers.TimedRotatingFileHandler( |
|
|
|
file_handler = logging.handlers.TimedRotatingFileHandler( |
|
|
|
"%s/%s" % (log_dir, "app.log"), when="H", interval=1, |
|
|
|
backupCount=20) |
|
|
|
handler.setFormatter(formatter) |
|
|
|
stream_handler = logging.StreamHandler() |
|
|
|
file_handler.setFormatter(formatter) |
|
|
|
stream_handler.setFormatter(formatter) |
|
|
|
|
|
|
|
root_logger = logging.getLogger() |
|
|
|
root_logger.addHandler(handler) |
|
|
|
root_logger.handlers = [] |
|
|
|
root_logger.addHandler(file_handler) |
|
|
|
root_logger.addHandler(stream_handler) |
|
|
|
root_logger.setLevel(logging.NOTSET) |
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|