A semantic search engine for source code https://bitshift.benkurtovic.com/
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 
 
 

95 lignes
2.7 KiB

  1. """
  2. :synopsis: Parent crawler module, which supervises all crawlers.
  3. Contains functions for initializing all subsidiary, threaded crawlers.
  4. """
  5. import logging
  6. import logging.handlers
  7. import os
  8. import Queue
  9. import sys
  10. import time
  11. from threading import Event
  12. from .crawler import GitHubCrawler, BitbucketCrawler
  13. from .indexer import GitIndexer, GitRepository
  14. __all__ = ["crawl"]
  15. MAX_URL_QUEUE_SIZE = 5e3
  16. def crawl():
  17. """
  18. Initialize all crawlers (and indexers).
  19. Start the:
  20. 1. GitHub crawler, :class:`crawler.GitHubCrawler`.
  21. 2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`.
  22. 3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
  23. """
  24. _configure_logging()
  25. time.sleep(5)
  26. repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
  27. run_event = Event()
  28. run_event.set()
  29. threads = [GitIndexer(repo_clone_queue, run_event)]
  30. if sys.argv[1:]:
  31. names = sys.argv[1:]
  32. ranks = GitHubCrawler.get_ranks(names)
  33. for name in names:
  34. repo = GitRepository("https://github.com/" + name, name, "GitHub",
  35. ranks[name])
  36. repo_clone_queue.put(repo)
  37. else:
  38. threads += [GitHubCrawler(repo_clone_queue, run_event),
  39. BitbucketCrawler(repo_clone_queue, run_event)]
  40. for thread in threads:
  41. thread.start()
  42. try:
  43. while 1:
  44. time.sleep(0.1)
  45. except KeyboardInterrupt:
  46. run_event.clear()
  47. with repo_clone_queue.mutex:
  48. repo_clone_queue.queue.clear()
  49. for thread in threads:
  50. thread.join()
  51. def _configure_logging():
  52. # This isn't ideal, since it means the bitshift python package must be kept
  53. # inside the app, but it works for now:
  54. root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
  55. log_dir = os.path.join(root, "logs")
  56. if not os.path.exists(log_dir):
  57. os.mkdir(log_dir)
  58. logging.getLogger("requests").setLevel(logging.WARNING)
  59. logging.getLogger("urllib3").setLevel(logging.WARNING)
  60. formatter = logging.Formatter(
  61. fmt=("%(asctime)s %(levelname)s %(name)s %(message)s"),
  62. datefmt="%y-%m-%d %H:%M:%S")
  63. file_handler = logging.handlers.TimedRotatingFileHandler(
  64. "%s/%s" % (log_dir, "app.log"), when="H", interval=1,
  65. backupCount=20)
  66. stream_handler = logging.StreamHandler()
  67. file_handler.setFormatter(formatter)
  68. stream_handler.setFormatter(formatter)
  69. root_logger = logging.getLogger()
  70. root_logger.handlers = []
  71. root_logger.addHandler(file_handler)
  72. root_logger.addHandler(stream_handler)
  73. root_logger.setLevel(logging.NOTSET)
  74. if __name__ == "__main__":
  75. crawl()