Browse Source

Support crawling specific repos; add some logging.

tags/v1.0^2
Ben Kurtovic 10 years ago
parent
commit
1c0c4104e5
4 changed files with 34 additions and 15 deletions
  1. +27
    -11
      bitshift/crawler/crawl.py
  2. +4
    -3
      bitshift/crawler/crawler.py
  3. +2
    -0
      bitshift/crawler/indexer.py
  4. +1
    -1
      bitshift/languages.yml

+ 27
- 11
bitshift/crawler/crawl.py View File

@@ -12,11 +12,14 @@ import sys
import time import time
from threading import Event from threading import Event


from bitshift.crawler import crawler, indexer
from bitshift.parser import start_parse_servers
from .crawler import GitHubCrawler, BitbucketCrawler
from .indexer import GitIndexer, GitRepository
from ..parser import start_parse_servers


__all__ = ["crawl"] __all__ = ["crawl"]


MAX_URL_QUEUE_SIZE = 5e3

def crawl(): def crawl():
""" """
Initialize all crawlers (and indexers). Initialize all crawlers (and indexers).
@@ -28,17 +31,24 @@ def crawl():
""" """


_configure_logging() _configure_logging()

MAX_URL_QUEUE_SIZE = 5e3
parse_servers = start_parse_servers()


repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
run_event = Event() run_event = Event()
run_event.set() run_event.set()
threads = [crawler.GitHubCrawler(repo_clone_queue, run_event),
crawler.BitbucketCrawler(repo_clone_queue, run_event),
indexer.GitIndexer(repo_clone_queue, run_event)]
threads = [GitIndexer(repo_clone_queue, run_event)]

if sys.argv[1:]:
names = sys.argv[1:]
ranks = GitHubCrawler.get_ranks(names)
for name in names:
repo = GitRepository("https://github.com/" + name, name, "GitHub",
ranks[name])
repo_clone_queue.put(repo)
else:
threads += [GitHubCrawler(repo_clone_queue, run_event),
BitbucketCrawler(repo_clone_queue, run_event)]


parse_servers = start_parse_servers()
time.sleep(5) time.sleep(5)
for thread in threads: for thread in threads:
thread.start() thread.start()
@@ -48,6 +58,8 @@ def crawl():
time.sleep(0.1) time.sleep(0.1)
except KeyboardInterrupt: except KeyboardInterrupt:
run_event.clear() run_event.clear()
with repo_clone_queue.mutex:
repo_clone_queue.queue.clear()
for thread in threads: for thread in threads:
thread.join() thread.join()
for server in parse_servers: for server in parse_servers:
@@ -69,13 +81,17 @@ def _configure_logging():
fmt=("%(asctime)s %(levelname)s %(name)s:%(funcName)s" fmt=("%(asctime)s %(levelname)s %(name)s:%(funcName)s"
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S") " %(message)s"), datefmt="%y-%m-%d %H:%M:%S")


handler = logging.handlers.TimedRotatingFileHandler(
file_handler = logging.handlers.TimedRotatingFileHandler(
"%s/%s" % (log_dir, "app.log"), when="H", interval=1, "%s/%s" % (log_dir, "app.log"), when="H", interval=1,
backupCount=20) backupCount=20)
handler.setFormatter(formatter)
stream_handler = logging.StreamHandler()
file_handler.setFormatter(formatter)
stream_handler.setFormatter(formatter)


root_logger = logging.getLogger() root_logger = logging.getLogger()
root_logger.addHandler(handler)
root_logger.handlers = []
root_logger.addHandler(file_handler)
root_logger.addHandler(stream_handler)
root_logger.setLevel(logging.NOTSET) root_logger.setLevel(logging.NOTSET)


if __name__ == "__main__": if __name__ == "__main__":


+ 4
- 3
bitshift/crawler/crawler.py View File

@@ -79,7 +79,7 @@ class GitHubCrawler(threading.Thread):
queue_percent_full))) queue_percent_full)))


repo_names = [repo["full_name"] for repo in resp.json()] repo_names = [repo["full_name"] for repo in resp.json()]
repo_ranks = self._get_repository_ranks(repo_names)
repo_ranks = self.get_ranks(repo_names)


for repo in resp.json(): for repo in resp.json():
while self.clone_queue.full(): while self.clone_queue.full():
@@ -99,7 +99,8 @@ class GitHubCrawler(threading.Thread):
if sleep_time > 0: if sleep_time > 0:
time.sleep(sleep_time) time.sleep(sleep_time)


def _get_repository_ranks(self, repo_names):
@classmethod
def get_ranks(cls, repo_names):
""" """
Return the ranks for several repositories. Return the ranks for several repositories.


@@ -132,7 +133,7 @@ class GitHubCrawler(threading.Thread):
query_url = "%s?q=%s" % (API_URL, query_url = "%s?q=%s" % (API_URL,
"+".join("repo:%s" % name for name in names)) "+".join("repo:%s" % name for name in names))


params = self.AUTHENTICATION
params = cls.AUTHENTICATION
resp = requests.get(query_url, resp = requests.get(query_url,
params=params, params=params,
headers={ headers={


+ 2
- 0
bitshift/crawler/indexer.py View File

@@ -127,6 +127,7 @@ class GitIndexer(threading.Thread):
:type repo_url: :class:`GitRepository` :type repo_url: :class:`GitRepository`
""" """


self._logger.info(u"Indexing repo: %s", repo.name)
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
try: try:
self._insert_repository_codelets(repo) self._insert_repository_codelets(repo)
@@ -170,6 +171,7 @@ class GitIndexer(threading.Thread):
commits_meta[filename]["time_created"], commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"], commits_meta[filename]["time_last_modified"],
repo.rank) repo.rank)
self._logger.debug("Indexing file: %s", codelet.name)
try: try:
parse(codelet) parse(codelet)
except UnsupportedFileError: except UnsupportedFileError:


+ 1
- 1
bitshift/languages.yml View File

@@ -9,7 +9,6 @@ languages:
- Python console session - Python console session
- Python Traceback - Python Traceback
- NumPy - NumPy
- PyPy Log
- C - C
- Java - Java
- Ruby: - Ruby:
@@ -276,6 +275,7 @@ languages:
- Properties - Properties
- Protocol Buffer - Protocol Buffer
- Puppet - Puppet
- PyPy Log
- QBasic - QBasic
- QML - QML
- Racket - Racket


Loading…
Cancel
Save