Browse Source

Support crawling specific repos; add some logging.

tags/v1.0^2
Ben Kurtovic 10 years ago
parent
commit
1c0c4104e5
4 changed files with 34 additions and 15 deletions
  1. +27
    -11
      bitshift/crawler/crawl.py
  2. +4
    -3
      bitshift/crawler/crawler.py
  3. +2
    -0
      bitshift/crawler/indexer.py
  4. +1
    -1
      bitshift/languages.yml

+ 27
- 11
bitshift/crawler/crawl.py View File

@@ -12,11 +12,14 @@ import sys
import time
from threading import Event

from bitshift.crawler import crawler, indexer
from bitshift.parser import start_parse_servers
from .crawler import GitHubCrawler, BitbucketCrawler
from .indexer import GitIndexer, GitRepository
from ..parser import start_parse_servers

__all__ = ["crawl"]

MAX_URL_QUEUE_SIZE = 5e3

def crawl():
"""
Initialize all crawlers (and indexers).
@@ -28,17 +31,24 @@ def crawl():
"""

_configure_logging()

MAX_URL_QUEUE_SIZE = 5e3
parse_servers = start_parse_servers()

repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
run_event = Event()
run_event.set()
threads = [crawler.GitHubCrawler(repo_clone_queue, run_event),
crawler.BitbucketCrawler(repo_clone_queue, run_event),
indexer.GitIndexer(repo_clone_queue, run_event)]
threads = [GitIndexer(repo_clone_queue, run_event)]

if sys.argv[1:]:
names = sys.argv[1:]
ranks = GitHubCrawler.get_ranks(names)
for name in names:
repo = GitRepository("https://github.com/" + name, name, "GitHub",
ranks[name])
repo_clone_queue.put(repo)
else:
threads += [GitHubCrawler(repo_clone_queue, run_event),
BitbucketCrawler(repo_clone_queue, run_event)]

parse_servers = start_parse_servers()
time.sleep(5)
for thread in threads:
thread.start()
@@ -48,6 +58,8 @@ def crawl():
time.sleep(0.1)
except KeyboardInterrupt:
run_event.clear()
with repo_clone_queue.mutex:
repo_clone_queue.queue.clear()
for thread in threads:
thread.join()
for server in parse_servers:
@@ -69,13 +81,17 @@ def _configure_logging():
fmt=("%(asctime)s %(levelname)s %(name)s:%(funcName)s"
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S")

handler = logging.handlers.TimedRotatingFileHandler(
file_handler = logging.handlers.TimedRotatingFileHandler(
"%s/%s" % (log_dir, "app.log"), when="H", interval=1,
backupCount=20)
handler.setFormatter(formatter)
stream_handler = logging.StreamHandler()
file_handler.setFormatter(formatter)
stream_handler.setFormatter(formatter)

root_logger = logging.getLogger()
root_logger.addHandler(handler)
root_logger.handlers = []
root_logger.addHandler(file_handler)
root_logger.addHandler(stream_handler)
root_logger.setLevel(logging.NOTSET)

if __name__ == "__main__":


+ 4
- 3
bitshift/crawler/crawler.py View File

@@ -79,7 +79,7 @@ class GitHubCrawler(threading.Thread):
queue_percent_full)))

repo_names = [repo["full_name"] for repo in resp.json()]
repo_ranks = self._get_repository_ranks(repo_names)
repo_ranks = self.get_ranks(repo_names)

for repo in resp.json():
while self.clone_queue.full():
@@ -99,7 +99,8 @@ class GitHubCrawler(threading.Thread):
if sleep_time > 0:
time.sleep(sleep_time)

def _get_repository_ranks(self, repo_names):
@classmethod
def get_ranks(cls, repo_names):
"""
Return the ranks for several repositories.

@@ -132,7 +133,7 @@ class GitHubCrawler(threading.Thread):
query_url = "%s?q=%s" % (API_URL,
"+".join("repo:%s" % name for name in names))

params = self.AUTHENTICATION
params = cls.AUTHENTICATION
resp = requests.get(query_url,
params=params,
headers={


+ 2
- 0
bitshift/crawler/indexer.py View File

@@ -127,6 +127,7 @@ class GitIndexer(threading.Thread):
:type repo_url: :class:`GitRepository`
"""

self._logger.info(u"Indexing repo: %s", repo.name)
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
try:
self._insert_repository_codelets(repo)
@@ -170,6 +171,7 @@ class GitIndexer(threading.Thread):
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"],
repo.rank)
self._logger.debug("Indexing file: %s", codelet.name)
try:
parse(codelet)
except UnsupportedFileError:


+ 1
- 1
bitshift/languages.yml View File

@@ -9,7 +9,6 @@ languages:
- Python console session
- Python Traceback
- NumPy
- PyPy Log
- C
- Java
- Ruby:
@@ -276,6 +275,7 @@ languages:
- Properties
- Protocol Buffer
- Puppet
- PyPy Log
- QBasic
- QML
- Racket


Loading…
Cancel
Save