From 755dce6ae3ca2be4f72e16b09eb9fa6ef9614420 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Thu, 17 Apr 2014 09:53:27 -0400 Subject: [PATCH] Add logging to crawler/indexer. Add: bitshift/crawler/(__init__, crawler, indexer).py -add `logging` module to all `bitshift.crawler` modules, for some basic diagnostic output. --- bitshift/crawler/__init__.py | 11 ++++++++--- bitshift/crawler/crawler.py | 7 +++++-- bitshift/crawler/indexer.py | 26 ++++++++++++++++---------- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index 4875712..39a1a28 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -4,14 +4,12 @@ Contains functions for initializing all subsidiary, threaded crawlers. """ -import Queue +import logging, Queue from bitshift.crawler import crawler, indexer __all__ = ["crawl"] -MAX_URL_QUEUE_SIZE = 5e3 - def crawl(): """ Initialize all crawlers (and indexers). @@ -21,6 +19,13 @@ def crawl(): 2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer` """ + MAX_URL_QUEUE_SIZE = 5e3 + DEBUG_FILE = "crawler.log" + + logging.basicConfig(filename=DEBUG_FILE, + format="%(asctime)s:\t%(threadName)s:\t%(message)s", + level=logging.DEBUG) + repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) github_crawler = crawler.GitHubCrawler(repository_queue) git_indexer = indexer.GitIndexer(repository_queue) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 8b9576d..edd8eaf 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -4,7 +4,7 @@ Contains all website/framework-specific Class crawlers. """ -import requests, time, threading +import logging, requests, time, threading import bitshift.crawler.indexer @@ -44,7 +44,8 @@ class GitHubCrawler(threading.Thread): """ self.repository_queue = repository_queue - super(GitHubCrawler, self).__init__() + logging.info("Starting.") + super(GitHubCrawler, self).__init__(name=self.__class__.__name__) def run(self): """ @@ -66,6 +67,8 @@ class GitHubCrawler(threading.Thread): while len(next_api_url) > 0: start_time = time.time() response = requests.get(next_api_url, params=authentication_params) + logging.info("API call made. Limit remaining: %s." % + response.headers["x-ratelimit-remaining"]) for repo in response.json(): while self.repository_queue.full(): diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 50dbe8c..b1e8e34 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -3,7 +3,7 @@ repositories. """ -import bs4, os, re, shutil, subprocess, threading +import bs4, logging, os, re, shutil, subprocess, threading from ..database import Database from ..codelet import Codelet @@ -35,7 +35,8 @@ class GitIndexer(threading.Thread): if not os.path.exists(GIT_CLONE_DIR): os.makedirs(GIT_CLONE_DIR) - super(GitIndexer, self).__init__() + logging.info("Starting.") + super(GitIndexer, self).__init__(name=self.__class__.__name__) def run(self): """ @@ -53,12 +54,8 @@ class GitIndexer(threading.Thread): repo = self.repository_queue.get() self.repository_queue.task_done() - - try: - _index_repository(repo["url"], repo["name"], - repo["framework_name"]) - except: - pass + _index_repository(repo["url"], repo["name"], + repo["framework_name"]) class _ChangeDir(object): """ @@ -116,15 +113,23 @@ def _index_repository(repo_url, repo_name, framework_name): GIT_CLONE_TIMEOUT = 600 + logging.info("Indexing repository %s." % repo_url) with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \ clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0: + logging.debug("_index_repository(): Cloning %s failed." % repo_url) if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) return with _ChangeDir(repo_name) as repository_dir: - _insert_repository_codelets(repo_url, repo_name, framework_name) + try: + _insert_repository_codelets(repo_url, repo_name, + framework_name) + except Exception as exception: + logging.warning("%s: _insert_repository_codelets" + " failed %s." % (exception, repo_url)) + pass shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) @@ -312,5 +317,6 @@ def _decode(raw): encoding = bs4.BeautifulSoup(raw).original_encoding return raw.decode(encoding) if encoding is not None else None - except: + except Exception as exception: + logging.warning("_debug(): %s", exception) return None