Browse Source

Add logging to crawler/indexer.

Add:
    bitshift/crawler/(__init__, crawler, indexer).py
        -add `logging` module to all `bitshift.crawler` modules, for some basic
        diagnostic output.
tags/v1.0^2
Severyn Kozak 10 years ago
parent
commit
755dce6ae3
3 changed files with 29 additions and 15 deletions
  1. +8
    -3
      bitshift/crawler/__init__.py
  2. +5
    -2
      bitshift/crawler/crawler.py
  3. +16
    -10
      bitshift/crawler/indexer.py

+ 8
- 3
bitshift/crawler/__init__.py View File

@@ -4,14 +4,12 @@
Contains functions for initializing all subsidiary, threaded crawlers. Contains functions for initializing all subsidiary, threaded crawlers.
""" """


import Queue
import logging, Queue


from bitshift.crawler import crawler, indexer from bitshift.crawler import crawler, indexer


__all__ = ["crawl"] __all__ = ["crawl"]


MAX_URL_QUEUE_SIZE = 5e3

def crawl(): def crawl():
""" """
Initialize all crawlers (and indexers). Initialize all crawlers (and indexers).
@@ -21,6 +19,13 @@ def crawl():
2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer` 2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`
""" """


MAX_URL_QUEUE_SIZE = 5e3
DEBUG_FILE = "crawler.log"

logging.basicConfig(filename=DEBUG_FILE,
format="%(asctime)s:\t%(threadName)s:\t%(message)s",
level=logging.DEBUG)

repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
github_crawler = crawler.GitHubCrawler(repository_queue) github_crawler = crawler.GitHubCrawler(repository_queue)
git_indexer = indexer.GitIndexer(repository_queue) git_indexer = indexer.GitIndexer(repository_queue)


+ 5
- 2
bitshift/crawler/crawler.py View File

@@ -4,7 +4,7 @@
Contains all website/framework-specific Class crawlers. Contains all website/framework-specific Class crawlers.
""" """


import requests, time, threading
import logging, requests, time, threading


import bitshift.crawler.indexer import bitshift.crawler.indexer


@@ -44,7 +44,8 @@ class GitHubCrawler(threading.Thread):
""" """


self.repository_queue = repository_queue self.repository_queue = repository_queue
super(GitHubCrawler, self).__init__()
logging.info("Starting.")
super(GitHubCrawler, self).__init__(name=self.__class__.__name__)


def run(self): def run(self):
""" """
@@ -66,6 +67,8 @@ class GitHubCrawler(threading.Thread):
while len(next_api_url) > 0: while len(next_api_url) > 0:
start_time = time.time() start_time = time.time()
response = requests.get(next_api_url, params=authentication_params) response = requests.get(next_api_url, params=authentication_params)
logging.info("API call made. Limit remaining: %s." %
response.headers["x-ratelimit-remaining"])


for repo in response.json(): for repo in response.json():
while self.repository_queue.full(): while self.repository_queue.full():


+ 16
- 10
bitshift/crawler/indexer.py View File

@@ -3,7 +3,7 @@
repositories. repositories.
""" """


import bs4, os, re, shutil, subprocess, threading
import bs4, logging, os, re, shutil, subprocess, threading


from ..database import Database from ..database import Database
from ..codelet import Codelet from ..codelet import Codelet
@@ -35,7 +35,8 @@ class GitIndexer(threading.Thread):
if not os.path.exists(GIT_CLONE_DIR): if not os.path.exists(GIT_CLONE_DIR):
os.makedirs(GIT_CLONE_DIR) os.makedirs(GIT_CLONE_DIR)


super(GitIndexer, self).__init__()
logging.info("Starting.")
super(GitIndexer, self).__init__(name=self.__class__.__name__)


def run(self): def run(self):
""" """
@@ -53,12 +54,8 @@ class GitIndexer(threading.Thread):


repo = self.repository_queue.get() repo = self.repository_queue.get()
self.repository_queue.task_done() self.repository_queue.task_done()

try:
_index_repository(repo["url"], repo["name"],
repo["framework_name"])
except:
pass
_index_repository(repo["url"], repo["name"],
repo["framework_name"])


class _ChangeDir(object): class _ChangeDir(object):
""" """
@@ -116,15 +113,23 @@ def _index_repository(repo_url, repo_name, framework_name):


GIT_CLONE_TIMEOUT = 600 GIT_CLONE_TIMEOUT = 600


logging.info("Indexing repository %s." % repo_url)
with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \ if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \
clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0: clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0:
logging.debug("_index_repository(): Cloning %s failed." % repo_url)
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
return return


with _ChangeDir(repo_name) as repository_dir: with _ChangeDir(repo_name) as repository_dir:
_insert_repository_codelets(repo_url, repo_name, framework_name)
try:
_insert_repository_codelets(repo_url, repo_name,
framework_name)
except Exception as exception:
logging.warning("%s: _insert_repository_codelets"
" failed %s." % (exception, repo_url))
pass


shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))


@@ -312,5 +317,6 @@ def _decode(raw):
encoding = bs4.BeautifulSoup(raw).original_encoding encoding = bs4.BeautifulSoup(raw).original_encoding
return raw.decode(encoding) if encoding is not None else None return raw.decode(encoding) if encoding is not None else None


except:
except Exception as exception:
logging.warning("_debug(): %s", exception)
return None return None

Loading…
Cancel
Save