Browse Source

Add logging to crawler/indexer.

Add:
    bitshift/crawler/(__init__, crawler, indexer).py
        -add `logging` module to all `bitshift.crawler` modules, for some basic
        diagnostic output.
tags/v1.0^2
Severyn Kozak 10 years ago
parent
commit
755dce6ae3
3 changed files with 29 additions and 15 deletions
  1. +8
    -3
      bitshift/crawler/__init__.py
  2. +5
    -2
      bitshift/crawler/crawler.py
  3. +16
    -10
      bitshift/crawler/indexer.py

+ 8
- 3
bitshift/crawler/__init__.py View File

@@ -4,14 +4,12 @@
Contains functions for initializing all subsidiary, threaded crawlers.
"""

import Queue
import logging, Queue

from bitshift.crawler import crawler, indexer

__all__ = ["crawl"]

MAX_URL_QUEUE_SIZE = 5e3

def crawl():
"""
Initialize all crawlers (and indexers).
@@ -21,6 +19,13 @@ def crawl():
2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`
"""

MAX_URL_QUEUE_SIZE = 5e3
DEBUG_FILE = "crawler.log"

logging.basicConfig(filename=DEBUG_FILE,
format="%(asctime)s:\t%(threadName)s:\t%(message)s",
level=logging.DEBUG)

repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
github_crawler = crawler.GitHubCrawler(repository_queue)
git_indexer = indexer.GitIndexer(repository_queue)


+ 5
- 2
bitshift/crawler/crawler.py View File

@@ -4,7 +4,7 @@
Contains all website/framework-specific Class crawlers.
"""

import requests, time, threading
import logging, requests, time, threading

import bitshift.crawler.indexer

@@ -44,7 +44,8 @@ class GitHubCrawler(threading.Thread):
"""

self.repository_queue = repository_queue
super(GitHubCrawler, self).__init__()
logging.info("Starting.")
super(GitHubCrawler, self).__init__(name=self.__class__.__name__)

def run(self):
"""
@@ -66,6 +67,8 @@ class GitHubCrawler(threading.Thread):
while len(next_api_url) > 0:
start_time = time.time()
response = requests.get(next_api_url, params=authentication_params)
logging.info("API call made. Limit remaining: %s." %
response.headers["x-ratelimit-remaining"])

for repo in response.json():
while self.repository_queue.full():


+ 16
- 10
bitshift/crawler/indexer.py View File

@@ -3,7 +3,7 @@
repositories.
"""

import bs4, os, re, shutil, subprocess, threading
import bs4, logging, os, re, shutil, subprocess, threading

from ..database import Database
from ..codelet import Codelet
@@ -35,7 +35,8 @@ class GitIndexer(threading.Thread):
if not os.path.exists(GIT_CLONE_DIR):
os.makedirs(GIT_CLONE_DIR)

super(GitIndexer, self).__init__()
logging.info("Starting.")
super(GitIndexer, self).__init__(name=self.__class__.__name__)

def run(self):
"""
@@ -53,12 +54,8 @@ class GitIndexer(threading.Thread):

repo = self.repository_queue.get()
self.repository_queue.task_done()

try:
_index_repository(repo["url"], repo["name"],
repo["framework_name"])
except:
pass
_index_repository(repo["url"], repo["name"],
repo["framework_name"])

class _ChangeDir(object):
"""
@@ -116,15 +113,23 @@ def _index_repository(repo_url, repo_name, framework_name):

GIT_CLONE_TIMEOUT = 600

logging.info("Indexing repository %s." % repo_url)
with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \
clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0:
logging.debug("_index_repository(): Cloning %s failed." % repo_url)
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
return

with _ChangeDir(repo_name) as repository_dir:
_insert_repository_codelets(repo_url, repo_name, framework_name)
try:
_insert_repository_codelets(repo_url, repo_name,
framework_name)
except Exception as exception:
logging.warning("%s: _insert_repository_codelets"
" failed %s." % (exception, repo_url))
pass

shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))

@@ -312,5 +317,6 @@ def _decode(raw):
encoding = bs4.BeautifulSoup(raw).original_encoding
return raw.decode(encoding) if encoding is not None else None

except:
except Exception as exception:
logging.warning("_debug(): %s", exception)
return None

Loading…
Cancel
Save