Kaynağa Gözat

Add untested threaded indexer/crawler prototype.

Additions are not tested and not yet documented.

Add:
    crawler.py
        -add threaded GitHubCrawler class, which interacts with a GitIndexer
        via a Queue.

    git_indexer.py
        -add threaded GitIndexer class, which interacts with GitHubCrawler via
        a Queue.
        -rename context-manager ChangeDir class to _ChangeDir, because it's
        essentially "private".

    __init__.py
        -add body to crawl(), which creates instances of GitHubCrawler and
        GitIndexer and starts them.
tags/v1.0^2
Severyn Kozak 10 yıl önce
ebeveyn
işleme
b7ccec0501
3 değiştirilmiş dosya ile 46 ekleme ve 14 silme
  1. +10
    -2
      bitshift/crawler/__init__.py
  2. +16
    -4
      bitshift/crawler/crawler.py
  3. +20
    -8
      bitshift/crawler/git_indexer.py

+ 10
- 2
bitshift/crawler/__init__.py Dosyayı Görüntüle

@@ -1,6 +1,14 @@
import crawler
import Queue

from bitshift.crawler import crawler
from bitshift.crawler import git_indexer

__all__ = ["crawl"]

def crawl():
pass
repository_queue = Queue.Queue()
github_crawler = crawler.GitHubCrawler(repository_queue)
indexer = git_indexer.GitIndexer(repository_queue)

for thread in [github_crawler, indexer]:
thread.start()

+ 16
- 4
bitshift/crawler/crawler.py Dosyayı Görüntüle

@@ -4,14 +4,22 @@
...more info soon...
"""

import requests, time
import requests, time, threading

import git_indexer
import bitshift.crawler.git_indexer

from ..codelet import Codelet
from ..database import Database

def github():
class GitHubCrawler(threading.Thread):
def __init__(self, repository_queue):
self.repository_queue = repository_queue
super(GitHubCrawler, self).__init__()

def run():
_github()

def _github():
"""
Query the GitHub API for data about every public repository.

@@ -33,7 +41,11 @@ def github():
response = requests.get(next_api_url, params=authentication_params)

for repo in response.json():
print repo["id"]
self.repository_queue.put({
"url" : repo["html_url"],
"framework_name" : "GitHub"
})
self.repository_queue.task_done()

if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())


+ 20
- 8
bitshift/crawler/git_indexer.py Dosyayı Görüntüle

@@ -5,14 +5,26 @@
Add documentation, threaded Indexer class.
"""

import shutil, subprocess, os
import os, shutil, subprocess, threading

from ..database import Database
from ..codelet import Codelet

GIT_CLONE_DIR = "/tmp"

class ChangeDir(object):
class GitIndexer(threading.Thread):
def __init__(self, repository_queue):
self.repository_queue = repository_queue
super(GitIndexer, self).__init__()

def run(self):
while True:
while self.repository_queue.empty():
pass
new_repo = self.repository_queue.get()
_index_repository(new_repo["url"], new_repo["framework_name"])

class _ChangeDir(object):
"""
A wrapper class for os.chdir(), to map onto `with` and handle exceptions.

@@ -22,7 +34,7 @@ class ChangeDir(object):

def __init__(self, new_path):
"""
Create a ChangeDir instance.
Create a _ChangeDir instance.

:param new_path: The directory to enter.

@@ -50,7 +62,7 @@ class ChangeDir(object):

os.chdir(self.old_path)

def index_repository(repo_url, framework_name):
def _index_repository(repo_url, framework_name):
"""
Clone and index (create and insert Codeletes for) a Git repository.

@@ -70,9 +82,9 @@ def index_repository(repo_url, framework_name):
repo_name = repo_url.split("/")[-1]
codelets = []

with ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
subprocess.call("git clone %s" % repo_url, shell=True)
with ChangeDir(repo_name) as repository_dir:
with _ChangeDir(repo_name) as repository_dir:
codelets = _insert_repository_codelets(repo_url, repo_name,
framework_name)
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
@@ -128,7 +140,7 @@ def _generate_file_url(filename, repo_url, framework_name):
:rtype: str
"""

if framework_name == "github":
if framework_name == "GitHub":
default_branch = subprocess.check_output("git branch --no-color",
shell=True)[2:-1]
return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
@@ -164,7 +176,7 @@ def _get_git_commits():
commits.append({
"author" : fields[0],
"timestamp" : int(fields[1]),
"filenames" : fields[2].split("\0")[:-2]
"filenames" : fields[2].split("\x00")[:-2]
})

return commits


Yükleniyor…
İptal
Kaydet