Browse Source

Add untested threaded indexer/crawler prototype.

Additions are not tested and not yet documented.

Add:
    crawler.py
        -add threaded GitHubCrawler class, which interacts with a GitIndexer
        via a Queue.

    git_indexer.py
        -add threaded GitIndexer class, which interacts with GitHubCrawler via
        a Queue.
        -rename context-manager ChangeDir class to _ChangeDir, because it's
        essentially "private".

    __init__.py
        -add body to crawl(), which creates instances of GitHubCrawler and
        GitIndexer and starts them.
tags/v1.0^2
Severyn Kozak 10 years ago
parent
commit
b7ccec0501
3 changed files with 46 additions and 14 deletions
  1. +10
    -2
      bitshift/crawler/__init__.py
  2. +16
    -4
      bitshift/crawler/crawler.py
  3. +20
    -8
      bitshift/crawler/git_indexer.py

+ 10
- 2
bitshift/crawler/__init__.py View File

@@ -1,6 +1,14 @@
import crawler
import Queue

from bitshift.crawler import crawler
from bitshift.crawler import git_indexer


__all__ = ["crawl"] __all__ = ["crawl"]


def crawl(): def crawl():
pass
repository_queue = Queue.Queue()
github_crawler = crawler.GitHubCrawler(repository_queue)
indexer = git_indexer.GitIndexer(repository_queue)

for thread in [github_crawler, indexer]:
thread.start()

+ 16
- 4
bitshift/crawler/crawler.py View File

@@ -4,14 +4,22 @@
...more info soon... ...more info soon...
""" """


import requests, time
import requests, time, threading


import git_indexer
import bitshift.crawler.git_indexer


from ..codelet import Codelet from ..codelet import Codelet
from ..database import Database from ..database import Database


def github():
class GitHubCrawler(threading.Thread):
def __init__(self, repository_queue):
self.repository_queue = repository_queue
super(GitHubCrawler, self).__init__()

def run():
_github()

def _github():
""" """
Query the GitHub API for data about every public repository. Query the GitHub API for data about every public repository.


@@ -33,7 +41,11 @@ def github():
response = requests.get(next_api_url, params=authentication_params) response = requests.get(next_api_url, params=authentication_params)


for repo in response.json(): for repo in response.json():
print repo["id"]
self.repository_queue.put({
"url" : repo["html_url"],
"framework_name" : "GitHub"
})
self.repository_queue.task_done()


if int(response.headers["x-ratelimit-remaining"]) == 0: if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())


+ 20
- 8
bitshift/crawler/git_indexer.py View File

@@ -5,14 +5,26 @@
Add documentation, threaded Indexer class. Add documentation, threaded Indexer class.
""" """


import shutil, subprocess, os
import os, shutil, subprocess, threading


from ..database import Database from ..database import Database
from ..codelet import Codelet from ..codelet import Codelet


GIT_CLONE_DIR = "/tmp" GIT_CLONE_DIR = "/tmp"


class ChangeDir(object):
class GitIndexer(threading.Thread):
def __init__(self, repository_queue):
self.repository_queue = repository_queue
super(GitIndexer, self).__init__()

def run(self):
while True:
while self.repository_queue.empty():
pass
new_repo = self.repository_queue.get()
_index_repository(new_repo["url"], new_repo["framework_name"])

class _ChangeDir(object):
""" """
A wrapper class for os.chdir(), to map onto `with` and handle exceptions. A wrapper class for os.chdir(), to map onto `with` and handle exceptions.


@@ -22,7 +34,7 @@ class ChangeDir(object):


def __init__(self, new_path): def __init__(self, new_path):
""" """
Create a ChangeDir instance.
Create a _ChangeDir instance.


:param new_path: The directory to enter. :param new_path: The directory to enter.


@@ -50,7 +62,7 @@ class ChangeDir(object):


os.chdir(self.old_path) os.chdir(self.old_path)


def index_repository(repo_url, framework_name):
def _index_repository(repo_url, framework_name):
""" """
Clone and index (create and insert Codeletes for) a Git repository. Clone and index (create and insert Codeletes for) a Git repository.


@@ -70,9 +82,9 @@ def index_repository(repo_url, framework_name):
repo_name = repo_url.split("/")[-1] repo_name = repo_url.split("/")[-1]
codelets = [] codelets = []


with ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
subprocess.call("git clone %s" % repo_url, shell=True) subprocess.call("git clone %s" % repo_url, shell=True)
with ChangeDir(repo_name) as repository_dir:
with _ChangeDir(repo_name) as repository_dir:
codelets = _insert_repository_codelets(repo_url, repo_name, codelets = _insert_repository_codelets(repo_url, repo_name,
framework_name) framework_name)
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
@@ -128,7 +140,7 @@ def _generate_file_url(filename, repo_url, framework_name):
:rtype: str :rtype: str
""" """


if framework_name == "github":
if framework_name == "GitHub":
default_branch = subprocess.check_output("git branch --no-color", default_branch = subprocess.check_output("git branch --no-color",
shell=True)[2:-1] shell=True)[2:-1]
return "%s/blob/%s/%s" % (repo_url, default_branch, filename) return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
@@ -164,7 +176,7 @@ def _get_git_commits():
commits.append({ commits.append({
"author" : fields[0], "author" : fields[0],
"timestamp" : int(fields[1]), "timestamp" : int(fields[1]),
"filenames" : fields[2].split("\0")[:-2]
"filenames" : fields[2].split("\x00")[:-2]
}) })


return commits return commits


Loading…
Cancel
Save