@@ -1,28 +1,60 @@
"""
:synopsis: Index all the files in a Git repository.
.. todo::
Add documentation, threaded Indexer class.
:synopsis: Contains a singleton GitIndexer class, which clones and indexes git
repositories.
"""
import os, shutil, subprocess, threading
import bs4, os, re , shutil, subprocess, threading
from ..database import Database
from ..codelet import Codelet
GIT_CLONE_DIR = "/tmp"
GIT_CLONE_DIR = "/tmp/bitshift "
class GitIndexer(threading.Thread):
"""
A singleton Git repository indexer.
`GitIndexer` clones and indexes the repositories at urls found by the
:mod:`bitshift.crawler.crawler` Git crawlers.
:ivar repository_queue: (:class:`Queue.Queue`) A queue containing urls found
by the :mod:`bitshift.crawler.crawler` Git crawlers.
"""
def __init__(self, repository_queue):
"""
Create an instance of the singleton `GitIndexer`.
:param repository_queue: see :attr:`GitIndexer.repository_queue`
:type repository_queue: see :attr:`GitIndexer.repository_queue`
"""
self.repository_queue = repository_queue
super(GitIndexer, self).__init__()
def run(self):
"""
Retrieve new repository urls, clone, and index them.
Blocks until new urls appear in :attr:`GitIndexer.repository_queue`,
then retrieves one, and attempts cloning/indexing it. Should any errors
occur, the new repository will be discarded and the crawler will
index the next in the queue.
"""
while True:
while self.repository_queue.empty():
pass
new_repo = self.repository_queue.get()
_index_repository(new_repo["url"], new_repo["framework_name"])
repo = self.repository_queue.get()
self.repository_queue.task_done()
try:
_index_repository(repo["url"], repo["name"],
repo["framework_name"])
except: # desperate times -- will be modified later
pass
class _ChangeDir(object):
"""
@@ -62,7 +94,7 @@ class _ChangeDir(object):
os.chdir(self.old_path)
def _index_repository(repo_url, framework_name):
def _index_repository(repo_url, repo_name, framework_name):
"""
Clone and index (create and insert Codeletes for) a Git repository.
@@ -70,32 +102,30 @@ def _index_repository(repo_url, framework_name):
_insert_repository_codelets, then remove said repository.
:param repo_url: The url the Git repository was cloned from.
:param repo_name: The name of the repository.
:param framework_name: The name of the framework the repository is from.
:type repo_url: str
:type repo_name: str
:type framework_name: str
:return: Temporary: the new codelets, for testing purposes.
:rtype: Codelet array
"""
repo_name = repo_url.split("/")[-1]
codelets = []
GIT_CLONE_TIMEOUT = 60
with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
subprocess.call("git clone %s" % repo_url, shell=True)
if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \
clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0:
return
with _ChangeDir(repo_name) as repository_dir:
codelets = _insert_repository_codelets(repo_url, repo_name,
framework_name)
_insert_repository_codelets(repo_url, repo_name, framework_name)
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
return codelets
def _insert_repository_codelets(repo_url, repo_name, framework_name):
"""
Create a Codelet for the files inside a Git repository.
Create and insert a Codelet for the files inside a Git repository.
Create a new Codelet, and insert it into the Database singlet, for every
Create a new Codelet, and insert it into the Database singleton , for every
file inside the current working directory's default branch (usually
*master*).
@@ -108,21 +138,27 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name):
:type framework_name: str
"""
codelets = []
commits_meta = _get_commits_metadata()
for filename in commits_meta.keys():
with open(filename, "r") as source_file:
source = source_file.read()
source = _decode(source_file.read())
if source is None:
return
authors = [(author,) for author in commits_meta[filename]["authors"]]
codelets.append(
Codelet("%s:%s" % (repo_name, filename), source, filename,
authors = [(_decode(author),) for author in \
commits_meta[filename]["authors"]]
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url,
framework_name),
framework_name),
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"]))
commits_meta[filename]["time_last_modified"])
return codelets
db.codelets.insert({
"name" : codelet.name,
"authors" : codelet.authors
})
# Database.insert(codelet)
def _generate_file_url(filename, repo_url, framework_name):
"""
@@ -142,7 +178,7 @@ def _generate_file_url(filename, repo_url, framework_name):
if framework_name == "GitHub":
default_branch = subprocess.check_output("git branch --no-color",
shell=True)[2:-1]
shell=True)[2:-1]
return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
def _get_git_commits():
@@ -165,8 +201,7 @@ def _get_git_commits():
:rtype: dictionary
"""
git_log = subprocess.check_output(
("git --no-pager log --name-only"
git_log = subprocess.check_output(("git --no-pager log --name-only"
" --pretty=format:'%n%n%an%n%at' -z"), shell=True)
commits = []
@@ -183,24 +218,34 @@ def _get_git_commits():
def _get_tracked_files():
"""
Return a list of the filenames of all files in the Git repository.
Return a list of the filenames of all valuable files in the Git repository.
Get a list of the filenames of the non-binary (Perl heuristics used for
filetype identification) files currently inside the current working
directory's Git repository.
directory's Git repository. Then, weed out any boilerplate/non-code files
that match the regex rules in GIT_IGNORE_FILES.
:return: The filenames of all non-binary files.
:return: The filenames of all index-worthy non-binary files.
:rtype: str array
"""
tracked_files = subprocess.check_output(
("perl -le 'for (@ARGV){ print if -f && -T }'"
" $(find . -type d -name .git -prune -o -print)"), shell=True)
return [filename[2:] for filename in tracked_files.split("\n")[:-1]]
GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"]
tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \
-f && -T }' $(find . -type d -name .git -prune -o -print)"),
shell=True).split("\n")[:-1]
valuable_files = []
for filename in tracked_files:
filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
for pattern in GIT_IGNORE_FILES])
if not filename_match:
valuable_files.append(filename[2:])
return valuable_files
def _get_commits_metadata():
"""
Return a dictionary containing every tracked file's metadata.
Return a dictionary containing every valuable tracked file's metadata.
:return: A dictionary with author names, time of creation, and time of last
modification for every filename key.
@@ -236,3 +281,27 @@ def _get_commits_metadata():
files_meta[filename]["time_created"] = commit["timestamp"]
return files_meta
def _decode(raw):
"""
Return a decoded a raw string.
:param raw: The string to string.
:type raw: (str)
:return: If the original encoding is successfully inferenced, return the
decoded string.
:rtype: str, or None
.. warning::
The raw string's original encoding is identified by heuristics which
can, and occasionally will, fail. Decoding will then fail, and None
will be returned.
"""
try:
return raw.decode(bs4.BeautifulSoup(raw).original_encoding)
except (UnicodeDecodeError, UserWarning):
return None