@@ -1,28 +1,60 @@
"""
"""
:synopsis: Index all the files in a Git repository.
.. todo::
Add documentation, threaded Indexer class.
:synopsis: Contains a singleton GitIndexer class, which clones and indexes git
repositories.
"""
"""
import os, shutil, subprocess, threading
import bs4, os, re , shutil, subprocess, threading
from ..database import Database
from ..database import Database
from ..codelet import Codelet
from ..codelet import Codelet
GIT_CLONE_DIR = "/tmp"
GIT_CLONE_DIR = "/tmp/bitshift "
class GitIndexer(threading.Thread):
class GitIndexer(threading.Thread):
"""
A singleton Git repository indexer.
`GitIndexer` clones and indexes the repositories at urls found by the
:mod:`bitshift.crawler.crawler` Git crawlers.
:ivar repository_queue: (:class:`Queue.Queue`) A queue containing urls found
by the :mod:`bitshift.crawler.crawler` Git crawlers.
"""
def __init__(self, repository_queue):
def __init__(self, repository_queue):
"""
Create an instance of the singleton `GitIndexer`.
:param repository_queue: see :attr:`GitIndexer.repository_queue`
:type repository_queue: see :attr:`GitIndexer.repository_queue`
"""
self.repository_queue = repository_queue
self.repository_queue = repository_queue
super(GitIndexer, self).__init__()
super(GitIndexer, self).__init__()
def run(self):
def run(self):
"""
Retrieve new repository urls, clone, and index them.
Blocks until new urls appear in :attr:`GitIndexer.repository_queue`,
then retrieves one, and attempts cloning/indexing it. Should any errors
occur, the new repository will be discarded and the crawler will
index the next in the queue.
"""
while True:
while True:
while self.repository_queue.empty():
while self.repository_queue.empty():
pass
pass
new_repo = self.repository_queue.get()
_index_repository(new_repo["url"], new_repo["framework_name"])
repo = self.repository_queue.get()
self.repository_queue.task_done()
try:
_index_repository(repo["url"], repo["name"],
repo["framework_name"])
except: # desperate times -- will be modified later
pass
class _ChangeDir(object):
class _ChangeDir(object):
"""
"""
@@ -62,7 +94,7 @@ class _ChangeDir(object):
os.chdir(self.old_path)
os.chdir(self.old_path)
def _index_repository(repo_url, framework_name):
def _index_repository(repo_url, repo_name, framework_name):
"""
"""
Clone and index (create and insert Codeletes for) a Git repository.
Clone and index (create and insert Codeletes for) a Git repository.
@@ -70,32 +102,30 @@ def _index_repository(repo_url, framework_name):
_insert_repository_codelets, then remove said repository.
_insert_repository_codelets, then remove said repository.
:param repo_url: The url the Git repository was cloned from.
:param repo_url: The url the Git repository was cloned from.
:param repo_name: The name of the repository.
:param framework_name: The name of the framework the repository is from.
:param framework_name: The name of the framework the repository is from.
:type repo_url: str
:type repo_url: str
:type repo_name: str
:type framework_name: str
:type framework_name: str
:return: Temporary: the new codelets, for testing purposes.
:rtype: Codelet array
"""
"""
repo_name = repo_url.split("/")[-1]
codelets = []
GIT_CLONE_TIMEOUT = 60
with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
subprocess.call("git clone %s" % repo_url, shell=True)
if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \
clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0:
return
with _ChangeDir(repo_name) as repository_dir:
with _ChangeDir(repo_name) as repository_dir:
codelets = _insert_repository_codelets(repo_url, repo_name,
framework_name)
_insert_repository_codelets(repo_url, repo_name, framework_name)
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
return codelets
def _insert_repository_codelets(repo_url, repo_name, framework_name):
def _insert_repository_codelets(repo_url, repo_name, framework_name):
"""
"""
Create a Codelet for the files inside a Git repository.
Create and insert a Codelet for the files inside a Git repository.
Create a new Codelet, and insert it into the Database singlet, for every
Create a new Codelet, and insert it into the Database singleton , for every
file inside the current working directory's default branch (usually
file inside the current working directory's default branch (usually
*master*).
*master*).
@@ -108,21 +138,27 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name):
:type framework_name: str
:type framework_name: str
"""
"""
codelets = []
commits_meta = _get_commits_metadata()
commits_meta = _get_commits_metadata()
for filename in commits_meta.keys():
for filename in commits_meta.keys():
with open(filename, "r") as source_file:
with open(filename, "r") as source_file:
source = source_file.read()
source = _decode(source_file.read())
if source is None:
return
authors = [(author,) for author in commits_meta[filename]["authors"]]
codelets.append(
Codelet("%s:%s" % (repo_name, filename), source, filename,
authors = [(_decode(author),) for author in \
commits_meta[filename]["authors"]]
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url,
None, authors, _generate_file_url(filename, repo_url,
framework_name),
framework_name),
commits_meta[filename]["time_created"],
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"]))
commits_meta[filename]["time_last_modified"])
return codelets
db.codelets.insert({
"name" : codelet.name,
"authors" : codelet.authors
})
# Database.insert(codelet)
def _generate_file_url(filename, repo_url, framework_name):
def _generate_file_url(filename, repo_url, framework_name):
"""
"""
@@ -142,7 +178,7 @@ def _generate_file_url(filename, repo_url, framework_name):
if framework_name == "GitHub":
if framework_name == "GitHub":
default_branch = subprocess.check_output("git branch --no-color",
default_branch = subprocess.check_output("git branch --no-color",
shell=True)[2:-1]
shell=True)[2:-1]
return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
def _get_git_commits():
def _get_git_commits():
@@ -165,8 +201,7 @@ def _get_git_commits():
:rtype: dictionary
:rtype: dictionary
"""
"""
git_log = subprocess.check_output(
("git --no-pager log --name-only"
git_log = subprocess.check_output(("git --no-pager log --name-only"
" --pretty=format:'%n%n%an%n%at' -z"), shell=True)
" --pretty=format:'%n%n%an%n%at' -z"), shell=True)
commits = []
commits = []
@@ -183,24 +218,34 @@ def _get_git_commits():
def _get_tracked_files():
def _get_tracked_files():
"""
"""
Return a list of the filenames of all files in the Git repository.
Return a list of the filenames of all valuable files in the Git repository.
Get a list of the filenames of the non-binary (Perl heuristics used for
Get a list of the filenames of the non-binary (Perl heuristics used for
filetype identification) files currently inside the current working
filetype identification) files currently inside the current working
directory's Git repository.
directory's Git repository. Then, weed out any boilerplate/non-code files
that match the regex rules in GIT_IGNORE_FILES.
:return: The filenames of all non-binary files.
:return: The filenames of all index-worthy non-binary files.
:rtype: str array
:rtype: str array
"""
"""
tracked_files = subprocess.check_output(
("perl -le 'for (@ARGV){ print if -f && -T }'"
" $(find . -type d -name .git -prune -o -print)"), shell=True)
return [filename[2:] for filename in tracked_files.split("\n")[:-1]]
GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"]
tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \
-f && -T }' $(find . -type d -name .git -prune -o -print)"),
shell=True).split("\n")[:-1]
valuable_files = []
for filename in tracked_files:
filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
for pattern in GIT_IGNORE_FILES])
if not filename_match:
valuable_files.append(filename[2:])
return valuable_files
def _get_commits_metadata():
def _get_commits_metadata():
"""
"""
Return a dictionary containing every tracked file's metadata.
Return a dictionary containing every valuable tracked file's metadata.
:return: A dictionary with author names, time of creation, and time of last
:return: A dictionary with author names, time of creation, and time of last
modification for every filename key.
modification for every filename key.
@@ -236,3 +281,27 @@ def _get_commits_metadata():
files_meta[filename]["time_created"] = commit["timestamp"]
files_meta[filename]["time_created"] = commit["timestamp"]
return files_meta
return files_meta
def _decode(raw):
"""
Return a decoded a raw string.
:param raw: The string to string.
:type raw: (str)
:return: If the original encoding is successfully inferenced, return the
decoded string.
:rtype: str, or None
.. warning::
The raw string's original encoding is identified by heuristics which
can, and occasionally will, fail. Decoding will then fail, and None
will be returned.
"""
try:
return raw.decode(bs4.BeautifulSoup(raw).original_encoding)
except (UnicodeDecodeError, UserWarning):
return None