|
|
@@ -3,7 +3,7 @@ |
|
|
|
repositories. |
|
|
|
""" |
|
|
|
|
|
|
|
import datetime |
|
|
|
from datetime import datetime |
|
|
|
import logging |
|
|
|
import os |
|
|
|
import Queue |
|
|
@@ -13,7 +13,8 @@ import subprocess |
|
|
|
import time |
|
|
|
import threading |
|
|
|
|
|
|
|
import bs4 |
|
|
|
from bs4 import UnicodeDammmit |
|
|
|
import git |
|
|
|
|
|
|
|
from ..database import Database |
|
|
|
from ..parser import parse, UnsupportedFileError |
|
|
@@ -33,7 +34,8 @@ class GitRepository(object): |
|
|
|
repository belongs to (eg, GitHub, BitBucket). |
|
|
|
:ivar rank: (float) The rank of the repository, as assigned by |
|
|
|
:class:`crawler.GitHubCrawler`. |
|
|
|
:ivar dirname: (str) The repository's on-disk directory name. |
|
|
|
:ivar path: (str) The repository's on-disk directory path. |
|
|
|
:ivar repo: (git.Repo) A git.Repo representation of the repository. |
|
|
|
""" |
|
|
|
|
|
|
|
def __init__(self, url, name, framework_name, rank): |
|
|
@@ -55,7 +57,9 @@ class GitRepository(object): |
|
|
|
self.name = name |
|
|
|
self.framework_name = framework_name |
|
|
|
self.rank = rank |
|
|
|
self.dirname = name.replace("-", "--").replace("/", "-") |
|
|
|
dirname = name.replace("/", "-") + "-" + str(int(time.time())) |
|
|
|
self.path = os.path.join(GIT_CLONE_DIR, dirname) |
|
|
|
self.repo = None |
|
|
|
|
|
|
|
class GitIndexer(threading.Thread): |
|
|
|
""" |
|
|
@@ -124,20 +128,18 @@ class GitIndexer(threading.Thread): |
|
|
|
`git clone` the Git repository located at **repo.url**, call |
|
|
|
`_insert_repository_codelets()`, then remove said repository. |
|
|
|
|
|
|
|
:param repo_url: The metadata of the repository to be indexed. |
|
|
|
|
|
|
|
:type repo_url: :class:`GitRepository` |
|
|
|
:param repo: The metadata of the repository to be indexed. |
|
|
|
:type repo: :class:`GitRepository` |
|
|
|
""" |
|
|
|
|
|
|
|
self._logger.info(u"Indexing repo: %s", repo.name) |
|
|
|
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): |
|
|
|
try: |
|
|
|
self._insert_repository_codelets(repo) |
|
|
|
except Exception: |
|
|
|
self._logger.exception("Exception raised while indexing:") |
|
|
|
finally: |
|
|
|
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): |
|
|
|
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname)) |
|
|
|
try: |
|
|
|
self._insert_repository_codelets(repo) |
|
|
|
except Exception: |
|
|
|
self._logger.exception("Exception raised while indexing:") |
|
|
|
finally: |
|
|
|
if os.path.isdir(repo.path): |
|
|
|
shutil.rmtree(repo.path) |
|
|
|
|
|
|
|
def _insert_repository_codelets(self, repo): |
|
|
|
""" |
|
|
@@ -152,27 +154,18 @@ class GitIndexer(threading.Thread): |
|
|
|
:type repo_url: :class:`GitRepository` |
|
|
|
""" |
|
|
|
|
|
|
|
commits_meta = self._get_commits_metadata() |
|
|
|
commits_meta = self._get_commits_metadata(repo) |
|
|
|
if commits_meta is None: |
|
|
|
return |
|
|
|
|
|
|
|
for filename in commits_meta.keys(): |
|
|
|
try: |
|
|
|
with open(filename) as source_file: |
|
|
|
source = self._decode(source_file.read()) |
|
|
|
if source is None: |
|
|
|
continue |
|
|
|
except IOError: |
|
|
|
continue |
|
|
|
|
|
|
|
authors = [(self._decode(author), None) for author in |
|
|
|
commits_meta[filename]["authors"]] |
|
|
|
url = self._generate_file_url(filename, repo.url, repo.framework_name) |
|
|
|
for filename, data in commits_meta.iteritems(): |
|
|
|
authors = [(author, None) for author in data["authors"]] |
|
|
|
encoded_source = data["blob"].data_stream.read() |
|
|
|
source = UnicodeDammmit(encoded_source).unicode_markup |
|
|
|
url = self._generate_file_url(filename, repo) |
|
|
|
codelet = Codelet("%s: %s" % (repo.name, filename), source, |
|
|
|
filename, None, authors, url, |
|
|
|
commits_meta[filename]["time_created"], |
|
|
|
commits_meta[filename]["time_last_modified"], |
|
|
|
repo.rank) |
|
|
|
filename, None, authors, url, data["time_created"], |
|
|
|
data["time_last_modified"], repo.rank) |
|
|
|
self._logger.debug("Indexing file: %s", codelet.name) |
|
|
|
try: |
|
|
|
parse(codelet) |
|
|
@@ -180,163 +173,103 @@ class GitIndexer(threading.Thread): |
|
|
|
continue |
|
|
|
self.database.insert(codelet) |
|
|
|
|
|
|
|
def _generate_file_url(self, filename, repo_url, framework_name): |
|
|
|
def _generate_file_url(self, filename, repo): |
|
|
|
""" |
|
|
|
Return a url for a filename from a Git wrapper framework. |
|
|
|
|
|
|
|
:param filename: The path of the file. |
|
|
|
:param repo_url: The url of the file's parent repository. |
|
|
|
:param framework_name: The name of the framework the repository is from. |
|
|
|
:param repo: The git repo. |
|
|
|
|
|
|
|
:type filename: str |
|
|
|
:type repo_url: str |
|
|
|
:type framework_name: str |
|
|
|
:type repo: :class:`GitRepository` |
|
|
|
|
|
|
|
:return: The file's full url on the given framework, if successfully |
|
|
|
derived. |
|
|
|
:rtype: str, or None |
|
|
|
|
|
|
|
.. warning:: |
|
|
|
Various Git subprocesses will occasionally fail, and, seeing as the |
|
|
|
information they provide is a crucial component of some repository |
|
|
|
file urls, None may be returned. |
|
|
|
""" |
|
|
|
|
|
|
|
try: |
|
|
|
if framework_name == "GitHub": |
|
|
|
default_branch = subprocess.check_output("git branch" |
|
|
|
" --no-color", shell=True)[2:-1] |
|
|
|
parts = [repo_url, "blob", default_branch, filename] |
|
|
|
elif framework_name == "Bitbucket": |
|
|
|
commit_hash = subprocess.check_output("git rev-parse HEAD", |
|
|
|
shell=True).replace("\n", "") |
|
|
|
parts = [repo_url, "src", commit_hash, filename] |
|
|
|
return "/".join(s.strip("/") for s in parts) |
|
|
|
except subprocess.CalledProcessError: |
|
|
|
return None |
|
|
|
|
|
|
|
def _get_git_commits(self): |
|
|
|
""" |
|
|
|
Return the current working directory's formatted commit data. |
|
|
|
|
|
|
|
Uses `git log` to generate metadata about every single file in the |
|
|
|
repository's commit history. |
|
|
|
|
|
|
|
:return: The author, timestamp, and names of all modified files of every |
|
|
|
commit. |
|
|
|
.. code-block:: python |
|
|
|
sample_returned_array = [ |
|
|
|
{ |
|
|
|
"author" : (str) "author" |
|
|
|
"timestamp" : (`datetime.datetime`) <object>, |
|
|
|
"filenames" : (str array) ["file1", "file2"] |
|
|
|
} |
|
|
|
] |
|
|
|
:rtype: array of dictionaries |
|
|
|
""" |
|
|
|
|
|
|
|
git_log = subprocess.check_output(("git --no-pager log --name-only" |
|
|
|
" --pretty=format:'%n%n%an%n%at' -z"), shell=True) |
|
|
|
|
|
|
|
commits = [] |
|
|
|
for commit in git_log.split("\n\n"): |
|
|
|
fields = commit.split("\n") |
|
|
|
if len(fields) > 2: |
|
|
|
commits.append({ |
|
|
|
"author" : fields[0], |
|
|
|
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), |
|
|
|
"filenames" : fields[2].split("\x00")[:-2] |
|
|
|
}) |
|
|
|
|
|
|
|
return commits |
|
|
|
|
|
|
|
def _get_tracked_files(self): |
|
|
|
""" |
|
|
|
Return a list of the filenames of all valuable files in the Git repository. |
|
|
|
|
|
|
|
Get a list of the filenames of the non-binary (Perl heuristics used for |
|
|
|
filetype identification) files currently inside the current working |
|
|
|
directory's Git repository. Then, weed out any boilerplate/non-code files |
|
|
|
that match the regex rules in GIT_IGNORE_FILES. |
|
|
|
|
|
|
|
:return: The filenames of all index-worthy non-binary files. |
|
|
|
:rtype: str array |
|
|
|
""" |
|
|
|
|
|
|
|
files = [] |
|
|
|
for dirname, subdir_names, filenames in os.walk("."): |
|
|
|
for filename in filenames: |
|
|
|
path = os.path.join(dirname, filename) |
|
|
|
if self._is_ascii(path): |
|
|
|
files.append(path[2:]) |
|
|
|
if framework_name == "GitHub": |
|
|
|
default_branch = repo.repo.active_branch |
|
|
|
parts = [repo_url, "blob", default_branch, filename] |
|
|
|
elif framework_name == "Bitbucket": |
|
|
|
try: |
|
|
|
commit_hash = repo.repo.head.commit.hexsha |
|
|
|
except ValueError: # No commits |
|
|
|
return None |
|
|
|
parts = [repo_url, "src", commit_hash, filename] |
|
|
|
return "/".join(s.strip("/") for s in parts) |
|
|
|
|
|
|
|
def _walk_history(self, files, head): |
|
|
|
"""Walk a repository's history for metadata.""" |
|
|
|
def update_entry(commit, entry, new_file): |
|
|
|
entry.add(commit.author.name) |
|
|
|
commit_ts = datetime.utcfromtimestamp(commit.committed_date) |
|
|
|
if commit_ts > entry["time_last_modified"]: |
|
|
|
entry["time_last_modified"] = commit_ts |
|
|
|
if new_file: |
|
|
|
entry["time_created"] = commit_ts |
|
|
|
|
|
|
|
def handle_commit(commit, paths): |
|
|
|
if not commit.parents: |
|
|
|
for item in commit.tree.traverse(): |
|
|
|
if item.type == "blob" and item.path in paths: |
|
|
|
update_entry(commit, files[paths[item.path]], True) |
|
|
|
return |
|
|
|
|
|
|
|
for parent in commit.parents: |
|
|
|
for diff in parent.diff(commit, create_patch=True): |
|
|
|
pth = diff.renamed_to if diff.renamed else diff.b_blob.path |
|
|
|
if pth not in paths: |
|
|
|
continue |
|
|
|
update_entry(commit, files[paths[pth]], diff.new_file) |
|
|
|
if diff.renamed: |
|
|
|
paths[diff.renamed_from] = paths[pth] |
|
|
|
del paths[pth] |
|
|
|
|
|
|
|
return files |
|
|
|
pending = [(head, {path: path for path in files})] |
|
|
|
while pending: |
|
|
|
commit, paths = pending.pop() |
|
|
|
handle_commit(commit, paths) |
|
|
|
for parent in commit.parents: |
|
|
|
new_paths = paths.copy() if len(commit.parents) > 1 else paths |
|
|
|
pending.append((parent, new_paths)) |
|
|
|
|
|
|
|
def _get_commits_metadata(self): |
|
|
|
def _get_commits_metadata(self, repo): |
|
|
|
""" |
|
|
|
Return a dictionary containing every valuable tracked file's metadata. |
|
|
|
|
|
|
|
:return: A dictionary with author names, time of creation, and time of last |
|
|
|
modification for every filename key. |
|
|
|
:return: A dictionary with author names, time of creation, and time of |
|
|
|
last modification for every filename key. |
|
|
|
.. code-block:: python |
|
|
|
sample_returned_dict = { |
|
|
|
"my_file" : { |
|
|
|
"authors" : (str array) ["author1", "author2"], |
|
|
|
"time_created" : (`datetime.datetime`) <object>, |
|
|
|
"time_last_modified" : (`datetime.datetime`) <object> |
|
|
|
} |
|
|
|
} |
|
|
|
:rtype: dictionary of dictionaries |
|
|
|
""" |
|
|
|
|
|
|
|
commits = self._get_git_commits() |
|
|
|
tracked_files = self._get_tracked_files() |
|
|
|
|
|
|
|
files_meta = {} |
|
|
|
for commit in commits: |
|
|
|
for filename in commit["filenames"]: |
|
|
|
if filename not in tracked_files: |
|
|
|
continue |
|
|
|
|
|
|
|
if filename not in files_meta.keys(): |
|
|
|
files_meta[filename] = { |
|
|
|
"authors" : [commit["author"]], |
|
|
|
"time_last_modified" : commit["timestamp"], |
|
|
|
"time_created" : commit["timestamp"] |
|
|
|
sample_returned_dict = { |
|
|
|
"my_file" : { |
|
|
|
"blob": (GitPython Blob) <object>, |
|
|
|
"authors" : (str set) {"author1", "author2"}, |
|
|
|
"time_created" : (`datetime.datetime`) <object>, |
|
|
|
"time_last_modified" : (`datetime.datetime`) <object> |
|
|
|
} |
|
|
|
else: |
|
|
|
if commit["author"] not in files_meta[filename]["authors"]: |
|
|
|
files_meta[filename]["authors"].append(commit["author"]) |
|
|
|
files_meta[filename]["time_created"] = commit["timestamp"] |
|
|
|
|
|
|
|
return files_meta |
|
|
|
|
|
|
|
def _decode(self, raw): |
|
|
|
""" |
|
|
|
Return a decoded a raw string. |
|
|
|
|
|
|
|
:param raw: The string to string. |
|
|
|
|
|
|
|
:type raw: (str) |
|
|
|
|
|
|
|
:return: If the original encoding is successfully inferenced, return the |
|
|
|
decoded string. |
|
|
|
:rtype: str, or None |
|
|
|
|
|
|
|
.. warning:: |
|
|
|
The raw string's original encoding is identified by heuristics which |
|
|
|
can, and occasionally will, fail. Decoding will then fail, and None |
|
|
|
will be returned. |
|
|
|
} |
|
|
|
:rtype: dictionary of dictionaries |
|
|
|
""" |
|
|
|
|
|
|
|
try: |
|
|
|
encoding = bs4.BeautifulSoup(raw).original_encoding |
|
|
|
return raw.decode(encoding) if encoding is not None else None |
|
|
|
|
|
|
|
except (LookupError, UnicodeDecodeError, UserWarning) as exception: |
|
|
|
return None |
|
|
|
tree = repo.repo.head.commit.tree |
|
|
|
except ValueError: # No commits |
|
|
|
return {} |
|
|
|
|
|
|
|
files = {} |
|
|
|
for item in tree.traverse(): |
|
|
|
if item.type == "blob" and self._is_ascii(item.data_stream): |
|
|
|
files[item.path] = { |
|
|
|
"blob": item, |
|
|
|
"authors" : set(), |
|
|
|
"time_last_modified": datetime.utcfromtimestamp(0), |
|
|
|
"time_created": datetime.utcfromtimestamp(0) |
|
|
|
} |
|
|
|
|
|
|
|
self._walk_history(files, repo.repo.head.commit) |
|
|
|
return files |
|
|
|
|
|
|
|
def _is_ascii(self, filename): |
|
|
|
def _is_ascii(self, fp): |
|
|
|
""" |
|
|
|
Heuristically determine whether a file is ASCII text or binary. |
|
|
|
|
|
|
@@ -346,34 +279,29 @@ class GitIndexer(threading.Thread): |
|
|
|
operator, and is the de-facto method for in : passdetermining whether a |
|
|
|
file is ASCII. |
|
|
|
|
|
|
|
:param filename: The path of the file to test. |
|
|
|
:param fp: The file object to test. |
|
|
|
|
|
|
|
:type filename: str |
|
|
|
:type fp: `file` |
|
|
|
|
|
|
|
:return: Whether the file is probably ASCII. |
|
|
|
:rtype: Boolean |
|
|
|
""" |
|
|
|
|
|
|
|
try: |
|
|
|
with open(filename) as source: |
|
|
|
file_snippet = source.read(512) |
|
|
|
|
|
|
|
if not file_snippet: |
|
|
|
return True |
|
|
|
|
|
|
|
ascii_characters = "".join(map(chr, range(32, 127)) + |
|
|
|
list("\n\r\t\b")) |
|
|
|
null_trans = string.maketrans("", "") |
|
|
|
file_snippet = source.read(512) |
|
|
|
|
|
|
|
if "\0" in file_snippet: |
|
|
|
return False |
|
|
|
if not file_snippet: |
|
|
|
return True |
|
|
|
|
|
|
|
non_ascii = file_snippet.translate(null_trans, ascii_characters) |
|
|
|
return not float(len(non_ascii)) / len(file_snippet) > 0.30 |
|
|
|
ascii_characters = "".join(map(chr, range(32, 127)) + |
|
|
|
list("\n\r\t\b")) |
|
|
|
null_trans = string.maketrans("", "") |
|
|
|
|
|
|
|
except IOError: |
|
|
|
if "\0" in file_snippet: |
|
|
|
return False |
|
|
|
|
|
|
|
non_ascii = file_snippet.translate(null_trans, ascii_characters) |
|
|
|
return not float(len(non_ascii)) / len(file_snippet) > 0.30 |
|
|
|
|
|
|
|
class _GitCloner(threading.Thread): |
|
|
|
""" |
|
|
|
A singleton Git repository cloner. |
|
|
@@ -428,7 +356,7 @@ class _GitCloner(threading.Thread): |
|
|
|
try: |
|
|
|
self._clone_repository(repo) |
|
|
|
except Exception: |
|
|
|
pass |
|
|
|
self._logger.exception("Exception raised while cloning:") |
|
|
|
|
|
|
|
def _clone_repository(self, repo): |
|
|
|
""" |
|
|
@@ -439,57 +367,10 @@ class _GitCloner(threading.Thread): |
|
|
|
:type repo: :class:`GitRepository` |
|
|
|
""" |
|
|
|
|
|
|
|
GIT_CLONE_TIMEOUT = 500 |
|
|
|
queue_percent_full = (float(self.index_queue.qsize()) / |
|
|
|
self.index_queue.maxsize) * 100 |
|
|
|
|
|
|
|
command = ["perl", "-e", "alarm shift @ARGV; exec @ARGV", |
|
|
|
str(GIT_CLONE_TIMEOUT), "git", "clone", "--single-branch", |
|
|
|
repo.url, GIT_CLONE_DIR + "/" + repo.dirname] |
|
|
|
if subprocess.call(command) != 0: |
|
|
|
subprocess.call(["pkill", "-f", "git"]) # This makes Ben K upset |
|
|
|
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): |
|
|
|
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname)) |
|
|
|
return |
|
|
|
|
|
|
|
while self.index_queue.full(): |
|
|
|
self._logger.info("Cloning repo: %s", repo.url) |
|
|
|
repo.repo = git.Repo.clone_from(repo.url, to_path=repo.path, bare=True, |
|
|
|
single_branch=True) |
|
|
|
while self.index_queue.full() and self.run_event.is_set(): |
|
|
|
time.sleep(THREAD_QUEUE_SLEEP) |
|
|
|
self.index_queue.put(repo) |
|
|
|
|
|
|
|
class _ChangeDir(object): |
|
|
|
""" |
|
|
|
A wrapper class for os.chdir(), to map onto `with` and handle exceptions. |
|
|
|
|
|
|
|
:ivar new_path: (str) The path to change the current directory to. |
|
|
|
:ivar old_path: (str) The path of the directory to return to. |
|
|
|
""" |
|
|
|
|
|
|
|
def __init__(self, new_path): |
|
|
|
""" |
|
|
|
Create a _ChangeDir instance. |
|
|
|
|
|
|
|
:param new_path: The directory to enter. |
|
|
|
|
|
|
|
:type new_path: str |
|
|
|
""" |
|
|
|
|
|
|
|
self.new_path = new_path |
|
|
|
|
|
|
|
def __enter__(self): |
|
|
|
""" |
|
|
|
Change the current working-directory to **new_path**. |
|
|
|
""" |
|
|
|
|
|
|
|
self.old_path = os.getcwd() |
|
|
|
os.chdir(self.new_path) |
|
|
|
|
|
|
|
def __exit__(self, *exception): |
|
|
|
""" |
|
|
|
Change the current working-directory to **old_path**. |
|
|
|
|
|
|
|
:param exception: Various exception arguments passed by `with`. |
|
|
|
|
|
|
|
:type exception: varargs |
|
|
|
""" |
|
|
|
|
|
|
|
os.chdir(self.old_path) |
|
|
|
if self.run_event.is_set(): |
|
|
|
self.index_queue.put(repo) |