|
|
@@ -3,25 +3,25 @@ |
|
|
|
repositories. |
|
|
|
""" |
|
|
|
|
|
|
|
import datetime |
|
|
|
from datetime import datetime |
|
|
|
import logging |
|
|
|
import os |
|
|
|
import Queue |
|
|
|
import shutil |
|
|
|
import string |
|
|
|
import subprocess |
|
|
|
import time |
|
|
|
import threading |
|
|
|
|
|
|
|
import bs4 |
|
|
|
from bs4 import UnicodeDammit |
|
|
|
import git |
|
|
|
|
|
|
|
from ..database import Database |
|
|
|
from ..parser import parse, UnsupportedFileError |
|
|
|
from ..languages import LANGS |
|
|
|
from ..codelet import Codelet |
|
|
|
|
|
|
|
GIT_CLONE_DIR = "/tmp/bitshift" |
|
|
|
THREAD_QUEUE_SLEEP = 0.5 |
|
|
|
MAX_INDEX_QUEUE_SIZE = 10 |
|
|
|
|
|
|
|
class GitRepository(object): |
|
|
|
""" |
|
|
@@ -33,7 +33,8 @@ class GitRepository(object): |
|
|
|
repository belongs to (eg, GitHub, BitBucket). |
|
|
|
:ivar rank: (float) The rank of the repository, as assigned by |
|
|
|
:class:`crawler.GitHubCrawler`. |
|
|
|
:ivar dirname: (str) The repository's on-disk directory name. |
|
|
|
:ivar path: (str) The repository's on-disk directory path. |
|
|
|
:ivar repo: (git.Repo) A git.Repo representation of the repository. |
|
|
|
""" |
|
|
|
|
|
|
|
def __init__(self, url, name, framework_name, rank): |
|
|
@@ -55,7 +56,9 @@ class GitRepository(object): |
|
|
|
self.name = name |
|
|
|
self.framework_name = framework_name |
|
|
|
self.rank = rank |
|
|
|
self.dirname = name.replace("-", "--").replace("/", "-") |
|
|
|
dirname = name.replace("/", "-") + "-" + str(int(time.time())) |
|
|
|
self.path = os.path.join(GIT_CLONE_DIR, dirname) |
|
|
|
self.repo = None |
|
|
|
|
|
|
|
class GitIndexer(threading.Thread): |
|
|
|
""" |
|
|
@@ -81,8 +84,6 @@ class GitIndexer(threading.Thread): |
|
|
|
:type index_queue: see :attr:`self.index_queue` |
|
|
|
""" |
|
|
|
|
|
|
|
MAX_INDEX_QUEUE_SIZE = 10 |
|
|
|
|
|
|
|
self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) |
|
|
|
self.run_event = run_event |
|
|
|
self.git_cloner = _GitCloner(clone_queue, self.index_queue, run_event) |
|
|
@@ -124,20 +125,18 @@ class GitIndexer(threading.Thread): |
|
|
|
`git clone` the Git repository located at **repo.url**, call |
|
|
|
`_insert_repository_codelets()`, then remove said repository. |
|
|
|
|
|
|
|
:param repo_url: The metadata of the repository to be indexed. |
|
|
|
|
|
|
|
:type repo_url: :class:`GitRepository` |
|
|
|
:param repo: The metadata of the repository to be indexed. |
|
|
|
:type repo: :class:`GitRepository` |
|
|
|
""" |
|
|
|
|
|
|
|
self._logger.info(u"Indexing repo: %s", repo.name) |
|
|
|
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): |
|
|
|
try: |
|
|
|
self._insert_repository_codelets(repo) |
|
|
|
except Exception: |
|
|
|
self._logger.exception("Exception raised while indexing:") |
|
|
|
finally: |
|
|
|
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): |
|
|
|
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname)) |
|
|
|
try: |
|
|
|
self._insert_repository_codelets(repo) |
|
|
|
except Exception: |
|
|
|
self._logger.exception("Exception raised while indexing:") |
|
|
|
finally: |
|
|
|
if os.path.isdir(repo.path): |
|
|
|
shutil.rmtree(repo.path) |
|
|
|
|
|
|
|
def _insert_repository_codelets(self, repo): |
|
|
|
""" |
|
|
@@ -152,27 +151,18 @@ class GitIndexer(threading.Thread): |
|
|
|
:type repo_url: :class:`GitRepository` |
|
|
|
""" |
|
|
|
|
|
|
|
commits_meta = self._get_commits_metadata() |
|
|
|
if commits_meta is None: |
|
|
|
file_meta = self._get_file_metadata(repo.repo) |
|
|
|
if file_meta is None: |
|
|
|
return |
|
|
|
|
|
|
|
for filename in commits_meta.keys(): |
|
|
|
try: |
|
|
|
with open(filename) as source_file: |
|
|
|
source = self._decode(source_file.read()) |
|
|
|
if source is None: |
|
|
|
continue |
|
|
|
except IOError: |
|
|
|
continue |
|
|
|
|
|
|
|
authors = [(self._decode(author), None) for author in |
|
|
|
commits_meta[filename]["authors"]] |
|
|
|
url = self._generate_file_url(filename, repo.url, repo.framework_name) |
|
|
|
for filename, data in file_meta.iteritems(): |
|
|
|
authors = [(author, None) for author in data["authors"]] |
|
|
|
encoded_source = data["blob"].data_stream.read() |
|
|
|
source = UnicodeDammit(encoded_source).unicode_markup |
|
|
|
url = self._generate_file_url(filename, repo) |
|
|
|
codelet = Codelet("%s: %s" % (repo.name, filename), source, |
|
|
|
filename, None, authors, url, |
|
|
|
commits_meta[filename]["time_created"], |
|
|
|
commits_meta[filename]["time_last_modified"], |
|
|
|
repo.rank) |
|
|
|
filename, None, authors, url, data["time_created"], |
|
|
|
data["time_last_modified"], repo.rank) |
|
|
|
self._logger.debug("Indexing file: %s", codelet.name) |
|
|
|
try: |
|
|
|
parse(codelet) |
|
|
@@ -180,163 +170,75 @@ class GitIndexer(threading.Thread): |
|
|
|
continue |
|
|
|
self.database.insert(codelet) |
|
|
|
|
|
|
|
def _generate_file_url(self, filename, repo_url, framework_name): |
|
|
|
def _generate_file_url(self, filename, repo): |
|
|
|
""" |
|
|
|
Return a url for a filename from a Git wrapper framework. |
|
|
|
|
|
|
|
:param filename: The path of the file. |
|
|
|
:param repo_url: The url of the file's parent repository. |
|
|
|
:param framework_name: The name of the framework the repository is from. |
|
|
|
:param repo: The git repo. |
|
|
|
|
|
|
|
:type filename: str |
|
|
|
:type repo_url: str |
|
|
|
:type framework_name: str |
|
|
|
:type repo: :class:`GitRepository` |
|
|
|
|
|
|
|
:return: The file's full url on the given framework, if successfully |
|
|
|
derived. |
|
|
|
:rtype: str, or None |
|
|
|
|
|
|
|
.. warning:: |
|
|
|
Various Git subprocesses will occasionally fail, and, seeing as the |
|
|
|
information they provide is a crucial component of some repository |
|
|
|
file urls, None may be returned. |
|
|
|
""" |
|
|
|
|
|
|
|
try: |
|
|
|
if framework_name == "GitHub": |
|
|
|
default_branch = subprocess.check_output("git branch" |
|
|
|
" --no-color", shell=True)[2:-1] |
|
|
|
parts = [repo_url, "blob", default_branch, filename] |
|
|
|
elif framework_name == "Bitbucket": |
|
|
|
commit_hash = subprocess.check_output("git rev-parse HEAD", |
|
|
|
shell=True).replace("\n", "") |
|
|
|
parts = [repo_url, "src", commit_hash, filename] |
|
|
|
return "/".join(s.strip("/") for s in parts) |
|
|
|
except subprocess.CalledProcessError: |
|
|
|
return None |
|
|
|
|
|
|
|
def _get_git_commits(self): |
|
|
|
""" |
|
|
|
Return the current working directory's formatted commit data. |
|
|
|
|
|
|
|
Uses `git log` to generate metadata about every single file in the |
|
|
|
repository's commit history. |
|
|
|
|
|
|
|
:return: The author, timestamp, and names of all modified files of every |
|
|
|
commit. |
|
|
|
.. code-block:: python |
|
|
|
sample_returned_array = [ |
|
|
|
{ |
|
|
|
"author" : (str) "author" |
|
|
|
"timestamp" : (`datetime.datetime`) <object>, |
|
|
|
"filenames" : (str array) ["file1", "file2"] |
|
|
|
} |
|
|
|
] |
|
|
|
:rtype: array of dictionaries |
|
|
|
""" |
|
|
|
|
|
|
|
git_log = subprocess.check_output(("git --no-pager log --name-only" |
|
|
|
" --pretty=format:'%n%n%an%n%at' -z"), shell=True) |
|
|
|
|
|
|
|
commits = [] |
|
|
|
for commit in git_log.split("\n\n"): |
|
|
|
fields = commit.split("\n") |
|
|
|
if len(fields) > 2: |
|
|
|
commits.append({ |
|
|
|
"author" : fields[0], |
|
|
|
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), |
|
|
|
"filenames" : fields[2].split("\x00")[:-2] |
|
|
|
}) |
|
|
|
|
|
|
|
return commits |
|
|
|
|
|
|
|
def _get_tracked_files(self): |
|
|
|
""" |
|
|
|
Return a list of the filenames of all valuable files in the Git repository. |
|
|
|
|
|
|
|
Get a list of the filenames of the non-binary (Perl heuristics used for |
|
|
|
filetype identification) files currently inside the current working |
|
|
|
directory's Git repository. Then, weed out any boilerplate/non-code files |
|
|
|
that match the regex rules in GIT_IGNORE_FILES. |
|
|
|
|
|
|
|
:return: The filenames of all index-worthy non-binary files. |
|
|
|
:rtype: str array |
|
|
|
""" |
|
|
|
|
|
|
|
files = [] |
|
|
|
for dirname, subdir_names, filenames in os.walk("."): |
|
|
|
for filename in filenames: |
|
|
|
path = os.path.join(dirname, filename) |
|
|
|
if self._is_ascii(path): |
|
|
|
files.append(path[2:]) |
|
|
|
|
|
|
|
return files |
|
|
|
if repo.framework_name == "GitHub": |
|
|
|
default_branch = repo.repo.active_branch.name |
|
|
|
parts = [repo.url, "blob", default_branch, filename] |
|
|
|
elif repo.framework_name == "Bitbucket": |
|
|
|
try: |
|
|
|
commit_hash = repo.repo.head.commit.hexsha |
|
|
|
except ValueError: # No commits |
|
|
|
return None |
|
|
|
parts = [repo.url, "src", commit_hash, filename] |
|
|
|
return "/".join(s.strip("/") for s in parts) |
|
|
|
|
|
|
|
def _get_commits_metadata(self): |
|
|
|
def _get_file_metadata(self, repo): |
|
|
|
""" |
|
|
|
Return a dictionary containing every valuable tracked file's metadata. |
|
|
|
|
|
|
|
:return: A dictionary with author names, time of creation, and time of last |
|
|
|
modification for every filename key. |
|
|
|
:return: A dictionary with author names, time of creation, and time of |
|
|
|
last modification for every filename key. |
|
|
|
.. code-block:: python |
|
|
|
sample_returned_dict = { |
|
|
|
"my_file" : { |
|
|
|
"authors" : (str array) ["author1", "author2"], |
|
|
|
"time_created" : (`datetime.datetime`) <object>, |
|
|
|
"time_last_modified" : (`datetime.datetime`) <object> |
|
|
|
} |
|
|
|
} |
|
|
|
:rtype: dictionary of dictionaries |
|
|
|
""" |
|
|
|
|
|
|
|
commits = self._get_git_commits() |
|
|
|
tracked_files = self._get_tracked_files() |
|
|
|
|
|
|
|
files_meta = {} |
|
|
|
for commit in commits: |
|
|
|
for filename in commit["filenames"]: |
|
|
|
if filename not in tracked_files: |
|
|
|
continue |
|
|
|
|
|
|
|
if filename not in files_meta.keys(): |
|
|
|
files_meta[filename] = { |
|
|
|
"authors" : [commit["author"]], |
|
|
|
"time_last_modified" : commit["timestamp"], |
|
|
|
"time_created" : commit["timestamp"] |
|
|
|
sample_returned_dict = { |
|
|
|
"my_file" : { |
|
|
|
"blob": (GitPython Blob) <object>, |
|
|
|
"authors" : (str list) ["author1", "author2"], |
|
|
|
"time_created" : (`datetime.datetime`) <object>, |
|
|
|
"time_last_modified" : (`datetime.datetime`) <object> |
|
|
|
} |
|
|
|
else: |
|
|
|
if commit["author"] not in files_meta[filename]["authors"]: |
|
|
|
files_meta[filename]["authors"].append(commit["author"]) |
|
|
|
files_meta[filename]["time_created"] = commit["timestamp"] |
|
|
|
|
|
|
|
return files_meta |
|
|
|
|
|
|
|
def _decode(self, raw): |
|
|
|
""" |
|
|
|
Return a decoded a raw string. |
|
|
|
|
|
|
|
:param raw: The string to string. |
|
|
|
|
|
|
|
:type raw: (str) |
|
|
|
|
|
|
|
:return: If the original encoding is successfully inferenced, return the |
|
|
|
decoded string. |
|
|
|
:rtype: str, or None |
|
|
|
|
|
|
|
.. warning:: |
|
|
|
The raw string's original encoding is identified by heuristics which |
|
|
|
can, and occasionally will, fail. Decoding will then fail, and None |
|
|
|
will be returned. |
|
|
|
} |
|
|
|
:rtype: dictionary of dictionaries |
|
|
|
""" |
|
|
|
|
|
|
|
try: |
|
|
|
encoding = bs4.BeautifulSoup(raw).original_encoding |
|
|
|
return raw.decode(encoding) if encoding is not None else None |
|
|
|
tree = repo.head.commit.tree |
|
|
|
except ValueError: # No commits |
|
|
|
return {} |
|
|
|
|
|
|
|
files = {} |
|
|
|
self._logger.debug("Building file metadata") |
|
|
|
for item in tree.traverse(): |
|
|
|
if item.type != "blob" or not self._is_ascii(item.data_stream): |
|
|
|
continue |
|
|
|
log = repo.git.log("--follow", '--format=%an %ct', "--", item.path) |
|
|
|
lines = log.splitlines() |
|
|
|
authors = {line.rsplit(" ", 1)[0] for line in lines} |
|
|
|
last_mod = int(lines[0].rsplit(" ", 1)[1]) |
|
|
|
created = int(lines[-1].rsplit(" ", 1)[1]) |
|
|
|
|
|
|
|
files[item.path] = { |
|
|
|
"blob": item, |
|
|
|
"authors" : authors, |
|
|
|
"time_last_modified": datetime.fromtimestamp(last_mod), |
|
|
|
"time_created": datetime.fromtimestamp(created) |
|
|
|
} |
|
|
|
|
|
|
|
except (LookupError, UnicodeDecodeError, UserWarning) as exception: |
|
|
|
return None |
|
|
|
return files |
|
|
|
|
|
|
|
def _is_ascii(self, filename): |
|
|
|
def _is_ascii(self, source): |
|
|
|
""" |
|
|
|
Heuristically determine whether a file is ASCII text or binary. |
|
|
|
|
|
|
@@ -346,34 +248,29 @@ class GitIndexer(threading.Thread): |
|
|
|
operator, and is the de-facto method for in : passdetermining whether a |
|
|
|
file is ASCII. |
|
|
|
|
|
|
|
:param filename: The path of the file to test. |
|
|
|
:param source: The file object to test. |
|
|
|
|
|
|
|
:type filename: str |
|
|
|
:type source: `file` |
|
|
|
|
|
|
|
:return: Whether the file is probably ASCII. |
|
|
|
:rtype: Boolean |
|
|
|
""" |
|
|
|
|
|
|
|
try: |
|
|
|
with open(filename) as source: |
|
|
|
file_snippet = source.read(512) |
|
|
|
|
|
|
|
if not file_snippet: |
|
|
|
return True |
|
|
|
|
|
|
|
ascii_characters = "".join(map(chr, range(32, 127)) + |
|
|
|
list("\n\r\t\b")) |
|
|
|
null_trans = string.maketrans("", "") |
|
|
|
file_snippet = source.read(512) |
|
|
|
|
|
|
|
if "\0" in file_snippet: |
|
|
|
return False |
|
|
|
if not file_snippet: |
|
|
|
return True |
|
|
|
|
|
|
|
non_ascii = file_snippet.translate(null_trans, ascii_characters) |
|
|
|
return not float(len(non_ascii)) / len(file_snippet) > 0.30 |
|
|
|
ascii_characters = "".join(map(chr, range(32, 127)) + |
|
|
|
list("\n\r\t\b")) |
|
|
|
null_trans = string.maketrans("", "") |
|
|
|
|
|
|
|
except IOError: |
|
|
|
if "\0" in file_snippet: |
|
|
|
return False |
|
|
|
|
|
|
|
non_ascii = file_snippet.translate(null_trans, ascii_characters) |
|
|
|
return not float(len(non_ascii)) / len(file_snippet) > 0.30 |
|
|
|
|
|
|
|
class _GitCloner(threading.Thread): |
|
|
|
""" |
|
|
|
A singleton Git repository cloner. |
|
|
@@ -428,7 +325,7 @@ class _GitCloner(threading.Thread): |
|
|
|
try: |
|
|
|
self._clone_repository(repo) |
|
|
|
except Exception: |
|
|
|
pass |
|
|
|
self._logger.exception("Exception raised while cloning:") |
|
|
|
|
|
|
|
def _clone_repository(self, repo): |
|
|
|
""" |
|
|
@@ -439,57 +336,10 @@ class _GitCloner(threading.Thread): |
|
|
|
:type repo: :class:`GitRepository` |
|
|
|
""" |
|
|
|
|
|
|
|
GIT_CLONE_TIMEOUT = 500 |
|
|
|
queue_percent_full = (float(self.index_queue.qsize()) / |
|
|
|
self.index_queue.maxsize) * 100 |
|
|
|
|
|
|
|
command = ["perl", "-e", "alarm shift @ARGV; exec @ARGV", |
|
|
|
str(GIT_CLONE_TIMEOUT), "git", "clone", "--single-branch", |
|
|
|
repo.url, GIT_CLONE_DIR + "/" + repo.dirname] |
|
|
|
if subprocess.call(command) != 0: |
|
|
|
subprocess.call(["pkill", "-f", "git"]) # This makes Ben K upset |
|
|
|
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): |
|
|
|
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname)) |
|
|
|
return |
|
|
|
|
|
|
|
while self.index_queue.full(): |
|
|
|
self._logger.info("Cloning repo: %s", repo.url) |
|
|
|
repo.repo = git.Repo.clone_from(repo.url, to_path=repo.path, bare=True, |
|
|
|
single_branch=True) |
|
|
|
while self.index_queue.full() and self.run_event.is_set(): |
|
|
|
time.sleep(THREAD_QUEUE_SLEEP) |
|
|
|
self.index_queue.put(repo) |
|
|
|
|
|
|
|
class _ChangeDir(object): |
|
|
|
""" |
|
|
|
A wrapper class for os.chdir(), to map onto `with` and handle exceptions. |
|
|
|
|
|
|
|
:ivar new_path: (str) The path to change the current directory to. |
|
|
|
:ivar old_path: (str) The path of the directory to return to. |
|
|
|
""" |
|
|
|
|
|
|
|
def __init__(self, new_path): |
|
|
|
""" |
|
|
|
Create a _ChangeDir instance. |
|
|
|
|
|
|
|
:param new_path: The directory to enter. |
|
|
|
|
|
|
|
:type new_path: str |
|
|
|
""" |
|
|
|
|
|
|
|
self.new_path = new_path |
|
|
|
|
|
|
|
def __enter__(self): |
|
|
|
""" |
|
|
|
Change the current working-directory to **new_path**. |
|
|
|
""" |
|
|
|
|
|
|
|
self.old_path = os.getcwd() |
|
|
|
os.chdir(self.new_path) |
|
|
|
|
|
|
|
def __exit__(self, *exception): |
|
|
|
""" |
|
|
|
Change the current working-directory to **old_path**. |
|
|
|
|
|
|
|
:param exception: Various exception arguments passed by `with`. |
|
|
|
|
|
|
|
:type exception: varargs |
|
|
|
""" |
|
|
|
|
|
|
|
os.chdir(self.old_path) |
|
|
|
if self.run_event.is_set(): |
|
|
|
self.index_queue.put(repo) |