diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 01687bd..635e2b8 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -3,7 +3,7 @@ repositories. """ -import datetime +from datetime import datetime import logging import os import Queue @@ -13,7 +13,8 @@ import subprocess import time import threading -import bs4 +from bs4 import UnicodeDammmit +import git from ..database import Database from ..parser import parse, UnsupportedFileError @@ -33,7 +34,8 @@ class GitRepository(object): repository belongs to (eg, GitHub, BitBucket). :ivar rank: (float) The rank of the repository, as assigned by :class:`crawler.GitHubCrawler`. - :ivar dirname: (str) The repository's on-disk directory name. + :ivar path: (str) The repository's on-disk directory path. + :ivar repo: (git.Repo) A git.Repo representation of the repository. """ def __init__(self, url, name, framework_name, rank): @@ -55,7 +57,9 @@ class GitRepository(object): self.name = name self.framework_name = framework_name self.rank = rank - self.dirname = name.replace("-", "--").replace("/", "-") + dirname = name.replace("/", "-") + "-" + str(int(time.time())) + self.path = os.path.join(GIT_CLONE_DIR, dirname) + self.repo = None class GitIndexer(threading.Thread): """ @@ -124,20 +128,18 @@ class GitIndexer(threading.Thread): `git clone` the Git repository located at **repo.url**, call `_insert_repository_codelets()`, then remove said repository. - :param repo_url: The metadata of the repository to be indexed. - - :type repo_url: :class:`GitRepository` + :param repo: The metadata of the repository to be indexed. + :type repo: :class:`GitRepository` """ self._logger.info(u"Indexing repo: %s", repo.name) - with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): - try: - self._insert_repository_codelets(repo) - except Exception: - self._logger.exception("Exception raised while indexing:") - finally: - if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname)) + try: + self._insert_repository_codelets(repo) + except Exception: + self._logger.exception("Exception raised while indexing:") + finally: + if os.path.isdir(repo.path): + shutil.rmtree(repo.path) def _insert_repository_codelets(self, repo): """ @@ -152,27 +154,18 @@ class GitIndexer(threading.Thread): :type repo_url: :class:`GitRepository` """ - commits_meta = self._get_commits_metadata() + commits_meta = self._get_commits_metadata(repo) if commits_meta is None: return - for filename in commits_meta.keys(): - try: - with open(filename) as source_file: - source = self._decode(source_file.read()) - if source is None: - continue - except IOError: - continue - - authors = [(self._decode(author), None) for author in - commits_meta[filename]["authors"]] - url = self._generate_file_url(filename, repo.url, repo.framework_name) + for filename, data in commits_meta.iteritems(): + authors = [(author, None) for author in data["authors"]] + encoded_source = data["blob"].data_stream.read() + source = UnicodeDammmit(encoded_source).unicode_markup + url = self._generate_file_url(filename, repo) codelet = Codelet("%s: %s" % (repo.name, filename), source, - filename, None, authors, url, - commits_meta[filename]["time_created"], - commits_meta[filename]["time_last_modified"], - repo.rank) + filename, None, authors, url, data["time_created"], + data["time_last_modified"], repo.rank) self._logger.debug("Indexing file: %s", codelet.name) try: parse(codelet) @@ -180,163 +173,103 @@ class GitIndexer(threading.Thread): continue self.database.insert(codelet) - def _generate_file_url(self, filename, repo_url, framework_name): + def _generate_file_url(self, filename, repo): """ Return a url for a filename from a Git wrapper framework. :param filename: The path of the file. - :param repo_url: The url of the file's parent repository. - :param framework_name: The name of the framework the repository is from. + :param repo: The git repo. :type filename: str - :type repo_url: str - :type framework_name: str + :type repo: :class:`GitRepository` :return: The file's full url on the given framework, if successfully derived. :rtype: str, or None - - .. warning:: - Various Git subprocesses will occasionally fail, and, seeing as the - information they provide is a crucial component of some repository - file urls, None may be returned. - """ - - try: - if framework_name == "GitHub": - default_branch = subprocess.check_output("git branch" - " --no-color", shell=True)[2:-1] - parts = [repo_url, "blob", default_branch, filename] - elif framework_name == "Bitbucket": - commit_hash = subprocess.check_output("git rev-parse HEAD", - shell=True).replace("\n", "") - parts = [repo_url, "src", commit_hash, filename] - return "/".join(s.strip("/") for s in parts) - except subprocess.CalledProcessError: - return None - - def _get_git_commits(self): - """ - Return the current working directory's formatted commit data. - - Uses `git log` to generate metadata about every single file in the - repository's commit history. - - :return: The author, timestamp, and names of all modified files of every - commit. - .. code-block:: python - sample_returned_array = [ - { - "author" : (str) "author" - "timestamp" : (`datetime.datetime`) , - "filenames" : (str array) ["file1", "file2"] - } - ] - :rtype: array of dictionaries - """ - - git_log = subprocess.check_output(("git --no-pager log --name-only" - " --pretty=format:'%n%n%an%n%at' -z"), shell=True) - - commits = [] - for commit in git_log.split("\n\n"): - fields = commit.split("\n") - if len(fields) > 2: - commits.append({ - "author" : fields[0], - "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), - "filenames" : fields[2].split("\x00")[:-2] - }) - - return commits - - def _get_tracked_files(self): - """ - Return a list of the filenames of all valuable files in the Git repository. - - Get a list of the filenames of the non-binary (Perl heuristics used for - filetype identification) files currently inside the current working - directory's Git repository. Then, weed out any boilerplate/non-code files - that match the regex rules in GIT_IGNORE_FILES. - - :return: The filenames of all index-worthy non-binary files. - :rtype: str array """ - files = [] - for dirname, subdir_names, filenames in os.walk("."): - for filename in filenames: - path = os.path.join(dirname, filename) - if self._is_ascii(path): - files.append(path[2:]) + if framework_name == "GitHub": + default_branch = repo.repo.active_branch + parts = [repo_url, "blob", default_branch, filename] + elif framework_name == "Bitbucket": + try: + commit_hash = repo.repo.head.commit.hexsha + except ValueError: # No commits + return None + parts = [repo_url, "src", commit_hash, filename] + return "/".join(s.strip("/") for s in parts) + + def _walk_history(self, files, head): + """Walk a repository's history for metadata.""" + def update_entry(commit, entry, new_file): + entry.add(commit.author.name) + commit_ts = datetime.utcfromtimestamp(commit.committed_date) + if commit_ts > entry["time_last_modified"]: + entry["time_last_modified"] = commit_ts + if new_file: + entry["time_created"] = commit_ts + + def handle_commit(commit, paths): + if not commit.parents: + for item in commit.tree.traverse(): + if item.type == "blob" and item.path in paths: + update_entry(commit, files[paths[item.path]], True) + return + + for parent in commit.parents: + for diff in parent.diff(commit, create_patch=True): + pth = diff.renamed_to if diff.renamed else diff.b_blob.path + if pth not in paths: + continue + update_entry(commit, files[paths[pth]], diff.new_file) + if diff.renamed: + paths[diff.renamed_from] = paths[pth] + del paths[pth] - return files + pending = [(head, {path: path for path in files})] + while pending: + commit, paths = pending.pop() + handle_commit(commit, paths) + for parent in commit.parents: + new_paths = paths.copy() if len(commit.parents) > 1 else paths + pending.append((parent, new_paths)) - def _get_commits_metadata(self): + def _get_commits_metadata(self, repo): """ Return a dictionary containing every valuable tracked file's metadata. - :return: A dictionary with author names, time of creation, and time of last - modification for every filename key. + :return: A dictionary with author names, time of creation, and time of + last modification for every filename key. .. code-block:: python - sample_returned_dict = { - "my_file" : { - "authors" : (str array) ["author1", "author2"], - "time_created" : (`datetime.datetime`) , - "time_last_modified" : (`datetime.datetime`) - } - } - :rtype: dictionary of dictionaries - """ - - commits = self._get_git_commits() - tracked_files = self._get_tracked_files() - - files_meta = {} - for commit in commits: - for filename in commit["filenames"]: - if filename not in tracked_files: - continue - - if filename not in files_meta.keys(): - files_meta[filename] = { - "authors" : [commit["author"]], - "time_last_modified" : commit["timestamp"], - "time_created" : commit["timestamp"] + sample_returned_dict = { + "my_file" : { + "blob": (GitPython Blob) , + "authors" : (str set) {"author1", "author2"}, + "time_created" : (`datetime.datetime`) , + "time_last_modified" : (`datetime.datetime`) } - else: - if commit["author"] not in files_meta[filename]["authors"]: - files_meta[filename]["authors"].append(commit["author"]) - files_meta[filename]["time_created"] = commit["timestamp"] - - return files_meta - - def _decode(self, raw): - """ - Return a decoded a raw string. - - :param raw: The string to string. - - :type raw: (str) - - :return: If the original encoding is successfully inferenced, return the - decoded string. - :rtype: str, or None - - .. warning:: - The raw string's original encoding is identified by heuristics which - can, and occasionally will, fail. Decoding will then fail, and None - will be returned. + } + :rtype: dictionary of dictionaries """ - try: - encoding = bs4.BeautifulSoup(raw).original_encoding - return raw.decode(encoding) if encoding is not None else None - - except (LookupError, UnicodeDecodeError, UserWarning) as exception: - return None + tree = repo.repo.head.commit.tree + except ValueError: # No commits + return {} + + files = {} + for item in tree.traverse(): + if item.type == "blob" and self._is_ascii(item.data_stream): + files[item.path] = { + "blob": item, + "authors" : set(), + "time_last_modified": datetime.utcfromtimestamp(0), + "time_created": datetime.utcfromtimestamp(0) + } + + self._walk_history(files, repo.repo.head.commit) + return files - def _is_ascii(self, filename): + def _is_ascii(self, fp): """ Heuristically determine whether a file is ASCII text or binary. @@ -346,34 +279,29 @@ class GitIndexer(threading.Thread): operator, and is the de-facto method for in : passdetermining whether a file is ASCII. - :param filename: The path of the file to test. + :param fp: The file object to test. - :type filename: str + :type fp: `file` :return: Whether the file is probably ASCII. :rtype: Boolean """ - try: - with open(filename) as source: - file_snippet = source.read(512) - - if not file_snippet: - return True - - ascii_characters = "".join(map(chr, range(32, 127)) + - list("\n\r\t\b")) - null_trans = string.maketrans("", "") + file_snippet = source.read(512) - if "\0" in file_snippet: - return False + if not file_snippet: + return True - non_ascii = file_snippet.translate(null_trans, ascii_characters) - return not float(len(non_ascii)) / len(file_snippet) > 0.30 + ascii_characters = "".join(map(chr, range(32, 127)) + + list("\n\r\t\b")) + null_trans = string.maketrans("", "") - except IOError: + if "\0" in file_snippet: return False + non_ascii = file_snippet.translate(null_trans, ascii_characters) + return not float(len(non_ascii)) / len(file_snippet) > 0.30 + class _GitCloner(threading.Thread): """ A singleton Git repository cloner. @@ -428,7 +356,7 @@ class _GitCloner(threading.Thread): try: self._clone_repository(repo) except Exception: - pass + self._logger.exception("Exception raised while cloning:") def _clone_repository(self, repo): """ @@ -439,57 +367,10 @@ class _GitCloner(threading.Thread): :type repo: :class:`GitRepository` """ - GIT_CLONE_TIMEOUT = 500 - queue_percent_full = (float(self.index_queue.qsize()) / - self.index_queue.maxsize) * 100 - - command = ["perl", "-e", "alarm shift @ARGV; exec @ARGV", - str(GIT_CLONE_TIMEOUT), "git", "clone", "--single-branch", - repo.url, GIT_CLONE_DIR + "/" + repo.dirname] - if subprocess.call(command) != 0: - subprocess.call(["pkill", "-f", "git"]) # This makes Ben K upset - if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname)) - return - - while self.index_queue.full(): + self._logger.info("Cloning repo: %s", repo.url) + repo.repo = git.Repo.clone_from(repo.url, to_path=repo.path, bare=True, + single_branch=True) + while self.index_queue.full() and self.run_event.is_set(): time.sleep(THREAD_QUEUE_SLEEP) - self.index_queue.put(repo) - -class _ChangeDir(object): - """ - A wrapper class for os.chdir(), to map onto `with` and handle exceptions. - - :ivar new_path: (str) The path to change the current directory to. - :ivar old_path: (str) The path of the directory to return to. - """ - - def __init__(self, new_path): - """ - Create a _ChangeDir instance. - - :param new_path: The directory to enter. - - :type new_path: str - """ - - self.new_path = new_path - - def __enter__(self): - """ - Change the current working-directory to **new_path**. - """ - - self.old_path = os.getcwd() - os.chdir(self.new_path) - - def __exit__(self, *exception): - """ - Change the current working-directory to **old_path**. - - :param exception: Various exception arguments passed by `with`. - - :type exception: varargs - """ - - os.chdir(self.old_path) + if self.run_event.is_set(): + self.index_queue.put(repo) diff --git a/setup.py b/setup.py index 869c896..cc97bc9 100644 --- a/setup.py +++ b/setup.py @@ -6,8 +6,8 @@ setup( packages = find_packages(), install_requires = [ "Flask>=0.10.1", "gunicorn>=18.0", "pygments>=1.6", "requests>=2.2.0", - "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3", - "PyYAML>=3.11", "python-dateutil>=2.2"], + "GitPython>=0.3.2.RC1", "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", + "mmh3>=2.3", "PyYAML>=3.11", "python-dateutil>=2.2", "cchardet>=0.3.5"], author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", license = "MIT", url = "https://github.com/earwig/bitshift"