From afc5980683d4e0949d07fea0e3837b8b7622267f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Jun 2014 22:39:19 -0400 Subject: [PATCH 01/10] Rewrite much of the indexer to use GitPython. --- bitshift/crawler/indexer.py | 357 +++++++++++++++----------------------------- setup.py | 4 +- 2 files changed, 121 insertions(+), 240 deletions(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 01687bd..635e2b8 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -3,7 +3,7 @@ repositories. """ -import datetime +from datetime import datetime import logging import os import Queue @@ -13,7 +13,8 @@ import subprocess import time import threading -import bs4 +from bs4 import UnicodeDammmit +import git from ..database import Database from ..parser import parse, UnsupportedFileError @@ -33,7 +34,8 @@ class GitRepository(object): repository belongs to (eg, GitHub, BitBucket). :ivar rank: (float) The rank of the repository, as assigned by :class:`crawler.GitHubCrawler`. - :ivar dirname: (str) The repository's on-disk directory name. + :ivar path: (str) The repository's on-disk directory path. + :ivar repo: (git.Repo) A git.Repo representation of the repository. """ def __init__(self, url, name, framework_name, rank): @@ -55,7 +57,9 @@ class GitRepository(object): self.name = name self.framework_name = framework_name self.rank = rank - self.dirname = name.replace("-", "--").replace("/", "-") + dirname = name.replace("/", "-") + "-" + str(int(time.time())) + self.path = os.path.join(GIT_CLONE_DIR, dirname) + self.repo = None class GitIndexer(threading.Thread): """ @@ -124,20 +128,18 @@ class GitIndexer(threading.Thread): `git clone` the Git repository located at **repo.url**, call `_insert_repository_codelets()`, then remove said repository. - :param repo_url: The metadata of the repository to be indexed. - - :type repo_url: :class:`GitRepository` + :param repo: The metadata of the repository to be indexed. + :type repo: :class:`GitRepository` """ self._logger.info(u"Indexing repo: %s", repo.name) - with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): - try: - self._insert_repository_codelets(repo) - except Exception: - self._logger.exception("Exception raised while indexing:") - finally: - if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname)) + try: + self._insert_repository_codelets(repo) + except Exception: + self._logger.exception("Exception raised while indexing:") + finally: + if os.path.isdir(repo.path): + shutil.rmtree(repo.path) def _insert_repository_codelets(self, repo): """ @@ -152,27 +154,18 @@ class GitIndexer(threading.Thread): :type repo_url: :class:`GitRepository` """ - commits_meta = self._get_commits_metadata() + commits_meta = self._get_commits_metadata(repo) if commits_meta is None: return - for filename in commits_meta.keys(): - try: - with open(filename) as source_file: - source = self._decode(source_file.read()) - if source is None: - continue - except IOError: - continue - - authors = [(self._decode(author), None) for author in - commits_meta[filename]["authors"]] - url = self._generate_file_url(filename, repo.url, repo.framework_name) + for filename, data in commits_meta.iteritems(): + authors = [(author, None) for author in data["authors"]] + encoded_source = data["blob"].data_stream.read() + source = UnicodeDammmit(encoded_source).unicode_markup + url = self._generate_file_url(filename, repo) codelet = Codelet("%s: %s" % (repo.name, filename), source, - filename, None, authors, url, - commits_meta[filename]["time_created"], - commits_meta[filename]["time_last_modified"], - repo.rank) + filename, None, authors, url, data["time_created"], + data["time_last_modified"], repo.rank) self._logger.debug("Indexing file: %s", codelet.name) try: parse(codelet) @@ -180,163 +173,103 @@ class GitIndexer(threading.Thread): continue self.database.insert(codelet) - def _generate_file_url(self, filename, repo_url, framework_name): + def _generate_file_url(self, filename, repo): """ Return a url for a filename from a Git wrapper framework. :param filename: The path of the file. - :param repo_url: The url of the file's parent repository. - :param framework_name: The name of the framework the repository is from. + :param repo: The git repo. :type filename: str - :type repo_url: str - :type framework_name: str + :type repo: :class:`GitRepository` :return: The file's full url on the given framework, if successfully derived. :rtype: str, or None - - .. warning:: - Various Git subprocesses will occasionally fail, and, seeing as the - information they provide is a crucial component of some repository - file urls, None may be returned. - """ - - try: - if framework_name == "GitHub": - default_branch = subprocess.check_output("git branch" - " --no-color", shell=True)[2:-1] - parts = [repo_url, "blob", default_branch, filename] - elif framework_name == "Bitbucket": - commit_hash = subprocess.check_output("git rev-parse HEAD", - shell=True).replace("\n", "") - parts = [repo_url, "src", commit_hash, filename] - return "/".join(s.strip("/") for s in parts) - except subprocess.CalledProcessError: - return None - - def _get_git_commits(self): - """ - Return the current working directory's formatted commit data. - - Uses `git log` to generate metadata about every single file in the - repository's commit history. - - :return: The author, timestamp, and names of all modified files of every - commit. - .. code-block:: python - sample_returned_array = [ - { - "author" : (str) "author" - "timestamp" : (`datetime.datetime`) , - "filenames" : (str array) ["file1", "file2"] - } - ] - :rtype: array of dictionaries - """ - - git_log = subprocess.check_output(("git --no-pager log --name-only" - " --pretty=format:'%n%n%an%n%at' -z"), shell=True) - - commits = [] - for commit in git_log.split("\n\n"): - fields = commit.split("\n") - if len(fields) > 2: - commits.append({ - "author" : fields[0], - "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), - "filenames" : fields[2].split("\x00")[:-2] - }) - - return commits - - def _get_tracked_files(self): - """ - Return a list of the filenames of all valuable files in the Git repository. - - Get a list of the filenames of the non-binary (Perl heuristics used for - filetype identification) files currently inside the current working - directory's Git repository. Then, weed out any boilerplate/non-code files - that match the regex rules in GIT_IGNORE_FILES. - - :return: The filenames of all index-worthy non-binary files. - :rtype: str array """ - files = [] - for dirname, subdir_names, filenames in os.walk("."): - for filename in filenames: - path = os.path.join(dirname, filename) - if self._is_ascii(path): - files.append(path[2:]) + if framework_name == "GitHub": + default_branch = repo.repo.active_branch + parts = [repo_url, "blob", default_branch, filename] + elif framework_name == "Bitbucket": + try: + commit_hash = repo.repo.head.commit.hexsha + except ValueError: # No commits + return None + parts = [repo_url, "src", commit_hash, filename] + return "/".join(s.strip("/") for s in parts) + + def _walk_history(self, files, head): + """Walk a repository's history for metadata.""" + def update_entry(commit, entry, new_file): + entry.add(commit.author.name) + commit_ts = datetime.utcfromtimestamp(commit.committed_date) + if commit_ts > entry["time_last_modified"]: + entry["time_last_modified"] = commit_ts + if new_file: + entry["time_created"] = commit_ts + + def handle_commit(commit, paths): + if not commit.parents: + for item in commit.tree.traverse(): + if item.type == "blob" and item.path in paths: + update_entry(commit, files[paths[item.path]], True) + return + + for parent in commit.parents: + for diff in parent.diff(commit, create_patch=True): + pth = diff.renamed_to if diff.renamed else diff.b_blob.path + if pth not in paths: + continue + update_entry(commit, files[paths[pth]], diff.new_file) + if diff.renamed: + paths[diff.renamed_from] = paths[pth] + del paths[pth] - return files + pending = [(head, {path: path for path in files})] + while pending: + commit, paths = pending.pop() + handle_commit(commit, paths) + for parent in commit.parents: + new_paths = paths.copy() if len(commit.parents) > 1 else paths + pending.append((parent, new_paths)) - def _get_commits_metadata(self): + def _get_commits_metadata(self, repo): """ Return a dictionary containing every valuable tracked file's metadata. - :return: A dictionary with author names, time of creation, and time of last - modification for every filename key. + :return: A dictionary with author names, time of creation, and time of + last modification for every filename key. .. code-block:: python - sample_returned_dict = { - "my_file" : { - "authors" : (str array) ["author1", "author2"], - "time_created" : (`datetime.datetime`) , - "time_last_modified" : (`datetime.datetime`) - } - } - :rtype: dictionary of dictionaries - """ - - commits = self._get_git_commits() - tracked_files = self._get_tracked_files() - - files_meta = {} - for commit in commits: - for filename in commit["filenames"]: - if filename not in tracked_files: - continue - - if filename not in files_meta.keys(): - files_meta[filename] = { - "authors" : [commit["author"]], - "time_last_modified" : commit["timestamp"], - "time_created" : commit["timestamp"] + sample_returned_dict = { + "my_file" : { + "blob": (GitPython Blob) , + "authors" : (str set) {"author1", "author2"}, + "time_created" : (`datetime.datetime`) , + "time_last_modified" : (`datetime.datetime`) } - else: - if commit["author"] not in files_meta[filename]["authors"]: - files_meta[filename]["authors"].append(commit["author"]) - files_meta[filename]["time_created"] = commit["timestamp"] - - return files_meta - - def _decode(self, raw): - """ - Return a decoded a raw string. - - :param raw: The string to string. - - :type raw: (str) - - :return: If the original encoding is successfully inferenced, return the - decoded string. - :rtype: str, or None - - .. warning:: - The raw string's original encoding is identified by heuristics which - can, and occasionally will, fail. Decoding will then fail, and None - will be returned. + } + :rtype: dictionary of dictionaries """ - try: - encoding = bs4.BeautifulSoup(raw).original_encoding - return raw.decode(encoding) if encoding is not None else None - - except (LookupError, UnicodeDecodeError, UserWarning) as exception: - return None + tree = repo.repo.head.commit.tree + except ValueError: # No commits + return {} + + files = {} + for item in tree.traverse(): + if item.type == "blob" and self._is_ascii(item.data_stream): + files[item.path] = { + "blob": item, + "authors" : set(), + "time_last_modified": datetime.utcfromtimestamp(0), + "time_created": datetime.utcfromtimestamp(0) + } + + self._walk_history(files, repo.repo.head.commit) + return files - def _is_ascii(self, filename): + def _is_ascii(self, fp): """ Heuristically determine whether a file is ASCII text or binary. @@ -346,34 +279,29 @@ class GitIndexer(threading.Thread): operator, and is the de-facto method for in : passdetermining whether a file is ASCII. - :param filename: The path of the file to test. + :param fp: The file object to test. - :type filename: str + :type fp: `file` :return: Whether the file is probably ASCII. :rtype: Boolean """ - try: - with open(filename) as source: - file_snippet = source.read(512) - - if not file_snippet: - return True - - ascii_characters = "".join(map(chr, range(32, 127)) + - list("\n\r\t\b")) - null_trans = string.maketrans("", "") + file_snippet = source.read(512) - if "\0" in file_snippet: - return False + if not file_snippet: + return True - non_ascii = file_snippet.translate(null_trans, ascii_characters) - return not float(len(non_ascii)) / len(file_snippet) > 0.30 + ascii_characters = "".join(map(chr, range(32, 127)) + + list("\n\r\t\b")) + null_trans = string.maketrans("", "") - except IOError: + if "\0" in file_snippet: return False + non_ascii = file_snippet.translate(null_trans, ascii_characters) + return not float(len(non_ascii)) / len(file_snippet) > 0.30 + class _GitCloner(threading.Thread): """ A singleton Git repository cloner. @@ -428,7 +356,7 @@ class _GitCloner(threading.Thread): try: self._clone_repository(repo) except Exception: - pass + self._logger.exception("Exception raised while cloning:") def _clone_repository(self, repo): """ @@ -439,57 +367,10 @@ class _GitCloner(threading.Thread): :type repo: :class:`GitRepository` """ - GIT_CLONE_TIMEOUT = 500 - queue_percent_full = (float(self.index_queue.qsize()) / - self.index_queue.maxsize) * 100 - - command = ["perl", "-e", "alarm shift @ARGV; exec @ARGV", - str(GIT_CLONE_TIMEOUT), "git", "clone", "--single-branch", - repo.url, GIT_CLONE_DIR + "/" + repo.dirname] - if subprocess.call(command) != 0: - subprocess.call(["pkill", "-f", "git"]) # This makes Ben K upset - if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)): - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname)) - return - - while self.index_queue.full(): + self._logger.info("Cloning repo: %s", repo.url) + repo.repo = git.Repo.clone_from(repo.url, to_path=repo.path, bare=True, + single_branch=True) + while self.index_queue.full() and self.run_event.is_set(): time.sleep(THREAD_QUEUE_SLEEP) - self.index_queue.put(repo) - -class _ChangeDir(object): - """ - A wrapper class for os.chdir(), to map onto `with` and handle exceptions. - - :ivar new_path: (str) The path to change the current directory to. - :ivar old_path: (str) The path of the directory to return to. - """ - - def __init__(self, new_path): - """ - Create a _ChangeDir instance. - - :param new_path: The directory to enter. - - :type new_path: str - """ - - self.new_path = new_path - - def __enter__(self): - """ - Change the current working-directory to **new_path**. - """ - - self.old_path = os.getcwd() - os.chdir(self.new_path) - - def __exit__(self, *exception): - """ - Change the current working-directory to **old_path**. - - :param exception: Various exception arguments passed by `with`. - - :type exception: varargs - """ - - os.chdir(self.old_path) + if self.run_event.is_set(): + self.index_queue.put(repo) diff --git a/setup.py b/setup.py index 869c896..cc97bc9 100644 --- a/setup.py +++ b/setup.py @@ -6,8 +6,8 @@ setup( packages = find_packages(), install_requires = [ "Flask>=0.10.1", "gunicorn>=18.0", "pygments>=1.6", "requests>=2.2.0", - "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3", - "PyYAML>=3.11", "python-dateutil>=2.2"], + "GitPython>=0.3.2.RC1", "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", + "mmh3>=2.3", "PyYAML>=3.11", "python-dateutil>=2.2", "cchardet>=0.3.5"], author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", license = "MIT", url = "https://github.com/earwig/bitshift" From a1a5252aa76858a0e730308208c4ae6a6acedceb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Jun 2014 22:44:20 -0400 Subject: [PATCH 02/10] Typo. --- bitshift/crawler/indexer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 635e2b8..ba6d2be 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -13,7 +13,7 @@ import subprocess import time import threading -from bs4 import UnicodeDammmit +from bs4 import UnicodeDammit import git from ..database import Database @@ -161,7 +161,7 @@ class GitIndexer(threading.Thread): for filename, data in commits_meta.iteritems(): authors = [(author, None) for author in data["authors"]] encoded_source = data["blob"].data_stream.read() - source = UnicodeDammmit(encoded_source).unicode_markup + source = UnicodeDammit(encoded_source).unicode_markup url = self._generate_file_url(filename, repo) codelet = Codelet("%s: %s" % (repo.name, filename), source, filename, None, authors, url, data["time_created"], From be091dff9b76bb3754af019156da8eb4f3a43fd6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Jun 2014 22:46:50 -0400 Subject: [PATCH 03/10] Assorted bugfixes. --- bitshift/crawler/indexer.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index ba6d2be..fc146f2 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -9,7 +9,6 @@ import os import Queue import shutil import string -import subprocess import time import threading @@ -18,11 +17,11 @@ import git from ..database import Database from ..parser import parse, UnsupportedFileError -from ..languages import LANGS from ..codelet import Codelet GIT_CLONE_DIR = "/tmp/bitshift" THREAD_QUEUE_SLEEP = 0.5 +MAX_INDEX_QUEUE_SIZE = 10 class GitRepository(object): """ @@ -85,8 +84,6 @@ class GitIndexer(threading.Thread): :type index_queue: see :attr:`self.index_queue` """ - MAX_INDEX_QUEUE_SIZE = 10 - self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) self.run_event = run_event self.git_cloner = _GitCloner(clone_queue, self.index_queue, run_event) @@ -188,15 +185,15 @@ class GitIndexer(threading.Thread): :rtype: str, or None """ - if framework_name == "GitHub": + if repo.framework_name == "GitHub": default_branch = repo.repo.active_branch - parts = [repo_url, "blob", default_branch, filename] - elif framework_name == "Bitbucket": + parts = [repo.url, "blob", default_branch, filename] + elif repo.framework_name == "Bitbucket": try: commit_hash = repo.repo.head.commit.hexsha except ValueError: # No commits return None - parts = [repo_url, "src", commit_hash, filename] + parts = [repo.url, "src", commit_hash, filename] return "/".join(s.strip("/") for s in parts) def _walk_history(self, files, head): @@ -269,7 +266,7 @@ class GitIndexer(threading.Thread): self._walk_history(files, repo.repo.head.commit) return files - def _is_ascii(self, fp): + def _is_ascii(self, source): """ Heuristically determine whether a file is ASCII text or binary. @@ -279,9 +276,9 @@ class GitIndexer(threading.Thread): operator, and is the de-facto method for in : passdetermining whether a file is ASCII. - :param fp: The file object to test. + :param source: The file object to test. - :type fp: `file` + :type source: `file` :return: Whether the file is probably ASCII. :rtype: Boolean From 4c34055849a079ab6da5cec67f0f23b08a5db2e2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Jun 2014 22:48:07 -0400 Subject: [PATCH 04/10] Another bugfix. --- bitshift/crawler/indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index fc146f2..5f726c5 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -199,7 +199,7 @@ class GitIndexer(threading.Thread): def _walk_history(self, files, head): """Walk a repository's history for metadata.""" def update_entry(commit, entry, new_file): - entry.add(commit.author.name) + entry["authors"].add(commit.author.name) commit_ts = datetime.utcfromtimestamp(commit.committed_date) if commit_ts > entry["time_last_modified"]: entry["time_last_modified"] = commit_ts From 2987ae27cb87fde7f4605ebadf90a5dc75c18cf5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Jun 2014 22:48:53 -0400 Subject: [PATCH 05/10] Sadface. --- bitshift/crawler/indexer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 5f726c5..529a964 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -215,12 +215,12 @@ class GitIndexer(threading.Thread): for parent in commit.parents: for diff in parent.diff(commit, create_patch=True): - pth = diff.renamed_to if diff.renamed else diff.b_blob.path + pth = diff.rename_to if diff.renamed else diff.b_blob.path if pth not in paths: continue update_entry(commit, files[paths[pth]], diff.new_file) if diff.renamed: - paths[diff.renamed_from] = paths[pth] + paths[diff.rename_from] = paths[pth] del paths[pth] pending = [(head, {path: path for path in files})] From 782b9b9faf460ddf0f41f98afcdf1e1901b53d21 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 8 Jun 2014 22:51:17 -0400 Subject: [PATCH 06/10] Use the branch name. --- bitshift/crawler/indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 529a964..6c51a47 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -186,7 +186,7 @@ class GitIndexer(threading.Thread): """ if repo.framework_name == "GitHub": - default_branch = repo.repo.active_branch + default_branch = repo.repo.active_branch.name parts = [repo.url, "blob", default_branch, filename] elif repo.framework_name == "Bitbucket": try: From 627deadc861d64db5f4e131283c45a22b1f74d73 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 9 Jun 2014 03:08:50 -0400 Subject: [PATCH 07/10] Improve metadata retrieval. --- bitshift/crawler/indexer.py | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 6c51a47..1b209fa 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -151,11 +151,11 @@ class GitIndexer(threading.Thread): :type repo_url: :class:`GitRepository` """ - commits_meta = self._get_commits_metadata(repo) - if commits_meta is None: + file_meta = self._get_file_metadata(repo) + if file_meta is None: return - for filename, data in commits_meta.iteritems(): + for filename, data in file_meta.iteritems(): authors = [(author, None) for author in data["authors"]] encoded_source = data["blob"].data_stream.read() source = UnicodeDammit(encoded_source).unicode_markup @@ -199,13 +199,24 @@ class GitIndexer(threading.Thread): def _walk_history(self, files, head): """Walk a repository's history for metadata.""" def update_entry(commit, entry, new_file): - entry["authors"].add(commit.author.name) + if commit.author.name not in entry["authors"]: + entry["authors"].append(commit.author.name) commit_ts = datetime.utcfromtimestamp(commit.committed_date) if commit_ts > entry["time_last_modified"]: entry["time_last_modified"] = commit_ts if new_file: entry["time_created"] = commit_ts + def get_diffs(commit, parent): + cache_key = parent.binsha + commit.binsha + if cache_key in diff_cache: + return diff_cache[cache_key] + diffs = parent.diff(commit, create_patch=True) + for diff in diffs: + del diff.diff + diff_cache[cache_key] = diffs + return diffs + def handle_commit(commit, paths): if not commit.parents: for item in commit.tree.traverse(): @@ -214,7 +225,9 @@ class GitIndexer(threading.Thread): return for parent in commit.parents: - for diff in parent.diff(commit, create_patch=True): + for diff in get_diffs(commit, parent): + if not diff.b_blob: # Happens when file modes are changed + continue pth = diff.rename_to if diff.renamed else diff.b_blob.path if pth not in paths: continue @@ -224,14 +237,23 @@ class GitIndexer(threading.Thread): del paths[pth] pending = [(head, {path: path for path in files})] + diff_cache = {} + processed = {} while pending: commit, paths = pending.pop() handle_commit(commit, paths) + hash_key = hash(frozenset(paths.items())) for parent in commit.parents: new_paths = paths.copy() if len(commit.parents) > 1 else paths - pending.append((parent, new_paths)) - - def _get_commits_metadata(self, repo): + if parent.binsha in processed: + if hash_key not in processed[parent.binsha]: + pending.append((parent, new_paths)) + processed[parent.binsha].append(hash_key) + else: + pending.append((parent, new_paths)) + processed[parent.binsha] = [hash_key] + + def _get_file_metadata(self, repo): """ Return a dictionary containing every valuable tracked file's metadata. @@ -241,7 +263,7 @@ class GitIndexer(threading.Thread): sample_returned_dict = { "my_file" : { "blob": (GitPython Blob) , - "authors" : (str set) {"author1", "author2"}, + "authors" : (str list) ["author1", "author2"], "time_created" : (`datetime.datetime`) , "time_last_modified" : (`datetime.datetime`) } @@ -258,11 +280,12 @@ class GitIndexer(threading.Thread): if item.type == "blob" and self._is_ascii(item.data_stream): files[item.path] = { "blob": item, - "authors" : set(), + "authors" : [], "time_last_modified": datetime.utcfromtimestamp(0), "time_created": datetime.utcfromtimestamp(0) } + self._logger.debug("Building file metadata") self._walk_history(files, repo.repo.head.commit) return files From 1c1eb6009bcdb305edb9fd2c98c7a124babc9dbc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 9 Jun 2014 03:23:51 -0400 Subject: [PATCH 08/10] utf8mb4 --- bitshift/database/migration.py | 7 ++++++- bitshift/database/schema.sql | 8 +++++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py index 730790f..5592dfe 100644 --- a/bitshift/database/migration.py +++ b/bitshift/database/migration.py @@ -3,7 +3,7 @@ Contains information about database schema versions, and SQL queries to update between them. """ -VERSION = 10 +VERSION = 11 MIGRATIONS = [ # 1 -> 2 @@ -122,6 +122,11 @@ MIGRATIONS = [ MODIFY COLUMN `sloc_col` INT UNSIGNED DEFAULT NULL, MODIFY COLUMN `sloc_end_row` INT UNSIGNED DEFAULT NULL, MODIFY COLUMN `sloc_end_col` INT UNSIGNED DEFAULT NULL""" + ], + # 10 -> 11 + [ + """ALTER DATABASE `bitshift` + CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci""" ] ] diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql index 6102fe8..c4d8792 100644 --- a/bitshift/database/schema.sql +++ b/bitshift/database/schema.sql @@ -1,12 +1,14 @@ --- Schema version 10 +-- Schema version 11 -CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; +CREATE DATABASE `bitshift` + DEFAULT CHARACTER SET utf8mb4 + COLLATE utf8mb4_unicode_ci; USE `bitshift`; CREATE TABLE `version` ( `version` INT UNSIGNED NOT NULL ) ENGINE=InnoDB; -INSERT INTO `version` VALUES (10); +INSERT INTO `version` VALUES (11); CREATE TABLE `origins` ( `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, From 55980f33fd7ae5ca3ac921a3e4182073612962f3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 9 Jun 2014 12:48:20 -0400 Subject: [PATCH 09/10] Better method for determining commit history. --- bitshift/crawler/indexer.py | 74 +++++++-------------------------------------- 1 file changed, 11 insertions(+), 63 deletions(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 1b209fa..4bf6b76 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -151,7 +151,7 @@ class GitIndexer(threading.Thread): :type repo_url: :class:`GitRepository` """ - file_meta = self._get_file_metadata(repo) + file_meta = self._get_file_metadata(repo.repo) if file_meta is None: return @@ -196,63 +196,6 @@ class GitIndexer(threading.Thread): parts = [repo.url, "src", commit_hash, filename] return "/".join(s.strip("/") for s in parts) - def _walk_history(self, files, head): - """Walk a repository's history for metadata.""" - def update_entry(commit, entry, new_file): - if commit.author.name not in entry["authors"]: - entry["authors"].append(commit.author.name) - commit_ts = datetime.utcfromtimestamp(commit.committed_date) - if commit_ts > entry["time_last_modified"]: - entry["time_last_modified"] = commit_ts - if new_file: - entry["time_created"] = commit_ts - - def get_diffs(commit, parent): - cache_key = parent.binsha + commit.binsha - if cache_key in diff_cache: - return diff_cache[cache_key] - diffs = parent.diff(commit, create_patch=True) - for diff in diffs: - del diff.diff - diff_cache[cache_key] = diffs - return diffs - - def handle_commit(commit, paths): - if not commit.parents: - for item in commit.tree.traverse(): - if item.type == "blob" and item.path in paths: - update_entry(commit, files[paths[item.path]], True) - return - - for parent in commit.parents: - for diff in get_diffs(commit, parent): - if not diff.b_blob: # Happens when file modes are changed - continue - pth = diff.rename_to if diff.renamed else diff.b_blob.path - if pth not in paths: - continue - update_entry(commit, files[paths[pth]], diff.new_file) - if diff.renamed: - paths[diff.rename_from] = paths[pth] - del paths[pth] - - pending = [(head, {path: path for path in files})] - diff_cache = {} - processed = {} - while pending: - commit, paths = pending.pop() - handle_commit(commit, paths) - hash_key = hash(frozenset(paths.items())) - for parent in commit.parents: - new_paths = paths.copy() if len(commit.parents) > 1 else paths - if parent.binsha in processed: - if hash_key not in processed[parent.binsha]: - pending.append((parent, new_paths)) - processed[parent.binsha].append(hash_key) - else: - pending.append((parent, new_paths)) - processed[parent.binsha] = [hash_key] - def _get_file_metadata(self, repo): """ Return a dictionary containing every valuable tracked file's metadata. @@ -271,22 +214,27 @@ class GitIndexer(threading.Thread): :rtype: dictionary of dictionaries """ try: - tree = repo.repo.head.commit.tree + tree = repo.head.commit.tree except ValueError: # No commits return {} files = {} for item in tree.traverse(): if item.type == "blob" and self._is_ascii(item.data_stream): + log = repo.git.log("--follow", '--format=%an %ct', item.path) + lines = log.splitlines() + authors = {line.rsplit(" ", 1)[0] for line in lines} + last_mod = int(lines[0].rsplit(" ", 1)[1]) + created = int(lines[-1].rsplit(" ", 1)[1]) + files[item.path] = { "blob": item, - "authors" : [], - "time_last_modified": datetime.utcfromtimestamp(0), - "time_created": datetime.utcfromtimestamp(0) + "authors" : authors, + "time_last_modified": datetime.fromtimestamp(last_mod), + "time_created": datetime.fromtimestamp(created) } self._logger.debug("Building file metadata") - self._walk_history(files, repo.repo.head.commit) return files def _is_ascii(self, source): From e4ddd3ec5f223c2d9da782c724177385c9ec0337 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 9 Jun 2014 13:07:01 -0400 Subject: [PATCH 10/10] Fix for repo.git.log(). --- bitshift/crawler/indexer.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 4bf6b76..cbbf5d4 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -219,22 +219,23 @@ class GitIndexer(threading.Thread): return {} files = {} + self._logger.debug("Building file metadata") for item in tree.traverse(): - if item.type == "blob" and self._is_ascii(item.data_stream): - log = repo.git.log("--follow", '--format=%an %ct', item.path) - lines = log.splitlines() - authors = {line.rsplit(" ", 1)[0] for line in lines} - last_mod = int(lines[0].rsplit(" ", 1)[1]) - created = int(lines[-1].rsplit(" ", 1)[1]) - - files[item.path] = { - "blob": item, - "authors" : authors, - "time_last_modified": datetime.fromtimestamp(last_mod), - "time_created": datetime.fromtimestamp(created) - } + if item.type != "blob" or not self._is_ascii(item.data_stream): + continue + log = repo.git.log("--follow", '--format=%an %ct', "--", item.path) + lines = log.splitlines() + authors = {line.rsplit(" ", 1)[0] for line in lines} + last_mod = int(lines[0].rsplit(" ", 1)[1]) + created = int(lines[-1].rsplit(" ", 1)[1]) + + files[item.path] = { + "blob": item, + "authors" : authors, + "time_last_modified": datetime.fromtimestamp(last_mod), + "time_created": datetime.fromtimestamp(created) + } - self._logger.debug("Building file metadata") return files def _is_ascii(self, source):