Browse Source

Better method for determining commit history.

tags/v1.0^2
Ben Kurtovic 10 years ago
parent
commit
55980f33fd
1 changed files with 11 additions and 63 deletions
  1. +11
    -63
      bitshift/crawler/indexer.py

+ 11
- 63
bitshift/crawler/indexer.py View File

@@ -151,7 +151,7 @@ class GitIndexer(threading.Thread):
:type repo_url: :class:`GitRepository`
"""

file_meta = self._get_file_metadata(repo)
file_meta = self._get_file_metadata(repo.repo)
if file_meta is None:
return

@@ -196,63 +196,6 @@ class GitIndexer(threading.Thread):
parts = [repo.url, "src", commit_hash, filename]
return "/".join(s.strip("/") for s in parts)

def _walk_history(self, files, head):
"""Walk a repository's history for metadata."""
def update_entry(commit, entry, new_file):
if commit.author.name not in entry["authors"]:
entry["authors"].append(commit.author.name)
commit_ts = datetime.utcfromtimestamp(commit.committed_date)
if commit_ts > entry["time_last_modified"]:
entry["time_last_modified"] = commit_ts
if new_file:
entry["time_created"] = commit_ts

def get_diffs(commit, parent):
cache_key = parent.binsha + commit.binsha
if cache_key in diff_cache:
return diff_cache[cache_key]
diffs = parent.diff(commit, create_patch=True)
for diff in diffs:
del diff.diff
diff_cache[cache_key] = diffs
return diffs

def handle_commit(commit, paths):
if not commit.parents:
for item in commit.tree.traverse():
if item.type == "blob" and item.path in paths:
update_entry(commit, files[paths[item.path]], True)
return

for parent in commit.parents:
for diff in get_diffs(commit, parent):
if not diff.b_blob: # Happens when file modes are changed
continue
pth = diff.rename_to if diff.renamed else diff.b_blob.path
if pth not in paths:
continue
update_entry(commit, files[paths[pth]], diff.new_file)
if diff.renamed:
paths[diff.rename_from] = paths[pth]
del paths[pth]

pending = [(head, {path: path for path in files})]
diff_cache = {}
processed = {}
while pending:
commit, paths = pending.pop()
handle_commit(commit, paths)
hash_key = hash(frozenset(paths.items()))
for parent in commit.parents:
new_paths = paths.copy() if len(commit.parents) > 1 else paths
if parent.binsha in processed:
if hash_key not in processed[parent.binsha]:
pending.append((parent, new_paths))
processed[parent.binsha].append(hash_key)
else:
pending.append((parent, new_paths))
processed[parent.binsha] = [hash_key]

def _get_file_metadata(self, repo):
"""
Return a dictionary containing every valuable tracked file's metadata.
@@ -271,22 +214,27 @@ class GitIndexer(threading.Thread):
:rtype: dictionary of dictionaries
"""
try:
tree = repo.repo.head.commit.tree
tree = repo.head.commit.tree
except ValueError: # No commits
return {}

files = {}
for item in tree.traverse():
if item.type == "blob" and self._is_ascii(item.data_stream):
log = repo.git.log("--follow", '--format=%an %ct', item.path)
lines = log.splitlines()
authors = {line.rsplit(" ", 1)[0] for line in lines}
last_mod = int(lines[0].rsplit(" ", 1)[1])
created = int(lines[-1].rsplit(" ", 1)[1])

files[item.path] = {
"blob": item,
"authors" : [],
"time_last_modified": datetime.utcfromtimestamp(0),
"time_created": datetime.utcfromtimestamp(0)
"authors" : authors,
"time_last_modified": datetime.fromtimestamp(last_mod),
"time_created": datetime.fromtimestamp(created)
}

self._logger.debug("Building file metadata")
self._walk_history(files, repo.repo.head.commit)
return files

def _is_ascii(self, source):


Loading…
Cancel
Save