Browse Source

Rewrite much of the indexer to use GitPython.

tags/v1.0^2
Ben Kurtovic 10 years ago
parent
commit
afc5980683
2 changed files with 121 additions and 240 deletions
  1. +119
    -238
      bitshift/crawler/indexer.py
  2. +2
    -2
      setup.py

+ 119
- 238
bitshift/crawler/indexer.py View File

@@ -3,7 +3,7 @@
repositories.
"""

import datetime
from datetime import datetime
import logging
import os
import Queue
@@ -13,7 +13,8 @@ import subprocess
import time
import threading

import bs4
from bs4 import UnicodeDammmit
import git

from ..database import Database
from ..parser import parse, UnsupportedFileError
@@ -33,7 +34,8 @@ class GitRepository(object):
repository belongs to (eg, GitHub, BitBucket).
:ivar rank: (float) The rank of the repository, as assigned by
:class:`crawler.GitHubCrawler`.
:ivar dirname: (str) The repository's on-disk directory name.
:ivar path: (str) The repository's on-disk directory path.
:ivar repo: (git.Repo) A git.Repo representation of the repository.
"""

def __init__(self, url, name, framework_name, rank):
@@ -55,7 +57,9 @@ class GitRepository(object):
self.name = name
self.framework_name = framework_name
self.rank = rank
self.dirname = name.replace("-", "--").replace("/", "-")
dirname = name.replace("/", "-") + "-" + str(int(time.time()))
self.path = os.path.join(GIT_CLONE_DIR, dirname)
self.repo = None

class GitIndexer(threading.Thread):
"""
@@ -124,20 +128,18 @@ class GitIndexer(threading.Thread):
`git clone` the Git repository located at **repo.url**, call
`_insert_repository_codelets()`, then remove said repository.

:param repo_url: The metadata of the repository to be indexed.

:type repo_url: :class:`GitRepository`
:param repo: The metadata of the repository to be indexed.
:type repo: :class:`GitRepository`
"""

self._logger.info(u"Indexing repo: %s", repo.name)
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
try:
self._insert_repository_codelets(repo)
except Exception:
self._logger.exception("Exception raised while indexing:")
finally:
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname))
try:
self._insert_repository_codelets(repo)
except Exception:
self._logger.exception("Exception raised while indexing:")
finally:
if os.path.isdir(repo.path):
shutil.rmtree(repo.path)

def _insert_repository_codelets(self, repo):
"""
@@ -152,27 +154,18 @@ class GitIndexer(threading.Thread):
:type repo_url: :class:`GitRepository`
"""

commits_meta = self._get_commits_metadata()
commits_meta = self._get_commits_metadata(repo)
if commits_meta is None:
return

for filename in commits_meta.keys():
try:
with open(filename) as source_file:
source = self._decode(source_file.read())
if source is None:
continue
except IOError:
continue

authors = [(self._decode(author), None) for author in
commits_meta[filename]["authors"]]
url = self._generate_file_url(filename, repo.url, repo.framework_name)
for filename, data in commits_meta.iteritems():
authors = [(author, None) for author in data["authors"]]
encoded_source = data["blob"].data_stream.read()
source = UnicodeDammmit(encoded_source).unicode_markup
url = self._generate_file_url(filename, repo)
codelet = Codelet("%s: %s" % (repo.name, filename), source,
filename, None, authors, url,
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"],
repo.rank)
filename, None, authors, url, data["time_created"],
data["time_last_modified"], repo.rank)
self._logger.debug("Indexing file: %s", codelet.name)
try:
parse(codelet)
@@ -180,163 +173,103 @@ class GitIndexer(threading.Thread):
continue
self.database.insert(codelet)

def _generate_file_url(self, filename, repo_url, framework_name):
def _generate_file_url(self, filename, repo):
"""
Return a url for a filename from a Git wrapper framework.

:param filename: The path of the file.
:param repo_url: The url of the file's parent repository.
:param framework_name: The name of the framework the repository is from.
:param repo: The git repo.

:type filename: str
:type repo_url: str
:type framework_name: str
:type repo: :class:`GitRepository`

:return: The file's full url on the given framework, if successfully
derived.
:rtype: str, or None

.. warning::
Various Git subprocesses will occasionally fail, and, seeing as the
information they provide is a crucial component of some repository
file urls, None may be returned.
"""

try:
if framework_name == "GitHub":
default_branch = subprocess.check_output("git branch"
" --no-color", shell=True)[2:-1]
parts = [repo_url, "blob", default_branch, filename]
elif framework_name == "Bitbucket":
commit_hash = subprocess.check_output("git rev-parse HEAD",
shell=True).replace("\n", "")
parts = [repo_url, "src", commit_hash, filename]
return "/".join(s.strip("/") for s in parts)
except subprocess.CalledProcessError:
return None

def _get_git_commits(self):
"""
Return the current working directory's formatted commit data.

Uses `git log` to generate metadata about every single file in the
repository's commit history.

:return: The author, timestamp, and names of all modified files of every
commit.
.. code-block:: python
sample_returned_array = [
{
"author" : (str) "author"
"timestamp" : (`datetime.datetime`) <object>,
"filenames" : (str array) ["file1", "file2"]
}
]
:rtype: array of dictionaries
"""

git_log = subprocess.check_output(("git --no-pager log --name-only"
" --pretty=format:'%n%n%an%n%at' -z"), shell=True)

commits = []
for commit in git_log.split("\n\n"):
fields = commit.split("\n")
if len(fields) > 2:
commits.append({
"author" : fields[0],
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
"filenames" : fields[2].split("\x00")[:-2]
})

return commits

def _get_tracked_files(self):
"""
Return a list of the filenames of all valuable files in the Git repository.

Get a list of the filenames of the non-binary (Perl heuristics used for
filetype identification) files currently inside the current working
directory's Git repository. Then, weed out any boilerplate/non-code files
that match the regex rules in GIT_IGNORE_FILES.

:return: The filenames of all index-worthy non-binary files.
:rtype: str array
"""

files = []
for dirname, subdir_names, filenames in os.walk("."):
for filename in filenames:
path = os.path.join(dirname, filename)
if self._is_ascii(path):
files.append(path[2:])
if framework_name == "GitHub":
default_branch = repo.repo.active_branch
parts = [repo_url, "blob", default_branch, filename]
elif framework_name == "Bitbucket":
try:
commit_hash = repo.repo.head.commit.hexsha
except ValueError: # No commits
return None
parts = [repo_url, "src", commit_hash, filename]
return "/".join(s.strip("/") for s in parts)

def _walk_history(self, files, head):
"""Walk a repository's history for metadata."""
def update_entry(commit, entry, new_file):
entry.add(commit.author.name)
commit_ts = datetime.utcfromtimestamp(commit.committed_date)
if commit_ts > entry["time_last_modified"]:
entry["time_last_modified"] = commit_ts
if new_file:
entry["time_created"] = commit_ts

def handle_commit(commit, paths):
if not commit.parents:
for item in commit.tree.traverse():
if item.type == "blob" and item.path in paths:
update_entry(commit, files[paths[item.path]], True)
return

for parent in commit.parents:
for diff in parent.diff(commit, create_patch=True):
pth = diff.renamed_to if diff.renamed else diff.b_blob.path
if pth not in paths:
continue
update_entry(commit, files[paths[pth]], diff.new_file)
if diff.renamed:
paths[diff.renamed_from] = paths[pth]
del paths[pth]

return files
pending = [(head, {path: path for path in files})]
while pending:
commit, paths = pending.pop()
handle_commit(commit, paths)
for parent in commit.parents:
new_paths = paths.copy() if len(commit.parents) > 1 else paths
pending.append((parent, new_paths))

def _get_commits_metadata(self):
def _get_commits_metadata(self, repo):
"""
Return a dictionary containing every valuable tracked file's metadata.

:return: A dictionary with author names, time of creation, and time of last
modification for every filename key.
:return: A dictionary with author names, time of creation, and time of
last modification for every filename key.
.. code-block:: python
sample_returned_dict = {
"my_file" : {
"authors" : (str array) ["author1", "author2"],
"time_created" : (`datetime.datetime`) <object>,
"time_last_modified" : (`datetime.datetime`) <object>
}
}
:rtype: dictionary of dictionaries
"""

commits = self._get_git_commits()
tracked_files = self._get_tracked_files()

files_meta = {}
for commit in commits:
for filename in commit["filenames"]:
if filename not in tracked_files:
continue

if filename not in files_meta.keys():
files_meta[filename] = {
"authors" : [commit["author"]],
"time_last_modified" : commit["timestamp"],
"time_created" : commit["timestamp"]
sample_returned_dict = {
"my_file" : {
"blob": (GitPython Blob) <object>,
"authors" : (str set) {"author1", "author2"},
"time_created" : (`datetime.datetime`) <object>,
"time_last_modified" : (`datetime.datetime`) <object>
}
else:
if commit["author"] not in files_meta[filename]["authors"]:
files_meta[filename]["authors"].append(commit["author"])
files_meta[filename]["time_created"] = commit["timestamp"]

return files_meta

def _decode(self, raw):
"""
Return a decoded a raw string.

:param raw: The string to string.

:type raw: (str)

:return: If the original encoding is successfully inferenced, return the
decoded string.
:rtype: str, or None

.. warning::
The raw string's original encoding is identified by heuristics which
can, and occasionally will, fail. Decoding will then fail, and None
will be returned.
}
:rtype: dictionary of dictionaries
"""

try:
encoding = bs4.BeautifulSoup(raw).original_encoding
return raw.decode(encoding) if encoding is not None else None

except (LookupError, UnicodeDecodeError, UserWarning) as exception:
return None
tree = repo.repo.head.commit.tree
except ValueError: # No commits
return {}

files = {}
for item in tree.traverse():
if item.type == "blob" and self._is_ascii(item.data_stream):
files[item.path] = {
"blob": item,
"authors" : set(),
"time_last_modified": datetime.utcfromtimestamp(0),
"time_created": datetime.utcfromtimestamp(0)
}

self._walk_history(files, repo.repo.head.commit)
return files

def _is_ascii(self, filename):
def _is_ascii(self, fp):
"""
Heuristically determine whether a file is ASCII text or binary.

@@ -346,34 +279,29 @@ class GitIndexer(threading.Thread):
operator, and is the de-facto method for in : passdetermining whether a
file is ASCII.

:param filename: The path of the file to test.
:param fp: The file object to test.

:type filename: str
:type fp: `file`

:return: Whether the file is probably ASCII.
:rtype: Boolean
"""

try:
with open(filename) as source:
file_snippet = source.read(512)

if not file_snippet:
return True

ascii_characters = "".join(map(chr, range(32, 127)) +
list("\n\r\t\b"))
null_trans = string.maketrans("", "")
file_snippet = source.read(512)

if "\0" in file_snippet:
return False
if not file_snippet:
return True

non_ascii = file_snippet.translate(null_trans, ascii_characters)
return not float(len(non_ascii)) / len(file_snippet) > 0.30
ascii_characters = "".join(map(chr, range(32, 127)) +
list("\n\r\t\b"))
null_trans = string.maketrans("", "")

except IOError:
if "\0" in file_snippet:
return False

non_ascii = file_snippet.translate(null_trans, ascii_characters)
return not float(len(non_ascii)) / len(file_snippet) > 0.30

class _GitCloner(threading.Thread):
"""
A singleton Git repository cloner.
@@ -428,7 +356,7 @@ class _GitCloner(threading.Thread):
try:
self._clone_repository(repo)
except Exception:
pass
self._logger.exception("Exception raised while cloning:")

def _clone_repository(self, repo):
"""
@@ -439,57 +367,10 @@ class _GitCloner(threading.Thread):
:type repo: :class:`GitRepository`
"""

GIT_CLONE_TIMEOUT = 500
queue_percent_full = (float(self.index_queue.qsize()) /
self.index_queue.maxsize) * 100

command = ["perl", "-e", "alarm shift @ARGV; exec @ARGV",
str(GIT_CLONE_TIMEOUT), "git", "clone", "--single-branch",
repo.url, GIT_CLONE_DIR + "/" + repo.dirname]
if subprocess.call(command) != 0:
subprocess.call(["pkill", "-f", "git"]) # This makes Ben K upset
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname))
return

while self.index_queue.full():
self._logger.info("Cloning repo: %s", repo.url)
repo.repo = git.Repo.clone_from(repo.url, to_path=repo.path, bare=True,
single_branch=True)
while self.index_queue.full() and self.run_event.is_set():
time.sleep(THREAD_QUEUE_SLEEP)
self.index_queue.put(repo)

class _ChangeDir(object):
"""
A wrapper class for os.chdir(), to map onto `with` and handle exceptions.

:ivar new_path: (str) The path to change the current directory to.
:ivar old_path: (str) The path of the directory to return to.
"""

def __init__(self, new_path):
"""
Create a _ChangeDir instance.

:param new_path: The directory to enter.

:type new_path: str
"""

self.new_path = new_path

def __enter__(self):
"""
Change the current working-directory to **new_path**.
"""

self.old_path = os.getcwd()
os.chdir(self.new_path)

def __exit__(self, *exception):
"""
Change the current working-directory to **old_path**.

:param exception: Various exception arguments passed by `with`.

:type exception: varargs
"""

os.chdir(self.old_path)
if self.run_event.is_set():
self.index_queue.put(repo)

+ 2
- 2
setup.py View File

@@ -6,8 +6,8 @@ setup(
packages = find_packages(),
install_requires = [
"Flask>=0.10.1", "gunicorn>=18.0", "pygments>=1.6", "requests>=2.2.0",
"beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3",
"PyYAML>=3.11", "python-dateutil>=2.2"],
"GitPython>=0.3.2.RC1", "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1",
"mmh3>=2.3", "PyYAML>=3.11", "python-dateutil>=2.2", "cchardet>=0.3.5"],
author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak",
license = "MIT",
url = "https://github.com/earwig/bitshift"


Loading…
Cancel
Save