Browse Source

Rewrite much of the indexer to use GitPython.

tags/v1.0^2
Ben Kurtovic 10 years ago
parent
commit
afc5980683
2 changed files with 121 additions and 240 deletions
  1. +119
    -238
      bitshift/crawler/indexer.py
  2. +2
    -2
      setup.py

+ 119
- 238
bitshift/crawler/indexer.py View File

@@ -3,7 +3,7 @@
repositories. repositories.
""" """


import datetime
from datetime import datetime
import logging import logging
import os import os
import Queue import Queue
@@ -13,7 +13,8 @@ import subprocess
import time import time
import threading import threading


import bs4
from bs4 import UnicodeDammmit
import git


from ..database import Database from ..database import Database
from ..parser import parse, UnsupportedFileError from ..parser import parse, UnsupportedFileError
@@ -33,7 +34,8 @@ class GitRepository(object):
repository belongs to (eg, GitHub, BitBucket). repository belongs to (eg, GitHub, BitBucket).
:ivar rank: (float) The rank of the repository, as assigned by :ivar rank: (float) The rank of the repository, as assigned by
:class:`crawler.GitHubCrawler`. :class:`crawler.GitHubCrawler`.
:ivar dirname: (str) The repository's on-disk directory name.
:ivar path: (str) The repository's on-disk directory path.
:ivar repo: (git.Repo) A git.Repo representation of the repository.
""" """


def __init__(self, url, name, framework_name, rank): def __init__(self, url, name, framework_name, rank):
@@ -55,7 +57,9 @@ class GitRepository(object):
self.name = name self.name = name
self.framework_name = framework_name self.framework_name = framework_name
self.rank = rank self.rank = rank
self.dirname = name.replace("-", "--").replace("/", "-")
dirname = name.replace("/", "-") + "-" + str(int(time.time()))
self.path = os.path.join(GIT_CLONE_DIR, dirname)
self.repo = None


class GitIndexer(threading.Thread): class GitIndexer(threading.Thread):
""" """
@@ -124,20 +128,18 @@ class GitIndexer(threading.Thread):
`git clone` the Git repository located at **repo.url**, call `git clone` the Git repository located at **repo.url**, call
`_insert_repository_codelets()`, then remove said repository. `_insert_repository_codelets()`, then remove said repository.


:param repo_url: The metadata of the repository to be indexed.

:type repo_url: :class:`GitRepository`
:param repo: The metadata of the repository to be indexed.
:type repo: :class:`GitRepository`
""" """


self._logger.info(u"Indexing repo: %s", repo.name) self._logger.info(u"Indexing repo: %s", repo.name)
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
try:
self._insert_repository_codelets(repo)
except Exception:
self._logger.exception("Exception raised while indexing:")
finally:
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname))
try:
self._insert_repository_codelets(repo)
except Exception:
self._logger.exception("Exception raised while indexing:")
finally:
if os.path.isdir(repo.path):
shutil.rmtree(repo.path)


def _insert_repository_codelets(self, repo): def _insert_repository_codelets(self, repo):
""" """
@@ -152,27 +154,18 @@ class GitIndexer(threading.Thread):
:type repo_url: :class:`GitRepository` :type repo_url: :class:`GitRepository`
""" """


commits_meta = self._get_commits_metadata()
commits_meta = self._get_commits_metadata(repo)
if commits_meta is None: if commits_meta is None:
return return


for filename in commits_meta.keys():
try:
with open(filename) as source_file:
source = self._decode(source_file.read())
if source is None:
continue
except IOError:
continue

authors = [(self._decode(author), None) for author in
commits_meta[filename]["authors"]]
url = self._generate_file_url(filename, repo.url, repo.framework_name)
for filename, data in commits_meta.iteritems():
authors = [(author, None) for author in data["authors"]]
encoded_source = data["blob"].data_stream.read()
source = UnicodeDammmit(encoded_source).unicode_markup
url = self._generate_file_url(filename, repo)
codelet = Codelet("%s: %s" % (repo.name, filename), source, codelet = Codelet("%s: %s" % (repo.name, filename), source,
filename, None, authors, url,
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"],
repo.rank)
filename, None, authors, url, data["time_created"],
data["time_last_modified"], repo.rank)
self._logger.debug("Indexing file: %s", codelet.name) self._logger.debug("Indexing file: %s", codelet.name)
try: try:
parse(codelet) parse(codelet)
@@ -180,163 +173,103 @@ class GitIndexer(threading.Thread):
continue continue
self.database.insert(codelet) self.database.insert(codelet)


def _generate_file_url(self, filename, repo_url, framework_name):
def _generate_file_url(self, filename, repo):
""" """
Return a url for a filename from a Git wrapper framework. Return a url for a filename from a Git wrapper framework.


:param filename: The path of the file. :param filename: The path of the file.
:param repo_url: The url of the file's parent repository.
:param framework_name: The name of the framework the repository is from.
:param repo: The git repo.


:type filename: str :type filename: str
:type repo_url: str
:type framework_name: str
:type repo: :class:`GitRepository`


:return: The file's full url on the given framework, if successfully :return: The file's full url on the given framework, if successfully
derived. derived.
:rtype: str, or None :rtype: str, or None

.. warning::
Various Git subprocesses will occasionally fail, and, seeing as the
information they provide is a crucial component of some repository
file urls, None may be returned.
"""

try:
if framework_name == "GitHub":
default_branch = subprocess.check_output("git branch"
" --no-color", shell=True)[2:-1]
parts = [repo_url, "blob", default_branch, filename]
elif framework_name == "Bitbucket":
commit_hash = subprocess.check_output("git rev-parse HEAD",
shell=True).replace("\n", "")
parts = [repo_url, "src", commit_hash, filename]
return "/".join(s.strip("/") for s in parts)
except subprocess.CalledProcessError:
return None

def _get_git_commits(self):
"""
Return the current working directory's formatted commit data.

Uses `git log` to generate metadata about every single file in the
repository's commit history.

:return: The author, timestamp, and names of all modified files of every
commit.
.. code-block:: python
sample_returned_array = [
{
"author" : (str) "author"
"timestamp" : (`datetime.datetime`) <object>,
"filenames" : (str array) ["file1", "file2"]
}
]
:rtype: array of dictionaries
"""

git_log = subprocess.check_output(("git --no-pager log --name-only"
" --pretty=format:'%n%n%an%n%at' -z"), shell=True)

commits = []
for commit in git_log.split("\n\n"):
fields = commit.split("\n")
if len(fields) > 2:
commits.append({
"author" : fields[0],
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
"filenames" : fields[2].split("\x00")[:-2]
})

return commits

def _get_tracked_files(self):
"""
Return a list of the filenames of all valuable files in the Git repository.

Get a list of the filenames of the non-binary (Perl heuristics used for
filetype identification) files currently inside the current working
directory's Git repository. Then, weed out any boilerplate/non-code files
that match the regex rules in GIT_IGNORE_FILES.

:return: The filenames of all index-worthy non-binary files.
:rtype: str array
""" """


files = []
for dirname, subdir_names, filenames in os.walk("."):
for filename in filenames:
path = os.path.join(dirname, filename)
if self._is_ascii(path):
files.append(path[2:])
if framework_name == "GitHub":
default_branch = repo.repo.active_branch
parts = [repo_url, "blob", default_branch, filename]
elif framework_name == "Bitbucket":
try:
commit_hash = repo.repo.head.commit.hexsha
except ValueError: # No commits
return None
parts = [repo_url, "src", commit_hash, filename]
return "/".join(s.strip("/") for s in parts)

def _walk_history(self, files, head):
"""Walk a repository's history for metadata."""
def update_entry(commit, entry, new_file):
entry.add(commit.author.name)
commit_ts = datetime.utcfromtimestamp(commit.committed_date)
if commit_ts > entry["time_last_modified"]:
entry["time_last_modified"] = commit_ts
if new_file:
entry["time_created"] = commit_ts

def handle_commit(commit, paths):
if not commit.parents:
for item in commit.tree.traverse():
if item.type == "blob" and item.path in paths:
update_entry(commit, files[paths[item.path]], True)
return

for parent in commit.parents:
for diff in parent.diff(commit, create_patch=True):
pth = diff.renamed_to if diff.renamed else diff.b_blob.path
if pth not in paths:
continue
update_entry(commit, files[paths[pth]], diff.new_file)
if diff.renamed:
paths[diff.renamed_from] = paths[pth]
del paths[pth]


return files
pending = [(head, {path: path for path in files})]
while pending:
commit, paths = pending.pop()
handle_commit(commit, paths)
for parent in commit.parents:
new_paths = paths.copy() if len(commit.parents) > 1 else paths
pending.append((parent, new_paths))


def _get_commits_metadata(self):
def _get_commits_metadata(self, repo):
""" """
Return a dictionary containing every valuable tracked file's metadata. Return a dictionary containing every valuable tracked file's metadata.


:return: A dictionary with author names, time of creation, and time of last
modification for every filename key.
:return: A dictionary with author names, time of creation, and time of
last modification for every filename key.
.. code-block:: python .. code-block:: python
sample_returned_dict = {
"my_file" : {
"authors" : (str array) ["author1", "author2"],
"time_created" : (`datetime.datetime`) <object>,
"time_last_modified" : (`datetime.datetime`) <object>
}
}
:rtype: dictionary of dictionaries
"""

commits = self._get_git_commits()
tracked_files = self._get_tracked_files()

files_meta = {}
for commit in commits:
for filename in commit["filenames"]:
if filename not in tracked_files:
continue

if filename not in files_meta.keys():
files_meta[filename] = {
"authors" : [commit["author"]],
"time_last_modified" : commit["timestamp"],
"time_created" : commit["timestamp"]
sample_returned_dict = {
"my_file" : {
"blob": (GitPython Blob) <object>,
"authors" : (str set) {"author1", "author2"},
"time_created" : (`datetime.datetime`) <object>,
"time_last_modified" : (`datetime.datetime`) <object>
} }
else:
if commit["author"] not in files_meta[filename]["authors"]:
files_meta[filename]["authors"].append(commit["author"])
files_meta[filename]["time_created"] = commit["timestamp"]

return files_meta

def _decode(self, raw):
"""
Return a decoded a raw string.

:param raw: The string to string.

:type raw: (str)

:return: If the original encoding is successfully inferenced, return the
decoded string.
:rtype: str, or None

.. warning::
The raw string's original encoding is identified by heuristics which
can, and occasionally will, fail. Decoding will then fail, and None
will be returned.
}
:rtype: dictionary of dictionaries
""" """

try: try:
encoding = bs4.BeautifulSoup(raw).original_encoding
return raw.decode(encoding) if encoding is not None else None

except (LookupError, UnicodeDecodeError, UserWarning) as exception:
return None
tree = repo.repo.head.commit.tree
except ValueError: # No commits
return {}

files = {}
for item in tree.traverse():
if item.type == "blob" and self._is_ascii(item.data_stream):
files[item.path] = {
"blob": item,
"authors" : set(),
"time_last_modified": datetime.utcfromtimestamp(0),
"time_created": datetime.utcfromtimestamp(0)
}

self._walk_history(files, repo.repo.head.commit)
return files


def _is_ascii(self, filename):
def _is_ascii(self, fp):
""" """
Heuristically determine whether a file is ASCII text or binary. Heuristically determine whether a file is ASCII text or binary.


@@ -346,34 +279,29 @@ class GitIndexer(threading.Thread):
operator, and is the de-facto method for in : passdetermining whether a operator, and is the de-facto method for in : passdetermining whether a
file is ASCII. file is ASCII.


:param filename: The path of the file to test.
:param fp: The file object to test.


:type filename: str
:type fp: `file`


:return: Whether the file is probably ASCII. :return: Whether the file is probably ASCII.
:rtype: Boolean :rtype: Boolean
""" """


try:
with open(filename) as source:
file_snippet = source.read(512)

if not file_snippet:
return True

ascii_characters = "".join(map(chr, range(32, 127)) +
list("\n\r\t\b"))
null_trans = string.maketrans("", "")
file_snippet = source.read(512)


if "\0" in file_snippet:
return False
if not file_snippet:
return True


non_ascii = file_snippet.translate(null_trans, ascii_characters)
return not float(len(non_ascii)) / len(file_snippet) > 0.30
ascii_characters = "".join(map(chr, range(32, 127)) +
list("\n\r\t\b"))
null_trans = string.maketrans("", "")


except IOError:
if "\0" in file_snippet:
return False return False


non_ascii = file_snippet.translate(null_trans, ascii_characters)
return not float(len(non_ascii)) / len(file_snippet) > 0.30

class _GitCloner(threading.Thread): class _GitCloner(threading.Thread):
""" """
A singleton Git repository cloner. A singleton Git repository cloner.
@@ -428,7 +356,7 @@ class _GitCloner(threading.Thread):
try: try:
self._clone_repository(repo) self._clone_repository(repo)
except Exception: except Exception:
pass
self._logger.exception("Exception raised while cloning:")


def _clone_repository(self, repo): def _clone_repository(self, repo):
""" """
@@ -439,57 +367,10 @@ class _GitCloner(threading.Thread):
:type repo: :class:`GitRepository` :type repo: :class:`GitRepository`
""" """


GIT_CLONE_TIMEOUT = 500
queue_percent_full = (float(self.index_queue.qsize()) /
self.index_queue.maxsize) * 100

command = ["perl", "-e", "alarm shift @ARGV; exec @ARGV",
str(GIT_CLONE_TIMEOUT), "git", "clone", "--single-branch",
repo.url, GIT_CLONE_DIR + "/" + repo.dirname]
if subprocess.call(command) != 0:
subprocess.call(["pkill", "-f", "git"]) # This makes Ben K upset
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname))
return

while self.index_queue.full():
self._logger.info("Cloning repo: %s", repo.url)
repo.repo = git.Repo.clone_from(repo.url, to_path=repo.path, bare=True,
single_branch=True)
while self.index_queue.full() and self.run_event.is_set():
time.sleep(THREAD_QUEUE_SLEEP) time.sleep(THREAD_QUEUE_SLEEP)
self.index_queue.put(repo)

class _ChangeDir(object):
"""
A wrapper class for os.chdir(), to map onto `with` and handle exceptions.

:ivar new_path: (str) The path to change the current directory to.
:ivar old_path: (str) The path of the directory to return to.
"""

def __init__(self, new_path):
"""
Create a _ChangeDir instance.

:param new_path: The directory to enter.

:type new_path: str
"""

self.new_path = new_path

def __enter__(self):
"""
Change the current working-directory to **new_path**.
"""

self.old_path = os.getcwd()
os.chdir(self.new_path)

def __exit__(self, *exception):
"""
Change the current working-directory to **old_path**.

:param exception: Various exception arguments passed by `with`.

:type exception: varargs
"""

os.chdir(self.old_path)
if self.run_event.is_set():
self.index_queue.put(repo)

+ 2
- 2
setup.py View File

@@ -6,8 +6,8 @@ setup(
packages = find_packages(), packages = find_packages(),
install_requires = [ install_requires = [
"Flask>=0.10.1", "gunicorn>=18.0", "pygments>=1.6", "requests>=2.2.0", "Flask>=0.10.1", "gunicorn>=18.0", "pygments>=1.6", "requests>=2.2.0",
"beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3",
"PyYAML>=3.11", "python-dateutil>=2.2"],
"GitPython>=0.3.2.RC1", "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1",
"mmh3>=2.3", "PyYAML>=3.11", "python-dateutil>=2.2", "cchardet>=0.3.5"],
author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak",
license = "MIT", license = "MIT",
url = "https://github.com/earwig/bitshift" url = "https://github.com/earwig/bitshift"


Loading…
Cancel
Save