Browse Source

Merge branch 'feature/gitpython' into develop

tags/v1.0^2
Ben Kurtovic 10 years ago
parent
commit
0496f09c2c
4 changed files with 106 additions and 248 deletions
  1. +93
    -243
      bitshift/crawler/indexer.py
  2. +6
    -1
      bitshift/database/migration.py
  3. +5
    -2
      bitshift/database/schema.sql
  4. +2
    -2
      setup.py

+ 93
- 243
bitshift/crawler/indexer.py View File

@@ -3,25 +3,25 @@
repositories.
"""

import datetime
from datetime import datetime
import logging
import os
import Queue
import shutil
import string
import subprocess
import time
import threading

import bs4
from bs4 import UnicodeDammit
import git

from ..database import Database
from ..parser import parse, UnsupportedFileError
from ..languages import LANGS
from ..codelet import Codelet

GIT_CLONE_DIR = "/tmp/bitshift"
THREAD_QUEUE_SLEEP = 0.5
MAX_INDEX_QUEUE_SIZE = 10

class GitRepository(object):
"""
@@ -33,7 +33,8 @@ class GitRepository(object):
repository belongs to (eg, GitHub, BitBucket).
:ivar rank: (float) The rank of the repository, as assigned by
:class:`crawler.GitHubCrawler`.
:ivar dirname: (str) The repository's on-disk directory name.
:ivar path: (str) The repository's on-disk directory path.
:ivar repo: (git.Repo) A git.Repo representation of the repository.
"""

def __init__(self, url, name, framework_name, rank):
@@ -55,7 +56,9 @@ class GitRepository(object):
self.name = name
self.framework_name = framework_name
self.rank = rank
self.dirname = name.replace("-", "--").replace("/", "-")
dirname = name.replace("/", "-") + "-" + str(int(time.time()))
self.path = os.path.join(GIT_CLONE_DIR, dirname)
self.repo = None

class GitIndexer(threading.Thread):
"""
@@ -81,8 +84,6 @@ class GitIndexer(threading.Thread):
:type index_queue: see :attr:`self.index_queue`
"""

MAX_INDEX_QUEUE_SIZE = 10

self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
self.run_event = run_event
self.git_cloner = _GitCloner(clone_queue, self.index_queue, run_event)
@@ -124,20 +125,18 @@ class GitIndexer(threading.Thread):
`git clone` the Git repository located at **repo.url**, call
`_insert_repository_codelets()`, then remove said repository.

:param repo_url: The metadata of the repository to be indexed.

:type repo_url: :class:`GitRepository`
:param repo: The metadata of the repository to be indexed.
:type repo: :class:`GitRepository`
"""

self._logger.info(u"Indexing repo: %s", repo.name)
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
try:
self._insert_repository_codelets(repo)
except Exception:
self._logger.exception("Exception raised while indexing:")
finally:
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname))
try:
self._insert_repository_codelets(repo)
except Exception:
self._logger.exception("Exception raised while indexing:")
finally:
if os.path.isdir(repo.path):
shutil.rmtree(repo.path)

def _insert_repository_codelets(self, repo):
"""
@@ -152,27 +151,18 @@ class GitIndexer(threading.Thread):
:type repo_url: :class:`GitRepository`
"""

commits_meta = self._get_commits_metadata()
if commits_meta is None:
file_meta = self._get_file_metadata(repo.repo)
if file_meta is None:
return

for filename in commits_meta.keys():
try:
with open(filename) as source_file:
source = self._decode(source_file.read())
if source is None:
continue
except IOError:
continue

authors = [(self._decode(author), None) for author in
commits_meta[filename]["authors"]]
url = self._generate_file_url(filename, repo.url, repo.framework_name)
for filename, data in file_meta.iteritems():
authors = [(author, None) for author in data["authors"]]
encoded_source = data["blob"].data_stream.read()
source = UnicodeDammit(encoded_source).unicode_markup
url = self._generate_file_url(filename, repo)
codelet = Codelet("%s: %s" % (repo.name, filename), source,
filename, None, authors, url,
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"],
repo.rank)
filename, None, authors, url, data["time_created"],
data["time_last_modified"], repo.rank)
self._logger.debug("Indexing file: %s", codelet.name)
try:
parse(codelet)
@@ -180,163 +170,75 @@ class GitIndexer(threading.Thread):
continue
self.database.insert(codelet)

def _generate_file_url(self, filename, repo_url, framework_name):
def _generate_file_url(self, filename, repo):
"""
Return a url for a filename from a Git wrapper framework.

:param filename: The path of the file.
:param repo_url: The url of the file's parent repository.
:param framework_name: The name of the framework the repository is from.
:param repo: The git repo.

:type filename: str
:type repo_url: str
:type framework_name: str
:type repo: :class:`GitRepository`

:return: The file's full url on the given framework, if successfully
derived.
:rtype: str, or None

.. warning::
Various Git subprocesses will occasionally fail, and, seeing as the
information they provide is a crucial component of some repository
file urls, None may be returned.
"""

try:
if framework_name == "GitHub":
default_branch = subprocess.check_output("git branch"
" --no-color", shell=True)[2:-1]
parts = [repo_url, "blob", default_branch, filename]
elif framework_name == "Bitbucket":
commit_hash = subprocess.check_output("git rev-parse HEAD",
shell=True).replace("\n", "")
parts = [repo_url, "src", commit_hash, filename]
return "/".join(s.strip("/") for s in parts)
except subprocess.CalledProcessError:
return None

def _get_git_commits(self):
"""
Return the current working directory's formatted commit data.

Uses `git log` to generate metadata about every single file in the
repository's commit history.

:return: The author, timestamp, and names of all modified files of every
commit.
.. code-block:: python
sample_returned_array = [
{
"author" : (str) "author"
"timestamp" : (`datetime.datetime`) <object>,
"filenames" : (str array) ["file1", "file2"]
}
]
:rtype: array of dictionaries
"""

git_log = subprocess.check_output(("git --no-pager log --name-only"
" --pretty=format:'%n%n%an%n%at' -z"), shell=True)

commits = []
for commit in git_log.split("\n\n"):
fields = commit.split("\n")
if len(fields) > 2:
commits.append({
"author" : fields[0],
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
"filenames" : fields[2].split("\x00")[:-2]
})

return commits

def _get_tracked_files(self):
"""
Return a list of the filenames of all valuable files in the Git repository.

Get a list of the filenames of the non-binary (Perl heuristics used for
filetype identification) files currently inside the current working
directory's Git repository. Then, weed out any boilerplate/non-code files
that match the regex rules in GIT_IGNORE_FILES.

:return: The filenames of all index-worthy non-binary files.
:rtype: str array
"""

files = []
for dirname, subdir_names, filenames in os.walk("."):
for filename in filenames:
path = os.path.join(dirname, filename)
if self._is_ascii(path):
files.append(path[2:])

return files
if repo.framework_name == "GitHub":
default_branch = repo.repo.active_branch.name
parts = [repo.url, "blob", default_branch, filename]
elif repo.framework_name == "Bitbucket":
try:
commit_hash = repo.repo.head.commit.hexsha
except ValueError: # No commits
return None
parts = [repo.url, "src", commit_hash, filename]
return "/".join(s.strip("/") for s in parts)

def _get_commits_metadata(self):
def _get_file_metadata(self, repo):
"""
Return a dictionary containing every valuable tracked file's metadata.

:return: A dictionary with author names, time of creation, and time of last
modification for every filename key.
:return: A dictionary with author names, time of creation, and time of
last modification for every filename key.
.. code-block:: python
sample_returned_dict = {
"my_file" : {
"authors" : (str array) ["author1", "author2"],
"time_created" : (`datetime.datetime`) <object>,
"time_last_modified" : (`datetime.datetime`) <object>
}
}
:rtype: dictionary of dictionaries
"""

commits = self._get_git_commits()
tracked_files = self._get_tracked_files()

files_meta = {}
for commit in commits:
for filename in commit["filenames"]:
if filename not in tracked_files:
continue

if filename not in files_meta.keys():
files_meta[filename] = {
"authors" : [commit["author"]],
"time_last_modified" : commit["timestamp"],
"time_created" : commit["timestamp"]
sample_returned_dict = {
"my_file" : {
"blob": (GitPython Blob) <object>,
"authors" : (str list) ["author1", "author2"],
"time_created" : (`datetime.datetime`) <object>,
"time_last_modified" : (`datetime.datetime`) <object>
}
else:
if commit["author"] not in files_meta[filename]["authors"]:
files_meta[filename]["authors"].append(commit["author"])
files_meta[filename]["time_created"] = commit["timestamp"]

return files_meta

def _decode(self, raw):
"""
Return a decoded a raw string.

:param raw: The string to string.

:type raw: (str)

:return: If the original encoding is successfully inferenced, return the
decoded string.
:rtype: str, or None

.. warning::
The raw string's original encoding is identified by heuristics which
can, and occasionally will, fail. Decoding will then fail, and None
will be returned.
}
:rtype: dictionary of dictionaries
"""

try:
encoding = bs4.BeautifulSoup(raw).original_encoding
return raw.decode(encoding) if encoding is not None else None
tree = repo.head.commit.tree
except ValueError: # No commits
return {}

files = {}
self._logger.debug("Building file metadata")
for item in tree.traverse():
if item.type != "blob" or not self._is_ascii(item.data_stream):
continue
log = repo.git.log("--follow", '--format=%an %ct', "--", item.path)
lines = log.splitlines()
authors = {line.rsplit(" ", 1)[0] for line in lines}
last_mod = int(lines[0].rsplit(" ", 1)[1])
created = int(lines[-1].rsplit(" ", 1)[1])

files[item.path] = {
"blob": item,
"authors" : authors,
"time_last_modified": datetime.fromtimestamp(last_mod),
"time_created": datetime.fromtimestamp(created)
}

except (LookupError, UnicodeDecodeError, UserWarning) as exception:
return None
return files

def _is_ascii(self, filename):
def _is_ascii(self, source):
"""
Heuristically determine whether a file is ASCII text or binary.

@@ -346,34 +248,29 @@ class GitIndexer(threading.Thread):
operator, and is the de-facto method for in : passdetermining whether a
file is ASCII.

:param filename: The path of the file to test.
:param source: The file object to test.

:type filename: str
:type source: `file`

:return: Whether the file is probably ASCII.
:rtype: Boolean
"""

try:
with open(filename) as source:
file_snippet = source.read(512)

if not file_snippet:
return True

ascii_characters = "".join(map(chr, range(32, 127)) +
list("\n\r\t\b"))
null_trans = string.maketrans("", "")
file_snippet = source.read(512)

if "\0" in file_snippet:
return False
if not file_snippet:
return True

non_ascii = file_snippet.translate(null_trans, ascii_characters)
return not float(len(non_ascii)) / len(file_snippet) > 0.30
ascii_characters = "".join(map(chr, range(32, 127)) +
list("\n\r\t\b"))
null_trans = string.maketrans("", "")

except IOError:
if "\0" in file_snippet:
return False

non_ascii = file_snippet.translate(null_trans, ascii_characters)
return not float(len(non_ascii)) / len(file_snippet) > 0.30

class _GitCloner(threading.Thread):
"""
A singleton Git repository cloner.
@@ -428,7 +325,7 @@ class _GitCloner(threading.Thread):
try:
self._clone_repository(repo)
except Exception:
pass
self._logger.exception("Exception raised while cloning:")

def _clone_repository(self, repo):
"""
@@ -439,57 +336,10 @@ class _GitCloner(threading.Thread):
:type repo: :class:`GitRepository`
"""

GIT_CLONE_TIMEOUT = 500
queue_percent_full = (float(self.index_queue.qsize()) /
self.index_queue.maxsize) * 100

command = ["perl", "-e", "alarm shift @ARGV; exec @ARGV",
str(GIT_CLONE_TIMEOUT), "git", "clone", "--single-branch",
repo.url, GIT_CLONE_DIR + "/" + repo.dirname]
if subprocess.call(command) != 0:
subprocess.call(["pkill", "-f", "git"]) # This makes Ben K upset
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname))
return

while self.index_queue.full():
self._logger.info("Cloning repo: %s", repo.url)
repo.repo = git.Repo.clone_from(repo.url, to_path=repo.path, bare=True,
single_branch=True)
while self.index_queue.full() and self.run_event.is_set():
time.sleep(THREAD_QUEUE_SLEEP)
self.index_queue.put(repo)

class _ChangeDir(object):
"""
A wrapper class for os.chdir(), to map onto `with` and handle exceptions.

:ivar new_path: (str) The path to change the current directory to.
:ivar old_path: (str) The path of the directory to return to.
"""

def __init__(self, new_path):
"""
Create a _ChangeDir instance.

:param new_path: The directory to enter.

:type new_path: str
"""

self.new_path = new_path

def __enter__(self):
"""
Change the current working-directory to **new_path**.
"""

self.old_path = os.getcwd()
os.chdir(self.new_path)

def __exit__(self, *exception):
"""
Change the current working-directory to **old_path**.

:param exception: Various exception arguments passed by `with`.

:type exception: varargs
"""

os.chdir(self.old_path)
if self.run_event.is_set():
self.index_queue.put(repo)

+ 6
- 1
bitshift/database/migration.py View File

@@ -3,7 +3,7 @@ Contains information about database schema versions, and SQL queries to update
between them.
"""

VERSION = 10
VERSION = 11

MIGRATIONS = [
# 1 -> 2
@@ -122,6 +122,11 @@ MIGRATIONS = [
MODIFY COLUMN `sloc_col` INT UNSIGNED DEFAULT NULL,
MODIFY COLUMN `sloc_end_row` INT UNSIGNED DEFAULT NULL,
MODIFY COLUMN `sloc_end_col` INT UNSIGNED DEFAULT NULL"""
],
# 10 -> 11
[
"""ALTER DATABASE `bitshift`
CHARACTER SET = utf8mb4 COLLATE = utf8mb4_unicode_ci"""
]
]



+ 5
- 2
bitshift/database/schema.sql View File

@@ -1,12 +1,14 @@
-- Schema version 11

CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
CREATE DATABASE `bitshift`
DEFAULT CHARACTER SET utf8mb4
COLLATE utf8mb4_unicode_ci;
USE `bitshift`;

CREATE TABLE `version` (
`version` INT UNSIGNED NOT NULL
) ENGINE=InnoDB;
INSERT INTO `version` VALUES (10);
INSERT INTO `version` VALUES (11);

CREATE TABLE `origins` (
`origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT,


+ 2
- 2
setup.py View File

@@ -6,8 +6,8 @@ setup(
packages = find_packages(),
install_requires = [
"Flask>=0.10.1", "gunicorn>=18.0", "pygments>=1.6", "requests>=2.2.0",
"beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3",
"PyYAML>=3.11", "python-dateutil>=2.2"],
"GitPython>=0.3.2.RC1", "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1",
"mmh3>=2.3", "PyYAML>=3.11", "python-dateutil>=2.2", "cchardet>=0.3.5"],
author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak",
license = "MIT",
url = "https://github.com/earwig/bitshift"


Loading…
Cancel
Save