Browse Source

Merge branch 'develop' into feature/query_parser

tags/v1.0^2
Ben Kurtovic 10 years ago
parent
commit
c4816c2bb8
17 changed files with 1270 additions and 41 deletions
  1. +1
    -0
      .gitignore
  2. +9
    -1
      README.md
  3. +7
    -3
      app.py
  4. +1
    -1
      bitshift/__init__.py
  5. +4
    -2
      bitshift/assets.py
  6. +52
    -8
      bitshift/codelet.py
  7. +55
    -0
      bitshift/crawler/__init__.py
  8. +240
    -0
      bitshift/crawler/crawler.py
  9. +489
    -0
      bitshift/crawler/indexer.py
  10. +0
    -18
      bitshift/database.py
  11. +153
    -0
      bitshift/database/__init__.py
  12. +97
    -0
      bitshift/database/migration.py
  13. +114
    -0
      bitshift/database/schema.sql
  14. +2
    -0
      bitshift/query/__init__.py
  15. +11
    -0
      docs/source/api/bitshift.query.rst
  16. +32
    -7
      docs/source/api/bitshift.rst
  17. +3
    -1
      setup.py

+ 1
- 0
.gitignore View File

@@ -1,5 +1,6 @@
.sass-cache
.DS_Store
.my.cnf

# github premade rules
*.py[cod]


+ 9
- 1
README.md View File

@@ -1,7 +1,8 @@
bitshift
========

bitshift is a semantic search engine for source code.
bitshift is a semantic search engine for source code developed by Benjamin
Attal, Ben Kurtovic, and Severyn Kozak.

Branches
--------
@@ -13,6 +14,11 @@ Branches
- `feature/*`: individual components of the project with untested, likely
horribly broken code - branch off from and merge into `develop` when done

Style
-----
bitshift uses [SASS][SASS] for styling; compile the stylesheets to CSS with
`sass --watch static/sass/:static/css`.

Documentation
-------------

@@ -24,3 +30,5 @@ new modules or packages, but *not* when adding functions or changing
docstrings), run `sphinx-apidoc -fo docs/source/api bitshift` from the project
root. Note that this will revert any custom changes made to the files in
`docs/source/api`, so you might want to update them by hand instead.

[SASS]: http://sass-lang.com/guide

+ 7
- 3
app.py View File

@@ -5,6 +5,8 @@ Module to contain all the project's Flask server plumbing.
from flask import Flask
from flask import render_template, session

from bitshift import assets
from bitshift.database import Database
from bitshift.query import parse_query

app = Flask(__name__)
@@ -12,7 +14,9 @@ app.config.from_object("bitshift.config")

app_env = app.jinja_env
app_env.line_statement_prefix = "="
app_env.globals.update(assets = assets)
app_env.globals.update(assets=assets)

database = Database()

@app.route("/")
def index():
@@ -20,8 +24,8 @@ def index():

@app.route("/search/<query>")
def search(query):
## tree = parse_query(query)
## database.search(tree)
tree = parse_query(query)
database.search(tree)
pass

if __name__ == "__main__":


+ 1
- 1
bitshift/__init__.py View File

@@ -1 +1 @@
from . import assets, codelet, config, database, parser, query
from . import assets, codelet, config, database, parser, query, crawler

+ 4
- 2
bitshift/assets.py View File

@@ -1,6 +1,5 @@
"""
.. module:: assets
:synopsis: Helper functions for use inside the project's Jinja templates.
:synopsis: Helper functions for use inside the project's Jinja templates.
"""

from flask import Markup
@@ -16,8 +15,11 @@ def tag(filename):

:param filename: The filename of the asset to create a tag for.

:type filename: str

:return: A string containing a `<source>` tag for JS files, and a `<link>`
for CSS files.
:rtype: str
"""

file_ext = filename.split(".")[-1]


+ 52
- 8
bitshift/codelet.py View File

@@ -1,13 +1,57 @@
__all__ = ["Codelet"]

class Codelet(object):
## object to store the following (it doesn't need to do anything with it):
## author name, URL, date created/modified, language, source code itself
## for VCS: project name, file in project
## also: list of functions, etc (associations data)
"""
A source-code object with code metadata and composition analysis.

## DICTIONARY MAPPING STRINGS REPRESENTING ASSOCIATION TYPE WITH DICTIONARIES
## MAPPING ASSOCIATION NAMES WITH TUPLES REPRESENTING THEIR PLACE IN THE FILE
## STORED AS TWO INTEGERS REPRESENTING THE ROW AND THE COLUMN
:ivar name: (str) A suitable name for the codelet.
:ivar code: (str) A containing the raw source code.
:ivar filename: (str, or None) The filename of the snippet.
:ivar language: (int, or None) The inferred language of `code`.
:ivar authors: (array of tuples (str, str or None)) An array of tuples
containing an author's name and profile URL (on the service the code
was pulled from).
:ivar code_url: (str) The url of the (page containing the) source code.
:ivar date_created: (:class:`datetime.datetime`, or None) The date the code
was published.
:ivar date_modified: (:class:`datetime.datetime`, or None) The date the
code was last modified.
:ivar rank: (float) A quanitification of the source code's quality, as
per available ratings (stars, forks, upvotes, etc.).
"""

## {"functions": {"foo": (12, 13), "bar": (53, 3)}}
def __init__(self, name, code, filename, language, authors, code_url,
date_created, date_modified, rank):
"""
Create a Codelet instance.

:param name: see :attr:`self.name`
:param code: see :attr:`self.code`
:param filename: see :attr:`self.filename`
:param language: see :attr:`self.language`
:param authors: see :attr:`self.authors`
:param code_url: see :attr:`self.code_url`
:param date_created: see :attr:`self.date_created`
:param date_modified: see :attr:`self.date_modified`
:param rank: see :attr:`self.rank`

:type name: see :attr:`self.name`
:type code: see :attr:`self.code`
:type filename: see :attr:`self.filename`
:type language: see :attr:`self.language`
:type authors: see :attr:`self.authors`
:type code_url: see :attr:`self.code_url`
:type date_created: see :attr:`self.date_created`
:type date_modified: see :attr:`self.date_modified`
:type rank: see :attr:`self.rank`
"""

self.name = name
self.code = code
self.filename = filename
self.language = language
self.authors = authors
self.code_url = code_url
self.date_created = date_created
self.date_modified = date_modified
self.rank = rank

+ 55
- 0
bitshift/crawler/__init__.py View File

@@ -0,0 +1,55 @@
"""
:synopsis: Parent crawler module, which supervises all crawlers.

Contains functions for initializing all subsidiary, threaded crawlers.
"""

import logging, logging.handlers, os, Queue

from bitshift.crawler import crawler, indexer

__all__ = ["crawl"]

def crawl():
"""
Initialize all crawlers (and indexers).

Start the:
1. GitHub crawler, :class:`crawler.GitHubCrawler`.
2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`.
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
"""

_configure_logging()

MAX_URL_QUEUE_SIZE = 5e3

repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
threads = [crawler.GitHubCrawler(repo_clone_queue),
crawler.BitbucketCrawler(repo_clone_queue),
indexer.GitIndexer(repo_clone_queue)]

for thread in threads:
thread.start()

def _configure_logging():
LOG_FILE_DIR = "log"

if not os.path.exists(LOG_FILE_DIR):
os.mkdir(LOG_FILE_DIR)

logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

formatter = logging.Formatter(
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S")

handler = logging.handlers.TimedRotatingFileHandler(
"%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1,
backupCount=20)
handler.setFormatter(formatter)

root_logger = logging.getLogger()
root_logger.addHandler(handler)
root_logger.setLevel(logging.NOTSET)

+ 240
- 0
bitshift/crawler/crawler.py View File

@@ -0,0 +1,240 @@
"""
:synopsis: Main crawler module, to oversee all site-specific crawlers.

Contains all website/framework-specific Class crawlers.
"""

import logging, requests, time, threading

from bitshift.crawler import indexer

from ..codelet import Codelet
from ..database import Database

class GitHubCrawler(threading.Thread):
"""
Crawler that retrieves links to all of GitHub's public repositories.

GitHubCrawler is a threaded singleton that queries GitHub's API for urls
to its public repositories, which it inserts into a :class:`Queue.Queue`
shared with :class:`indexer.GitIndexer`.

:ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository`
with repository metadata retrieved by :class:`GitHubCrawler`, and other Git
crawlers, to be processed by :class:`indexer.GitIndexer`.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
"""

AUTHENTICATION = {
"client_id" : "436cb884ae09be7f2a4e",
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
}

def __init__(self, clone_queue):
"""
Create an instance of the singleton `GitHubCrawler`.

:param clone_queue: see :attr:`self.clone_queue`

:type clone_queue: see :attr:`self.clone_queue`
"""

self.clone_queue = clone_queue
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")
super(GitHubCrawler, self).__init__(name=self.__class__.__name__)

def run(self):
"""
Query the GitHub API for data about every public repository.

Pull all of GitHub's repositories by making calls to its API in a loop,
accessing a subsequent page of results via the "next" URL returned in an
API response header. Uses Severyn Kozak's (sevko) authentication
credentials. For every new repository, a :class:`GitRepository` is
inserted into :attr:`self.clone_queue`.
"""

next_api_url = "https://api.github.com/repositories"
api_request_interval = 5e3 / 60 ** 2

while len(next_api_url) > 0:
start_time = time.time()

try:
resp = requests.get(next_api_url, params=self.AUTHENTICATION)
except ConnectionError as excep:
self._logger.warning("API %s call failed: %s: %s",
next_api_url, excep.__class__.__name__, excep)
time.sleep(0.5)
continue

queue_percent_full = (float(self.clone_queue.qsize()) /
self.clone_queue.maxsize) * 100
self._logger.info("API call made. Queue size: %d/%d, %d%%." %
((self.clone_queue.qsize(), self.clone_queue.maxsize,
queue_percent_full)))

repo_names = [repo["full_name"] for repo in resp.json()]
repo_stars = self._get_repositories_stars(repo_names)

for repo in resp.json():
while self.clone_queue.full():
time.sleep(1)

self.clone_queue.put(indexer.GitRepository(
repo["html_url"], repo["full_name"].replace("/", ""),
"GitHub", repo_stars[repo["full_name"]]))

if int(resp.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(resp.headers["x-ratelimit-reset"]) -
time.time())

next_api_url = resp.headers["link"].split(">")[0][1:]

sleep_time = api_request_interval - (time.time() - start_time)
if sleep_time > 0:
time.sleep(sleep_time)

def _get_repositories_stars(self, repo_names):
"""
Return the number of stargazers for several repositories.

Queries the GitHub API for the number of stargazers for any given
repositories, and blocks if the query limit is exceeded.

:param repo_names: An array of repository names, in
`username/repository_name` format.

:type repo_names: str

:return: A dictionary with repository name keys, and corresponding
stargazer count values.

Example dictionary:
.. code-block:: python
{
"user/repository" : 100
}

:rtype: dictionary
"""

API_URL = "https://api.github.com/search/repositories"
REPOS_PER_QUERY = 25

repo_stars = {}
for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in
xrange(0, len(repo_names), REPOS_PER_QUERY)]:
query_url = "%s?q=%s" % (API_URL,
"+".join("repo:%s" % name for name in names))

params = self.AUTHENTICATION
resp = requests.get(query_url,
params=params,
headers={
"Accept" : "application/vnd.github.preview"
})

if int(resp.headers["x-ratelimit-remaining"]) == 0:
sleep_time = int(resp.headers["x-ratelimit-reset"]) - \
time.time() + 1
if sleep_time > 0:
logging.info("API quota exceeded. Sleep time: %d." %
sleep_time)
time.sleep(sleep_time)

for repo in resp.json()["items"]:
rank = float(repo["stargazers_count"]) / 1000
repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0

for name in repo_names:
if name not in repo_stars:
repo_stars[name] = 0.5

return repo_stars

class BitbucketCrawler(threading.Thread):
"""
Crawler that retrieves links to all of Bitbucket's public repositories.

BitbucketCrawler is a threaded singleton that queries Bitbucket's API for
urls to its public repositories, and inserts them as
:class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with
:class:`indexer.GitIndexer`.

:ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert
:class:`indexer.GitRepository` repository urls into.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
"""

def __init__(self, clone_queue):
"""
Create an instance of the singleton `BitbucketCrawler`.

:param clone_queue: see :attr:`self.clone_queue`

:type clone_queue: see :attr:`self.clone_queue`
"""

self.clone_queue = clone_queue
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")
super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)

def run(self):
"""
Query the Bitbucket API for data about every public repository.

Query the Bitbucket API's "/repositories" endpoint and read its
paginated responses in a loop; any "git" repositories have their
clone-urls and names inserted into a :class:`indexer.GitRepository` in
:attr:`self.clone_queue`.
"""

next_api_url = "https://api.bitbucket.org/2.0/repositories"

while True:
try:
response = requests.get(next_api_url).json()
except ConnectionError as exception:
time.sleep(0.5)
self._logger.warning("API %s call failed: %s: %s",
next_api_url, excep.__class__.__name__, excep)
continue

queue_percent_full = (float(self.clone_queue.qsize()) /
self.clone_queue.maxsize) * 100
self._logger.info("API call made. Queue size: %d/%d, %d%%." %
((self.clone_queue.qsize(), self.clone_queue.maxsize,
queue_percent_full)))

for repo in response["values"]:
if repo["scm"] == "git":
while self.clone_queue.full():
time.sleep(1)

clone_links = repo["links"]["clone"]
clone_url = (clone_links[0]["href"] if
clone_links[0]["name"] == "https" else
clone_links[1]["href"])
links.append("clone_url")

try:
watchers = requests.get(
repo["links"]["watchers"]["href"])
rank = len(watchers.json()["values"]) / 100
except ConnectionError as exception:
time.sleep(0.5)
self._logger.warning("API %s call failed: %s: %s",
next_api_url, excep.__class__.__name__, excep)
continue

self.clone_queue.put(indexer.GitRepository(
clone_url, repo["full_name"], "Bitbucket"),
rank if rank < 1.0 else 1.0)

next_api_url = response["next"]
time.sleep(0.2)

+ 489
- 0
bitshift/crawler/indexer.py View File

@@ -0,0 +1,489 @@
"""
:synopsis: Contains a singleton GitIndexer class, which clones and indexes git
repositories.
"""

import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\
threading

from ..database import Database
from ..codelet import Codelet

GIT_CLONE_DIR = "/tmp/bitshift"
THREAD_QUEUE_SLEEP = 0.5

class GitRepository(object):
"""
A representation of a Git repository's metadata.

:ivar url: (str) The repository's url.
:ivar name: (str) The name of the repository.
:ivar framework_name: (str) The name of the online Git framework that the
repository belongs to (eg, GitHub, BitBucket).
:ivar rank: (float) The rank of the repository, as assigned by
:class:`crawler.GitHubCrawler`.
"""

def __init__(self, url, name, framework_name, rank):
"""
Create a GitRepository instance.

:param url: see :attr:`GitRepository.url`
:param name: see :attr:`GitRepository.name`
:param framework_name: see :attr:`GitRepository.framework_name`
:param rank: see :attr:`GitRepository.rank`

:type url: str
:type name: str
:type framework_name: str
:type rank: float
"""

self.url = url
self.name = name
self.framework_name = framework_name
self.rank = rank

class GitIndexer(threading.Thread):
"""
A singleton Git repository indexer.

:class:`GitIndexer` indexes the repositories cloned by the
:class:`_GitCloner` singleton.

:ivar index_queue: (:class:`Queue.Queue`) A queue containing
:class:`GitRepository` objects for every new repository succesfully
cloned by :class:`_GitCloner`, which are to be indexed.
:ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner,
which feeds :class:`GitIndexer`.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
"""

def __init__(self, clone_queue):
"""
Create an instance of the singleton `GitIndexer`.

:param clone_queue: see :attr:`self.index_queue`

:type index_queue: see :attr:`self.index_queue`
"""

MAX_INDEX_QUEUE_SIZE = 10

self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
self.git_cloner = _GitCloner(clone_queue, self.index_queue)
self.git_cloner.start()
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")

if not os.path.exists(GIT_CLONE_DIR):
os.makedirs(GIT_CLONE_DIR)

super(GitIndexer, self).__init__(name=self.__class__.__name__)

def run(self):
"""
Retrieve metadata about newly cloned repositories and index them.

Blocks until new repositories appear in :attr:`self.index_queue`, then
retrieves one, and attempts indexing it. Should any errors occur, the
new repository will be discarded and the indexer will index the next in
the queue.
"""

while True:
while self.index_queue.empty():
time.sleep(THREAD_QUEUE_SLEEP)

repo = self.index_queue.get()
self.index_queue.task_done()
try:
self._index_repository(repo)
except Exception as excep:
self._logger.warning("%s: %s.", excep.__class__.__name__, excep)

def _index_repository(self, repo):
"""
Clone and index (create and insert Codeletes for) a Git repository.

`git clone` the Git repository located at **repo.url**, call
`_insert_repository_codelets()`, then remove said repository.

:param repo_url: The metadata of the repository to be indexed.

:type repo_url: :class:`GitRepository`
"""

with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir:
try:
self._insert_repository_codelets(repo)
except Exception as excep:
self._logger.warning("%s: %s.", excep.__class__.__name__, excep)

if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))

def _insert_repository_codelets(self, repo):
"""
Create and insert a Codelet for the files inside a Git repository.

Create a new Codelet, and insert it into the Database singleton, for
every file inside the current working directory's default branch
(usually *master*).

:param repo_url: The metadata of the repository to be indexed.

:type repo_url: :class:`GitRepository`
"""

commits_meta = self._get_commits_metadata()
if commits_meta is None:
return

for filename in commits_meta.keys():
try:
with open(filename) as source_file:
source = self._decode(source_file.read())
if source is None:
continue
except IOError as exception:
continue

authors = [(self._decode(author), None) for author in \
commits_meta[filename]["authors"]]
codelet = Codelet("%s:%s" % (repo.name, filename), source, filename,
None, authors, self._generate_file_url(filename,
repo.url, repo.framework_name),
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"],
repo.rank)

def _generate_file_url(self, filename, repo_url, framework_name):
"""
Return a url for a filename from a Git wrapper framework.

:param filename: The path of the file.
:param repo_url: The url of the file's parent repository.
:param framework_name: The name of the framework the repository is from.

:type filename: str
:type repo_url: str
:type framework_name: str

:return: The file's full url on the given framework, if successfully
derived.
:rtype: str, or None

.. warning::
Various Git subprocesses will occasionally fail, and, seeing as the
information they provide is a crucial component of some repository file
urls, None may be returned.
"""

try:
if framework_name == "GitHub":
default_branch = subprocess.check_output("git branch"
" --no-color", shell=True)[2:-1]
return ("%s/blob/%s/%s" % (repo_url, default_branch,
filename)).replace("//", "/")
elif framework_name == "Bitbucket":
commit_hash = subprocess.check_output("git rev-parse HEAD",
shell=True).replace("\n", "")
return ("%s/src/%s/%s" % (repo_url, commit_hash,
filename)).replace("//", "/")
except subprocess.CalledProcessError as exception:
return None

def _get_git_commits(self):
"""
Return the current working directory's formatted commit data.

Uses `git log` to generate metadata about every single file in the
repository's commit history.

:return: The author, timestamp, and names of all modified files of every
commit.
.. code-block:: python
sample_returned_array = [
{
"author" : (str) "author"
"timestamp" : (`datetime.datetime`) <object>,
"filenames" : (str array) ["file1", "file2"]
}
]
:rtype: array of dictionaries
"""

git_log = subprocess.check_output(("git --no-pager log --name-only"
" --pretty=format:'%n%n%an%n%at' -z"), shell=True)

commits = []
for commit in git_log.split("\n\n"):
fields = commit.split("\n")
if len(fields) > 2:
commits.append({
"author" : fields[0],
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
"filenames" : fields[2].split("\x00")[:-2]
})

return commits

def _get_tracked_files(self):
"""
Return a list of the filenames of all valuable files in the Git repository.

Get a list of the filenames of the non-binary (Perl heuristics used for
filetype identification) files currently inside the current working
directory's Git repository. Then, weed out any boilerplate/non-code files
that match the regex rules in GIT_IGNORE_FILES.

:return: The filenames of all index-worthy non-binary files.
:rtype: str array
"""

files = []
for dirname, subdir_names, filenames in os.walk("."):
for filename in filenames:
path = os.path.join(dirname, filename)
if self._is_ascii(path):
files.append(path[2:])

return files

def _get_commits_metadata(self):
"""
Return a dictionary containing every valuable tracked file's metadata.

:return: A dictionary with author names, time of creation, and time of last
modification for every filename key.
.. code-block:: python
sample_returned_dict = {
"my_file" : {
"authors" : (str array) ["author1", "author2"],
"time_created" : (`datetime.datetime`) <object>,
"time_last_modified" : (`datetime.datetime`) <object>
}
}
:rtype: dictionary of dictionaries
"""

commits = self._get_git_commits()
tracked_files = self._get_tracked_files()

files_meta = {}
for commit in commits:
for filename in commit["filenames"]:
if filename not in tracked_files:
continue

if filename not in files_meta.keys():
files_meta[filename] = {
"authors" : [commit["author"]],
"time_last_modified" : commit["timestamp"],
"time_created" : commit["timestamp"]
}
else:
if commit["author"] not in files_meta[filename]["authors"]:
files_meta[filename]["authors"].append(commit["author"])
files_meta[filename]["time_created"] = commit["timestamp"]

return files_meta

def _decode(self, raw):
"""
Return a decoded a raw string.

:param raw: The string to string.

:type raw: (str)

:return: If the original encoding is successfully inferenced, return the
decoded string.
:rtype: str, or None

.. warning::
The raw string's original encoding is identified by heuristics which
can, and occasionally will, fail. Decoding will then fail, and None
will be returned.
"""

try:
encoding = bs4.BeautifulSoup(raw).original_encoding
return raw.decode(encoding) if encoding is not None else None

except (LookupError, UnicodeDecodeError, UserWarning) as exception:
return None

def _is_ascii(self, filename):
"""
Heuristically determine whether a file is ASCII text or binary.

If a portion of the file contains null bytes, or the percentage of bytes
that aren't ASCII is greater than 30%, then the file is concluded to be
binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T`
operator, and is the de-facto method for in : passdetermining whether a
file is ASCII.

:param filename: The path of the file to test.

:type filename: str

:return: Whether the file is probably ASCII.
:rtype: Boolean
"""

try:
with open(filename) as source:
file_snippet = source.read(512)

if not file_snippet:
return True

ascii_characters = "".join(map(chr, range(32, 127)) +
list("\n\r\t\b"))
null_trans = string.maketrans("", "")

if "\0" in file_snippet:
return False

non_ascii = file_snippet.translate(null_trans, ascii_characters)
return not float(len(non_ascii)) / len(file_snippet) > 0.30

except IOError as exception:
return False

class _GitCloner(threading.Thread):
"""
A singleton Git repository cloner.

Clones the repositories crawled by :class:`crawler.GitHubCrawler` for
:class:`GitIndexer` to index.

:ivar clone_queue: (:class:`Queue.Queue`) see
:attr:`crawler.GitHubCrawler.clone_queue`.
:ivar index_queue: (:class:`Queue.Queue`) see
:attr:`GitIndexer.index_queue`.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
"""

def __init__(self, clone_queue, index_queue):
"""
Create an instance of the singleton :class:`_GitCloner`.

:param clone_queue: see :attr:`self.clone_queue`
:param index_queue: see :attr:`self.index_queue`

:type clone_queue: see :attr:`self.clone_queue`
:type index_queue: see :attr:`self.index_queue`
"""

self.clone_queue = clone_queue
self.index_queue = index_queue
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")
super(_GitCloner, self).__init__(name=self.__class__.__name__)

def run(self):
"""
Retrieve metadata about newly crawled repositories and clone them.

Blocks until new :class:`GitRepository` appear in
:attr:`self.clone_queue`, then attempts cloning them. If
succcessful, the cloned repository is added to :attr:`self.index_queue`
for the `GitIndexer` to clone; otherwise, it is discarded.
"""

while True:
while self.clone_queue.empty():
time.sleep(THREAD_QUEUE_SLEEP)
repo = self.clone_queue.get()
self.clone_queue.task_done()

try:
self._clone_repository(repo)
except Exception as exception:
pass

def _clone_repository(self, repo):
"""
Attempt cloning a Git repository.

:param repo: Metadata about the repository to clone.

:type repo: :class:`GitRepository`
"""

GIT_CLONE_TIMEOUT = 500

queue_percent_full = (float(self.index_queue.qsize()) /
self.index_queue.maxsize) * 100

exit_code = None
command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone"
" --single-branch %s %s/%s || pkill -f git")

command_attempt = 0
while exit_code is None:
try:
exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT,
repo.url, GIT_CLONE_DIR, repo.name), shell=True)
except Exception as exception:
time.sleep(1)
command_attempt += 1
if command_attempt == 20:
break
else:
continue
else:
break

if exit_code != 0:
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
return

while self.index_queue.full():
time.sleep(THREAD_QUEUE_SLEEP)

self.index_queue.put(repo)

class _ChangeDir(object):
"""
A wrapper class for os.chdir(), to map onto `with` and handle exceptions.

:ivar new_path: (str) The path to change the current directory to.
:ivar old_path: (str) The path of the directory to return to.
"""

def __init__(self, new_path):
"""
Create a _ChangeDir instance.

:param new_path: The directory to enter.

:type new_path: str
"""

self.new_path = new_path

def __enter__(self):
"""
Change the current working-directory to **new_path**.
"""

self.old_path = os.getcwd()
os.chdir(self.new_path)

def __exit__(self, *exception):
"""
Change the current working-directory to **old_path**.

:param exception: Various exception arguments passed by `with`.

:type exception: varargs
"""

os.chdir(self.old_path)

+ 0
- 18
bitshift/database.py View File

@@ -1,18 +0,0 @@
"""
Module with classes and functions to handle communication with the MySQL
database backend, which manages the search index.
"""

import oursql

class Database(object):
"""Represents the MySQL database."""

def __init__(self):
pass

def _connect(self):
pass

def _create(self):
pass

+ 153
- 0
bitshift/database/__init__.py View File

@@ -0,0 +1,153 @@
"""
Subpackage with classes and functions to handle communication with the MySQL
database backend, which manages the search index.
"""

import os

import mmh3
import oursql

from .migration import VERSION, MIGRATIONS

__all__ = ["Database"]

class Database(object):
"""Represents the MySQL database."""

def __init__(self, migrate=False):
self._conn = self._connect()
self._check_version(migrate)

def _connect(self):
"""Establish a connection to the database."""
root = os.path.dirname(os.path.abspath(__file__))
default_file = os.path.join(root, ".my.cnf")
return oursql.connect(db="bitshift", read_default_file=default_file,
autoping=True, autoreconnect=True)

def _migrate(self, cursor, current):
"""Migrate the database to the latest schema version."""
for version in xrange(current, VERSION):
print "Migrating to %d..." % version + 1
for query in MIGRATIONS[version - 1]:
cursor.execute(query)
cursor.execute("UPDATE version SET version = ?", (version + 1,))

def _check_version(self, migrate):
"""Check the database schema version and respond accordingly.

If the schema is out of date, migrate if *migrate* is True, else raise
an exception.
"""
with self._conn.cursor() as cursor:
cursor.execute("SELECT version FROM version")
version = cursor.fetchone()[0]
if version < VERSION:
if migrate:
self._migrate(cursor, version)
else:
err = "Database schema out of date. " \
"Run `python -m bitshift.database.migration`."
raise RuntimeError(err)

def _get_codelets_from_ids(self, cursor, ids):
"""Return a list of Codelet objects given a list of codelet IDs."""
raise NotImplementedError() ## TODO

def _decompose_url(self, cursor, url):
"""Break up a URL into an origin (with a URL base) and a suffix."""
query = """SELECT origin_id, SUBSTR(?, LENGTH(origin_url_base))
FROM origins
WHERE origin_url_base IS NOT NULL
AND ? LIKE CONCAT(origin_url_base, "%")"""

cursor.execute(query, (url, url))
result = cursor.fetchone()
return result if result else (1, url)

def _insert_symbols(self, cursor, code_id, sym_type, symbols):
"""Insert a list of symbols of a given type into the database."""
sym_types = ["functions", "classes", "variables"]
query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)"
query2 = """INSERT INTO symbol_locations VALUES
(DEFAULT, ?, ?, ?, ?, ?, ?)"""

for (name, decls, uses) in symbols:
cursor.execute(query1, (code_id, sym_types.index(sym_type), name))
sym_id = cursor.lastrowid
params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] +
[tuple([sym_id, 1] + list(loc)) for loc in uses])
cursor.executemany(query2, params)

def close(self):
"""Disconnect from the database."""
self._conn.close()

def search(self, query, page=1):
"""
Search the database for a query and return the *n*\ th page of results.

:param query: The query to search for.
:type query: :py:class:`~.query.tree.Tree`
:param page: The result page to display.
:type page: int

:return: The total number of results, and the *n*\ th page of results.
:rtype: 2-tuple of (long, list of :py:class:`.Codelet`\ s)
"""
query1 = """SELECT cdata_codelet, cache_count_mnt, cache_count_exp
FROM cache
INNER JOIN cache_data ON cache_id = cdata_cache
WHERE cache_id = ?"""
query2 = "INSERT INTO cache VALUES (?, ?, ?, DEFAULT)"
query3 = "INSERT INTO cache_data VALUES (?, ?)"

cache_id = mmh3.hash64(str(page) + ":" + query.serialize())[0]

with self._conn.cursor() as cursor:
cursor.execute(query1, (cache_id,))
results = cursor.fetchall()
if results: # Cache hit
num_results = results[0][1] * (10 ** results[0][2])
ids = [res[0] for res in results]
else: # Cache miss
## TODO: build and execute search query
results = cursor.fetchall()
ids = NotImplemented ## TODO: extract ids from results
num_results = NotImplemented ## TODO: num if results else 0
num_exp = max(len(str(num_results)) - 3, 0)
num_results = int(round(num_results, -num_exp))
num_mnt = num_results / (10 ** num_exp)
cursor.execute(query2, (cache_id, num_mnt, num_exp))
cursor.executemany(query3, [(cache_id, c_id) for c_id in ids])
return (num_results, self._get_codelets_from_ids(cursor, ids))

def insert(self, codelet):
"""
Insert a codelet into the database.

:param codelet: The codelet to insert.
:type codelet: :py:class:`.Codelet`
"""
query1 = """INSERT INTO code VALUES (?, ?, ?)
ON DUPLICATE KEY UPDATE code_id=code_id"""
query2 = """INSERT INTO codelets VALUES
(DEFAULT, ?, ?, ?, ?, ?, ?, ?)"""
query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)"

hash_key = str(codelet.language) + ":" + codelet.code.encode("utf8")
code_id = mmh3.hash64(hash_key)[0]

with self._conn.cursor() as cursor:
cursor.execute(query1, (code_id, codelet.language, codelet.code))
if cursor.rowcount == 1:
for sym_type, symbols in codelet.symbols.iteritems():
self._insert_symbols(cursor, code_id, sym_type, symbols)
origin, url = self._decompose_url(cursor, codelet.url)
cursor.execute(query2, (codelet.name, code_id, origin, url,
codelet.rank, codelet.date_created,
codelet.date_modified))
codelet_id = cursor.lastrowid
authors = [(codelet_id, a[0], a[1]) for a in codelet.authors]
cursor.executemany(query3, authors)

+ 97
- 0
bitshift/database/migration.py View File

@@ -0,0 +1,97 @@
"""
Contains information about database schema versions, and SQL queries to update
between them.
"""

VERSION = 6

MIGRATIONS = [
# 1 -> 2
[
"""ALTER TABLE `codelets`
DROP FOREIGN KEY `codelets_ibfk_1`""",
"""ALTER TABLE `code`
DROP KEY `code_hash`,
DROP COLUMN `code_hash`,
MODIFY COLUMN `code_id` BIGINT NOT NULL""",
"""ALTER TABLE `codelets`
MODIFY COLUMN `codelet_code_id` BIGINT NOT NULL,
ADD KEY (`codelet_lang`),
ADD CONSTRAINT `codelets_ibfk_1` FOREIGN KEY (`codelet_code_id`)
REFERENCES `code` (`code_id`)
ON DELETE RESTRICT ON UPDATE CASCADE""",
"""ALTER TABLE `symbols`
ADD COLUMN `symbol_end_row` INT UNSIGNED NOT NULL,
ADD COLUMN `symbol_end_col` INT UNSIGNED NOT NULL"""
],
# 2 -> 3
[
"""ALTER TABLE `symbols`
DROP FOREIGN KEY `symbols_ibfk_1`,
CHANGE COLUMN `symbol_codelet` `symbol_code` BIGINT NOT NULL,
ADD CONSTRAINT `symbols_ibfk_1` FOREIGN KEY (`symbol_code`)
REFERENCES `code` (`code_id`)
ON DELETE CASCADE ON UPDATE CASCADE"""
],
# 3 -> 4
[
"""ALTER TABLE `symbols`
DROP COLUMN `symbol_row`,
DROP COLUMN `symbol_col`,
DROP COLUMN `symbol_end_row`,
DROP COLUMN `symbol_end_col`""",
"""CREATE TABLE `symbol_locations` (
`sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`sloc_symbol` BIGINT UNSIGNED NOT NULL,
`sloc_type` TINYINT UNSIGNED NOT NULL,
`sloc_row` INT UNSIGNED NOT NULL,
`sloc_col` INT UNSIGNED NOT NULL,
`sloc_end_row` INT UNSIGNED NOT NULL,
`sloc_end_col` INT UNSIGNED NOT NULL,
PRIMARY KEY (`sloc_id`),
FOREIGN KEY (`sloc_symbol`)
REFERENCES `symbols` (`symbol_id`)
ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB"""
],
# 4 -> 5
[
"""ALTER TABLE `origins`
MODIFY COLUMN `origin_name` VARCHAR(64) DEFAULT NULL,
MODIFY COLUMN `origin_url` VARCHAR(512) DEFAULT NULL,
MODIFY COLUMN `origin_url_base` VARCHAR(512) DEFAULT NULL"""
],
# 5 -> 6
[
"""ALTER TABLE `code`
ADD COLUMN `code_lang` SMALLINT UNSIGNED DEFAULT NULL
AFTER `code_id`,
ADD KEY (`code_lang`)""",
"""ALTER TABLE `codelets`
DROP KEY `codelet_lang`,
DROP COLUMN `codelet_lang`""",
"""ALTER TABLE `cache_data`
DROP FOREIGN KEY `cache_data_ibfk_1`""",
"""ALTER TABLE `cache`
MODIFY COLUMN `cache_id` BIGINT NOT NULL,
DROP COLUMN `cache_hash`,
DROP COLUMN `cache_last_used`,
MODIFY COLUMN `cache_count_mnt` SMALLINT UNSIGNED NOT NULL""",
"""ALTER TABLE `cache_data`
MODIFY COLUMN `cdata_cache` BIGINT NOT NULL,
ADD PRIMARY KEY (`cdata_cache`, `cdata_codelet`),
ADD CONSTRAINT `cache_data_ibfk_1` FOREIGN KEY (`cdata_codelet`)
REFERENCES `codelets` (`codelet_id`)
ON DELETE CASCADE ON UPDATE CASCADE""",
"""CREATE EVENT `flush_cache`
ON SCHEDULE EVERY 1 HOUR
DO
DELETE FROM `cache`
WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY);"""
]
]

if __name__ == "__main__":
from . import Database

Database(migrate=True).close()

+ 114
- 0
bitshift/database/schema.sql View File

@@ -0,0 +1,114 @@
-- Schema version 6

CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
USE `bitshift`;

CREATE TABLE `version` (
`version` INT UNSIGNED NOT NULL
) ENGINE=InnoDB;
INSERT INTO `version` VALUES (6);

CREATE TABLE `origins` (
`origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT,
`origin_name` VARCHAR(64) DEFAULT NULL,
`origin_url` VARCHAR(512) DEFAULT NULL,
`origin_url_base` VARCHAR(512) DEFAULT NULL,
`origin_image` BLOB DEFAULT NULL,
PRIMARY KEY (`origin_id`)
) ENGINE=InnoDB;
INSERT INTO `origins` VALUES (1, NULL, NULL, NULL, NULL);

CREATE TABLE `code` (
`code_id` BIGINT NOT NULL,
`code_lang` SMALLINT UNSIGNED DEFAULT NULL,
`code_code` MEDIUMTEXT NOT NULL,
PRIMARY KEY (`code_id`),
KEY (`code_lang`),
FULLTEXT KEY (`code_code`)
) ENGINE=InnoDB;

CREATE TABLE `codelets` (
`codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`codelet_name` VARCHAR(300) NOT NULL,
`codelet_code_id` BIGINT NOT NULL,
`codelet_origin` TINYINT UNSIGNED NOT NULL,
`codelet_url` VARCHAR(512) NOT NULL,
`codelet_rank` FLOAT NOT NULL,
`codelet_date_created` DATETIME DEFAULT NULL,
`codelet_date_modified` DATETIME DEFAULT NULL,
PRIMARY KEY (`codelet_id`),
FULLTEXT KEY (`codelet_name`),
KEY (`codelet_rank`),
KEY (`codelet_date_created`),
KEY (`codelet_date_modified`),
FOREIGN KEY (`codelet_code_id`)
REFERENCES `code` (`code_id`)
ON DELETE RESTRICT ON UPDATE CASCADE,
FOREIGN KEY (`codelet_origin`)
REFERENCES `origins` (`origin_id`)
ON DELETE RESTRICT ON UPDATE CASCADE
) ENGINE=InnoDB;

CREATE TABLE `authors` (
`author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`author_codelet` BIGINT UNSIGNED NOT NULL,
`author_name` VARCHAR(128) NOT NULL,
`author_url` VARCHAR(512) DEFAULT NULL,
PRIMARY KEY (`author_id`),
FULLTEXT KEY (`author_name`),
FOREIGN KEY (`author_codelet`)
REFERENCES `codelets` (`codelet_id`)
ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB;

CREATE TABLE `symbols` (
`symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`symbol_code` BIGINT NOT NULL,
`symbol_type` TINYINT UNSIGNED NOT NULL,
`symbol_name` VARCHAR(512) NOT NULL,
PRIMARY KEY (`symbol_id`),
KEY (`symbol_type`, `symbol_name`(32)),
FOREIGN KEY (`symbol_code`)
REFERENCES `code` (`code_id`)
ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB;

CREATE TABLE `symbol_locations` (
`sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`sloc_symbol` BIGINT UNSIGNED NOT NULL,
`sloc_type` TINYINT UNSIGNED NOT NULL,
`sloc_row` INT UNSIGNED NOT NULL,
`sloc_col` INT UNSIGNED NOT NULL,
`sloc_end_row` INT UNSIGNED NOT NULL,
`sloc_end_col` INT UNSIGNED NOT NULL,
PRIMARY KEY (`sloc_id`),
FOREIGN KEY (`sloc_symbol`)
REFERENCES `symbols` (`symbol_id`)
ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB;

CREATE TABLE `cache` (
`cache_id` BIGINT NOT NULL,
`cache_count_mnt` SMALLINT UNSIGNED NOT NULL,
`cache_count_exp` TINYINT UNSIGNED NOT NULL,
`cache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`cache_id`)
) ENGINE=InnoDB;

CREATE TABLE `cache_data` (
`cdata_cache` BIGINT NOT NULL,
`cdata_codelet` BIGINT UNSIGNED NOT NULL,
PRIMARY KEY (`cdata_cache`, `cdata_codelet`),
FOREIGN KEY (`cdata_cache`)
REFERENCES `cache` (`cache_id`)
ON DELETE CASCADE ON UPDATE CASCADE,
FOREIGN KEY (`cdata_codelet`)
REFERENCES `codelets` (`codelet_id`)
ON DELETE CASCADE ON UPDATE CASCADE
) ENGINE=InnoDB;

CREATE EVENT `flush_cache`
ON SCHEDULE EVERY 1 HOUR
DO
DELETE FROM `cache`
WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY);

+ 2
- 0
bitshift/query/__init__.py View File

@@ -22,4 +22,6 @@ def parse_query(query):


# gets a string, returns a Tree
# TODO: note: resultant Trees should be normalized so that "foo OR bar"
# and "bar OR foo" result in equivalent trees
pass

+ 11
- 0
docs/source/api/bitshift.query.rst View File

@@ -0,0 +1,11 @@
query Package
=============

:mod:`query` Package
--------------------

.. automodule:: bitshift.query
:members:
:undoc-members:
:show-inheritance:


+ 32
- 7
docs/source/api/bitshift.rst View File

@@ -1,30 +1,51 @@
bitshift package
bitshift Package
================

Submodules
:mod:`bitshift` Package
-----------------------

bitshift.assets module
.. automodule:: bitshift.__init__
:members:
:undoc-members:
:show-inheritance:

:mod:`assets` Module
--------------------

.. automodule:: bitshift.assets
:members:
:undoc-members:
:show-inheritance:

bitshift.config module
:mod:`codelet` Module
---------------------

.. automodule:: bitshift.config
.. automodule:: bitshift.codelet
:members:
:undoc-members:
:show-inheritance:

:mod:`config` Module
--------------------

Module contents
.. automodule:: bitshift.config
:members:
:undoc-members:
:show-inheritance:

:mod:`database` Module
----------------------

.. automodule:: bitshift
.. automodule:: bitshift.database
:members:
:undoc-members:
:show-inheritance:

Subpackages
-----------

.. toctree::

bitshift.parser
bitshift.query


+ 3
- 1
setup.py View File

@@ -4,7 +4,9 @@ setup(
name = "bitshift",
version = "0.1",
packages = find_packages(),
install_requires = ["Flask>=0.10.1", "pygments>=1.6"],
install_requires = [
"Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0",
"beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"],
author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak",
license = "MIT",
url = "https://github.com/earwig/bitshift"


Loading…
Cancel
Save