Merge branch 'develop' into feature/query_parser

10 years ago · c4816c2bb8
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 .sass-cache
 .DS_Store
 .my.cnf

 # github premade rules
 *.py[cod]
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
 bitshift
 ========

 bitshift is a semantic search engine for source code.
 bitshift is a semantic search engine for source code developed by Benjamin
 Attal, Ben Kurtovic, and Severyn Kozak.

 Branches
 --------
@@ -13,6 +14,11 @@ Branches
 - `feature/*`: individual components of the project with untested, likely
  horribly broken code - branch off from and merge into `develop` when done

 Style
 -----
 bitshift uses [SASS][SASS] for styling; compile the stylesheets to CSS with
 `sass --watch static/sass/:static/css`.

 Documentation
 -------------

@@ -24,3 +30,5 @@ new modules or packages, but *not* when adding functions or changing
 docstrings), run `sphinx-apidoc -fo docs/source/api bitshift` from the project
 root. Note that this will revert any custom changes made to the files in
 `docs/source/api`, so you might want to update them by hand instead.

 [SASS]: http://sass-lang.com/guide
--- a/app.py
+++ b/app.py
@@ -5,6 +5,8 @@ Module to contain all the project's Flask server plumbing.
 from flask import Flask
 from flask import render_template, session

 from bitshift import assets
 from bitshift.database import Database
 from bitshift.query import parse_query

 app = Flask(__name__)
@@ -12,7 +14,9 @@ app.config.from_object("bitshift.config")

 app_env = app.jinja_env
 app_env.line_statement_prefix = "="
 app_env.globals.update(assets = assets)
 app_env.globals.update(assets=assets)

 database = Database()

@app.route("/")
 def index():
@@ -20,8 +24,8 @@ def index():

@app.route("/search/<query>")
 def search(query):
    ## tree = parse_query(query)
    ## database.search(tree)
    tree = parse_query(query)
    database.search(tree)
    pass

 if __name__ == "__main__":
--- a/bitshift/init.py
+++ b/bitshift/init.py
@@ -1 +1 @@
 from . import assets, codelet, config, database, parser, query
 from . import assets, codelet, config, database, parser, query, crawler
--- a/bitshift/assets.py
+++ b/bitshift/assets.py
@@ -1,6 +1,5 @@
 """
 .. module:: assets
   :synopsis: Helper functions for use inside the project's Jinja templates.
 :synopsis: Helper functions for use inside the project's Jinja templates.
 """

 from flask import Markup
@@ -16,8 +15,11 @@ def tag(filename):

    :param filename: The filename of the asset to create a tag for.

    :type filename: str

    :return: A string containing a `<source>` tag for JS files, and a `<link>`
        for CSS files.
    :rtype: str
    """

    file_ext = filename.split(".")[-1]
--- a/bitshift/codelet.py
+++ b/bitshift/codelet.py
@@ -1,13 +1,57 @@
 __all__ = ["Codelet"]

 class Codelet(object):
    ## object to store the following (it doesn't need to do anything with it):
    ## author name, URL, date created/modified, language, source code itself
    ## for VCS: project name, file in project
    ## also: list of functions, etc (associations data)
    """
    A source-code object with code metadata and composition analysis.

    ## DICTIONARY MAPPING STRINGS REPRESENTING ASSOCIATION TYPE WITH DICTIONARIES
    ## MAPPING ASSOCIATION NAMES WITH TUPLES REPRESENTING THEIR PLACE IN THE FILE
    ## STORED AS TWO INTEGERS REPRESENTING THE ROW AND THE COLUMN
    :ivar name: (str) A suitable name for the codelet.
    :ivar code: (str) A containing the raw source code.
    :ivar filename: (str, or None) The filename of the snippet.
    :ivar language: (int, or None) The inferred language of `code`.
    :ivar authors: (array of tuples (str, str or None)) An array of tuples
        containing an author's name and profile URL (on the service the code
        was pulled from).
    :ivar code_url: (str) The url of the (page containing the) source code.
    :ivar date_created: (:class:`datetime.datetime`, or None) The date the code
        was published.
    :ivar date_modified: (:class:`datetime.datetime`, or None) The date the
        code was last modified.
    :ivar rank: (float) A quanitification of the source code's quality, as
        per available ratings (stars, forks, upvotes, etc.).
    """

    ## {"functions": {"foo": (12, 13), "bar": (53, 3)}}
    def __init__(self, name, code, filename, language, authors, code_url,
            date_created, date_modified, rank):
        """
        Create a Codelet instance.

        :param name: see :attr:`self.name`
        :param code: see :attr:`self.code`
        :param filename: see :attr:`self.filename`
        :param language: see :attr:`self.language`
        :param authors: see :attr:`self.authors`
        :param code_url: see :attr:`self.code_url`
        :param date_created: see :attr:`self.date_created`
        :param date_modified: see :attr:`self.date_modified`
        :param rank: see :attr:`self.rank`

        :type name: see :attr:`self.name`
        :type code: see :attr:`self.code`
        :type filename: see :attr:`self.filename`
        :type language: see :attr:`self.language`
        :type authors: see :attr:`self.authors`
        :type code_url: see :attr:`self.code_url`
        :type date_created: see :attr:`self.date_created`
        :type date_modified: see :attr:`self.date_modified`
        :type rank: see :attr:`self.rank`
        """

        self.name = name
        self.code = code
        self.filename = filename
        self.language = language
        self.authors = authors
        self.code_url = code_url
        self.date_created = date_created
        self.date_modified = date_modified
        self.rank = rank
--- a/bitshift/crawler/init.py
+++ b/bitshift/crawler/init.py
@@ -0,0 +1,55 @@
 """
 :synopsis: Parent crawler module, which supervises all crawlers.

 Contains functions for initializing all subsidiary, threaded crawlers.
 """

 import logging, logging.handlers, os, Queue

 from bitshift.crawler import crawler, indexer

 __all__ = ["crawl"]

 def crawl():
    """
    Initialize all crawlers (and indexers).

    Start the:
    1. GitHub crawler, :class:`crawler.GitHubCrawler`.
    2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`.
    3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
    """

    _configure_logging()

    MAX_URL_QUEUE_SIZE = 5e3

    repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
    threads = [crawler.GitHubCrawler(repo_clone_queue),
            crawler.BitbucketCrawler(repo_clone_queue),
            indexer.GitIndexer(repo_clone_queue)]

    for thread in threads:
        thread.start()

 def _configure_logging():
    LOG_FILE_DIR = "log"

    if not os.path.exists(LOG_FILE_DIR):
        os.mkdir(LOG_FILE_DIR)

    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.getLogger("urllib3").setLevel(logging.WARNING)

    formatter = logging.Formatter(
            fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"
            " %(message)s"), datefmt="%y-%m-%d %H:%M:%S")

    handler = logging.handlers.TimedRotatingFileHandler(
            "%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1,
            backupCount=20)
    handler.setFormatter(formatter)

    root_logger = logging.getLogger()
    root_logger.addHandler(handler)
    root_logger.setLevel(logging.NOTSET)
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -0,0 +1,240 @@
 """
 :synopsis: Main crawler module, to oversee all site-specific crawlers.

 Contains all website/framework-specific Class crawlers.
 """

 import logging, requests, time, threading

 from bitshift.crawler import indexer

 from ..codelet import Codelet
 from ..database import Database

 class GitHubCrawler(threading.Thread):
    """
    Crawler that retrieves links to all of GitHub's public repositories.

    GitHubCrawler is a threaded singleton that queries GitHub's API for urls
    to its public repositories, which it inserts into a :class:`Queue.Queue`
    shared with :class:`indexer.GitIndexer`.

    :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository`
    with repository metadata retrieved by :class:`GitHubCrawler`, and other Git
    crawlers, to be processed by :class:`indexer.GitIndexer`.
    :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
    """

    AUTHENTICATION = {
        "client_id" : "436cb884ae09be7f2a4e",
        "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
    }

    def __init__(self, clone_queue):
        """
        Create an instance of the singleton `GitHubCrawler`.

        :param clone_queue: see :attr:`self.clone_queue`

        :type clone_queue: see :attr:`self.clone_queue`
        """

        self.clone_queue = clone_queue
        self._logger = logging.getLogger("%s.%s" %
                (__name__, self.__class__.__name__))
        self._logger.info("Starting.")
        super(GitHubCrawler, self).__init__(name=self.__class__.__name__)

    def run(self):
        """
        Query the GitHub API for data about every public repository.

        Pull all of GitHub's repositories by making calls to its API in a loop,
        accessing a subsequent page of results via the "next" URL returned in an
        API response header. Uses Severyn Kozak's (sevko) authentication
        credentials. For every new repository, a :class:`GitRepository` is
        inserted into :attr:`self.clone_queue`.
        """

        next_api_url = "https://api.github.com/repositories"
        api_request_interval = 5e3 / 60 ** 2

        while len(next_api_url) > 0:
            start_time = time.time()

            try:
                resp = requests.get(next_api_url, params=self.AUTHENTICATION)
            except ConnectionError as excep:
                self._logger.warning("API %s call failed: %s: %s",
                        next_api_url, excep.__class__.__name__, excep)
                time.sleep(0.5)
                continue

            queue_percent_full = (float(self.clone_queue.qsize()) /
                    self.clone_queue.maxsize) * 100
            self._logger.info("API call made. Queue size: %d/%d, %d%%." %
                    ((self.clone_queue.qsize(), self.clone_queue.maxsize,
                    queue_percent_full)))

            repo_names = [repo["full_name"] for repo in resp.json()]
            repo_stars = self._get_repositories_stars(repo_names)

            for repo in resp.json():
                while self.clone_queue.full():
                    time.sleep(1)

                self.clone_queue.put(indexer.GitRepository(
                        repo["html_url"], repo["full_name"].replace("/", ""),
                        "GitHub", repo_stars[repo["full_name"]]))

            if int(resp.headers["x-ratelimit-remaining"]) == 0:
                time.sleep(int(resp.headers["x-ratelimit-reset"]) -
                        time.time())

            next_api_url = resp.headers["link"].split(">")[0][1:]

            sleep_time = api_request_interval - (time.time() - start_time)
            if sleep_time > 0:
                time.sleep(sleep_time)

    def _get_repositories_stars(self, repo_names):
        """
        Return the number of stargazers for several repositories.

        Queries the GitHub API for the number of stargazers for any given
        repositories, and blocks if the query limit is exceeded.

        :param repo_names: An array of repository names, in
            `username/repository_name` format.

        :type repo_names: str

        :return: A dictionary with repository name keys, and corresponding
            stargazer count values.

            Example dictionary:
            .. code-block:: python
                {
                    "user/repository" : 100
                }

        :rtype: dictionary
        """

        API_URL = "https://api.github.com/search/repositories"
        REPOS_PER_QUERY = 25

        repo_stars = {}
        for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in
                xrange(0, len(repo_names), REPOS_PER_QUERY)]:
            query_url = "%s?q=%s" % (API_URL,
                "+".join("repo:%s" % name for name in names))

            params = self.AUTHENTICATION
            resp = requests.get(query_url,
                    params=params,
                    headers={
                        "Accept" : "application/vnd.github.preview"
                    })

            if int(resp.headers["x-ratelimit-remaining"]) == 0:
                sleep_time = int(resp.headers["x-ratelimit-reset"]) - \
                        time.time() + 1
                if sleep_time > 0:
                    logging.info("API quota exceeded. Sleep time: %d." %
                            sleep_time)
                    time.sleep(sleep_time)

            for repo in resp.json()["items"]:
                rank = float(repo["stargazers_count"]) / 1000
                repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0

        for name in repo_names:
            if name not in repo_stars:
                repo_stars[name] = 0.5

        return repo_stars

 class BitbucketCrawler(threading.Thread):
    """
    Crawler that retrieves links to all of Bitbucket's public repositories.

    BitbucketCrawler is a threaded singleton that queries Bitbucket's API for
    urls to its public repositories, and inserts them as
    :class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with
    :class:`indexer.GitIndexer`.

    :ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert
        :class:`indexer.GitRepository` repository urls into.
    :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
    """

    def __init__(self, clone_queue):
        """
        Create an instance of the singleton `BitbucketCrawler`.

        :param clone_queue: see :attr:`self.clone_queue`

        :type clone_queue: see :attr:`self.clone_queue`
        """

        self.clone_queue = clone_queue
        self._logger = logging.getLogger("%s.%s" %
                (__name__, self.__class__.__name__))
        self._logger.info("Starting.")
        super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)

    def run(self):
        """
        Query  the Bitbucket API for data about every public repository.

        Query the Bitbucket API's "/repositories" endpoint and read its
        paginated responses in a loop; any "git" repositories have their
        clone-urls and names inserted into a :class:`indexer.GitRepository` in
        :attr:`self.clone_queue`.
        """

        next_api_url = "https://api.bitbucket.org/2.0/repositories"

        while True:
            try:
                response = requests.get(next_api_url).json()
            except ConnectionError as exception:
                time.sleep(0.5)
                self._logger.warning("API %s call failed: %s: %s",
                        next_api_url, excep.__class__.__name__, excep)
                continue

            queue_percent_full = (float(self.clone_queue.qsize()) /
                    self.clone_queue.maxsize) * 100
            self._logger.info("API call made. Queue size: %d/%d, %d%%." %
                    ((self.clone_queue.qsize(), self.clone_queue.maxsize,
                    queue_percent_full)))

            for repo in response["values"]:
                if repo["scm"] == "git":
                    while self.clone_queue.full():
                        time.sleep(1)

                    clone_links = repo["links"]["clone"]
                    clone_url = (clone_links[0]["href"] if
                            clone_links[0]["name"] == "https" else
                            clone_links[1]["href"])
                    links.append("clone_url")

                    try:
                        watchers = requests.get(
                                repo["links"]["watchers"]["href"])
                        rank = len(watchers.json()["values"]) / 100
                    except ConnectionError as exception:
                        time.sleep(0.5)
                        self._logger.warning("API %s call failed: %s: %s",
                                next_api_url, excep.__class__.__name__, excep)
                        continue

                    self.clone_queue.put(indexer.GitRepository(
                        clone_url, repo["full_name"], "Bitbucket"),
                        rank if rank < 1.0 else 1.0)

            next_api_url = response["next"]
            time.sleep(0.2)
--- a/bitshift/crawler/indexer.py
+++ b/bitshift/crawler/indexer.py
@@ -0,0 +1,489 @@
 """
 :synopsis: Contains a singleton GitIndexer class, which clones and indexes git
    repositories.
 """

 import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\
        threading

 from ..database import Database
 from ..codelet import Codelet

 GIT_CLONE_DIR = "/tmp/bitshift"
 THREAD_QUEUE_SLEEP = 0.5

 class GitRepository(object):
    """
    A representation of a Git repository's metadata.

    :ivar url: (str) The repository's url.
    :ivar name: (str) The name of the repository.
    :ivar framework_name: (str) The name of the online Git framework that the
        repository belongs to (eg, GitHub, BitBucket).
    :ivar rank: (float) The rank of the repository, as assigned by
        :class:`crawler.GitHubCrawler`.
    """

    def __init__(self, url, name, framework_name, rank):
        """
        Create a GitRepository instance.

        :param url: see :attr:`GitRepository.url`
        :param name: see :attr:`GitRepository.name`
        :param framework_name: see :attr:`GitRepository.framework_name`
        :param rank: see :attr:`GitRepository.rank`

        :type url: str
        :type name: str
        :type framework_name: str
        :type rank: float
        """

        self.url = url
        self.name = name
        self.framework_name = framework_name
        self.rank = rank

 class GitIndexer(threading.Thread):
    """
    A singleton Git repository indexer.

    :class:`GitIndexer` indexes the repositories cloned by the
    :class:`_GitCloner` singleton.

    :ivar index_queue: (:class:`Queue.Queue`) A queue containing
        :class:`GitRepository` objects for every new repository succesfully
        cloned by :class:`_GitCloner`, which are to be indexed.
    :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner,
        which feeds :class:`GitIndexer`.
    :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
    """

    def __init__(self, clone_queue):
        """
        Create an instance of the singleton `GitIndexer`.

        :param clone_queue: see :attr:`self.index_queue`

        :type index_queue: see :attr:`self.index_queue`
        """

        MAX_INDEX_QUEUE_SIZE = 10

        self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
        self.git_cloner = _GitCloner(clone_queue, self.index_queue)
        self.git_cloner.start()
        self._logger = logging.getLogger("%s.%s" %
                (__name__, self.__class__.__name__))
        self._logger.info("Starting.")

        if not os.path.exists(GIT_CLONE_DIR):
            os.makedirs(GIT_CLONE_DIR)

        super(GitIndexer, self).__init__(name=self.__class__.__name__)

    def run(self):
        """
        Retrieve metadata about newly cloned repositories and index them.

        Blocks until new repositories appear in :attr:`self.index_queue`, then
        retrieves one, and attempts indexing it. Should any errors occur, the
        new repository will be discarded and the indexer will index the next in
        the queue.
        """

        while True:
            while self.index_queue.empty():
                time.sleep(THREAD_QUEUE_SLEEP)

            repo = self.index_queue.get()
            self.index_queue.task_done()
            try:
                self._index_repository(repo)
            except Exception as excep:
                self._logger.warning("%s: %s.", excep.__class__.__name__, excep)

    def _index_repository(self, repo):
        """
        Clone and index (create and insert Codeletes for) a Git repository.

        `git clone` the Git repository located at **repo.url**, call
        `_insert_repository_codelets()`, then remove said repository.

        :param repo_url: The metadata of the repository to be indexed.

        :type repo_url: :class:`GitRepository`
        """

        with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir:
            try:
                self._insert_repository_codelets(repo)
            except Exception as excep:
                self._logger.warning("%s: %s.", excep.__class__.__name__, excep)

        if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
            shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))

    def _insert_repository_codelets(self, repo):
        """
        Create and insert a Codelet for the files inside a Git repository.

        Create a new Codelet, and insert it into the Database singleton, for
        every file inside the current working directory's default branch
        (usually *master*).

        :param repo_url: The metadata of the repository to be indexed.

        :type repo_url: :class:`GitRepository`
        """

        commits_meta = self._get_commits_metadata()
        if commits_meta is None:
            return

        for filename in commits_meta.keys():
            try:
                with open(filename) as source_file:
                    source = self._decode(source_file.read())
                    if source is None:
                        continue
            except IOError as exception:
                continue

            authors = [(self._decode(author), None) for author in \
                    commits_meta[filename]["authors"]]
            codelet = Codelet("%s:%s" % (repo.name, filename), source, filename,
                            None, authors, self._generate_file_url(filename,
                                    repo.url, repo.framework_name),
                            commits_meta[filename]["time_created"],
                            commits_meta[filename]["time_last_modified"],
                            repo.rank)

    def _generate_file_url(self, filename, repo_url, framework_name):
        """
        Return a url for a filename from a Git wrapper framework.

        :param filename: The path of the file.
        :param repo_url: The url of the file's parent repository.
        :param framework_name: The name of the framework the repository is from.

        :type filename: str
        :type repo_url: str
        :type framework_name: str

        :return: The file's full url on the given framework, if successfully
            derived.
        :rtype: str, or None

        .. warning::
            Various Git subprocesses will occasionally fail, and, seeing as the
            information they provide is a crucial component of some repository file
            urls, None may be returned.
        """

        try:
            if framework_name == "GitHub":
                    default_branch = subprocess.check_output("git branch"
                            " --no-color", shell=True)[2:-1]
                    return ("%s/blob/%s/%s" % (repo_url, default_branch,
                            filename)).replace("//", "/")
            elif framework_name == "Bitbucket":
                    commit_hash = subprocess.check_output("git rev-parse HEAD",
                            shell=True).replace("\n", "")
                    return ("%s/src/%s/%s" % (repo_url, commit_hash,
                            filename)).replace("//", "/")
        except subprocess.CalledProcessError as exception:
            return None

    def _get_git_commits(self):
        """
        Return the current working directory's formatted commit data.

        Uses `git log` to generate metadata about every single file in the
        repository's commit history.

        :return: The author, timestamp, and names of all modified files of every
            commit.
            .. code-block:: python
               sample_returned_array = [
                   {
                       "author" : (str) "author"
                       "timestamp" : (`datetime.datetime`) <object>,
                       "filenames" : (str array) ["file1", "file2"]
                   }
               ]
        :rtype: array of dictionaries
        """

        git_log = subprocess.check_output(("git --no-pager log --name-only"
                " --pretty=format:'%n%n%an%n%at' -z"), shell=True)

        commits = []
        for commit in git_log.split("\n\n"):
            fields = commit.split("\n")
            if len(fields) > 2:
                commits.append({
                    "author" : fields[0],
                    "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
                    "filenames" : fields[2].split("\x00")[:-2]
                })

        return commits

    def _get_tracked_files(self):
        """
        Return a list of the filenames of all valuable files in the Git repository.

        Get a list of the filenames of the non-binary (Perl heuristics used for
        filetype identification) files currently inside the current working
        directory's Git repository. Then, weed out any boilerplate/non-code files
        that match the regex rules in GIT_IGNORE_FILES.

        :return: The filenames of all index-worthy non-binary files.
        :rtype: str array
        """

        files = []
        for dirname, subdir_names, filenames in os.walk("."):
            for filename in filenames:
                path = os.path.join(dirname, filename)
                if self._is_ascii(path):
                    files.append(path[2:])

        return files

    def _get_commits_metadata(self):
        """
        Return a dictionary containing every valuable tracked file's metadata.

        :return: A dictionary with author names, time of creation, and time of last
            modification for every filename key.
            .. code-block:: python
                   sample_returned_dict = {
                       "my_file" : {
                           "authors" : (str array) ["author1", "author2"],
                           "time_created" : (`datetime.datetime`) <object>,
                           "time_last_modified" : (`datetime.datetime`) <object>
                       }
                   }
        :rtype: dictionary of dictionaries
        """

        commits = self._get_git_commits()
        tracked_files = self._get_tracked_files()

        files_meta = {}
        for commit in commits:
            for filename in commit["filenames"]:
                if filename not in tracked_files:
                    continue

                if filename not in files_meta.keys():
                    files_meta[filename] = {
                        "authors" : [commit["author"]],
                        "time_last_modified" : commit["timestamp"],
                        "time_created" : commit["timestamp"]
                    }
                else:
                    if commit["author"] not in files_meta[filename]["authors"]:
                        files_meta[filename]["authors"].append(commit["author"])
                    files_meta[filename]["time_created"] = commit["timestamp"]

        return files_meta

    def _decode(self, raw):
        """
        Return a decoded a raw string.

        :param raw: The string to string.

        :type raw: (str)

        :return: If the original encoding is successfully inferenced, return the
            decoded string.
        :rtype: str, or None

        .. warning::
            The raw string's original encoding is identified by heuristics which
            can, and occasionally will, fail. Decoding will then fail, and None
            will be returned.
        """

        try:
            encoding = bs4.BeautifulSoup(raw).original_encoding
            return raw.decode(encoding) if encoding is not None else None

        except (LookupError, UnicodeDecodeError, UserWarning) as exception:
            return None

    def _is_ascii(self, filename):
        """
        Heuristically determine whether a file is ASCII text or binary.

        If a portion of the file contains null bytes, or the percentage of bytes
        that aren't ASCII is greater than 30%, then the file is concluded to be
        binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T`
        operator, and is the de-facto method for in : passdetermining whether a
        file is ASCII.

        :param filename: The path of the file to test.

        :type filename: str

        :return: Whether the file is probably ASCII.
        :rtype: Boolean
        """

        try:
            with open(filename) as source:
                file_snippet = source.read(512)

                if not file_snippet:
                    return True

                ascii_characters = "".join(map(chr, range(32, 127)) +
                        list("\n\r\t\b"))
                null_trans = string.maketrans("", "")

                if "\0" in file_snippet:
                    return False

                non_ascii = file_snippet.translate(null_trans, ascii_characters)
                return not float(len(non_ascii)) / len(file_snippet) > 0.30

        except IOError as exception:
            return False

 class _GitCloner(threading.Thread):
    """
    A singleton Git repository cloner.

    Clones the repositories crawled by :class:`crawler.GitHubCrawler` for
    :class:`GitIndexer` to index.

    :ivar clone_queue: (:class:`Queue.Queue`) see
        :attr:`crawler.GitHubCrawler.clone_queue`.
    :ivar index_queue: (:class:`Queue.Queue`) see
        :attr:`GitIndexer.index_queue`.
    :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
    """

    def __init__(self, clone_queue, index_queue):
        """
        Create an instance of the singleton :class:`_GitCloner`.

        :param clone_queue: see :attr:`self.clone_queue`
        :param index_queue: see :attr:`self.index_queue`

        :type clone_queue: see :attr:`self.clone_queue`
        :type index_queue: see :attr:`self.index_queue`
        """

        self.clone_queue = clone_queue
        self.index_queue = index_queue
        self._logger = logging.getLogger("%s.%s" %
                (__name__, self.__class__.__name__))
        self._logger.info("Starting.")
        super(_GitCloner, self).__init__(name=self.__class__.__name__)

    def run(self):
        """
        Retrieve metadata about newly crawled repositories and clone them.

        Blocks until new :class:`GitRepository` appear in
        :attr:`self.clone_queue`, then attempts cloning them. If
        succcessful, the cloned repository is added to :attr:`self.index_queue`
        for the `GitIndexer` to clone; otherwise, it is discarded.
        """

        while True:
            while self.clone_queue.empty():
                time.sleep(THREAD_QUEUE_SLEEP)
            repo = self.clone_queue.get()
            self.clone_queue.task_done()

            try:
                self._clone_repository(repo)
            except Exception as exception:
                pass

    def _clone_repository(self, repo):
        """
        Attempt cloning a Git repository.

        :param repo: Metadata about the repository to clone.

        :type repo: :class:`GitRepository`
        """

        GIT_CLONE_TIMEOUT = 500

        queue_percent_full = (float(self.index_queue.qsize()) /
                self.index_queue.maxsize) * 100

        exit_code = None
        command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone"
        " --single-branch %s %s/%s || pkill -f git")

        command_attempt = 0
        while exit_code is None:
            try:
                exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT,
                        repo.url, GIT_CLONE_DIR, repo.name), shell=True)
            except Exception as exception:
                time.sleep(1)
                command_attempt += 1
                if command_attempt == 20:
                    break
                else:
                    continue
            else:
                break

        if exit_code != 0:
            if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
                shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
            return

        while self.index_queue.full():
            time.sleep(THREAD_QUEUE_SLEEP)

        self.index_queue.put(repo)

 class _ChangeDir(object):
    """
    A wrapper class for os.chdir(), to map onto `with` and handle exceptions.

    :ivar new_path: (str) The path to change the current directory to.
    :ivar old_path: (str) The path of the directory to return to.
    """

    def __init__(self, new_path):
        """
        Create a _ChangeDir instance.

        :param new_path: The directory to enter.

        :type new_path: str
        """

        self.new_path = new_path

    def __enter__(self):
        """
        Change the current working-directory to **new_path**.
        """

        self.old_path = os.getcwd()
        os.chdir(self.new_path)

    def __exit__(self, *exception):
        """
        Change the current working-directory to **old_path**.

        :param exception: Various exception arguments passed by `with`.

        :type exception: varargs
        """

        os.chdir(self.old_path)
--- a/bitshift/database.py
+++ b/bitshift/database.py
@@ -1,18 +0,0 @@
 """
 Module with classes and functions to handle communication with the MySQL
 database backend, which manages the search index.
 """

 import oursql

 class Database(object):
    """Represents the MySQL database."""

    def __init__(self):
        pass

    def _connect(self):
        pass

    def _create(self):
        pass
--- a/bitshift/database/init.py
+++ b/bitshift/database/init.py
@@ -0,0 +1,153 @@
 """
 Subpackage with classes and functions to handle communication with the MySQL
 database backend, which manages the search index.
 """

 import os

 import mmh3
 import oursql

 from .migration import VERSION, MIGRATIONS

 __all__ = ["Database"]

 class Database(object):
    """Represents the MySQL database."""

    def __init__(self, migrate=False):
        self._conn = self._connect()
        self._check_version(migrate)

    def _connect(self):
        """Establish a connection to the database."""
        root = os.path.dirname(os.path.abspath(__file__))
        default_file = os.path.join(root, ".my.cnf")
        return oursql.connect(db="bitshift", read_default_file=default_file,
                              autoping=True, autoreconnect=True)

    def _migrate(self, cursor, current):
        """Migrate the database to the latest schema version."""
        for version in xrange(current, VERSION):
            print "Migrating to %d..." % version + 1
            for query in MIGRATIONS[version - 1]:
                cursor.execute(query)
            cursor.execute("UPDATE version SET version = ?", (version + 1,))

    def _check_version(self, migrate):
        """Check the database schema version and respond accordingly.

        If the schema is out of date, migrate if *migrate* is True, else raise
        an exception.
        """
        with self._conn.cursor() as cursor:
            cursor.execute("SELECT version FROM version")
            version = cursor.fetchone()[0]
            if version < VERSION:
                if migrate:
                    self._migrate(cursor, version)
                else:
                    err = "Database schema out of date. " \
                          "Run `python -m bitshift.database.migration`."
                    raise RuntimeError(err)

    def _get_codelets_from_ids(self, cursor, ids):
        """Return a list of Codelet objects given a list of codelet IDs."""
        raise NotImplementedError()  ## TODO

    def _decompose_url(self, cursor, url):
        """Break up a URL into an origin (with a URL base) and a suffix."""
        query = """SELECT origin_id, SUBSTR(?, LENGTH(origin_url_base))
                   FROM origins
                   WHERE origin_url_base IS NOT NULL
                   AND ? LIKE CONCAT(origin_url_base, "%")"""

        cursor.execute(query, (url, url))
        result = cursor.fetchone()
        return result if result else (1, url)

    def _insert_symbols(self, cursor, code_id, sym_type, symbols):
        """Insert a list of symbols of a given type into the database."""
        sym_types = ["functions", "classes", "variables"]
        query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)"
        query2 = """INSERT INTO symbol_locations VALUES
                    (DEFAULT, ?, ?, ?, ?, ?, ?)"""

        for (name, decls, uses) in symbols:
            cursor.execute(query1, (code_id, sym_types.index(sym_type), name))
            sym_id = cursor.lastrowid
            params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] +
                      [tuple([sym_id, 1] + list(loc)) for loc in uses])
            cursor.executemany(query2, params)

    def close(self):
        """Disconnect from the database."""
        self._conn.close()

    def search(self, query, page=1):
        """
        Search the database for a query and return the *n*\ th page of results.

        :param query: The query to search for.
        :type query: :py:class:`~.query.tree.Tree`
        :param page: The result page to display.
        :type page: int

        :return: The total number of results, and the *n*\ th page of results.
        :rtype: 2-tuple of (long, list of :py:class:`.Codelet`\ s)
        """
        query1 = """SELECT cdata_codelet, cache_count_mnt, cache_count_exp
                    FROM cache
                    INNER JOIN cache_data ON cache_id = cdata_cache
                    WHERE cache_id = ?"""
        query2 = "INSERT INTO cache VALUES (?, ?, ?, DEFAULT)"
        query3 = "INSERT INTO cache_data VALUES (?, ?)"

        cache_id = mmh3.hash64(str(page) + ":" + query.serialize())[0]

        with self._conn.cursor() as cursor:
            cursor.execute(query1, (cache_id,))
            results = cursor.fetchall()
            if results:  # Cache hit
                num_results = results[0][1] * (10 ** results[0][2])
                ids = [res[0] for res in results]
            else:  # Cache miss
                ## TODO: build and execute search query
                results = cursor.fetchall()
                ids = NotImplemented  ## TODO: extract ids from results
                num_results = NotImplemented  ## TODO: num if results else 0
                num_exp = max(len(str(num_results)) - 3, 0)
                num_results = int(round(num_results, -num_exp))
                num_mnt = num_results / (10 ** num_exp)
                cursor.execute(query2, (cache_id, num_mnt, num_exp))
                cursor.executemany(query3, [(cache_id, c_id) for c_id in ids])
            return (num_results, self._get_codelets_from_ids(cursor, ids))

    def insert(self, codelet):
        """
        Insert a codelet into the database.

        :param codelet: The codelet to insert.
        :type codelet: :py:class:`.Codelet`
        """
        query1 = """INSERT INTO code VALUES (?, ?, ?)
                    ON DUPLICATE KEY UPDATE code_id=code_id"""
        query2 = """INSERT INTO codelets VALUES
                    (DEFAULT, ?, ?, ?, ?, ?, ?, ?)"""
        query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)"

        hash_key = str(codelet.language) + ":" + codelet.code.encode("utf8")
        code_id = mmh3.hash64(hash_key)[0]

        with self._conn.cursor() as cursor:
            cursor.execute(query1, (code_id, codelet.language, codelet.code))
            if cursor.rowcount == 1:
                for sym_type, symbols in codelet.symbols.iteritems():
                    self._insert_symbols(cursor, code_id, sym_type, symbols)
            origin, url = self._decompose_url(cursor, codelet.url)
            cursor.execute(query2, (codelet.name, code_id, origin, url,
                                    codelet.rank, codelet.date_created,
                                    codelet.date_modified))
            codelet_id = cursor.lastrowid
            authors = [(codelet_id, a[0], a[1]) for a in codelet.authors]
            cursor.executemany(query3, authors)
--- a/bitshift/database/migration.py
+++ b/bitshift/database/migration.py
@@ -0,0 +1,97 @@
 """
 Contains information about database schema versions, and SQL queries to update
 between them.
 """

 VERSION = 6

 MIGRATIONS = [
    # 1 -> 2
    [
        """ALTER TABLE `codelets`
           DROP FOREIGN KEY `codelets_ibfk_1`""",
        """ALTER TABLE `code`
           DROP KEY `code_hash`,
           DROP COLUMN `code_hash`,
           MODIFY COLUMN `code_id` BIGINT NOT NULL""",
        """ALTER TABLE `codelets`
           MODIFY COLUMN `codelet_code_id` BIGINT NOT NULL,
           ADD KEY (`codelet_lang`),
           ADD CONSTRAINT `codelets_ibfk_1` FOREIGN KEY (`codelet_code_id`)
               REFERENCES `code` (`code_id`)
               ON DELETE RESTRICT ON UPDATE CASCADE""",
        """ALTER TABLE `symbols`
           ADD COLUMN `symbol_end_row` INT UNSIGNED NOT NULL,
           ADD COLUMN `symbol_end_col` INT UNSIGNED NOT NULL"""
    ],
    # 2 -> 3
    [
        """ALTER TABLE `symbols`
           DROP FOREIGN KEY `symbols_ibfk_1`,
           CHANGE COLUMN `symbol_codelet` `symbol_code` BIGINT NOT NULL,
           ADD CONSTRAINT `symbols_ibfk_1` FOREIGN KEY (`symbol_code`)
               REFERENCES `code` (`code_id`)
               ON DELETE CASCADE ON UPDATE CASCADE"""
    ],
    # 3 -> 4
    [
        """ALTER TABLE `symbols`
           DROP COLUMN `symbol_row`,
           DROP COLUMN `symbol_col`,
           DROP COLUMN `symbol_end_row`,
           DROP COLUMN `symbol_end_col`""",
        """CREATE TABLE `symbol_locations` (
           `sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
           `sloc_symbol` BIGINT UNSIGNED NOT NULL,
           `sloc_type` TINYINT UNSIGNED NOT NULL,
           `sloc_row` INT UNSIGNED NOT NULL,
           `sloc_col` INT UNSIGNED NOT NULL,
           `sloc_end_row` INT UNSIGNED NOT NULL,
           `sloc_end_col` INT UNSIGNED NOT NULL,
           PRIMARY KEY (`sloc_id`),
           FOREIGN KEY (`sloc_symbol`)
               REFERENCES `symbols` (`symbol_id`)
               ON DELETE CASCADE ON UPDATE CASCADE
           ) ENGINE=InnoDB"""
    ],
    # 4 -> 5
    [
        """ALTER TABLE `origins`
           MODIFY COLUMN `origin_name` VARCHAR(64) DEFAULT NULL,
           MODIFY COLUMN `origin_url` VARCHAR(512) DEFAULT NULL,
           MODIFY COLUMN `origin_url_base` VARCHAR(512) DEFAULT NULL"""
    ],
    # 5 -> 6
    [
        """ALTER TABLE `code`
           ADD COLUMN `code_lang` SMALLINT UNSIGNED DEFAULT NULL
               AFTER `code_id`,
           ADD KEY (`code_lang`)""",
        """ALTER TABLE `codelets`
           DROP KEY `codelet_lang`,
           DROP COLUMN `codelet_lang`""",
        """ALTER TABLE `cache_data`
           DROP FOREIGN KEY `cache_data_ibfk_1`""",
        """ALTER TABLE `cache`
           MODIFY COLUMN `cache_id` BIGINT NOT NULL,
           DROP COLUMN `cache_hash`,
           DROP COLUMN `cache_last_used`,
           MODIFY COLUMN `cache_count_mnt` SMALLINT UNSIGNED NOT NULL""",
        """ALTER TABLE `cache_data`
           MODIFY COLUMN `cdata_cache` BIGINT NOT NULL,
           ADD PRIMARY KEY (`cdata_cache`, `cdata_codelet`),
           ADD CONSTRAINT `cache_data_ibfk_1` FOREIGN KEY (`cdata_codelet`)
               REFERENCES `codelets` (`codelet_id`)
               ON DELETE CASCADE ON UPDATE CASCADE""",
        """CREATE EVENT `flush_cache`
           ON SCHEDULE EVERY 1 HOUR
           DO
               DELETE FROM `cache`
                   WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY);"""
    ]
 ]

 if __name__ == "__main__":
    from . import Database

    Database(migrate=True).close()
--- a/bitshift/database/schema.sql
+++ b/bitshift/database/schema.sql
@@ -0,0 +1,114 @@
 -- Schema version 6

 CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
 USE `bitshift`;

 CREATE TABLE `version` (
    `version` INT UNSIGNED NOT NULL
 ) ENGINE=InnoDB;
 INSERT INTO `version` VALUES (6);

 CREATE TABLE `origins` (
    `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT,
    `origin_name` VARCHAR(64) DEFAULT NULL,
    `origin_url` VARCHAR(512) DEFAULT NULL,
    `origin_url_base` VARCHAR(512) DEFAULT NULL,
    `origin_image` BLOB DEFAULT NULL,
    PRIMARY KEY (`origin_id`)
 ) ENGINE=InnoDB;
 INSERT INTO `origins` VALUES (1, NULL, NULL, NULL, NULL);

 CREATE TABLE `code` (
    `code_id` BIGINT NOT NULL,
    `code_lang` SMALLINT UNSIGNED DEFAULT NULL,
    `code_code` MEDIUMTEXT NOT NULL,
    PRIMARY KEY (`code_id`),
    KEY (`code_lang`),
    FULLTEXT KEY (`code_code`)
 ) ENGINE=InnoDB;

 CREATE TABLE `codelets` (
    `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
    `codelet_name` VARCHAR(300) NOT NULL,
    `codelet_code_id` BIGINT NOT NULL,
    `codelet_origin` TINYINT UNSIGNED NOT NULL,
    `codelet_url` VARCHAR(512) NOT NULL,
    `codelet_rank` FLOAT NOT NULL,
    `codelet_date_created` DATETIME DEFAULT NULL,
    `codelet_date_modified` DATETIME DEFAULT NULL,
    PRIMARY KEY (`codelet_id`),
    FULLTEXT KEY (`codelet_name`),
    KEY (`codelet_rank`),
    KEY (`codelet_date_created`),
    KEY (`codelet_date_modified`),
    FOREIGN KEY (`codelet_code_id`)
        REFERENCES `code` (`code_id`)
        ON DELETE RESTRICT ON UPDATE CASCADE,
    FOREIGN KEY (`codelet_origin`)
        REFERENCES `origins` (`origin_id`)
        ON DELETE RESTRICT ON UPDATE CASCADE
 ) ENGINE=InnoDB;

 CREATE TABLE `authors` (
    `author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
    `author_codelet` BIGINT UNSIGNED NOT NULL,
    `author_name` VARCHAR(128) NOT NULL,
    `author_url` VARCHAR(512) DEFAULT NULL,
    PRIMARY KEY (`author_id`),
    FULLTEXT KEY (`author_name`),
    FOREIGN KEY (`author_codelet`)
        REFERENCES `codelets` (`codelet_id`)
        ON DELETE CASCADE ON UPDATE CASCADE
 ) ENGINE=InnoDB;

 CREATE TABLE `symbols` (
    `symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
    `symbol_code` BIGINT NOT NULL,
    `symbol_type` TINYINT UNSIGNED NOT NULL,
    `symbol_name` VARCHAR(512) NOT NULL,
    PRIMARY KEY (`symbol_id`),
    KEY (`symbol_type`, `symbol_name`(32)),
    FOREIGN KEY (`symbol_code`)
        REFERENCES `code` (`code_id`)
        ON DELETE CASCADE ON UPDATE CASCADE
 ) ENGINE=InnoDB;

 CREATE TABLE `symbol_locations` (
    `sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
    `sloc_symbol` BIGINT UNSIGNED NOT NULL,
    `sloc_type` TINYINT UNSIGNED NOT NULL,
    `sloc_row` INT UNSIGNED NOT NULL,
    `sloc_col` INT UNSIGNED NOT NULL,
    `sloc_end_row` INT UNSIGNED NOT NULL,
    `sloc_end_col` INT UNSIGNED NOT NULL,
    PRIMARY KEY (`sloc_id`),
    FOREIGN KEY (`sloc_symbol`)
        REFERENCES `symbols` (`symbol_id`)
        ON DELETE CASCADE ON UPDATE CASCADE
 ) ENGINE=InnoDB;

 CREATE TABLE `cache` (
    `cache_id` BIGINT NOT NULL,
    `cache_count_mnt` SMALLINT UNSIGNED NOT NULL,
    `cache_count_exp` TINYINT UNSIGNED NOT NULL,
    `cache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
    PRIMARY KEY (`cache_id`)
 ) ENGINE=InnoDB;

 CREATE TABLE `cache_data` (
    `cdata_cache` BIGINT NOT NULL,
    `cdata_codelet` BIGINT UNSIGNED NOT NULL,
    PRIMARY KEY (`cdata_cache`, `cdata_codelet`),
    FOREIGN KEY (`cdata_cache`)
        REFERENCES `cache` (`cache_id`)
        ON DELETE CASCADE ON UPDATE CASCADE,
    FOREIGN KEY (`cdata_codelet`)
        REFERENCES `codelets` (`codelet_id`)
        ON DELETE CASCADE ON UPDATE CASCADE
 ) ENGINE=InnoDB;

 CREATE EVENT `flush_cache`
    ON SCHEDULE EVERY 1 HOUR
    DO
        DELETE FROM `cache`
            WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY);
--- a/bitshift/query/init.py
+++ b/bitshift/query/init.py
@@ -22,4 +22,6 @@ def parse_query(query):


    # gets a string, returns a Tree
    # TODO: note: resultant Trees should be normalized so that "foo OR bar"
    # and "bar OR foo" result in equivalent trees
    pass
--- a/docs/source/api/bitshift.query.rst
+++ b/docs/source/api/bitshift.query.rst
@@ -0,0 +1,11 @@
 query Package
 =============

 :mod:`query` Package
 --------------------

 .. automodule:: bitshift.query
    :members:
    :undoc-members:
    :show-inheritance:

--- a/docs/source/api/bitshift.rst
+++ b/docs/source/api/bitshift.rst
@@ -1,30 +1,51 @@
 bitshift package
 bitshift Package
 ================

 Submodules
 :mod:`bitshift` Package
 -----------------------

 bitshift.assets module
 .. automodule:: bitshift.__init__
    :members:
    :undoc-members:
    :show-inheritance:

 :mod:`assets` Module
 --------------------

 .. automodule:: bitshift.assets
    :members:
    :undoc-members:
    :show-inheritance:

 bitshift.config module
 :mod:`codelet` Module
 ---------------------

 .. automodule:: bitshift.config
 .. automodule:: bitshift.codelet
    :members:
    :undoc-members:
    :show-inheritance:

 :mod:`config` Module
 --------------------

 Module contents
 .. automodule:: bitshift.config
    :members:
    :undoc-members:
    :show-inheritance:

 :mod:`database` Module
 ----------------------

 .. automodule:: bitshift
 .. automodule:: bitshift.database
    :members:
    :undoc-members:
    :show-inheritance:

 Subpackages
 -----------

 .. toctree::

    bitshift.parser
    bitshift.query

--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,9 @@ setup(
    name = "bitshift",
    version = "0.1",
    packages = find_packages(),
    install_requires = ["Flask>=0.10.1", "pygments>=1.6"],
    install_requires = [
        "Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0",
        "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"],
    author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak",
    license = "MIT",
    url = "https://github.com/earwig/bitshift"