Merge branch 'develop' of github.com:earwig/bitshift into develop

Conflicts: app.py setup.py
10 years ago · 2cf98df3e2
--- a/app.py
+++ b/app.py
@@ -5,6 +5,7 @@ Module to contain all the project's Flask server plumbing.
 from flask import Flask
 from flask import render_template, session

 from bitshift import assets
 from bitshift.database import Database
 from bitshift.query import parse_query

--- a/bitshift/init.py
+++ b/bitshift/init.py
@@ -1 +1 @@
 from . import assets, codelet, config, database, parser, query
 from . import assets, codelet, config, database, parser, query, crawler
--- a/bitshift/assets.py
+++ b/bitshift/assets.py
@@ -15,8 +15,11 @@ def tag(filename):

    :param filename: The filename of the asset to create a tag for.

    :type filename: str

    :return: A string containing a `<source>` tag for JS files, and a `<link>`
        for CSS files.
    :rtype: str
    """

    file_ext = filename.split(".")[-1]
--- a/bitshift/codelet.py
+++ b/bitshift/codelet.py
@@ -4,42 +4,54 @@ class Codelet(object):
    """
    A source-code object with code metadata and composition analysis.

    :ivar name: (str) A suitable name for the codelet.
    :ivar code: (str) A containing the raw source code.
    :ivar filename: (str, or None) The filename of the snippet.
    :ivar language: (str, or None) The inferred language of `code`.
    :ivar author: (str, or None) The name of the code's author.
    :ivar url: (str) The url of the (page containing the) source code.
    :ivar date_created: (str, or None) The date the code was published.
    :ivar date_modified: (str, or None) The date the code was last modified.
    :ivar language: (int, or None) The inferred language of `code`.
    :ivar authors: (array of tuples (str, str or None)) An array of tuples
        containing an author's name and profile URL (on the service the code
        was pulled from).
    :ivar code_url: (str) The url of the (page containing the) source code.
    :ivar date_created: (:class:`datetime.datetime`, or None) The date the code
        was published.
    :ivar date_modified: (:class:`datetime.datetime`, or None) The date the
        code was last modified.
    :ivar rank: (float) A quanitification of the source code's quality, as
        per available ratings (stars, forks, upvotes, etc.).
    """

    def __init__(self, code, filename, author, language, code_url, author_url,
                 date_created, date_modified):
    def __init__(self, name, code, filename, language, authors, code_url,
            date_created, date_modified, rank):
        """
        Create a Codelet instance.

        :param code: The raw source code.
        :param filename: The filename of the code, if any.
        :param author: The author of the code.
        :param language: The inferred language.
        :param code_url: The url of the (page containing the) source code.
        :param date_created: The date the code was published.
        :param date_modified: The date the code was last modified.
        :param name: see :attr:`self.name`
        :param code: see :attr:`self.code`
        :param filename: see :attr:`self.filename`
        :param language: see :attr:`self.language`
        :param authors: see :attr:`self.authors`
        :param code_url: see :attr:`self.code_url`
        :param date_created: see :attr:`self.date_created`
        :param date_modified: see :attr:`self.date_modified`
        :param rank: see :attr:`self.rank`

        :type code: str
        :type filename: str, or None
        :type language: str, or None
        :type author: str, or None
        :type url: str
        :type date_created: str, or None
        :type date_modified: str, or None
        :type name: see :attr:`self.name`
        :type code: see :attr:`self.code`
        :type filename: see :attr:`self.filename`
        :type language: see :attr:`self.language`
        :type authors: see :attr:`self.authors`
        :type code_url: see :attr:`self.code_url`
        :type date_created: see :attr:`self.date_created`
        :type date_modified: see :attr:`self.date_modified`
        :type rank: see :attr:`self.rank`
        """

        self.name = name
        self.code = code
        self.filename = filename
        self.author = author
        self.language = language
        self.authors = authors
        self.code_url = code_url
        self.author_url = author_url
        self.date_created = date_created
        self.date_modified = date_modified
        self.rank = rank
--- a/bitshift/crawler/init.py
+++ b/bitshift/crawler/init.py
@@ -0,0 +1,55 @@
 """
 :synopsis: Parent crawler module, which supervises all crawlers.

 Contains functions for initializing all subsidiary, threaded crawlers.
 """

 import logging, logging.handlers, os, Queue

 from bitshift.crawler import crawler, indexer

 __all__ = ["crawl"]

 def crawl():
    """
    Initialize all crawlers (and indexers).

    Start the:
    1. GitHub crawler, :class:`crawler.GitHubCrawler`.
    2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`.
    3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
    """

    _configure_logging()

    MAX_URL_QUEUE_SIZE = 5e3

    repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
    threads = [crawler.GitHubCrawler(repo_clone_queue),
            crawler.BitbucketCrawler(repo_clone_queue),
            indexer.GitIndexer(repo_clone_queue)]

    for thread in threads:
        thread.start()

 def _configure_logging():
    LOG_FILE_DIR = "log"

    if not os.path.exists(LOG_FILE_DIR):
        os.mkdir(LOG_FILE_DIR)

    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.getLogger("urllib3").setLevel(logging.WARNING)

    formatter = logging.Formatter(
            fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"
            " %(message)s"), datefmt="%y-%m-%d %H:%M:%S")

    handler = logging.handlers.TimedRotatingFileHandler(
            "%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1,
            backupCount=20)
    handler.setFormatter(formatter)

    root_logger = logging.getLogger()
    root_logger.addHandler(handler)
    root_logger.setLevel(logging.NOTSET)
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -0,0 +1,240 @@
 """
 :synopsis: Main crawler module, to oversee all site-specific crawlers.

 Contains all website/framework-specific Class crawlers.
 """

 import logging, requests, time, threading

 from bitshift.crawler import indexer

 from ..codelet import Codelet
 from ..database import Database

 class GitHubCrawler(threading.Thread):
    """
    Crawler that retrieves links to all of GitHub's public repositories.

    GitHubCrawler is a threaded singleton that queries GitHub's API for urls
    to its public repositories, which it inserts into a :class:`Queue.Queue`
    shared with :class:`indexer.GitIndexer`.

    :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository`
    with repository metadata retrieved by :class:`GitHubCrawler`, and other Git
    crawlers, to be processed by :class:`indexer.GitIndexer`.
    :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
    """

    AUTHENTICATION = {
        "client_id" : "436cb884ae09be7f2a4e",
        "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
    }

    def __init__(self, clone_queue):
        """
        Create an instance of the singleton `GitHubCrawler`.

        :param clone_queue: see :attr:`self.clone_queue`

        :type clone_queue: see :attr:`self.clone_queue`
        """

        self.clone_queue = clone_queue
        self._logger = logging.getLogger("%s.%s" %
                (__name__, self.__class__.__name__))
        self._logger.info("Starting.")
        super(GitHubCrawler, self).__init__(name=self.__class__.__name__)

    def run(self):
        """
        Query the GitHub API for data about every public repository.

        Pull all of GitHub's repositories by making calls to its API in a loop,
        accessing a subsequent page of results via the "next" URL returned in an
        API response header. Uses Severyn Kozak's (sevko) authentication
        credentials. For every new repository, a :class:`GitRepository` is
        inserted into :attr:`self.clone_queue`.
        """

        next_api_url = "https://api.github.com/repositories"
        api_request_interval = 5e3 / 60 ** 2

        while len(next_api_url) > 0:
            start_time = time.time()

            try:
                resp = requests.get(next_api_url, params=self.AUTHENTICATION)
            except ConnectionError as excep:
                self._logger.warning("API %s call failed: %s: %s",
                        next_api_url, excep.__class__.__name__, excep)
                time.sleep(0.5)
                continue

            queue_percent_full = (float(self.clone_queue.qsize()) /
                    self.clone_queue.maxsize) * 100
            self._logger.info("API call made. Queue size: %d/%d, %d%%." %
                    ((self.clone_queue.qsize(), self.clone_queue.maxsize,
                    queue_percent_full)))

            repo_names = [repo["full_name"] for repo in resp.json()]
            repo_stars = self._get_repositories_stars(repo_names)

            for repo in resp.json():
                while self.clone_queue.full():
                    time.sleep(1)

                self.clone_queue.put(indexer.GitRepository(
                        repo["html_url"], repo["full_name"].replace("/", ""),
                        "GitHub", repo_stars[repo["full_name"]]))

            if int(resp.headers["x-ratelimit-remaining"]) == 0:
                time.sleep(int(resp.headers["x-ratelimit-reset"]) -
                        time.time())

            next_api_url = resp.headers["link"].split(">")[0][1:]

            sleep_time = api_request_interval - (time.time() - start_time)
            if sleep_time > 0:
                time.sleep(sleep_time)

    def _get_repositories_stars(self, repo_names):
        """
        Return the number of stargazers for several repositories.

        Queries the GitHub API for the number of stargazers for any given
        repositories, and blocks if the query limit is exceeded.

        :param repo_names: An array of repository names, in
            `username/repository_name` format.

        :type repo_names: str

        :return: A dictionary with repository name keys, and corresponding
            stargazer count values.

            Example dictionary:
            .. code-block:: python
                {
                    "user/repository" : 100
                }

        :rtype: dictionary
        """

        API_URL = "https://api.github.com/search/repositories"
        REPOS_PER_QUERY = 25

        repo_stars = {}
        for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in
                xrange(0, len(repo_names), REPOS_PER_QUERY)]:
            query_url = "%s?q=%s" % (API_URL,
                "+".join("repo:%s" % name for name in names))

            params = self.AUTHENTICATION
            resp = requests.get(query_url,
                    params=params,
                    headers={
                        "Accept" : "application/vnd.github.preview"
                    })

            if int(resp.headers["x-ratelimit-remaining"]) == 0:
                sleep_time = int(resp.headers["x-ratelimit-reset"]) - \
                        time.time() + 1
                if sleep_time > 0:
                    logging.info("API quota exceeded. Sleep time: %d." %
                            sleep_time)
                    time.sleep(sleep_time)

            for repo in resp.json()["items"]:
                rank = float(repo["stargazers_count"]) / 1000
                repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0

        for name in repo_names:
            if name not in repo_stars:
                repo_stars[name] = 0.5

        return repo_stars

 class BitbucketCrawler(threading.Thread):
    """
    Crawler that retrieves links to all of Bitbucket's public repositories.

    BitbucketCrawler is a threaded singleton that queries Bitbucket's API for
    urls to its public repositories, and inserts them as
    :class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with
    :class:`indexer.GitIndexer`.

    :ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert
        :class:`indexer.GitRepository` repository urls into.
    :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
    """

    def __init__(self, clone_queue):
        """
        Create an instance of the singleton `BitbucketCrawler`.

        :param clone_queue: see :attr:`self.clone_queue`

        :type clone_queue: see :attr:`self.clone_queue`
        """

        self.clone_queue = clone_queue
        self._logger = logging.getLogger("%s.%s" %
                (__name__, self.__class__.__name__))
        self._logger.info("Starting.")
        super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)

    def run(self):
        """
        Query  the Bitbucket API for data about every public repository.

        Query the Bitbucket API's "/repositories" endpoint and read its
        paginated responses in a loop; any "git" repositories have their
        clone-urls and names inserted into a :class:`indexer.GitRepository` in
        :attr:`self.clone_queue`.
        """

        next_api_url = "https://api.bitbucket.org/2.0/repositories"

        while True:
            try:
                response = requests.get(next_api_url).json()
            except ConnectionError as exception:
                time.sleep(0.5)
                self._logger.warning("API %s call failed: %s: %s",
                        next_api_url, excep.__class__.__name__, excep)
                continue

            queue_percent_full = (float(self.clone_queue.qsize()) /
                    self.clone_queue.maxsize) * 100
            self._logger.info("API call made. Queue size: %d/%d, %d%%." %
                    ((self.clone_queue.qsize(), self.clone_queue.maxsize,
                    queue_percent_full)))

            for repo in response["values"]:
                if repo["scm"] == "git":
                    while self.clone_queue.full():
                        time.sleep(1)

                    clone_links = repo["links"]["clone"]
                    clone_url = (clone_links[0]["href"] if
                            clone_links[0]["name"] == "https" else
                            clone_links[1]["href"])
                    links.append("clone_url")

                    try:
                        watchers = requests.get(
                                repo["links"]["watchers"]["href"])
                        rank = len(watchers.json()["values"]) / 100
                    except ConnectionError as exception:
                        time.sleep(0.5)
                        self._logger.warning("API %s call failed: %s: %s",
                                next_api_url, excep.__class__.__name__, excep)
                        continue

                    self.clone_queue.put(indexer.GitRepository(
                        clone_url, repo["full_name"], "Bitbucket"),
                        rank if rank < 1.0 else 1.0)

            next_api_url = response["next"]
            time.sleep(0.2)
--- a/bitshift/crawler/indexer.py
+++ b/bitshift/crawler/indexer.py
@@ -0,0 +1,489 @@
 """
 :synopsis: Contains a singleton GitIndexer class, which clones and indexes git
    repositories.
 """

 import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\
        threading

 from ..database import Database
 from ..codelet import Codelet

 GIT_CLONE_DIR = "/tmp/bitshift"
 THREAD_QUEUE_SLEEP = 0.5

 class GitRepository(object):
    """
    A representation of a Git repository's metadata.

    :ivar url: (str) The repository's url.
    :ivar name: (str) The name of the repository.
    :ivar framework_name: (str) The name of the online Git framework that the
        repository belongs to (eg, GitHub, BitBucket).
    :ivar rank: (float) The rank of the repository, as assigned by
        :class:`crawler.GitHubCrawler`.
    """

    def __init__(self, url, name, framework_name, rank):
        """
        Create a GitRepository instance.

        :param url: see :attr:`GitRepository.url`
        :param name: see :attr:`GitRepository.name`
        :param framework_name: see :attr:`GitRepository.framework_name`
        :param rank: see :attr:`GitRepository.rank`

        :type url: str
        :type name: str
        :type framework_name: str
        :type rank: float
        """

        self.url = url
        self.name = name
        self.framework_name = framework_name
        self.rank = rank

 class GitIndexer(threading.Thread):
    """
    A singleton Git repository indexer.

    :class:`GitIndexer` indexes the repositories cloned by the
    :class:`_GitCloner` singleton.

    :ivar index_queue: (:class:`Queue.Queue`) A queue containing
        :class:`GitRepository` objects for every new repository succesfully
        cloned by :class:`_GitCloner`, which are to be indexed.
    :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner,
        which feeds :class:`GitIndexer`.
    :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
    """

    def __init__(self, clone_queue):
        """
        Create an instance of the singleton `GitIndexer`.

        :param clone_queue: see :attr:`self.index_queue`

        :type index_queue: see :attr:`self.index_queue`
        """

        MAX_INDEX_QUEUE_SIZE = 10

        self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
        self.git_cloner = _GitCloner(clone_queue, self.index_queue)
        self.git_cloner.start()
        self._logger = logging.getLogger("%s.%s" %
                (__name__, self.__class__.__name__))
        self._logger.info("Starting.")

        if not os.path.exists(GIT_CLONE_DIR):
            os.makedirs(GIT_CLONE_DIR)

        super(GitIndexer, self).__init__(name=self.__class__.__name__)

    def run(self):
        """
        Retrieve metadata about newly cloned repositories and index them.

        Blocks until new repositories appear in :attr:`self.index_queue`, then
        retrieves one, and attempts indexing it. Should any errors occur, the
        new repository will be discarded and the indexer will index the next in
        the queue.
        """

        while True:
            while self.index_queue.empty():
                time.sleep(THREAD_QUEUE_SLEEP)

            repo = self.index_queue.get()
            self.index_queue.task_done()
            try:
                self._index_repository(repo)
            except Exception as excep:
                self._logger.warning("%s: %s.", excep.__class__.__name__, excep)

    def _index_repository(self, repo):
        """
        Clone and index (create and insert Codeletes for) a Git repository.

        `git clone` the Git repository located at **repo.url**, call
        `_insert_repository_codelets()`, then remove said repository.

        :param repo_url: The metadata of the repository to be indexed.

        :type repo_url: :class:`GitRepository`
        """

        with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir:
            try:
                self._insert_repository_codelets(repo)
            except Exception as excep:
                self._logger.warning("%s: %s.", excep.__class__.__name__, excep)

        if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
            shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))

    def _insert_repository_codelets(self, repo):
        """
        Create and insert a Codelet for the files inside a Git repository.

        Create a new Codelet, and insert it into the Database singleton, for
        every file inside the current working directory's default branch
        (usually *master*).

        :param repo_url: The metadata of the repository to be indexed.

        :type repo_url: :class:`GitRepository`
        """

        commits_meta = self._get_commits_metadata()
        if commits_meta is None:
            return

        for filename in commits_meta.keys():
            try:
                with open(filename) as source_file:
                    source = self._decode(source_file.read())
                    if source is None:
                        continue
            except IOError as exception:
                continue

            authors = [(self._decode(author), None) for author in \
                    commits_meta[filename]["authors"]]
            codelet = Codelet("%s:%s" % (repo.name, filename), source, filename,
                            None, authors, self._generate_file_url(filename,
                                    repo.url, repo.framework_name),
                            commits_meta[filename]["time_created"],
                            commits_meta[filename]["time_last_modified"],
                            repo.rank)

    def _generate_file_url(self, filename, repo_url, framework_name):
        """
        Return a url for a filename from a Git wrapper framework.

        :param filename: The path of the file.
        :param repo_url: The url of the file's parent repository.
        :param framework_name: The name of the framework the repository is from.

        :type filename: str
        :type repo_url: str
        :type framework_name: str

        :return: The file's full url on the given framework, if successfully
            derived.
        :rtype: str, or None

        .. warning::
            Various Git subprocesses will occasionally fail, and, seeing as the
            information they provide is a crucial component of some repository file
            urls, None may be returned.
        """

        try:
            if framework_name == "GitHub":
                    default_branch = subprocess.check_output("git branch"
                            " --no-color", shell=True)[2:-1]
                    return ("%s/blob/%s/%s" % (repo_url, default_branch,
                            filename)).replace("//", "/")
            elif framework_name == "Bitbucket":
                    commit_hash = subprocess.check_output("git rev-parse HEAD",
                            shell=True).replace("\n", "")
                    return ("%s/src/%s/%s" % (repo_url, commit_hash,
                            filename)).replace("//", "/")
        except subprocess.CalledProcessError as exception:
            return None

    def _get_git_commits(self):
        """
        Return the current working directory's formatted commit data.

        Uses `git log` to generate metadata about every single file in the
        repository's commit history.

        :return: The author, timestamp, and names of all modified files of every
            commit.
            .. code-block:: python
               sample_returned_array = [
                   {
                       "author" : (str) "author"
                       "timestamp" : (`datetime.datetime`) <object>,
                       "filenames" : (str array) ["file1", "file2"]
                   }
               ]
        :rtype: array of dictionaries
        """

        git_log = subprocess.check_output(("git --no-pager log --name-only"
                " --pretty=format:'%n%n%an%n%at' -z"), shell=True)

        commits = []
        for commit in git_log.split("\n\n"):
            fields = commit.split("\n")
            if len(fields) > 2:
                commits.append({
                    "author" : fields[0],
                    "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
                    "filenames" : fields[2].split("\x00")[:-2]
                })

        return commits

    def _get_tracked_files(self):
        """
        Return a list of the filenames of all valuable files in the Git repository.

        Get a list of the filenames of the non-binary (Perl heuristics used for
        filetype identification) files currently inside the current working
        directory's Git repository. Then, weed out any boilerplate/non-code files
        that match the regex rules in GIT_IGNORE_FILES.

        :return: The filenames of all index-worthy non-binary files.
        :rtype: str array
        """

        files = []
        for dirname, subdir_names, filenames in os.walk("."):
            for filename in filenames:
                path = os.path.join(dirname, filename)
                if self._is_ascii(path):
                    files.append(path[2:])

        return files

    def _get_commits_metadata(self):
        """
        Return a dictionary containing every valuable tracked file's metadata.

        :return: A dictionary with author names, time of creation, and time of last
            modification for every filename key.
            .. code-block:: python
                   sample_returned_dict = {
                       "my_file" : {
                           "authors" : (str array) ["author1", "author2"],
                           "time_created" : (`datetime.datetime`) <object>,
                           "time_last_modified" : (`datetime.datetime`) <object>
                       }
                   }
        :rtype: dictionary of dictionaries
        """

        commits = self._get_git_commits()
        tracked_files = self._get_tracked_files()

        files_meta = {}
        for commit in commits:
            for filename in commit["filenames"]:
                if filename not in tracked_files:
                    continue

                if filename not in files_meta.keys():
                    files_meta[filename] = {
                        "authors" : [commit["author"]],
                        "time_last_modified" : commit["timestamp"],
                        "time_created" : commit["timestamp"]
                    }
                else:
                    if commit["author"] not in files_meta[filename]["authors"]:
                        files_meta[filename]["authors"].append(commit["author"])
                    files_meta[filename]["time_created"] = commit["timestamp"]

        return files_meta

    def _decode(self, raw):
        """
        Return a decoded a raw string.

        :param raw: The string to string.

        :type raw: (str)

        :return: If the original encoding is successfully inferenced, return the
            decoded string.
        :rtype: str, or None

        .. warning::
            The raw string's original encoding is identified by heuristics which
            can, and occasionally will, fail. Decoding will then fail, and None
            will be returned.
        """

        try:
            encoding = bs4.BeautifulSoup(raw).original_encoding
            return raw.decode(encoding) if encoding is not None else None

        except (LookupError, UnicodeDecodeError, UserWarning) as exception:
            return None

    def _is_ascii(self, filename):
        """
        Heuristically determine whether a file is ASCII text or binary.

        If a portion of the file contains null bytes, or the percentage of bytes
        that aren't ASCII is greater than 30%, then the file is concluded to be
        binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T`
        operator, and is the de-facto method for in : passdetermining whether a
        file is ASCII.

        :param filename: The path of the file to test.

        :type filename: str

        :return: Whether the file is probably ASCII.
        :rtype: Boolean
        """

        try:
            with open(filename) as source:
                file_snippet = source.read(512)

                if not file_snippet:
                    return True

                ascii_characters = "".join(map(chr, range(32, 127)) +
                        list("\n\r\t\b"))
                null_trans = string.maketrans("", "")

                if "\0" in file_snippet:
                    return False

                non_ascii = file_snippet.translate(null_trans, ascii_characters)
                return not float(len(non_ascii)) / len(file_snippet) > 0.30

        except IOError as exception:
            return False

 class _GitCloner(threading.Thread):
    """
    A singleton Git repository cloner.

    Clones the repositories crawled by :class:`crawler.GitHubCrawler` for
    :class:`GitIndexer` to index.

    :ivar clone_queue: (:class:`Queue.Queue`) see
        :attr:`crawler.GitHubCrawler.clone_queue`.
    :ivar index_queue: (:class:`Queue.Queue`) see
        :attr:`GitIndexer.index_queue`.
    :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
    """

    def __init__(self, clone_queue, index_queue):
        """
        Create an instance of the singleton :class:`_GitCloner`.

        :param clone_queue: see :attr:`self.clone_queue`
        :param index_queue: see :attr:`self.index_queue`

        :type clone_queue: see :attr:`self.clone_queue`
        :type index_queue: see :attr:`self.index_queue`
        """

        self.clone_queue = clone_queue
        self.index_queue = index_queue
        self._logger = logging.getLogger("%s.%s" %
                (__name__, self.__class__.__name__))
        self._logger.info("Starting.")
        super(_GitCloner, self).__init__(name=self.__class__.__name__)

    def run(self):
        """
        Retrieve metadata about newly crawled repositories and clone them.

        Blocks until new :class:`GitRepository` appear in
        :attr:`self.clone_queue`, then attempts cloning them. If
        succcessful, the cloned repository is added to :attr:`self.index_queue`
        for the `GitIndexer` to clone; otherwise, it is discarded.
        """

        while True:
            while self.clone_queue.empty():
                time.sleep(THREAD_QUEUE_SLEEP)
            repo = self.clone_queue.get()
            self.clone_queue.task_done()

            try:
                self._clone_repository(repo)
            except Exception as exception:
                pass

    def _clone_repository(self, repo):
        """
        Attempt cloning a Git repository.

        :param repo: Metadata about the repository to clone.

        :type repo: :class:`GitRepository`
        """

        GIT_CLONE_TIMEOUT = 500

        queue_percent_full = (float(self.index_queue.qsize()) /
                self.index_queue.maxsize) * 100

        exit_code = None
        command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone"
        " --single-branch %s %s/%s || pkill -f git")

        command_attempt = 0
        while exit_code is None:
            try:
                exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT,
                        repo.url, GIT_CLONE_DIR, repo.name), shell=True)
            except Exception as exception:
                time.sleep(1)
                command_attempt += 1
                if command_attempt == 20:
                    break
                else:
                    continue
            else:
                break

        if exit_code != 0:
            if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
                shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
            return

        while self.index_queue.full():
            time.sleep(THREAD_QUEUE_SLEEP)

        self.index_queue.put(repo)

 class _ChangeDir(object):
    """
    A wrapper class for os.chdir(), to map onto `with` and handle exceptions.

    :ivar new_path: (str) The path to change the current directory to.
    :ivar old_path: (str) The path of the directory to return to.
    """

    def __init__(self, new_path):
        """
        Create a _ChangeDir instance.

        :param new_path: The directory to enter.

        :type new_path: str
        """

        self.new_path = new_path

    def __enter__(self):
        """
        Change the current working-directory to **new_path**.
        """

        self.old_path = os.getcwd()
        os.chdir(self.new_path)

    def __exit__(self, *exception):
        """
        Change the current working-directory to **old_path**.

        :param exception: Various exception arguments passed by `with`.

        :type exception: varargs
        """

        os.chdir(self.old_path)
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@ setup(
    packages = find_packages(),
    install_requires = [
        "Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0",
        "BeautifulSoup>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"],
        "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"],
    author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak",
    license = "MIT",
    url = "https://github.com/earwig/bitshift"