""" :synopsis: Contains a singleton GitIndexer class, which clones and indexes git repositories. """ from datetime import datetime import logging import os import Queue import shutil import string import time import threading from bs4 import UnicodeDammit import git from ..database import Database from ..parser import parse, UnsupportedFileError from ..codelet import Codelet GIT_CLONE_DIR = "/tmp/bitshift" THREAD_QUEUE_SLEEP = 0.5 MAX_INDEX_QUEUE_SIZE = 10 class GitRepository(object): """ A representation of a Git repository's metadata. :ivar url: (str) The repository's url. :ivar name: (str) The name of the repository. :ivar framework_name: (str) The name of the online Git framework that the repository belongs to (eg, GitHub, BitBucket). :ivar rank: (float) The rank of the repository, as assigned by :class:`crawler.GitHubCrawler`. :ivar path: (str) The repository's on-disk directory path. :ivar repo: (git.Repo) A git.Repo representation of the repository. """ def __init__(self, url, name, framework_name, rank): """ Create a GitRepository instance. :param url: see :attr:`GitRepository.url` :param name: see :attr:`GitRepository.name` :param framework_name: see :attr:`GitRepository.framework_name` :param rank: see :attr:`GitRepository.rank` :type url: str :type name: str :type framework_name: str :type rank: float """ self.url = url self.name = name self.framework_name = framework_name self.rank = rank dirname = name.replace("/", "-") + "-" + str(int(time.time())) self.path = os.path.join(GIT_CLONE_DIR, dirname) self.repo = None class GitIndexer(threading.Thread): """ A singleton Git repository indexer. :class:`GitIndexer` indexes the repositories cloned by the :class:`_GitCloner` singleton. :ivar index_queue: (:class:`Queue.Queue`) A queue containing :class:`GitRepository` objects for every new repository succesfully cloned by :class:`_GitCloner`, which are to be indexed. :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner, which feeds :class:`GitIndexer`. :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. """ def __init__(self, clone_queue, run_event): """ Create an instance of the singleton `GitIndexer`. :param clone_queue: see :attr:`self.index_queue` :type index_queue: see :attr:`self.index_queue` """ self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) self.run_event = run_event self.git_cloner = _GitCloner(clone_queue, self.index_queue, run_event) self.git_cloner.start() self.database = Database() self._logger = logging.getLogger("%s.%s" % (__name__, self.__class__.__name__)) self._logger.info("Starting.") if not os.path.exists(GIT_CLONE_DIR): os.makedirs(GIT_CLONE_DIR) super(GitIndexer, self).__init__(name=self.__class__.__name__) def run(self): """ Retrieve metadata about newly cloned repositories and index them. Blocks until new repositories appear in :attr:`self.index_queue`, then retrieves one, and attempts indexing it. Should any errors occur, the new repository will be discarded and the indexer will index the next in the queue. """ while True: while self.index_queue.empty() and self.run_event.is_set(): time.sleep(THREAD_QUEUE_SLEEP) if not self.run_event.is_set(): break repo = self.index_queue.get() self.index_queue.task_done() self._index_repository(repo) def _index_repository(self, repo): """ Clone and index (create and insert Codeletes for) a Git repository. `git clone` the Git repository located at **repo.url**, call `_insert_repository_codelets()`, then remove said repository. :param repo: The metadata of the repository to be indexed. :type repo: :class:`GitRepository` """ self._logger.info(u"Indexing repo: %s", repo.name) try: self._insert_repository_codelets(repo) except Exception: self._logger.exception("Exception raised while indexing:") finally: if os.path.isdir(repo.path): shutil.rmtree(repo.path) def _insert_repository_codelets(self, repo): """ Create and insert a Codelet for the files inside a Git repository. Create a new Codelet, and insert it into the Database singleton, for every file inside the current working directory's default branch (usually *master*). :param repo_url: The metadata of the repository to be indexed. :type repo_url: :class:`GitRepository` """ file_meta = self._get_file_metadata(repo.repo) if file_meta is None: return for filename, data in file_meta.iteritems(): name = ("%s: %s" % (repo.name, filename)).encode("utf8") authors = [(author, None) for author in data["authors"]] encoded_source = data["blob"].data_stream.read() source = UnicodeDammit(encoded_source).unicode_markup url = self._generate_file_url(filename, repo) codelet = Codelet(name, source, filename, None, authors, url, data["time_created"], data["time_last_modified"], repo.rank) self._logger.debug("Indexing file: %s", codelet.name) try: parse(codelet) except UnsupportedFileError: continue except Exception: self._logger.exception("Exception raised while parsing:") self.database.insert(codelet) def _generate_file_url(self, filename, repo): """ Return a url for a filename from a Git wrapper framework. :param filename: The path of the file. :param repo: The git repo. :type filename: str :type repo: :class:`GitRepository` :return: The file's full url on the given framework, if successfully derived. :rtype: str, or None """ if repo.framework_name == "GitHub": default_branch = repo.repo.active_branch.name parts = [repo.url, "blob", default_branch, filename] elif repo.framework_name == "Bitbucket": try: commit_hash = repo.repo.head.commit.hexsha except ValueError: # No commits return None parts = [repo.url, "src", commit_hash, filename] return "/".join(s.strip("/") for s in parts) def _get_file_metadata(self, repo): """ Return a dictionary containing every valuable tracked file's metadata. :return: A dictionary with author names, time of creation, and time of last modification for every filename key. .. code-block:: python sample_returned_dict = { "my_file" : { "blob": (GitPython Blob) , "authors" : (str list) ["author1", "author2"], "time_created" : (`datetime.datetime`) , "time_last_modified" : (`datetime.datetime`) } } :rtype: dictionary of dictionaries """ try: tree = repo.head.commit.tree except ValueError: # No commits return {} files = {} self._logger.debug("Building file metadata") for item in tree.traverse(): if item.type != "blob" or not self._is_ascii(item.data_stream): continue log = repo.git.log("--follow", '--format=%an %ct', "--", item.path) lines = log.splitlines() authors = {line.rsplit(" ", 1)[0].decode("utf8") for line in lines} last_mod = int(lines[0].rsplit(" ", 1)[1]) created = int(lines[-1].rsplit(" ", 1)[1]) files[item.path] = { "blob": item, "authors" : authors, "time_last_modified": datetime.fromtimestamp(last_mod), "time_created": datetime.fromtimestamp(created) } return files def _is_ascii(self, source): """ Heuristically determine whether a file is ASCII text or binary. If a portion of the file contains null bytes, or the percentage of bytes that aren't ASCII is greater than 30%, then the file is concluded to be binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T` operator, and is the de-facto method for in : passdetermining whether a file is ASCII. :param source: The file object to test. :type source: `file` :return: Whether the file is probably ASCII. :rtype: Boolean """ file_snippet = source.read(512) if not file_snippet: return True ascii_characters = "".join(map(chr, range(32, 127)) + list("\n\r\t\b")) null_trans = string.maketrans("", "") if "\0" in file_snippet: return False non_ascii = file_snippet.translate(null_trans, ascii_characters) return not float(len(non_ascii)) / len(file_snippet) > 0.30 class _GitCloner(threading.Thread): """ A singleton Git repository cloner. Clones the repositories crawled by :class:`crawler.GitHubCrawler` for :class:`GitIndexer` to index. :ivar clone_queue: (:class:`Queue.Queue`) see :attr:`crawler.GitHubCrawler.clone_queue`. :ivar index_queue: (:class:`Queue.Queue`) see :attr:`GitIndexer.index_queue`. :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. """ def __init__(self, clone_queue, index_queue, run_event): """ Create an instance of the singleton :class:`_GitCloner`. :param clone_queue: see :attr:`self.clone_queue` :param index_queue: see :attr:`self.index_queue` :type clone_queue: see :attr:`self.clone_queue` :type index_queue: see :attr:`self.index_queue` """ self.clone_queue = clone_queue self.index_queue = index_queue self.run_event = run_event self._logger = logging.getLogger("%s.%s" % (__name__, self.__class__.__name__)) self._logger.info("Starting.") super(_GitCloner, self).__init__(name=self.__class__.__name__) def run(self): """ Retrieve metadata about newly crawled repositories and clone them. Blocks until new :class:`GitRepository` appear in :attr:`self.clone_queue`, then attempts cloning them. If succcessful, the cloned repository is added to :attr:`self.index_queue` for the `GitIndexer` to clone; otherwise, it is discarded. """ while True: while self.clone_queue.empty() and self.run_event.is_set(): time.sleep(THREAD_QUEUE_SLEEP) if not self.run_event.is_set(): break repo = self.clone_queue.get() self.clone_queue.task_done() try: self._clone_repository(repo) except Exception: self._logger.exception("Exception raised while cloning:") def _clone_repository(self, repo): """ Attempt cloning a Git repository. :param repo: Metadata about the repository to clone. :type repo: :class:`GitRepository` """ self._logger.info("Cloning repo: %s", repo.url) repo.repo = git.Repo.clone_from(repo.url, to_path=repo.path, bare=True, single_branch=True) while self.index_queue.full() and self.run_event.is_set(): time.sleep(THREAD_QUEUE_SLEEP) if self.run_event.is_set(): self.index_queue.put(repo)