|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495 |
- """
- :synopsis: Contains a singleton GitIndexer class, which clones and indexes git
- repositories.
- """
-
- import datetime
- import logging
- import os
- import Queue
- import shutil
- import string
- import subprocess
- import time
- import threading
-
- import bs4
-
- from ..database import Database
- from ..parser import parse, UnsupportedFileError
- from ..languages import LANGS
- from ..codelet import Codelet
-
- GIT_CLONE_DIR = "/tmp/bitshift"
- THREAD_QUEUE_SLEEP = 0.5
-
- class GitRepository(object):
- """
- A representation of a Git repository's metadata.
-
- :ivar url: (str) The repository's url.
- :ivar name: (str) The name of the repository.
- :ivar framework_name: (str) The name of the online Git framework that the
- repository belongs to (eg, GitHub, BitBucket).
- :ivar rank: (float) The rank of the repository, as assigned by
- :class:`crawler.GitHubCrawler`.
- :ivar dirname: (str) The repository's on-disk directory name.
- """
-
- def __init__(self, url, name, framework_name, rank):
- """
- Create a GitRepository instance.
-
- :param url: see :attr:`GitRepository.url`
- :param name: see :attr:`GitRepository.name`
- :param framework_name: see :attr:`GitRepository.framework_name`
- :param rank: see :attr:`GitRepository.rank`
-
- :type url: str
- :type name: str
- :type framework_name: str
- :type rank: float
- """
-
- self.url = url
- self.name = name
- self.framework_name = framework_name
- self.rank = rank
- self.dirname = name.replace("-", "--").replace("/", "-")
-
- class GitIndexer(threading.Thread):
- """
- A singleton Git repository indexer.
-
- :class:`GitIndexer` indexes the repositories cloned by the
- :class:`_GitCloner` singleton.
-
- :ivar index_queue: (:class:`Queue.Queue`) A queue containing
- :class:`GitRepository` objects for every new repository succesfully
- cloned by :class:`_GitCloner`, which are to be indexed.
- :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner,
- which feeds :class:`GitIndexer`.
- :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
- """
-
- def __init__(self, clone_queue, run_event):
- """
- Create an instance of the singleton `GitIndexer`.
-
- :param clone_queue: see :attr:`self.index_queue`
-
- :type index_queue: see :attr:`self.index_queue`
- """
-
- MAX_INDEX_QUEUE_SIZE = 10
-
- self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
- self.run_event = run_event
- self.git_cloner = _GitCloner(clone_queue, self.index_queue, run_event)
- self.git_cloner.start()
- self.database = Database()
- self._logger = logging.getLogger("%s.%s" %
- (__name__, self.__class__.__name__))
- self._logger.info("Starting.")
-
- if not os.path.exists(GIT_CLONE_DIR):
- os.makedirs(GIT_CLONE_DIR)
-
- super(GitIndexer, self).__init__(name=self.__class__.__name__)
-
- def run(self):
- """
- Retrieve metadata about newly cloned repositories and index them.
-
- Blocks until new repositories appear in :attr:`self.index_queue`, then
- retrieves one, and attempts indexing it. Should any errors occur, the
- new repository will be discarded and the indexer will index the next in
- the queue.
- """
-
- while True:
- while self.index_queue.empty() and self.run_event.is_set():
- time.sleep(THREAD_QUEUE_SLEEP)
- if not self.run_event.is_set():
- break
-
- repo = self.index_queue.get()
- self.index_queue.task_done()
- self._index_repository(repo)
-
- def _index_repository(self, repo):
- """
- Clone and index (create and insert Codeletes for) a Git repository.
-
- `git clone` the Git repository located at **repo.url**, call
- `_insert_repository_codelets()`, then remove said repository.
-
- :param repo_url: The metadata of the repository to be indexed.
-
- :type repo_url: :class:`GitRepository`
- """
-
- self._logger.info(u"Indexing repo: %s", repo.name)
- with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
- try:
- self._insert_repository_codelets(repo)
- except Exception:
- self._logger.exception("Exception raised while indexing:")
- finally:
- if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
- shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname))
-
- def _insert_repository_codelets(self, repo):
- """
- Create and insert a Codelet for the files inside a Git repository.
-
- Create a new Codelet, and insert it into the Database singleton, for
- every file inside the current working directory's default branch
- (usually *master*).
-
- :param repo_url: The metadata of the repository to be indexed.
-
- :type repo_url: :class:`GitRepository`
- """
-
- commits_meta = self._get_commits_metadata()
- if commits_meta is None:
- return
-
- for filename in commits_meta.keys():
- try:
- with open(filename) as source_file:
- source = self._decode(source_file.read())
- if source is None:
- continue
- except IOError:
- continue
-
- authors = [(self._decode(author), None) for author in
- commits_meta[filename]["authors"]]
- url = self._generate_file_url(filename, repo.url, repo.framework_name)
- codelet = Codelet("%s: %s" % (repo.name, filename), source,
- filename, None, authors, url,
- commits_meta[filename]["time_created"],
- commits_meta[filename]["time_last_modified"],
- repo.rank)
- self._logger.debug("Indexing file: %s", codelet.name)
- try:
- parse(codelet)
- except UnsupportedFileError:
- continue
- self.database.insert(codelet)
-
- def _generate_file_url(self, filename, repo_url, framework_name):
- """
- Return a url for a filename from a Git wrapper framework.
-
- :param filename: The path of the file.
- :param repo_url: The url of the file's parent repository.
- :param framework_name: The name of the framework the repository is from.
-
- :type filename: str
- :type repo_url: str
- :type framework_name: str
-
- :return: The file's full url on the given framework, if successfully
- derived.
- :rtype: str, or None
-
- .. warning::
- Various Git subprocesses will occasionally fail, and, seeing as the
- information they provide is a crucial component of some repository
- file urls, None may be returned.
- """
-
- try:
- if framework_name == "GitHub":
- default_branch = subprocess.check_output("git branch"
- " --no-color", shell=True)[2:-1]
- parts = [repo_url, "blob", default_branch, filename]
- elif framework_name == "Bitbucket":
- commit_hash = subprocess.check_output("git rev-parse HEAD",
- shell=True).replace("\n", "")
- parts = [repo_url, "src", commit_hash, filename]
- return "/".join(s.strip("/") for s in parts)
- except subprocess.CalledProcessError:
- return None
-
- def _get_git_commits(self):
- """
- Return the current working directory's formatted commit data.
-
- Uses `git log` to generate metadata about every single file in the
- repository's commit history.
-
- :return: The author, timestamp, and names of all modified files of every
- commit.
- .. code-block:: python
- sample_returned_array = [
- {
- "author" : (str) "author"
- "timestamp" : (`datetime.datetime`) <object>,
- "filenames" : (str array) ["file1", "file2"]
- }
- ]
- :rtype: array of dictionaries
- """
-
- git_log = subprocess.check_output(("git --no-pager log --name-only"
- " --pretty=format:'%n%n%an%n%at' -z"), shell=True)
-
- commits = []
- for commit in git_log.split("\n\n"):
- fields = commit.split("\n")
- if len(fields) > 2:
- commits.append({
- "author" : fields[0],
- "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
- "filenames" : fields[2].split("\x00")[:-2]
- })
-
- return commits
-
- def _get_tracked_files(self):
- """
- Return a list of the filenames of all valuable files in the Git repository.
-
- Get a list of the filenames of the non-binary (Perl heuristics used for
- filetype identification) files currently inside the current working
- directory's Git repository. Then, weed out any boilerplate/non-code files
- that match the regex rules in GIT_IGNORE_FILES.
-
- :return: The filenames of all index-worthy non-binary files.
- :rtype: str array
- """
-
- files = []
- for dirname, subdir_names, filenames in os.walk("."):
- for filename in filenames:
- path = os.path.join(dirname, filename)
- if self._is_ascii(path):
- files.append(path[2:])
-
- return files
-
- def _get_commits_metadata(self):
- """
- Return a dictionary containing every valuable tracked file's metadata.
-
- :return: A dictionary with author names, time of creation, and time of last
- modification for every filename key.
- .. code-block:: python
- sample_returned_dict = {
- "my_file" : {
- "authors" : (str array) ["author1", "author2"],
- "time_created" : (`datetime.datetime`) <object>,
- "time_last_modified" : (`datetime.datetime`) <object>
- }
- }
- :rtype: dictionary of dictionaries
- """
-
- commits = self._get_git_commits()
- tracked_files = self._get_tracked_files()
-
- files_meta = {}
- for commit in commits:
- for filename in commit["filenames"]:
- if filename not in tracked_files:
- continue
-
- if filename not in files_meta.keys():
- files_meta[filename] = {
- "authors" : [commit["author"]],
- "time_last_modified" : commit["timestamp"],
- "time_created" : commit["timestamp"]
- }
- else:
- if commit["author"] not in files_meta[filename]["authors"]:
- files_meta[filename]["authors"].append(commit["author"])
- files_meta[filename]["time_created"] = commit["timestamp"]
-
- return files_meta
-
- def _decode(self, raw):
- """
- Return a decoded a raw string.
-
- :param raw: The string to string.
-
- :type raw: (str)
-
- :return: If the original encoding is successfully inferenced, return the
- decoded string.
- :rtype: str, or None
-
- .. warning::
- The raw string's original encoding is identified by heuristics which
- can, and occasionally will, fail. Decoding will then fail, and None
- will be returned.
- """
-
- try:
- encoding = bs4.BeautifulSoup(raw).original_encoding
- return raw.decode(encoding) if encoding is not None else None
-
- except (LookupError, UnicodeDecodeError, UserWarning) as exception:
- return None
-
- def _is_ascii(self, filename):
- """
- Heuristically determine whether a file is ASCII text or binary.
-
- If a portion of the file contains null bytes, or the percentage of bytes
- that aren't ASCII is greater than 30%, then the file is concluded to be
- binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T`
- operator, and is the de-facto method for in : passdetermining whether a
- file is ASCII.
-
- :param filename: The path of the file to test.
-
- :type filename: str
-
- :return: Whether the file is probably ASCII.
- :rtype: Boolean
- """
-
- try:
- with open(filename) as source:
- file_snippet = source.read(512)
-
- if not file_snippet:
- return True
-
- ascii_characters = "".join(map(chr, range(32, 127)) +
- list("\n\r\t\b"))
- null_trans = string.maketrans("", "")
-
- if "\0" in file_snippet:
- return False
-
- non_ascii = file_snippet.translate(null_trans, ascii_characters)
- return not float(len(non_ascii)) / len(file_snippet) > 0.30
-
- except IOError:
- return False
-
- class _GitCloner(threading.Thread):
- """
- A singleton Git repository cloner.
-
- Clones the repositories crawled by :class:`crawler.GitHubCrawler` for
- :class:`GitIndexer` to index.
-
- :ivar clone_queue: (:class:`Queue.Queue`) see
- :attr:`crawler.GitHubCrawler.clone_queue`.
- :ivar index_queue: (:class:`Queue.Queue`) see
- :attr:`GitIndexer.index_queue`.
- :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
- """
-
- def __init__(self, clone_queue, index_queue, run_event):
- """
- Create an instance of the singleton :class:`_GitCloner`.
-
- :param clone_queue: see :attr:`self.clone_queue`
- :param index_queue: see :attr:`self.index_queue`
-
- :type clone_queue: see :attr:`self.clone_queue`
- :type index_queue: see :attr:`self.index_queue`
- """
-
- self.clone_queue = clone_queue
- self.index_queue = index_queue
- self.run_event = run_event
- self._logger = logging.getLogger("%s.%s" %
- (__name__, self.__class__.__name__))
- self._logger.info("Starting.")
- super(_GitCloner, self).__init__(name=self.__class__.__name__)
-
- def run(self):
- """
- Retrieve metadata about newly crawled repositories and clone them.
-
- Blocks until new :class:`GitRepository` appear in
- :attr:`self.clone_queue`, then attempts cloning them. If
- succcessful, the cloned repository is added to :attr:`self.index_queue`
- for the `GitIndexer` to clone; otherwise, it is discarded.
- """
-
- while True:
- while self.index_queue.empty() and self.run_event.is_set():
- time.sleep(THREAD_QUEUE_SLEEP)
- if not self.run_event.is_set():
- break
- repo = self.clone_queue.get()
- self.clone_queue.task_done()
-
- try:
- self._clone_repository(repo)
- except Exception:
- pass
-
- def _clone_repository(self, repo):
- """
- Attempt cloning a Git repository.
-
- :param repo: Metadata about the repository to clone.
-
- :type repo: :class:`GitRepository`
- """
-
- GIT_CLONE_TIMEOUT = 500
- queue_percent_full = (float(self.index_queue.qsize()) /
- self.index_queue.maxsize) * 100
-
- command = ["perl", "-e", "alarm shift @ARGV; exec @ARGV",
- str(GIT_CLONE_TIMEOUT), "git", "clone", "--single-branch",
- repo.url, GIT_CLONE_DIR + "/" + repo.dirname]
- if subprocess.call(command) != 0:
- subprocess.call(["pkill", "-f", "git"]) # This makes Ben K upset
- if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.dirname)):
- shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.dirname))
- return
-
- while self.index_queue.full():
- time.sleep(THREAD_QUEUE_SLEEP)
- self.index_queue.put(repo)
-
- class _ChangeDir(object):
- """
- A wrapper class for os.chdir(), to map onto `with` and handle exceptions.
-
- :ivar new_path: (str) The path to change the current directory to.
- :ivar old_path: (str) The path of the directory to return to.
- """
-
- def __init__(self, new_path):
- """
- Create a _ChangeDir instance.
-
- :param new_path: The directory to enter.
-
- :type new_path: str
- """
-
- self.new_path = new_path
-
- def __enter__(self):
- """
- Change the current working-directory to **new_path**.
- """
-
- self.old_path = os.getcwd()
- os.chdir(self.new_path)
-
- def __exit__(self, *exception):
- """
- Change the current working-directory to **old_path**.
-
- :param exception: Various exception arguments passed by `with`.
-
- :type exception: varargs
- """
-
- os.chdir(self.old_path)
|