Add tested indexer.

Add: bitshift/crawler/indexer.py -add _debug(). -add content to the module docstring; add documentation to GitIndexer, and the functions that were lacking it. -add another perl one-liner to supplement the `git clone` subprocess call, which terminates it after a set amount of time (should it have frozen) -- fixes a major bug that caused the entire indexer to hang.
10 years ago · 627c848f20
--- a/bitshift/crawler/init.py
+++ b/bitshift/crawler/init.py
@@ -6,8 +6,7 @@ Contains functions for initializing all subsidiary, threaded crawlers.

 import Queue

 from bitshift.crawler import crawler
 from bitshift.crawler import git_indexer
 from bitshift.crawler import crawler, indexer

 __all__ = ["crawl"]

@@ -19,12 +18,12 @@ def crawl():

    Start the:
    1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler`
    2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer`
    2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`
    """

    repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
    github_crawler = crawler.GitHubCrawler(repository_queue)
    indexer = git_indexer.GitIndexer(repository_queue)
    git_indexer = indexer.GitIndexer(repository_queue)

    for thread in [github_crawler, indexer]:
    for thread in [github_crawler, git_indexer]:
        thread.start()
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -1,12 +1,12 @@
 """
 :synopsis: Main crawler module, to oversee all site-specific crawlers.

 ...more info soon...
 Contains all website/framework-specific Class crawlers.
 """

 import requests, time, threading

 import bitshift.crawler.git_indexer
 import bitshift.crawler.indexer

 from ..codelet import Codelet
 from ..database import Database
@@ -17,12 +17,12 @@ class GitHubCrawler(threading.Thread):

    GitHubCrawler is a threaded singleton that queries GitHub's API for URLs
    to its public repositories, which it inserts into a :class:`Queue.Queue`
    shared with :class:`bitshift.crawler.git_indexer.GitIndexer`.
    shared with :class:`bitshift.crawler.indexer.GitIndexer`.

    :ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with
        repository information retrieved by `GitHubCrawler`, and other Git
        crawlers, to be processed by
        :class:`bitshift.crawler.git_indexer.GitIndexer`.
        :class:`bitshift.crawler.indexer.GitIndexer`.
    """

    def __init__(self, repository_queue):
@@ -31,7 +31,7 @@ class GitHubCrawler(threading.Thread):

        :param repository_queue: A queue containing dictionaries of  repository
            metadata retrieved by `GitHubCrawler`, meant to be processed by an
            instance of :class:`bitshift.crawler.git_indexer.GitIndexer`.
            instance of :class:`bitshift.crawler.indexer.GitIndexer`.

            .. code-block:: python
                sample_dict = {
@@ -43,7 +43,6 @@ class GitHubCrawler(threading.Thread):
        :type repository_queue: :class:`Queue.Queue`
        """


        self.repository_queue = repository_queue
        super(GitHubCrawler, self).__init__()

@@ -65,26 +64,16 @@ class GitHubCrawler(threading.Thread):
        api_request_interval = 5e3 / 60 ** 2

        while len(next_api_url) > 0:
            # DEBUG
            db.log.insert({
                "time" : str(time.time()).split(".")[0][-4:],
                "qsize" : self.repository_queue.qsize()
            })

            start_time = time.time()
            response = requests.get(next_api_url, params=authentication_params)

            for repo in response.json():
                logging.basicConfig(filename="crawler.log", level=logging.DEBUG)
                logging.debug("crawler: %-20s: %-5s: %-5s: %s",
                             str(time.time()).split(".")[0],
                             self.repository_queue.qsize(), repo["id"],
                             repo["name"])
                while self.repository_queue.full():
                    pass

                self.repository_queue.put({
                    "url" : repo["html_url"],
                    "name" : repo["html_url"].split("/")[-1],
                    "name" : repo["name"],
                    "framework_name" : "GitHub"
                })

--- a/bitshift/crawler/indexer.py
+++ b/bitshift/crawler/indexer.py
@@ -1,28 +1,60 @@
 """
 :synopsis: Index all the files in a Git repository.

 .. todo::
    Add documentation, threaded Indexer class.
 :synopsis: Contains a singleton GitIndexer class, which clones and indexes git
    repositories.
 """

 import os, shutil, subprocess, threading
 import bs4, os, re, shutil, subprocess, threading

 from ..database import Database
 from ..codelet import Codelet

 GIT_CLONE_DIR = "/tmp"
 GIT_CLONE_DIR = "/tmp/bitshift"

 class GitIndexer(threading.Thread):
    """
    A singleton Git repository indexer.

    `GitIndexer` clones and indexes the repositories at urls found by the
    :mod:`bitshift.crawler.crawler` Git crawlers.

    :ivar repository_queue: (:class:`Queue.Queue`) A queue containing urls found
        by the :mod:`bitshift.crawler.crawler` Git crawlers.
    """

    def __init__(self, repository_queue):
        """
        Create an instance of the singleton `GitIndexer`.

        :param repository_queue: see :attr:`GitIndexer.repository_queue`

        :type repository_queue: see :attr:`GitIndexer.repository_queue`
        """

        self.repository_queue = repository_queue
        super(GitIndexer, self).__init__()

    def run(self):
        """
        Retrieve new repository urls, clone, and index them.

        Blocks until new urls appear in :attr:`GitIndexer.repository_queue`,
        then retrieves one, and attempts cloning/indexing it. Should any errors
        occur, the new repository will be discarded and the crawler will
        index the next in the queue.
        """

        while True:
            while self.repository_queue.empty():
                pass
            new_repo = self.repository_queue.get()
            _index_repository(new_repo["url"], new_repo["framework_name"])

            repo = self.repository_queue.get()
            self.repository_queue.task_done()

            try:
                _index_repository(repo["url"], repo["name"],
                        repo["framework_name"])
            except: # desperate times -- will be modified later
                pass

 class _ChangeDir(object):
    """
@@ -62,7 +94,7 @@ class _ChangeDir(object):

        os.chdir(self.old_path)

 def _index_repository(repo_url, framework_name):
 def _index_repository(repo_url, repo_name, framework_name):
    """
    Clone and index (create and insert Codeletes for) a Git repository.

@@ -70,32 +102,30 @@ def _index_repository(repo_url, framework_name):
    _insert_repository_codelets, then remove said repository.

    :param repo_url: The url the Git repository was cloned from.
    :param repo_name: The name of the repository.
    :param framework_name: The name of the framework the repository is from.

    :type repo_url: str
    :type repo_name: str
    :type framework_name: str

    :return: Temporary: the new codelets, for testing purposes.
    :rtype: Codelet array
    """

    repo_name = repo_url.split("/")[-1]
    codelets = []
    GIT_CLONE_TIMEOUT = 60

    with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
        subprocess.call("git clone %s" % repo_url, shell=True)
        if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \
                clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0:
            return

        with _ChangeDir(repo_name) as repository_dir:
            codelets = _insert_repository_codelets(repo_url, repo_name,
                                                   framework_name)
            _insert_repository_codelets(repo_url, repo_name, framework_name)
        shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))

    return codelets

 def _insert_repository_codelets(repo_url, repo_name, framework_name):
    """
    Create a Codelet for the files inside a Git repository.
    Create and insert a Codelet for the files inside a Git repository.

    Create a new Codelet, and insert it into the Database singlet, for every
    Create a new Codelet, and insert it into the Database singleton, for every
    file inside the current working directory's default branch (usually
    *master*).

@@ -108,21 +138,27 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name):
    :type framework_name: str
    """

    codelets = []
    commits_meta = _get_commits_metadata()
    for filename in commits_meta.keys():
        with open(filename, "r") as source_file:
            source = source_file.read()
            source = _decode(source_file.read())
            if source is None:
                return

        authors = [(author,) for author in commits_meta[filename]["authors"]]
        codelets.append(
                Codelet("%s:%s" % (repo_name, filename), source, filename,
        authors = [(_decode(author),) for author in \
                commits_meta[filename]["authors"]]
        codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
                        None, authors, _generate_file_url(filename, repo_url,
                                                          framework_name),
                                framework_name),
                        commits_meta[filename]["time_created"],
                        commits_meta[filename]["time_last_modified"]))
                        commits_meta[filename]["time_last_modified"])

    return codelets
        db.codelets.insert({
            "name" : codelet.name,
            "authors" : codelet.authors
        })

        # Database.insert(codelet)

 def _generate_file_url(filename, repo_url, framework_name):
    """
@@ -142,7 +178,7 @@ def _generate_file_url(filename, repo_url, framework_name):

    if framework_name == "GitHub":
        default_branch = subprocess.check_output("git branch --no-color",
                                                 shell=True)[2:-1]
                shell=True)[2:-1]
        return "%s/blob/%s/%s" % (repo_url, default_branch, filename)

 def _get_git_commits():
@@ -165,8 +201,7 @@ def _get_git_commits():
    :rtype: dictionary
    """

    git_log = subprocess.check_output(
            ("git --no-pager log --name-only"
    git_log = subprocess.check_output(("git --no-pager log --name-only"
            " --pretty=format:'%n%n%an%n%at' -z"), shell=True)

    commits = []
@@ -183,24 +218,34 @@ def _get_git_commits():

 def _get_tracked_files():
    """
    Return a list of the filenames of all files in the Git repository.
    Return a list of the filenames of all valuable files in the Git repository.

    Get a list of the filenames of the non-binary (Perl heuristics used for
    filetype identification) files currently inside the current working
    directory's Git repository.
    directory's Git repository. Then, weed out any boilerplate/non-code files
    that match the regex rules in GIT_IGNORE_FILES.

    :return: The filenames of all non-binary files.
    :return: The filenames of all index-worthy non-binary files.
    :rtype: str array
    """

    tracked_files = subprocess.check_output(
            ("perl -le 'for (@ARGV){ print if -f && -T }'"
            " $(find . -type d -name .git -prune -o -print)"), shell=True)
    return [filename[2:] for filename in tracked_files.split("\n")[:-1]]
    GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"]

    tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \
            -f && -T }' $(find . -type d -name .git -prune -o -print)"),
            shell=True).split("\n")[:-1]

    valuable_files = []
    for filename in tracked_files:
        filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
                for pattern in GIT_IGNORE_FILES])
        if not filename_match:
            valuable_files.append(filename[2:])
    return valuable_files

 def _get_commits_metadata():
    """
    Return a dictionary containing every tracked file's metadata.
    Return a dictionary containing every valuable tracked file's metadata.

    :return: A dictionary with author names, time of creation, and time of last
        modification for every filename key.
@@ -236,3 +281,27 @@ def _get_commits_metadata():
                files_meta[filename]["time_created"] = commit["timestamp"]

    return files_meta

 def _decode(raw):
    """
    Return a decoded a raw string.

    :param raw: The string to string.

    :type raw: (str)

    :return: If the original encoding is successfully inferenced, return the
        decoded string.
    :rtype: str, or None

    .. warning::
        The raw string's original encoding is identified by heuristics which
        can, and occasionally will, fail. Decoding will then fail, and None
        will be returned.
    """

    try:
        return raw.decode(bs4.BeautifulSoup(raw).original_encoding)

    except (UnicodeDecodeError, UserWarning):
        return None