From 627c848f208d65d62389482b3467e47279200ce0 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Wed, 16 Apr 2014 16:41:14 -0400 Subject: [PATCH] Add tested indexer. Add: bitshift/crawler/indexer.py -add _debug(). -add content to the module docstring; add documentation to GitIndexer, and the functions that were lacking it. -add another perl one-liner to supplement the `git clone` subprocess call, which terminates it after a set amount of time (should it have frozen) -- fixes a major bug that caused the entire indexer to hang. --- bitshift/crawler/__init__.py | 9 ++- bitshift/crawler/crawler.py | 25 ++------ bitshift/crawler/indexer.py | 149 +++++++++++++++++++++++++++++++------------ 3 files changed, 120 insertions(+), 63 deletions(-) diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index 6c13be9..4875712 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -6,8 +6,7 @@ Contains functions for initializing all subsidiary, threaded crawlers. import Queue -from bitshift.crawler import crawler -from bitshift.crawler import git_indexer +from bitshift.crawler import crawler, indexer __all__ = ["crawl"] @@ -19,12 +18,12 @@ def crawl(): Start the: 1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler` - 2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer` + 2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer` """ repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) github_crawler = crawler.GitHubCrawler(repository_queue) - indexer = git_indexer.GitIndexer(repository_queue) + git_indexer = indexer.GitIndexer(repository_queue) - for thread in [github_crawler, indexer]: + for thread in [github_crawler, git_indexer]: thread.start() diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 5b0f600..8b9576d 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -1,12 +1,12 @@ """ :synopsis: Main crawler module, to oversee all site-specific crawlers. -...more info soon... +Contains all website/framework-specific Class crawlers. """ import requests, time, threading -import bitshift.crawler.git_indexer +import bitshift.crawler.indexer from ..codelet import Codelet from ..database import Database @@ -17,12 +17,12 @@ class GitHubCrawler(threading.Thread): GitHubCrawler is a threaded singleton that queries GitHub's API for URLs to its public repositories, which it inserts into a :class:`Queue.Queue` - shared with :class:`bitshift.crawler.git_indexer.GitIndexer`. + shared with :class:`bitshift.crawler.indexer.GitIndexer`. :ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with repository information retrieved by `GitHubCrawler`, and other Git crawlers, to be processed by - :class:`bitshift.crawler.git_indexer.GitIndexer`. + :class:`bitshift.crawler.indexer.GitIndexer`. """ def __init__(self, repository_queue): @@ -31,7 +31,7 @@ class GitHubCrawler(threading.Thread): :param repository_queue: A queue containing dictionaries of repository metadata retrieved by `GitHubCrawler`, meant to be processed by an - instance of :class:`bitshift.crawler.git_indexer.GitIndexer`. + instance of :class:`bitshift.crawler.indexer.GitIndexer`. .. code-block:: python sample_dict = { @@ -43,7 +43,6 @@ class GitHubCrawler(threading.Thread): :type repository_queue: :class:`Queue.Queue` """ - self.repository_queue = repository_queue super(GitHubCrawler, self).__init__() @@ -65,26 +64,16 @@ class GitHubCrawler(threading.Thread): api_request_interval = 5e3 / 60 ** 2 while len(next_api_url) > 0: - # DEBUG - db.log.insert({ - "time" : str(time.time()).split(".")[0][-4:], - "qsize" : self.repository_queue.qsize() - }) - start_time = time.time() response = requests.get(next_api_url, params=authentication_params) for repo in response.json(): - logging.basicConfig(filename="crawler.log", level=logging.DEBUG) - logging.debug("crawler: %-20s: %-5s: %-5s: %s", - str(time.time()).split(".")[0], - self.repository_queue.qsize(), repo["id"], - repo["name"]) while self.repository_queue.full(): pass + self.repository_queue.put({ "url" : repo["html_url"], - "name" : repo["html_url"].split("/")[-1], + "name" : repo["name"], "framework_name" : "GitHub" }) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 2268895..f2a8bbf 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -1,28 +1,60 @@ """ -:synopsis: Index all the files in a Git repository. - -.. todo:: - Add documentation, threaded Indexer class. +:synopsis: Contains a singleton GitIndexer class, which clones and indexes git + repositories. """ -import os, shutil, subprocess, threading +import bs4, os, re, shutil, subprocess, threading from ..database import Database from ..codelet import Codelet -GIT_CLONE_DIR = "/tmp" +GIT_CLONE_DIR = "/tmp/bitshift" class GitIndexer(threading.Thread): + """ + A singleton Git repository indexer. + + `GitIndexer` clones and indexes the repositories at urls found by the + :mod:`bitshift.crawler.crawler` Git crawlers. + + :ivar repository_queue: (:class:`Queue.Queue`) A queue containing urls found + by the :mod:`bitshift.crawler.crawler` Git crawlers. + """ + def __init__(self, repository_queue): + """ + Create an instance of the singleton `GitIndexer`. + + :param repository_queue: see :attr:`GitIndexer.repository_queue` + + :type repository_queue: see :attr:`GitIndexer.repository_queue` + """ + self.repository_queue = repository_queue super(GitIndexer, self).__init__() def run(self): + """ + Retrieve new repository urls, clone, and index them. + + Blocks until new urls appear in :attr:`GitIndexer.repository_queue`, + then retrieves one, and attempts cloning/indexing it. Should any errors + occur, the new repository will be discarded and the crawler will + index the next in the queue. + """ + while True: while self.repository_queue.empty(): pass - new_repo = self.repository_queue.get() - _index_repository(new_repo["url"], new_repo["framework_name"]) + + repo = self.repository_queue.get() + self.repository_queue.task_done() + + try: + _index_repository(repo["url"], repo["name"], + repo["framework_name"]) + except: # desperate times -- will be modified later + pass class _ChangeDir(object): """ @@ -62,7 +94,7 @@ class _ChangeDir(object): os.chdir(self.old_path) -def _index_repository(repo_url, framework_name): +def _index_repository(repo_url, repo_name, framework_name): """ Clone and index (create and insert Codeletes for) a Git repository. @@ -70,32 +102,30 @@ def _index_repository(repo_url, framework_name): _insert_repository_codelets, then remove said repository. :param repo_url: The url the Git repository was cloned from. + :param repo_name: The name of the repository. :param framework_name: The name of the framework the repository is from. :type repo_url: str + :type repo_name: str :type framework_name: str - - :return: Temporary: the new codelets, for testing purposes. - :rtype: Codelet array """ - repo_name = repo_url.split("/")[-1] - codelets = [] + GIT_CLONE_TIMEOUT = 60 with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: - subprocess.call("git clone %s" % repo_url, shell=True) + if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \ + clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0: + return + with _ChangeDir(repo_name) as repository_dir: - codelets = _insert_repository_codelets(repo_url, repo_name, - framework_name) + _insert_repository_codelets(repo_url, repo_name, framework_name) shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) - return codelets - def _insert_repository_codelets(repo_url, repo_name, framework_name): """ - Create a Codelet for the files inside a Git repository. + Create and insert a Codelet for the files inside a Git repository. - Create a new Codelet, and insert it into the Database singlet, for every + Create a new Codelet, and insert it into the Database singleton, for every file inside the current working directory's default branch (usually *master*). @@ -108,21 +138,27 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name): :type framework_name: str """ - codelets = [] commits_meta = _get_commits_metadata() for filename in commits_meta.keys(): with open(filename, "r") as source_file: - source = source_file.read() + source = _decode(source_file.read()) + if source is None: + return - authors = [(author,) for author in commits_meta[filename]["authors"]] - codelets.append( - Codelet("%s:%s" % (repo_name, filename), source, filename, + authors = [(_decode(author),) for author in \ + commits_meta[filename]["authors"]] + codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, None, authors, _generate_file_url(filename, repo_url, - framework_name), + framework_name), commits_meta[filename]["time_created"], - commits_meta[filename]["time_last_modified"])) + commits_meta[filename]["time_last_modified"]) - return codelets + db.codelets.insert({ + "name" : codelet.name, + "authors" : codelet.authors + }) + + # Database.insert(codelet) def _generate_file_url(filename, repo_url, framework_name): """ @@ -142,7 +178,7 @@ def _generate_file_url(filename, repo_url, framework_name): if framework_name == "GitHub": default_branch = subprocess.check_output("git branch --no-color", - shell=True)[2:-1] + shell=True)[2:-1] return "%s/blob/%s/%s" % (repo_url, default_branch, filename) def _get_git_commits(): @@ -165,8 +201,7 @@ def _get_git_commits(): :rtype: dictionary """ - git_log = subprocess.check_output( - ("git --no-pager log --name-only" + git_log = subprocess.check_output(("git --no-pager log --name-only" " --pretty=format:'%n%n%an%n%at' -z"), shell=True) commits = [] @@ -183,24 +218,34 @@ def _get_git_commits(): def _get_tracked_files(): """ - Return a list of the filenames of all files in the Git repository. + Return a list of the filenames of all valuable files in the Git repository. Get a list of the filenames of the non-binary (Perl heuristics used for filetype identification) files currently inside the current working - directory's Git repository. + directory's Git repository. Then, weed out any boilerplate/non-code files + that match the regex rules in GIT_IGNORE_FILES. - :return: The filenames of all non-binary files. + :return: The filenames of all index-worthy non-binary files. :rtype: str array """ - tracked_files = subprocess.check_output( - ("perl -le 'for (@ARGV){ print if -f && -T }'" - " $(find . -type d -name .git -prune -o -print)"), shell=True) - return [filename[2:] for filename in tracked_files.split("\n")[:-1]] + GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"] + + tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \ + -f && -T }' $(find . -type d -name .git -prune -o -print)"), + shell=True).split("\n")[:-1] + + valuable_files = [] + for filename in tracked_files: + filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE) + for pattern in GIT_IGNORE_FILES]) + if not filename_match: + valuable_files.append(filename[2:]) + return valuable_files def _get_commits_metadata(): """ - Return a dictionary containing every tracked file's metadata. + Return a dictionary containing every valuable tracked file's metadata. :return: A dictionary with author names, time of creation, and time of last modification for every filename key. @@ -236,3 +281,27 @@ def _get_commits_metadata(): files_meta[filename]["time_created"] = commit["timestamp"] return files_meta + +def _decode(raw): + """ + Return a decoded a raw string. + + :param raw: The string to string. + + :type raw: (str) + + :return: If the original encoding is successfully inferenced, return the + decoded string. + :rtype: str, or None + + .. warning:: + The raw string's original encoding is identified by heuristics which + can, and occasionally will, fail. Decoding will then fail, and None + will be returned. + """ + + try: + return raw.decode(bs4.BeautifulSoup(raw).original_encoding) + + except (UnicodeDecodeError, UserWarning): + return None