From f38772760b6dbe46410ca87407c7dab919079c3f Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Sat, 19 Apr 2014 15:33:21 -0400 Subject: [PATCH] Remove some subprocesses, comment out logging. Add: bitshift/crawler/ (crawler, indexer).py -comment out all logging statements, as they may be causing a memory leak (the crawler is meant to run perpetually, meaning that, depending on how the `logging` module is implemented, it may be accumulating logged strings in memory.) bitshift/crawler/indexer.py -make `_index_repository()` and `_index_repository_codelets()` functions of the `GitIndexer` class. -replace `_get_tracked_files()` subprocess call, which found the files in a Git repository and removed any that were non-ASCII, with a pure Python solution. -add `_is_ascii()`. --- bitshift/crawler/crawler.py | 18 +-- bitshift/crawler/indexer.py | 269 ++++++++++++++++++++++++++++---------------- 2 files changed, 181 insertions(+), 106 deletions(-) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 347fd9a..10dd961 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -34,7 +34,7 @@ class GitHubCrawler(threading.Thread): """ self.clone_queue = clone_queue - logging.info("Starting %s." % self.__class__.__name__) + # logging.info("Starting %s." % self.__class__.__name__) super(GitHubCrawler, self).__init__(name=self.__class__.__name__) def run(self): @@ -61,10 +61,10 @@ class GitHubCrawler(threading.Thread): queue_percent_full = (float(self.clone_queue.qsize()) / self.clone_queue.maxsize) * 100 - logging.info("API call made. Limit remaining: %s. Queue-size: (%d" - "%%) %d/%d" % (response.headers["x-ratelimit-remaining"], - queue_percent_full, self.clone_queue.qsize(), - self.clone_queue.maxsize)) + # logging.info("API call made. Limit remaining: %s. Queue-size: (%d" + # "%%) %d/%d" % (response.headers["x-ratelimit-remaining"], + # queue_percent_full, self.clone_queue.qsize(), + # self.clone_queue.maxsize)) for repo in response.json(): while self.clone_queue.full(): @@ -107,7 +107,7 @@ class BitbucketCrawler(threading.Thread): """ self.clone_queue = clone_queue - logging.info("Starting %s." % self.__class__.__name__) + # logging.info("Starting %s." % self.__class__.__name__) super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) def run(self): @@ -127,9 +127,9 @@ class BitbucketCrawler(threading.Thread): queue_percent_full = (float(self.clone_queue.qsize()) / self.clone_queue.maxsize) * 100 - logging.info("API call made. Queue-size: (%d%%) %d/%d" % ( - queue_percent_full, self.clone_queue.qsize(), - self.clone_queue.maxsize)) + # logging.info("API call made. Queue-size: (%d%%) %d/%d" % ( + # queue_percent_full, self.clone_queue.qsize(), + # self.clone_queue.maxsize)) for repo in response["values"]: if repo["scm"] == "git": diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 563f369..3bff3e7 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -3,7 +3,7 @@ repositories. """ -import bs4, logging, os, Queue, re, shutil, subprocess, time, threading +import bs4, logging, os, Queue, re, shutil, string, subprocess, time, threading from ..database import Database from ..codelet import Codelet @@ -63,10 +63,12 @@ class GitIndexer(threading.Thread): MAX_INDEX_QUEUE_SIZE = 10 - logging.info("Starting.") + # logging.info("Starting.") + self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) self.git_cloner = _GitCloner(clone_queue, self.index_queue) self.git_cloner.start() + self.codelet_count = 0 #debug if not os.path.exists(GIT_CLONE_DIR): os.makedirs(GIT_CLONE_DIR) @@ -89,14 +91,91 @@ class GitIndexer(threading.Thread): repo = self.index_queue.get() self.index_queue.task_done() - _index_repository(repo.url, repo.name, repo.framework_name) + self._index_repository(repo.url, repo.name, repo.framework_name) + + def _index_repository(self, repo_url, repo_name, framework_name): + """ + Clone and index (create and insert Codeletes for) a Git repository. + + `git clone` the Git repository located at **repo_url**, call + _insert_repository_codelets, then remove said repository. + + :param repo_url: The url the Git repository was cloned from. + :param repo_name: The name of the repository. + :param framework_name: The name of the framework the repository is from. + + :type repo_url: str + :type repo_name: str + :type framework_name: str + """ + + # logging.info("Indexing repository %s." % repo_url) + with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir: + try: + self._insert_repository_codelets(repo_url, repo_name, + framework_name) + except Exception as exception: + # logging.warning( + # "_insert_repository_codelets() failed: %s: %s: %s" % + # (exception.__class__.__name__, exception, repo_url)) + pass + + if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) + + def _insert_repository_codelets(self, repo_url, repo_name, framework_name): + """ + Create and insert a Codelet for the files inside a Git repository. + + Create a new Codelet, and insert it into the Database singleton, for every + file inside the current working directory's default branch (usually + *master*). + + :param repo_url: The url the Git repository was cloned from. + :param repo_name: The name of the repository. + :param framework_name: The name of the framework the repository is from. + + :type repo_url: str + :type repo_name: str + :type framework_name: str + """ + + commits_meta = _get_commits_metadata() + for filename in commits_meta.keys(): + try: + with open(filename, "r") as source_file: + source = _decode(source_file.read()) + if source is None: + return + except IOError as exception: + # logging.warning( + # "_insert_repository_codelets() failed: %s: %s: %s" % + # (exception.__class__.__name__, exception, repo_url)) + pass + + authors = [(_decode(author),) for author in \ + commits_meta[filename]["authors"]] + codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, + None, authors, _generate_file_url(filename, repo_url, + framework_name), + commits_meta[filename]["time_created"], + commits_meta[filename]["time_last_modified"]) + + self.codelet_count += 1 #debug + if self.codelet_count % 500 == 0: #debug + logging.info("Number of codelets indexed: %d.", self.codelet_count) #debug + + # Database.insert(codelet) class _GitCloner(threading.Thread): """ A singleton Git repository cloner. + Clones the repositories crawled by :class:`crawler.GitHubCrawler` for + :class:`GitIndexer` to index. + :ivar clone_queue: (:class:`Queue.Queue`) see - :attr:`bitshift.crawler.crawler.GitHubCrawler.clone_queue`. + :attr:`crawler.GitHubCrawler.clone_queue`. :ivar index_queue: (:class:`Queue.Queue`) see :attr:`GitIndexer.index_queue`. """ @@ -112,6 +191,8 @@ class _GitCloner(threading.Thread): :type index_queue: see :attr:`self.index_queue` """ + # logging.info("Starting.") + self.clone_queue = clone_queue self.index_queue = index_queue super(_GitCloner, self).__init__(name=self.__class__.__name__) @@ -146,16 +227,29 @@ class _GitCloner(threading.Thread): queue_percent_full = (float(self.index_queue.qsize()) / self.index_queue.maxsize) * 100 - logging.info("Cloning %s. Queue-size: (%d%%) %d/%d" % (repo.url, - queue_percent_full, self.index_queue.qsize(), - self.index_queue.maxsize)) + # logging.info("Cloning %s. Queue-size: (%d%%) %d/%d" % (repo.url, + # queue_percent_full, self.index_queue.qsize(), + # self.index_queue.maxsize)) + exit_code = None command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone" " --single-branch %s %s/%s || pkill -f git") - if subprocess.call(command % (GIT_CLONE_TIMEOUT, repo.url, - GIT_CLONE_DIR, repo.name), shell=True) != 0: - logging.warning("_clone_repository(): Cloning %s failed." % - repo.url) + + while exit_code is None: + try: + exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT, + repo.url, GIT_CLONE_DIR, repo.name), shell=True) + except: + # logging.warning("_clone_repository() failed: %s: %s", + # exception.__class__.__name__, exception) + time.sleep(1) + continue + else: + break + + if exit_code != 0: + # logging.warning("_clone_repository(): Cloning %s failed." % + # repo.url) if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) return @@ -203,74 +297,6 @@ class _ChangeDir(object): os.chdir(self.old_path) -def _index_repository(repo_url, repo_name, framework_name): - """ - Clone and index (create and insert Codeletes for) a Git repository. - - `git clone` the Git repository located at **repo_url**, call - _insert_repository_codelets, then remove said repository. - - :param repo_url: The url the Git repository was cloned from. - :param repo_name: The name of the repository. - :param framework_name: The name of the framework the repository is from. - - :type repo_url: str - :type repo_name: str - :type framework_name: str - """ - - logging.info("Indexing repository %s." % repo_url) - with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir: - try: - _insert_repository_codelets(repo_url, repo_name, - framework_name) - except Exception as exception: - logging.warning( - "_insert_repository_codelets() failed: %s: %s: %s" % - (exception.__class__.__name__, exception, repo_url)) - - if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) - -def _insert_repository_codelets(repo_url, repo_name, framework_name): - """ - Create and insert a Codelet for the files inside a Git repository. - - Create a new Codelet, and insert it into the Database singleton, for every - file inside the current working directory's default branch (usually - *master*). - - :param repo_url: The url the Git repository was cloned from. - :param repo_name: The name of the repository. - :param framework_name: The name of the framework the repository is from. - - :type repo_url: str - :type repo_name: str - :type framework_name: str - """ - - commits_meta = _get_commits_metadata() - for filename in commits_meta.keys(): - try: - with open(filename, "r") as source_file: - source = _decode(source_file.read()) - if source is None: - return - except IOError as exception: - logging.warning( - "_insert_repository_codelets() failed: %s: %s: %s" % - (exception.__class__.__name__, exception, repo_url)) - - authors = [(_decode(author),) for author in \ - commits_meta[filename]["authors"]] - codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, - None, authors, _generate_file_url(filename, repo_url, - framework_name), - commits_meta[filename]["time_created"], - commits_meta[filename]["time_last_modified"]) - - # Database.insert(codelet) - def _generate_file_url(filename, repo_url, framework_name): """ Return a url for a filename from a Git wrapper framework. @@ -288,19 +314,25 @@ def _generate_file_url(filename, repo_url, framework_name): :rtype: str, or None .. warning:: - `git branch` will occasionally fail, and, seeing as its a crucial - component of GitHub's repository file urls, None will be returned. + Various Git subprocesses will occasionally fail, and, seeing as the + information they provide is a crucial component of some repository file + urls, None may be returned. """ - if framework_name == "GitHub": - try: - default_branch = subprocess.check_output("git branch --no-color", - shell=True)[2:-1] - return "%s/blob/%s/%s" % (repo_url, default_branch, filename) - except CalledProcessError as exception: - logging.warning("_generate_file_url(): %s: %s", - exception.__class__.name, exception) - return None + try: + if framework_name == "GitHub": + default_branch = subprocess.check_output("git branch" + " --no-color", shell=True)[2:-1] + return ("%s/blob/%s/%s" % (repo_url, default_branch, + filename)).replace("//", "/") + elif framework_name == "Bitbucket": + commit_hash = subprocess.check_output("git rev-parse HEAD", + shell=True).replace("\n", "") + return ("%s/src/%s/%s" % (repo_url, commit_hash, + filename)).replace("//", "/") + except subprocess.CalledProcessError as exception: + # logging.warning("_generate_file_url() failed: %s", exception) + return None def _get_git_commits(): """ @@ -354,12 +386,15 @@ def _get_tracked_files(): GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?", "md(wn|t[e]?xt)?", "rst"] - tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \ - -f && -T }' $(find . -type d -name .git -prune -o -print)"), - shell=True).split("\n")[:-1] + files = [] + for dirname, subdir_names, filenames in os.walk("."): + for filename in filenames: + path = os.path.join(dirname, filename) + if _is_ascii(path): + files.append(path) valuable_files = [] - for filename in tracked_files: + for filename in files: filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE) for pattern in GIT_IGNORE_FILES]) extension = filename.split(".")[-1] @@ -431,7 +466,47 @@ def _decode(raw): encoding = bs4.BeautifulSoup(raw).original_encoding return raw.decode(encoding) if encoding is not None else None - except Exception as exception: - logging.warning("_decode(): %s: %s", exception.__class__.__name__, - exception) + except (LookupError, UnicodeDecodeError, UserWarning) as exception: + # logging.warning("_decode() failed: %s: %s", + # exception.__class__.__name__, exception) return None + +def _is_ascii(filename): + """ + Heuristically determine whether a file is ASCII text or binary. + + If a portion of the file contains null bytes, or the percentage of bytes + that aren't ASCII is greater than 30%, then the file is concluded to be + binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T` + operator, and is the de-facto method for in : passdetermining whether a + file is ASCII. + + :param filename: The path of the file to test. + + :type filename: str + + :return: Whether the file is probably ASCII. + :rtype: Boolean + """ + + try: + with open(filename) as source: + file_snippet = source.read(512) + + if not file_snippet: + return True + + ascii_characters = "".join(map(chr, range(32, 127)) + + list("\n\r\t\b")) + null_trans = string.maketrans("", "") + + if "\0" in file_snippet: + return False + + non_ascii = file_snippet.translate(null_trans, ascii_characters) + return not float(len(non_ascii)) / len(file_snippet) > 0.30 + + except IOError as exception: + # logging.warning("_is_ascii() failed: %s: %s", + # exception.__class__.__name__, exception) + return False