From d142f1fd55dc180900dd564810e94464f8debbb0 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Sat, 3 May 2014 15:22:29 -0400 Subject: [PATCH] Complete Crawler. Close #15, #14, #11, #8. Several of the closed issues were addressed partly in previous commits; definitively close them with this, for the moment, final update to the crawler package. Ref: bitshift/crawler/indexer.py -move all `GitIndexer` specific functions (eg, `_decode`, `_is_ascii()`)from the global scope to the class definition. --- bitshift/codelet.py | 53 +++--- bitshift/crawler/indexer.py | 417 ++++++++++++++++++++++---------------------- 2 files changed, 236 insertions(+), 234 deletions(-) diff --git a/bitshift/codelet.py b/bitshift/codelet.py index 9568a4d..453ace0 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -7,37 +7,43 @@ class Codelet(object): :ivar name: (str) A suitable name for the codelet. :ivar code: (str) A containing the raw source code. :ivar filename: (str, or None) The filename of the snippet. - :ivar language: (str, or None) The inferred language of `code`. - :ivar authors: (array of str tuples) An array of tuples containing an - author's name and profile URL (on the service the code was pulled from). + :ivar language: (int, or None) The inferred language of `code`. + :ivar authors: (array of tuples (str, str or None)) An array of tuples + containing an author's name and profile URL (on the service the code + was pulled from). :ivar code_url: (str) The url of the (page containing the) source code. - :ivar date_created: (str, or None) The date the code was published. - :ivar date_modified: (str, or None) The date the code was last modified. + :ivar date_created: (:class:`datetime.datetime`, or None) The date the code + was published. + :ivar date_modified: (:class:`datetime.datetime`, or None) The date the + code was last modified. + :ivar rank: (float) A quanitification of the source code's quality, as + per available ratings (stars, forks, upvotes, etc.). """ def __init__(self, name, code, filename, language, authors, code_url, - date_created, date_modified): + date_created, date_modified, rank): """ Create a Codelet instance. - :param name: The name of the codelet. - :param code: The raw source code. - :param filename: The filename of the code, if any. - :param language: The inferred language. - :param authors: An array of tuples containing an author's name and - profile URL (on the service the code was pulled from). - :param code_url: The url of the (page containing the) source code. - :param date_created: The date the code was published. - :param date_modified: The date the code was last modified. + :param name: see :attr:`self.name` + :param code: see :attr:`self.code` + :param filename: see :attr:`self.filename` + :param language: see :attr:`self.language` + :param authors: see :attr:`self.authors` + :param code_url: see :attr:`self.code_url` + :param date_created: see :attr:`self.date_created` + :param date_modified: see :attr:`self.date_modified` + :param rank: see :attr:`self.rank` - :type name: str - :type code: str - :type filename: str, or None - :type language: str, or None - :type authors: array of str tuples, or None - :type code_url: str - :type date_created: str, or None - :type date_modified: str, or None + :type name: see :attr:`self.name` + :type code: see :attr:`self.code` + :type filename: see :attr:`self.filename` + :type language: see :attr:`self.language` + :type authors: see :attr:`self.authors` + :type code_url: see :attr:`self.code_url` + :type date_created: see :attr:`self.date_created` + :type date_modified: see :attr:`self.date_modified` + :type rank: see :attr:`self.rank` """ self.name = name @@ -48,3 +54,4 @@ class Codelet(object): self.code_url = code_url self.date_created = date_created self.date_modified = date_modified + self.rank = rank diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 69c579c..c1c77ad 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -12,9 +12,6 @@ from ..codelet import Codelet GIT_CLONE_DIR = "/tmp/bitshift" THREAD_QUEUE_SLEEP = 0.5 -import pymongo #debug -db = pymongo.MongoClient().bitshift #debug - class GitRepository(object): """ A representation of a Git repository's metadata. @@ -101,10 +98,10 @@ class GitIndexer(threading.Thread): repo = self.index_queue.get() self.index_queue.task_done() - # try: - self._index_repository(repo) - # except Exception as excep: - # self._logger.warning("%s: %s.", excep.__class__.__name__, excep) + try: + self._index_repository(repo) + except Exception as excep: + self._logger.warning("%s: %s.", excep.__class__.__name__, excep) def _index_repository(self, repo): """ @@ -119,10 +116,10 @@ class GitIndexer(threading.Thread): """ with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir: - # try: - self._insert_repository_codelets(repo) - # except Exception as excep: - # self._logger.warning("%s: %s.", excep.__class__.__name__, excep) + try: + self._insert_repository_codelets(repo) + except Exception as excep: + self._logger.warning("%s: %s.", excep.__class__.__name__, excep) if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) @@ -140,29 +137,222 @@ class GitIndexer(threading.Thread): :type repo_url: :class:`GitRepository` """ - commits_meta = _get_commits_metadata() + commits_meta = self._get_commits_metadata() if commits_meta is None: return for filename in commits_meta.keys(): try: with open(filename) as source_file: - source = _decode(source_file.read()) + source = self._decode(source_file.read()) if source is None: continue except IOError as exception: continue - authors = [(_decode(author),) for author in \ + authors = [(self._decode(author), None) for author in \ commits_meta[filename]["authors"]] codelet = Codelet("%s:%s" % (repo.name, filename), source, filename, - None, authors, _generate_file_url(filename, + None, authors, self._generate_file_url(filename, repo.url, repo.framework_name), commits_meta[filename]["time_created"], commits_meta[filename]["time_last_modified"], repo.rank) - db.codelets.insert(codelet.__dict__) #debug + def _generate_file_url(self, filename, repo_url, framework_name): + """ + Return a url for a filename from a Git wrapper framework. + + :param filename: The path of the file. + :param repo_url: The url of the file's parent repository. + :param framework_name: The name of the framework the repository is from. + + :type filename: str + :type repo_url: str + :type framework_name: str + + :return: The file's full url on the given framework, if successfully + derived. + :rtype: str, or None + + .. warning:: + Various Git subprocesses will occasionally fail, and, seeing as the + information they provide is a crucial component of some repository file + urls, None may be returned. + """ + + try: + if framework_name == "GitHub": + default_branch = subprocess.check_output("git branch" + " --no-color", shell=True)[2:-1] + return ("%s/blob/%s/%s" % (repo_url, default_branch, + filename)).replace("//", "/") + elif framework_name == "Bitbucket": + commit_hash = subprocess.check_output("git rev-parse HEAD", + shell=True).replace("\n", "") + return ("%s/src/%s/%s" % (repo_url, commit_hash, + filename)).replace("//", "/") + except subprocess.CalledProcessError as exception: + return None + + def _get_git_commits(self): + """ + Return the current working directory's formatted commit data. + + Uses `git log` to generate metadata about every single file in the + repository's commit history. + + :return: The author, timestamp, and names of all modified files of every + commit. + .. code-block:: python + sample_returned_array = [ + { + "author" : (str) "author" + "timestamp" : (`datetime.datetime`) , + "filenames" : (str array) ["file1", "file2"] + } + ] + :rtype: array of dictionaries + """ + + git_log = subprocess.check_output(("git --no-pager log --name-only" + " --pretty=format:'%n%n%an%n%at' -z"), shell=True) + + commits = [] + for commit in git_log.split("\n\n"): + fields = commit.split("\n") + if len(fields) > 2: + commits.append({ + "author" : fields[0], + "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), + "filenames" : fields[2].split("\x00")[:-2] + }) + + return commits + + def _get_tracked_files(self): + """ + Return a list of the filenames of all valuable files in the Git repository. + + Get a list of the filenames of the non-binary (Perl heuristics used for + filetype identification) files currently inside the current working + directory's Git repository. Then, weed out any boilerplate/non-code files + that match the regex rules in GIT_IGNORE_FILES. + + :return: The filenames of all index-worthy non-binary files. + :rtype: str array + """ + + files = [] + for dirname, subdir_names, filenames in os.walk("."): + for filename in filenames: + path = os.path.join(dirname, filename) + if self._is_ascii(path): + files.append(path[2:]) + + return files + + def _get_commits_metadata(self): + """ + Return a dictionary containing every valuable tracked file's metadata. + + :return: A dictionary with author names, time of creation, and time of last + modification for every filename key. + .. code-block:: python + sample_returned_dict = { + "my_file" : { + "authors" : (str array) ["author1", "author2"], + "time_created" : (`datetime.datetime`) , + "time_last_modified" : (`datetime.datetime`) + } + } + :rtype: dictionary of dictionaries + """ + + commits = self._get_git_commits() + tracked_files = self._get_tracked_files() + + files_meta = {} + for commit in commits: + for filename in commit["filenames"]: + if filename not in tracked_files: + continue + + if filename not in files_meta.keys(): + files_meta[filename] = { + "authors" : [commit["author"]], + "time_last_modified" : commit["timestamp"], + "time_created" : commit["timestamp"] + } + else: + if commit["author"] not in files_meta[filename]["authors"]: + files_meta[filename]["authors"].append(commit["author"]) + files_meta[filename]["time_created"] = commit["timestamp"] + + return files_meta + + def _decode(self, raw): + """ + Return a decoded a raw string. + + :param raw: The string to string. + + :type raw: (str) + + :return: If the original encoding is successfully inferenced, return the + decoded string. + :rtype: str, or None + + .. warning:: + The raw string's original encoding is identified by heuristics which + can, and occasionally will, fail. Decoding will then fail, and None + will be returned. + """ + + try: + encoding = bs4.BeautifulSoup(raw).original_encoding + return raw.decode(encoding) if encoding is not None else None + + except (LookupError, UnicodeDecodeError, UserWarning) as exception: + return None + + def _is_ascii(self, filename): + """ + Heuristically determine whether a file is ASCII text or binary. + + If a portion of the file contains null bytes, or the percentage of bytes + that aren't ASCII is greater than 30%, then the file is concluded to be + binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T` + operator, and is the de-facto method for in : passdetermining whether a + file is ASCII. + + :param filename: The path of the file to test. + + :type filename: str + + :return: Whether the file is probably ASCII. + :rtype: Boolean + """ + + try: + with open(filename) as source: + file_snippet = source.read(512) + + if not file_snippet: + return True + + ascii_characters = "".join(map(chr, range(32, 127)) + + list("\n\r\t\b")) + null_trans = string.maketrans("", "") + + if "\0" in file_snippet: + return False + + non_ascii = file_snippet.translate(null_trans, ascii_characters) + return not float(len(non_ascii)) / len(file_snippet) > 0.30 + + except IOError as exception: + return False class _GitCloner(threading.Thread): """ @@ -297,198 +487,3 @@ class _ChangeDir(object): """ os.chdir(self.old_path) - -def _generate_file_url(filename, repo_url, framework_name): - """ - Return a url for a filename from a Git wrapper framework. - - :param filename: The path of the file. - :param repo_url: The url of the file's parent repository. - :param framework_name: The name of the framework the repository is from. - - :type filename: str - :type repo_url: str - :type framework_name: str - - :return: The file's full url on the given framework, if successfully - derived. - :rtype: str, or None - - .. warning:: - Various Git subprocesses will occasionally fail, and, seeing as the - information they provide is a crucial component of some repository file - urls, None may be returned. - """ - - try: - if framework_name == "GitHub": - default_branch = subprocess.check_output("git branch" - " --no-color", shell=True)[2:-1] - return ("%s/blob/%s/%s" % (repo_url, default_branch, - filename)).replace("//", "/") - elif framework_name == "Bitbucket": - commit_hash = subprocess.check_output("git rev-parse HEAD", - shell=True).replace("\n", "") - return ("%s/src/%s/%s" % (repo_url, commit_hash, - filename)).replace("//", "/") - except subprocess.CalledProcessError as exception: - return None - -def _get_git_commits(): - """ - Return the current working directory's formatted commit data. - - Uses `git log` to generate metadata about every single file in the - repository's commit history. - - :return: The author, timestamp, and names of all modified files of every - commit. - .. code-block:: python - sample_returned_array = [ - { - "author" : (str) "author" - "timestamp" : (`datetime.datetime`) , - "filenames" : (str array) ["file1", "file2"] - } - ] - :rtype: array of dictionaries - """ - - git_log = subprocess.check_output(("git --no-pager log --name-only" - " --pretty=format:'%n%n%an%n%at' -z"), shell=True) - - commits = [] - for commit in git_log.split("\n\n"): - fields = commit.split("\n") - if len(fields) > 2: - commits.append({ - "author" : fields[0], - "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), - "filenames" : fields[2].split("\x00")[:-2] - }) - - return commits - -def _get_tracked_files(): - """ - Return a list of the filenames of all valuable files in the Git repository. - - Get a list of the filenames of the non-binary (Perl heuristics used for - filetype identification) files currently inside the current working - directory's Git repository. Then, weed out any boilerplate/non-code files - that match the regex rules in GIT_IGNORE_FILES. - - :return: The filenames of all index-worthy non-binary files. - :rtype: str array - """ - - files = [] - for dirname, subdir_names, filenames in os.walk("."): - for filename in filenames: - path = os.path.join(dirname, filename) - if _is_ascii(path): - files.append(path[2:]) - - return files - -def _get_commits_metadata(): - """ - Return a dictionary containing every valuable tracked file's metadata. - - :return: A dictionary with author names, time of creation, and time of last - modification for every filename key. - .. code-block:: python - sample_returned_dict = { - "my_file" : { - "authors" : (str array) ["author1", "author2"], - "time_created" : (`datetime.datetime`) , - "time_last_modified" : (`datetime.datetime`) - } - } - :rtype: dictionary of dictionaries - """ - - commits = _get_git_commits() - tracked_files = _get_tracked_files() - - files_meta = {} - for commit in commits: - for filename in commit["filenames"]: - if filename not in tracked_files: - continue - - if filename not in files_meta.keys(): - files_meta[filename] = { - "authors" : [commit["author"]], - "time_last_modified" : commit["timestamp"], - "time_created" : commit["timestamp"] - } - else: - if commit["author"] not in files_meta[filename]["authors"]: - files_meta[filename]["authors"].append(commit["author"]) - files_meta[filename]["time_created"] = commit["timestamp"] - - return files_meta - -def _decode(raw): - """ - Return a decoded a raw string. - - :param raw: The string to string. - - :type raw: (str) - - :return: If the original encoding is successfully inferenced, return the - decoded string. - :rtype: str, or None - - .. warning:: - The raw string's original encoding is identified by heuristics which - can, and occasionally will, fail. Decoding will then fail, and None - will be returned. - """ - - try: - encoding = bs4.BeautifulSoup(raw).original_encoding - return raw.decode(encoding) if encoding is not None else None - - except (LookupError, UnicodeDecodeError, UserWarning) as exception: - return None - -def _is_ascii(filename): - """ - Heuristically determine whether a file is ASCII text or binary. - - If a portion of the file contains null bytes, or the percentage of bytes - that aren't ASCII is greater than 30%, then the file is concluded to be - binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T` - operator, and is the de-facto method for in : passdetermining whether a - file is ASCII. - - :param filename: The path of the file to test. - - :type filename: str - - :return: Whether the file is probably ASCII. - :rtype: Boolean - """ - - try: - with open(filename) as source: - file_snippet = source.read(512) - - if not file_snippet: - return True - - ascii_characters = "".join(map(chr, range(32, 127)) + - list("\n\r\t\b")) - null_trans = string.maketrans("", "") - - if "\0" in file_snippet: - return False - - non_ascii = file_snippet.translate(null_trans, ascii_characters) - return not float(len(non_ascii)) / len(file_snippet) > 0.30 - - except IOError as exception: - return False