From ef9c0609fed4c432f475a3bdd89b4b1ab062a3e3 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Mon, 14 Apr 2014 13:02:59 -0400 Subject: [PATCH] Mov author_files > git_inder, heavily refactor. Add: bitshift/crawler/crawler.py -add base crawler module -add github(), to index Github. Mod: bitshift/crawler/ -add package subdirectory for the crawler module, and any subsidiary modules (eg, git_indexer). bitshift/author_files.py > bitshift/crawler/git_indexer.py -rename the module to "git_indexer", to better reflect its use. -convert from stand-alone script to a module whose functions integrate cleanly with the rest of the application. -add all necessary, tested functions, with Sphinx documentation. --- bitshift/author_files.py | 53 ---------------- bitshift/crawler/crawler.py | 37 +++++++++++ bitshift/crawler/git_indexer.py | 134 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+), 53 deletions(-) delete mode 100644 bitshift/author_files.py create mode 100644 bitshift/crawler/crawler.py create mode 100644 bitshift/crawler/git_indexer.py diff --git a/bitshift/author_files.py b/bitshift/author_files.py deleted file mode 100644 index ed9f2c8..0000000 --- a/bitshift/author_files.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Output author/date information about the latest files in a Git repository. - -When executed inside a Git archive, prints a single line of metadata for every -file in the work tree. A given line contains the file's filename, authors, -and Unix timestamps for the file's time of creation and last modification; the -separate entries are null-delimited. - -Sample output: - socket_io.c\x00John Doe Jane Doe\x001384488690\x001384534626 - # filename: socket_io.c - # Author Names: -""" - -import fileinput, subprocess - -git_log = subprocess.check_output("git --no-pager log --name-only \ - --pretty=format:'%n%n%an%n%at' -z", shell=True) - -commits = [] -for commit in git_log.split("\n\n"): - fields = commit.split("\n") - if len(fields) > 2: - commits.append({ - "author" : fields[0], - "timestamp" : int(fields[1]), - "filenames" : fields[2].split("\0")[:-2] - }) - - -tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if -f && \ - T }' $(find . -type d -name .git -prune -o -print)", shell=True) -tracked_files = [filename[2:] for filename in tracked_files.split("\n")[:-1]] - -file_authors = {} -for commit in commits: - for filename in commit["filenames"]: - if filename in tracked_files: - if filename not in file_authors.keys(): - file_authors[filename] = { - "authors" : [commit["author"]], - "timestamps" : [commit["timestamp"]] - } - else: - if commit["author"] not in file_authors[filename]["authors"]: - file_authors[filename]["authors"].append(commit["author"]) - file_authors[filename]["timestamps"].append(commit["timestamp"]) - -for filename in file_authors.keys(): - authors = "\0".join(file_authors[filename]["authors"]) - time_created = min(file_authors[filename]["timestamps"]) - time_last_modified = max(file_authors[filename]["timestamps"]) - print "%s\0%s\0%d\0%d" % (filename, authors, time_created, time_last_modified) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py new file mode 100644 index 0000000..46cd54e --- /dev/null +++ b/bitshift/crawler/crawler.py @@ -0,0 +1,37 @@ +""" + +""" + +import requests, time + +import git_indexer + +# from .codelet import Codelet +# from .database import Database + +def github(): + """ + Query the GitHub API for data about every public repository. + + Pull all of GitHub's repositories by making calls to its API in a loop, + accessing a subsequent page of results via the "next" URL returned in an + API response header. Uses Severyn Kozak's (sevko) authentication + credentials. + """ + + next_api_url = "https://api.github.com/repositories" + authentication_params = { + "client_id" : "436cb884ae09be7f2a4e", + "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" + } + + while len(next_api_url) > 0: + response = requests.get(next_api_url, params=authentication_params) + + for repo in response.json(): + codelets = git_indexer.index_repository(repo["html_url"]) + + if int(response.headers["x-ratelimit-remaining"]) == 0: + time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) + + next_api_url = requests.headers["link"].split(">")[0][1:] diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py new file mode 100644 index 0000000..a98c600 --- /dev/null +++ b/bitshift/crawler/git_indexer.py @@ -0,0 +1,134 @@ +""" +:synopsis: Index all the files in a Git repository. + +Clone a Git repository, and retrieve the following information about each file: +filename, contributor names, dates of creation and last modification, and the +file text. +""" + +import fileinput, subprocess, os + +def index_repository(repo_url): + """ + Generate metadata for every file in a Git repository. + + `git clone` the Git repository located at **repo_url**, and return metadata + about every one of non-binary (text) files in its if main branch (usually + *master*). + + :return: An array of metadata dictionaries. + .. code-block:: python + sample_returned_array = [ + { + "filename" : (str) "myfile" + "time_created" : (int) 1395939566, + "time_last_modified" : (int) 1396920409, + "source" : (str) "The source code of the file." + } + ] + """ + + repo_name = repo_url.split("/")[-1] + subprocess.call("git clone %s" % repo_url, shell=True) + os.chdir(repo_name) + + files_meta = [] + commits_meta = _get_commits_metadata() + for filename in commits_meta.keys(): + commits_meta[filename]["filename"] = filename + with open(filename, "r") as source_file: + commits_meta[filename]["source"] = source_file.read() + files_meta.append(commits_meta[filename]) + + os.chdir("..") + subprocess.call("rm -rf %s" % repo_name, shell=True) + return files_meta + +def _get_git_commits(): + """ + Return the current working directory's formatted commit data. + + Uses `git log` to generate metadata about every single file in the + repository's commit history. + + :return: The author, timestamp, and names of all modified files of every + commit. + .. code-block:: python + sample_returned_array = [ + { + "author" : (str) "author" + "timestamp" : (int) 1396919293, + "filename" : (str array) ["file1", "file2"] + } + ] + :rtype: dictionary + """ + + git_log = subprocess.check_output("git --no-pager log --name-only \ + --pretty=format:'%n%n%an%n%at' -z", shell=True) + + commits = [] + for commit in git_log.split("\n\n"): + fields = commit.split("\n") + if len(fields) > 2: + commits.append({ + "author" : fields[0], + "timestamp" : int(fields[1]), + "filenames" : fields[2].split("\0")[:-2] + }) + + return commits + +def _get_tracked_files(): + """ + Return a list of the filenames of all files in the Git repository. + + Get a list of the filenames of the non-binary (Perl heuristics used for + filetype identification) files currently inside the current working + directory's Git repository. + + :return: The filenames of all non-binary files. + :rtype: str array + """ + + tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if \ + -f && -T }' $(find . -type d -name .git -prune -o -print)", shell=True) + return [filename[2:] for filename in tracked_files.split("\n")[:-1]] + +def _get_commits_metadata(): + """ + Return a dictionary containing every tracked file's metadata. + + :return: A dictionary with author names, time of creation, and time of last + modification for every filename key. + .. code-block:: python + sample_returned_dict = { + "my_file" : { + "authors" : (str array) ["author1", "author2"], + "time_created" : (int) 1395939566, + "time_last_modified" : (int) 1396920409 + } + } + :rtype: dictionary + """ + + commits = _get_git_commits() + tracked_files = _get_tracked_files() + + files_meta = {} + for commit in commits: + for filename in commit["filenames"]: + if filename not in tracked_files: + continue + + if filename not in files_meta.keys(): + files_meta[filename] = { + "authors" : [commit["author"]], + "time_last_modified" : commit["timestamp"] + } + else: + if commit["author"] not in files_meta[filename]["authors"]: + files_meta[filename]["authors"].append(commit["author"]) + files_meta[filename]["time_created"] = commit["timestamp"] + + return files_meta