From 77b448c3deaf980f1cddcee8986cf0c417a62a2c Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Mon, 14 Apr 2014 18:41:00 -0400 Subject: [PATCH] Mod Codelet, mov codelet creation from crawler. Add: bitshift/crawler/(crawler, git_indexer).py -move Codelet creation from the crawler to the git_indexer, in preparation for making crawling/indexing independent, threaded processes. Mod: bitshift/codelet.py -modify documentation for the author instance variable. --- bitshift/codelet.py | 18 ++++++----- bitshift/crawler/crawler.py | 8 +++-- bitshift/crawler/git_indexer.py | 66 +++++++++++++++++++++++++---------------- 3 files changed, 55 insertions(+), 37 deletions(-) diff --git a/bitshift/codelet.py b/bitshift/codelet.py index 08b0d36..87025e0 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -7,39 +7,41 @@ class Codelet(object): :ivar code: (str) A containing the raw source code. :ivar filename: (str, or None) The filename of the snippet. :ivar language: (str, or None) The inferred language of `code`. - :ivar author: (str, or None) The name of the code's author. - :ivar url: (str) The url of the (page containing the) source code. + :ivar authors: (array of str tuple) An array of tuples containing an + author's name and profile URL (on the service the code was pulled from). + :ivar code_url: (str) The url of the (page containing the) source code. :ivar date_created: (str, or None) The date the code was published. :ivar date_modified: (str, or None) The date the code was last modified. """ - def __init__(self, code, filename, author, language, code_url, author_url, + def __init__(self, name, code, filename, language, authors, code_url, date_created, date_modified): """ Create a Codelet instance. :param code: The raw source code. :param filename: The filename of the code, if any. - :param author: The author of the code. :param language: The inferred language. + :param authors: An array of tuples containing an author's name and + profile URL (on the service the code was pulled from). :param code_url: The url of the (page containing the) source code. :param date_created: The date the code was published. :param date_modified: The date the code was last modified. :type code: str :type filename: str, or None + :type authors: array of str tuples, or None :type language: str, or None - :type author: str, or None - :type url: str + :type code_url: str + :type author_urls: str array, or none :type date_created: str, or None :type date_modified: str, or None """ self.code = code self.filename = filename - self.author = author self.language = language + self.authors = authors self.code_url = code_url - self.author_url = author_url self.date_created = date_created self.date_modified = date_modified diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 46cd54e..1ca65d1 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -1,13 +1,15 @@ """ +:synopsis: Main crawler module, to oversee all site-specific crawlers. +...more info soon... """ import requests, time import git_indexer -# from .codelet import Codelet -# from .database import Database +from .codelet import Codelet +from .database import Database def github(): """ @@ -29,7 +31,7 @@ def github(): response = requests.get(next_api_url, params=authentication_params) for repo in response.json(): - codelets = git_indexer.index_repository(repo["html_url"]) + index_repository(repo["html_url"], framework) if int(response.headers["x-ratelimit-remaining"]) == 0: time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py index a98c600..0c7ce75 100644 --- a/bitshift/crawler/git_indexer.py +++ b/bitshift/crawler/git_indexer.py @@ -1,48 +1,61 @@ """ :synopsis: Index all the files in a Git repository. -Clone a Git repository, and retrieve the following information about each file: -filename, contributor names, dates of creation and last modification, and the -file text. +...more info soon... """ import fileinput, subprocess, os -def index_repository(repo_url): +from .database import Database + +def index_repository(repo_url, framework_name): """ - Generate metadata for every file in a Git repository. + Insert a Codelet for every file in a Git repository. - `git clone` the Git repository located at **repo_url**, and return metadata - about every one of non-binary (text) files in its if main branch (usually + `git clone` the Git repository located at **repo_url**, and create a Codelet + for every one of non-binary (text) files in its if main branch (usually *master*). - - :return: An array of metadata dictionaries. - .. code-block:: python - sample_returned_array = [ - { - "filename" : (str) "myfile" - "time_created" : (int) 1395939566, - "time_last_modified" : (int) 1396920409, - "source" : (str) "The source code of the file." - } - ] """ repo_name = repo_url.split("/")[-1] subprocess.call("git clone %s" % repo_url, shell=True) os.chdir(repo_name) - files_meta = [] commits_meta = _get_commits_metadata() for filename in commits_meta.keys(): - commits_meta[filename]["filename"] = filename with open(filename, "r") as source_file: - commits_meta[filename]["source"] = source_file.read() - files_meta.append(commits_meta[filename]) + source = source_file.read() + + authors = [(author,) for author in commits_meta["authors"]] + codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, + None, authors, _generate_file_url(filename, repo_url), + framework_name, commits_meta["time_created"], + commits_meta["time_last_modified"]) + Database.insert(codelet) os.chdir("..") subprocess.call("rm -rf %s" % repo_name, shell=True) - return files_meta + +def _generate_file_url(filename, repo_url, framework_name): + """ + Return a url for a filename from a Git wrapper framework. + + :param filename: The path of the file. + :param repo_url: The url of the file's parent repository. + :param framework_name: The name of the framework the repository is from. + + :type filename: str + :type repo_url: str + :type framework_name: str + + :return: The file's full url on the given framework. + :rtype: str + """ + + if framework_name == "github": + default branch = subprocess.check_output("git branch --no-color", \ + shell=True)[2:-1] + return "%s/blob/%s/%s" % (repo_url, default_branch, filename) def _get_git_commits(): """ @@ -58,14 +71,15 @@ def _get_git_commits(): { "author" : (str) "author" "timestamp" : (int) 1396919293, - "filename" : (str array) ["file1", "file2"] + "filenames" : (str array) ["file1", "file2"] } ] :rtype: dictionary """ - git_log = subprocess.check_output("git --no-pager log --name-only \ - --pretty=format:'%n%n%an%n%at' -z", shell=True) + git_log_cmd = ("git --no-pager --no-color log --name-only " + "--pretty=format:'%n%n%an%n%at' -z") + git_log = subprocess.check_output(git_log_cmd, shell=True) commits = [] for commit in git_log.split("\n\n"):