Add: bitshift/codelet.py -add name field to Codelet. bitshift/crawler/crawler.py -fix previously defunct code (which was committed at a point of incompletion) -- incorrect dictionary keys, etc.. -reformat some function calls' argument alignment to fit PEP standards. bitshift/crawler.py -add sleep() to ensure that an API query is made at regular intervals (determined by the GitHub API limit).tags/v1.0^2
@@ -1 +1 @@ | |||||
from . import assets, codelet, config, database, parser, query | |||||
from . import assets, codelet, config, database, parser, query, crawler |
@@ -4,10 +4,11 @@ class Codelet(object): | |||||
""" | """ | ||||
A source-code object with code metadata and composition analysis. | A source-code object with code metadata and composition analysis. | ||||
:ivar name: (str) A suitable name for the codelet. | |||||
:ivar code: (str) A containing the raw source code. | :ivar code: (str) A containing the raw source code. | ||||
:ivar filename: (str, or None) The filename of the snippet. | :ivar filename: (str, or None) The filename of the snippet. | ||||
:ivar language: (str, or None) The inferred language of `code`. | :ivar language: (str, or None) The inferred language of `code`. | ||||
:ivar authors: (array of str tuple) An array of tuples containing an | |||||
:ivar authors: (array of str tuples) An array of tuples containing an | |||||
author's name and profile URL (on the service the code was pulled from). | author's name and profile URL (on the service the code was pulled from). | ||||
:ivar code_url: (str) The url of the (page containing the) source code. | :ivar code_url: (str) The url of the (page containing the) source code. | ||||
:ivar date_created: (str, or None) The date the code was published. | :ivar date_created: (str, or None) The date the code was published. | ||||
@@ -19,6 +20,7 @@ class Codelet(object): | |||||
""" | """ | ||||
Create a Codelet instance. | Create a Codelet instance. | ||||
:param name: The name of the codelet. | |||||
:param code: The raw source code. | :param code: The raw source code. | ||||
:param filename: The filename of the code, if any. | :param filename: The filename of the code, if any. | ||||
:param language: The inferred language. | :param language: The inferred language. | ||||
@@ -28,16 +30,17 @@ class Codelet(object): | |||||
:param date_created: The date the code was published. | :param date_created: The date the code was published. | ||||
:param date_modified: The date the code was last modified. | :param date_modified: The date the code was last modified. | ||||
:type name: str | |||||
:type code: str | :type code: str | ||||
:type filename: str, or None | :type filename: str, or None | ||||
:type authors: array of str tuples, or None | |||||
:type language: str, or None | :type language: str, or None | ||||
:type authors: array of str tuples, or None | |||||
:type code_url: str | :type code_url: str | ||||
:type author_urls: str array, or none | |||||
:type date_created: str, or None | :type date_created: str, or None | ||||
:type date_modified: str, or None | :type date_modified: str, or None | ||||
""" | """ | ||||
self.name = name | |||||
self.code = code | self.code = code | ||||
self.filename = filename | self.filename = filename | ||||
self.language = language | self.language = language | ||||
@@ -0,0 +1,6 @@ | |||||
import crawler | |||||
__all__ = ["crawl"] | |||||
def crawl(): | |||||
pass |
@@ -8,8 +8,8 @@ import requests, time | |||||
import git_indexer | import git_indexer | ||||
from .codelet import Codelet | |||||
from .database import Database | |||||
from ..codelet import Codelet | |||||
from ..database import Database | |||||
def github(): | def github(): | ||||
""" | """ | ||||
@@ -26,14 +26,20 @@ def github(): | |||||
"client_id" : "436cb884ae09be7f2a4e", | "client_id" : "436cb884ae09be7f2a4e", | ||||
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" | "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" | ||||
} | } | ||||
api_request_interval = 5e3 / 60 ** 2 | |||||
while len(next_api_url) > 0: | while len(next_api_url) > 0: | ||||
start_time = time.time() | |||||
response = requests.get(next_api_url, params=authentication_params) | response = requests.get(next_api_url, params=authentication_params) | ||||
for repo in response.json(): | for repo in response.json(): | ||||
index_repository(repo["html_url"], framework) | |||||
print repo["id"] | |||||
if int(response.headers["x-ratelimit-remaining"]) == 0: | if int(response.headers["x-ratelimit-remaining"]) == 0: | ||||
time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) | time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) | ||||
next_api_url = requests.headers["link"].split(">")[0][1:] | |||||
next_api_url = response.headers["link"].split(">")[0][1:] | |||||
sleep_time = api_request_interval - (time.time() - start_time) | |||||
if sleep_time > 0: | |||||
time.sleep(sleep_time) |
@@ -6,7 +6,8 @@ | |||||
import fileinput, subprocess, os | import fileinput, subprocess, os | ||||
from .database import Database | |||||
from ..database import Database | |||||
from ..codelet import Codelet | |||||
def index_repository(repo_url, framework_name): | def index_repository(repo_url, framework_name): | ||||
""" | """ | ||||
@@ -21,20 +22,25 @@ def index_repository(repo_url, framework_name): | |||||
subprocess.call("git clone %s" % repo_url, shell=True) | subprocess.call("git clone %s" % repo_url, shell=True) | ||||
os.chdir(repo_name) | os.chdir(repo_name) | ||||
codelets = [] | |||||
commits_meta = _get_commits_metadata() | commits_meta = _get_commits_metadata() | ||||
for filename in commits_meta.keys(): | for filename in commits_meta.keys(): | ||||
with open(filename, "r") as source_file: | with open(filename, "r") as source_file: | ||||
source = source_file.read() | source = source_file.read() | ||||
authors = [(author,) for author in commits_meta["authors"]] | |||||
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, | |||||
None, authors, _generate_file_url(filename, repo_url), | |||||
framework_name, commits_meta["time_created"], | |||||
commits_meta["time_last_modified"]) | |||||
Database.insert(codelet) | |||||
authors = [(author,) for author in commits_meta[filename]["authors"]] | |||||
codelets.append( | |||||
Codelet("%s:%s" % (repo_name, filename), source, filename, | |||||
None, authors, _generate_file_url(filename, repo_url, | |||||
framework_name), | |||||
commits_meta[filename]["time_created"], | |||||
commits_meta[filename]["time_last_modified"])) | |||||
# Database.insert(codelet) | |||||
os.chdir("..") | os.chdir("..") | ||||
subprocess.call("rm -rf %s" % repo_name, shell=True) | subprocess.call("rm -rf %s" % repo_name, shell=True) | ||||
return codelets | |||||
def _generate_file_url(filename, repo_url, framework_name): | def _generate_file_url(filename, repo_url, framework_name): | ||||
""" | """ | ||||
@@ -53,7 +59,7 @@ def _generate_file_url(filename, repo_url, framework_name): | |||||
""" | """ | ||||
if framework_name == "github": | if framework_name == "github": | ||||
default branch = subprocess.check_output("git branch --no-color", \ | |||||
default_branch = subprocess.check_output("git branch --no-color", | |||||
shell=True)[2:-1] | shell=True)[2:-1] | ||||
return "%s/blob/%s/%s" % (repo_url, default_branch, filename) | return "%s/blob/%s/%s" % (repo_url, default_branch, filename) | ||||
@@ -77,9 +83,9 @@ def _get_git_commits(): | |||||
:rtype: dictionary | :rtype: dictionary | ||||
""" | """ | ||||
git_log_cmd = ("git --no-pager --no-color log --name-only " | |||||
"--pretty=format:'%n%n%an%n%at' -z") | |||||
git_log = subprocess.check_output(git_log_cmd, shell=True) | |||||
git_log = subprocess.check_output( | |||||
("git --no-pager log --name-only" | |||||
" --pretty=format:'%n%n%an%n%at' -z"), shell=True) | |||||
commits = [] | commits = [] | ||||
for commit in git_log.split("\n\n"): | for commit in git_log.split("\n\n"): | ||||
@@ -105,8 +111,9 @@ def _get_tracked_files(): | |||||
:rtype: str array | :rtype: str array | ||||
""" | """ | ||||
tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if \ | |||||
-f && -T }' $(find . -type d -name .git -prune -o -print)", shell=True) | |||||
tracked_files = subprocess.check_output( | |||||
("perl -le 'for (@ARGV){ print if -f && -T }'" | |||||
" $(find . -type d -name .git -prune -o -print)"), shell=True) | |||||
return [filename[2:] for filename in tracked_files.split("\n")[:-1]] | return [filename[2:] for filename in tracked_files.split("\n")[:-1]] | ||||
def _get_commits_metadata(): | def _get_commits_metadata(): | ||||
@@ -138,7 +145,8 @@ def _get_commits_metadata(): | |||||
if filename not in files_meta.keys(): | if filename not in files_meta.keys(): | ||||
files_meta[filename] = { | files_meta[filename] = { | ||||
"authors" : [commit["author"]], | "authors" : [commit["author"]], | ||||
"time_last_modified" : commit["timestamp"] | |||||
"time_last_modified" : commit["timestamp"], | |||||
"time_created" : commit["timestamp"] | |||||
} | } | ||||
else: | else: | ||||
if commit["author"] not in files_meta[filename]["authors"]: | if commit["author"] not in files_meta[filename]["authors"]: | ||||