From 9fc4598001264b58245b5c78ef21b792d7e3385c Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Mon, 14 Apr 2014 21:21:58 -0400 Subject: [PATCH] Clean up crawler/, fix minor bugs. Add: bitshift/codelet.py -add name field to Codelet. bitshift/crawler/crawler.py -fix previously defunct code (which was committed at a point of incompletion) -- incorrect dictionary keys, etc.. -reformat some function calls' argument alignment to fit PEP standards. bitshift/crawler.py -add sleep() to ensure that an API query is made at regular intervals (determined by the GitHub API limit). --- bitshift/__init__.py | 2 +- bitshift/codelet.py | 9 ++++++--- bitshift/crawler/__init__.py | 6 ++++++ bitshift/crawler/crawler.py | 14 ++++++++++---- bitshift/crawler/git_indexer.py | 36 ++++++++++++++++++++++-------------- 5 files changed, 45 insertions(+), 22 deletions(-) create mode 100644 bitshift/crawler/__init__.py diff --git a/bitshift/__init__.py b/bitshift/__init__.py index 9a18c9b..78ca5e9 100644 --- a/bitshift/__init__.py +++ b/bitshift/__init__.py @@ -1 +1 @@ -from . import assets, codelet, config, database, parser, query +from . import assets, codelet, config, database, parser, query, crawler diff --git a/bitshift/codelet.py b/bitshift/codelet.py index 87025e0..9568a4d 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -4,10 +4,11 @@ class Codelet(object): """ A source-code object with code metadata and composition analysis. + :ivar name: (str) A suitable name for the codelet. :ivar code: (str) A containing the raw source code. :ivar filename: (str, or None) The filename of the snippet. :ivar language: (str, or None) The inferred language of `code`. - :ivar authors: (array of str tuple) An array of tuples containing an + :ivar authors: (array of str tuples) An array of tuples containing an author's name and profile URL (on the service the code was pulled from). :ivar code_url: (str) The url of the (page containing the) source code. :ivar date_created: (str, or None) The date the code was published. @@ -19,6 +20,7 @@ class Codelet(object): """ Create a Codelet instance. + :param name: The name of the codelet. :param code: The raw source code. :param filename: The filename of the code, if any. :param language: The inferred language. @@ -28,16 +30,17 @@ class Codelet(object): :param date_created: The date the code was published. :param date_modified: The date the code was last modified. + :type name: str :type code: str :type filename: str, or None - :type authors: array of str tuples, or None :type language: str, or None + :type authors: array of str tuples, or None :type code_url: str - :type author_urls: str array, or none :type date_created: str, or None :type date_modified: str, or None """ + self.name = name self.code = code self.filename = filename self.language = language diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py new file mode 100644 index 0000000..a518970 --- /dev/null +++ b/bitshift/crawler/__init__.py @@ -0,0 +1,6 @@ +import crawler + +__all__ = ["crawl"] + +def crawl(): + pass diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 1ca65d1..34f2819 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -8,8 +8,8 @@ import requests, time import git_indexer -from .codelet import Codelet -from .database import Database +from ..codelet import Codelet +from ..database import Database def github(): """ @@ -26,14 +26,20 @@ def github(): "client_id" : "436cb884ae09be7f2a4e", "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" } + api_request_interval = 5e3 / 60 ** 2 while len(next_api_url) > 0: + start_time = time.time() response = requests.get(next_api_url, params=authentication_params) for repo in response.json(): - index_repository(repo["html_url"], framework) + print repo["id"] if int(response.headers["x-ratelimit-remaining"]) == 0: time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) - next_api_url = requests.headers["link"].split(">")[0][1:] + next_api_url = response.headers["link"].split(">")[0][1:] + + sleep_time = api_request_interval - (time.time() - start_time) + if sleep_time > 0: + time.sleep(sleep_time) diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py index 0c7ce75..cc9082c 100644 --- a/bitshift/crawler/git_indexer.py +++ b/bitshift/crawler/git_indexer.py @@ -6,7 +6,8 @@ import fileinput, subprocess, os -from .database import Database +from ..database import Database +from ..codelet import Codelet def index_repository(repo_url, framework_name): """ @@ -21,20 +22,25 @@ def index_repository(repo_url, framework_name): subprocess.call("git clone %s" % repo_url, shell=True) os.chdir(repo_name) + codelets = [] commits_meta = _get_commits_metadata() for filename in commits_meta.keys(): with open(filename, "r") as source_file: source = source_file.read() - authors = [(author,) for author in commits_meta["authors"]] - codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, - None, authors, _generate_file_url(filename, repo_url), - framework_name, commits_meta["time_created"], - commits_meta["time_last_modified"]) - Database.insert(codelet) + authors = [(author,) for author in commits_meta[filename]["authors"]] + codelets.append( + Codelet("%s:%s" % (repo_name, filename), source, filename, + None, authors, _generate_file_url(filename, repo_url, + framework_name), + commits_meta[filename]["time_created"], + commits_meta[filename]["time_last_modified"])) + + # Database.insert(codelet) os.chdir("..") subprocess.call("rm -rf %s" % repo_name, shell=True) + return codelets def _generate_file_url(filename, repo_url, framework_name): """ @@ -53,7 +59,7 @@ def _generate_file_url(filename, repo_url, framework_name): """ if framework_name == "github": - default branch = subprocess.check_output("git branch --no-color", \ + default_branch = subprocess.check_output("git branch --no-color", shell=True)[2:-1] return "%s/blob/%s/%s" % (repo_url, default_branch, filename) @@ -77,9 +83,9 @@ def _get_git_commits(): :rtype: dictionary """ - git_log_cmd = ("git --no-pager --no-color log --name-only " - "--pretty=format:'%n%n%an%n%at' -z") - git_log = subprocess.check_output(git_log_cmd, shell=True) + git_log = subprocess.check_output( + ("git --no-pager log --name-only" + " --pretty=format:'%n%n%an%n%at' -z"), shell=True) commits = [] for commit in git_log.split("\n\n"): @@ -105,8 +111,9 @@ def _get_tracked_files(): :rtype: str array """ - tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if \ - -f && -T }' $(find . -type d -name .git -prune -o -print)", shell=True) + tracked_files = subprocess.check_output( + ("perl -le 'for (@ARGV){ print if -f && -T }'" + " $(find . -type d -name .git -prune -o -print)"), shell=True) return [filename[2:] for filename in tracked_files.split("\n")[:-1]] def _get_commits_metadata(): @@ -138,7 +145,8 @@ def _get_commits_metadata(): if filename not in files_meta.keys(): files_meta[filename] = { "authors" : [commit["author"]], - "time_last_modified" : commit["timestamp"] + "time_last_modified" : commit["timestamp"], + "time_created" : commit["timestamp"] } else: if commit["author"] not in files_meta[filename]["authors"]: