Add: bitshift/crawler/(crawler, git_indexer).py -move Codelet creation from the crawler to the git_indexer, in preparation for making crawling/indexing independent, threaded processes. Mod: bitshift/codelet.py -modify documentation for the author instance variable.tags/v1.0^2
@@ -7,39 +7,41 @@ class Codelet(object): | |||||
:ivar code: (str) A containing the raw source code. | :ivar code: (str) A containing the raw source code. | ||||
:ivar filename: (str, or None) The filename of the snippet. | :ivar filename: (str, or None) The filename of the snippet. | ||||
:ivar language: (str, or None) The inferred language of `code`. | :ivar language: (str, or None) The inferred language of `code`. | ||||
:ivar author: (str, or None) The name of the code's author. | |||||
:ivar url: (str) The url of the (page containing the) source code. | |||||
:ivar authors: (array of str tuple) An array of tuples containing an | |||||
author's name and profile URL (on the service the code was pulled from). | |||||
:ivar code_url: (str) The url of the (page containing the) source code. | |||||
:ivar date_created: (str, or None) The date the code was published. | :ivar date_created: (str, or None) The date the code was published. | ||||
:ivar date_modified: (str, or None) The date the code was last modified. | :ivar date_modified: (str, or None) The date the code was last modified. | ||||
""" | """ | ||||
def __init__(self, code, filename, author, language, code_url, author_url, | |||||
def __init__(self, name, code, filename, language, authors, code_url, | |||||
date_created, date_modified): | date_created, date_modified): | ||||
""" | """ | ||||
Create a Codelet instance. | Create a Codelet instance. | ||||
:param code: The raw source code. | :param code: The raw source code. | ||||
:param filename: The filename of the code, if any. | :param filename: The filename of the code, if any. | ||||
:param author: The author of the code. | |||||
:param language: The inferred language. | :param language: The inferred language. | ||||
:param authors: An array of tuples containing an author's name and | |||||
profile URL (on the service the code was pulled from). | |||||
:param code_url: The url of the (page containing the) source code. | :param code_url: The url of the (page containing the) source code. | ||||
:param date_created: The date the code was published. | :param date_created: The date the code was published. | ||||
:param date_modified: The date the code was last modified. | :param date_modified: The date the code was last modified. | ||||
:type code: str | :type code: str | ||||
:type filename: str, or None | :type filename: str, or None | ||||
:type authors: array of str tuples, or None | |||||
:type language: str, or None | :type language: str, or None | ||||
:type author: str, or None | |||||
:type url: str | |||||
:type code_url: str | |||||
:type author_urls: str array, or none | |||||
:type date_created: str, or None | :type date_created: str, or None | ||||
:type date_modified: str, or None | :type date_modified: str, or None | ||||
""" | """ | ||||
self.code = code | self.code = code | ||||
self.filename = filename | self.filename = filename | ||||
self.author = author | |||||
self.language = language | self.language = language | ||||
self.authors = authors | |||||
self.code_url = code_url | self.code_url = code_url | ||||
self.author_url = author_url | |||||
self.date_created = date_created | self.date_created = date_created | ||||
self.date_modified = date_modified | self.date_modified = date_modified |
@@ -1,13 +1,15 @@ | |||||
""" | """ | ||||
:synopsis: Main crawler module, to oversee all site-specific crawlers. | |||||
...more info soon... | |||||
""" | """ | ||||
import requests, time | import requests, time | ||||
import git_indexer | import git_indexer | ||||
# from .codelet import Codelet | |||||
# from .database import Database | |||||
from .codelet import Codelet | |||||
from .database import Database | |||||
def github(): | def github(): | ||||
""" | """ | ||||
@@ -29,7 +31,7 @@ def github(): | |||||
response = requests.get(next_api_url, params=authentication_params) | response = requests.get(next_api_url, params=authentication_params) | ||||
for repo in response.json(): | for repo in response.json(): | ||||
codelets = git_indexer.index_repository(repo["html_url"]) | |||||
index_repository(repo["html_url"], framework) | |||||
if int(response.headers["x-ratelimit-remaining"]) == 0: | if int(response.headers["x-ratelimit-remaining"]) == 0: | ||||
time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) | time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) | ||||
@@ -1,48 +1,61 @@ | |||||
""" | """ | ||||
:synopsis: Index all the files in a Git repository. | :synopsis: Index all the files in a Git repository. | ||||
Clone a Git repository, and retrieve the following information about each file: | |||||
filename, contributor names, dates of creation and last modification, and the | |||||
file text. | |||||
...more info soon... | |||||
""" | """ | ||||
import fileinput, subprocess, os | import fileinput, subprocess, os | ||||
def index_repository(repo_url): | |||||
from .database import Database | |||||
def index_repository(repo_url, framework_name): | |||||
""" | """ | ||||
Generate metadata for every file in a Git repository. | |||||
Insert a Codelet for every file in a Git repository. | |||||
`git clone` the Git repository located at **repo_url**, and return metadata | |||||
about every one of non-binary (text) files in its if main branch (usually | |||||
`git clone` the Git repository located at **repo_url**, and create a Codelet | |||||
for every one of non-binary (text) files in its if main branch (usually | |||||
*master*). | *master*). | ||||
:return: An array of metadata dictionaries. | |||||
.. code-block:: python | |||||
sample_returned_array = [ | |||||
{ | |||||
"filename" : (str) "myfile" | |||||
"time_created" : (int) 1395939566, | |||||
"time_last_modified" : (int) 1396920409, | |||||
"source" : (str) "The source code of the file." | |||||
} | |||||
] | |||||
""" | """ | ||||
repo_name = repo_url.split("/")[-1] | repo_name = repo_url.split("/")[-1] | ||||
subprocess.call("git clone %s" % repo_url, shell=True) | subprocess.call("git clone %s" % repo_url, shell=True) | ||||
os.chdir(repo_name) | os.chdir(repo_name) | ||||
files_meta = [] | |||||
commits_meta = _get_commits_metadata() | commits_meta = _get_commits_metadata() | ||||
for filename in commits_meta.keys(): | for filename in commits_meta.keys(): | ||||
commits_meta[filename]["filename"] = filename | |||||
with open(filename, "r") as source_file: | with open(filename, "r") as source_file: | ||||
commits_meta[filename]["source"] = source_file.read() | |||||
files_meta.append(commits_meta[filename]) | |||||
source = source_file.read() | |||||
authors = [(author,) for author in commits_meta["authors"]] | |||||
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, | |||||
None, authors, _generate_file_url(filename, repo_url), | |||||
framework_name, commits_meta["time_created"], | |||||
commits_meta["time_last_modified"]) | |||||
Database.insert(codelet) | |||||
os.chdir("..") | os.chdir("..") | ||||
subprocess.call("rm -rf %s" % repo_name, shell=True) | subprocess.call("rm -rf %s" % repo_name, shell=True) | ||||
return files_meta | |||||
def _generate_file_url(filename, repo_url, framework_name): | |||||
""" | |||||
Return a url for a filename from a Git wrapper framework. | |||||
:param filename: The path of the file. | |||||
:param repo_url: The url of the file's parent repository. | |||||
:param framework_name: The name of the framework the repository is from. | |||||
:type filename: str | |||||
:type repo_url: str | |||||
:type framework_name: str | |||||
:return: The file's full url on the given framework. | |||||
:rtype: str | |||||
""" | |||||
if framework_name == "github": | |||||
default branch = subprocess.check_output("git branch --no-color", \ | |||||
shell=True)[2:-1] | |||||
return "%s/blob/%s/%s" % (repo_url, default_branch, filename) | |||||
def _get_git_commits(): | def _get_git_commits(): | ||||
""" | """ | ||||
@@ -58,14 +71,15 @@ def _get_git_commits(): | |||||
{ | { | ||||
"author" : (str) "author" | "author" : (str) "author" | ||||
"timestamp" : (int) 1396919293, | "timestamp" : (int) 1396919293, | ||||
"filename" : (str array) ["file1", "file2"] | |||||
"filenames" : (str array) ["file1", "file2"] | |||||
} | } | ||||
] | ] | ||||
:rtype: dictionary | :rtype: dictionary | ||||
""" | """ | ||||
git_log = subprocess.check_output("git --no-pager log --name-only \ | |||||
--pretty=format:'%n%n%an%n%at' -z", shell=True) | |||||
git_log_cmd = ("git --no-pager --no-color log --name-only " | |||||
"--pretty=format:'%n%n%an%n%at' -z") | |||||
git_log = subprocess.check_output(git_log_cmd, shell=True) | |||||
commits = [] | commits = [] | ||||
for commit in git_log.split("\n\n"): | for commit in git_log.split("\n\n"): | ||||