Explorar el Código

Mov author_files > git_inder, heavily refactor.

Add:
    bitshift/crawler/crawler.py
        -add base crawler module
        -add github(), to index Github.
Mod:
    bitshift/crawler/
        -add package subdirectory for the crawler module, and any subsidiary
        modules (eg, git_indexer).

    bitshift/author_files.py > bitshift/crawler/git_indexer.py
        -rename the module to "git_indexer", to better reflect its use.
        -convert from stand-alone script to a module whose functions integrate
        cleanly with the rest of the application.
        -add all necessary, tested functions, with Sphinx documentation.
tags/v1.0^2
Severyn Kozak hace 10 años
padre
commit
ef9c0609fe
Se han modificado 3 ficheros con 171 adiciones y 53 borrados
  1. +0
    -53
      bitshift/author_files.py
  2. +37
    -0
      bitshift/crawler/crawler.py
  3. +134
    -0
      bitshift/crawler/git_indexer.py

+ 0
- 53
bitshift/author_files.py Ver fichero

@@ -1,53 +0,0 @@
"""
Output author/date information about the latest files in a Git repository.

When executed inside a Git archive, prints a single line of metadata for every
file in the work tree. A given line contains the file's filename, authors,
and Unix timestamps for the file's time of creation and last modification; the
separate entries are null-delimited.

Sample output:
socket_io.c\x00John Doe Jane Doe\x001384488690\x001384534626
# filename: socket_io.c
# Author Names:
"""

import fileinput, subprocess

git_log = subprocess.check_output("git --no-pager log --name-only \
--pretty=format:'%n%n%an%n%at' -z", shell=True)

commits = []
for commit in git_log.split("\n\n"):
fields = commit.split("\n")
if len(fields) > 2:
commits.append({
"author" : fields[0],
"timestamp" : int(fields[1]),
"filenames" : fields[2].split("\0")[:-2]
})


tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if -f && \
T }' $(find . -type d -name .git -prune -o -print)", shell=True)
tracked_files = [filename[2:] for filename in tracked_files.split("\n")[:-1]]

file_authors = {}
for commit in commits:
for filename in commit["filenames"]:
if filename in tracked_files:
if filename not in file_authors.keys():
file_authors[filename] = {
"authors" : [commit["author"]],
"timestamps" : [commit["timestamp"]]
}
else:
if commit["author"] not in file_authors[filename]["authors"]:
file_authors[filename]["authors"].append(commit["author"])
file_authors[filename]["timestamps"].append(commit["timestamp"])

for filename in file_authors.keys():
authors = "\0".join(file_authors[filename]["authors"])
time_created = min(file_authors[filename]["timestamps"])
time_last_modified = max(file_authors[filename]["timestamps"])
print "%s\0%s\0%d\0%d" % (filename, authors, time_created, time_last_modified)

+ 37
- 0
bitshift/crawler/crawler.py Ver fichero

@@ -0,0 +1,37 @@
"""

"""

import requests, time

import git_indexer

# from .codelet import Codelet
# from .database import Database

def github():
"""
Query the GitHub API for data about every public repository.

Pull all of GitHub's repositories by making calls to its API in a loop,
accessing a subsequent page of results via the "next" URL returned in an
API response header. Uses Severyn Kozak's (sevko) authentication
credentials.
"""

next_api_url = "https://api.github.com/repositories"
authentication_params = {
"client_id" : "436cb884ae09be7f2a4e",
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
}

while len(next_api_url) > 0:
response = requests.get(next_api_url, params=authentication_params)

for repo in response.json():
codelets = git_indexer.index_repository(repo["html_url"])

if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())

next_api_url = requests.headers["link"].split(">")[0][1:]

+ 134
- 0
bitshift/crawler/git_indexer.py Ver fichero

@@ -0,0 +1,134 @@
"""
:synopsis: Index all the files in a Git repository.

Clone a Git repository, and retrieve the following information about each file:
filename, contributor names, dates of creation and last modification, and the
file text.
"""

import fileinput, subprocess, os

def index_repository(repo_url):
"""
Generate metadata for every file in a Git repository.

`git clone` the Git repository located at **repo_url**, and return metadata
about every one of non-binary (text) files in its if main branch (usually
*master*).

:return: An array of metadata dictionaries.
.. code-block:: python
sample_returned_array = [
{
"filename" : (str) "myfile"
"time_created" : (int) 1395939566,
"time_last_modified" : (int) 1396920409,
"source" : (str) "The source code of the file."
}
]
"""

repo_name = repo_url.split("/")[-1]
subprocess.call("git clone %s" % repo_url, shell=True)
os.chdir(repo_name)

files_meta = []
commits_meta = _get_commits_metadata()
for filename in commits_meta.keys():
commits_meta[filename]["filename"] = filename
with open(filename, "r") as source_file:
commits_meta[filename]["source"] = source_file.read()
files_meta.append(commits_meta[filename])

os.chdir("..")
subprocess.call("rm -rf %s" % repo_name, shell=True)
return files_meta

def _get_git_commits():
"""
Return the current working directory's formatted commit data.

Uses `git log` to generate metadata about every single file in the
repository's commit history.

:return: The author, timestamp, and names of all modified files of every
commit.
.. code-block:: python
sample_returned_array = [
{
"author" : (str) "author"
"timestamp" : (int) 1396919293,
"filename" : (str array) ["file1", "file2"]
}
]
:rtype: dictionary
"""

git_log = subprocess.check_output("git --no-pager log --name-only \
--pretty=format:'%n%n%an%n%at' -z", shell=True)

commits = []
for commit in git_log.split("\n\n"):
fields = commit.split("\n")
if len(fields) > 2:
commits.append({
"author" : fields[0],
"timestamp" : int(fields[1]),
"filenames" : fields[2].split("\0")[:-2]
})

return commits

def _get_tracked_files():
"""
Return a list of the filenames of all files in the Git repository.

Get a list of the filenames of the non-binary (Perl heuristics used for
filetype identification) files currently inside the current working
directory's Git repository.

:return: The filenames of all non-binary files.
:rtype: str array
"""

tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if \
-f && -T }' $(find . -type d -name .git -prune -o -print)", shell=True)
return [filename[2:] for filename in tracked_files.split("\n")[:-1]]

def _get_commits_metadata():
"""
Return a dictionary containing every tracked file's metadata.

:return: A dictionary with author names, time of creation, and time of last
modification for every filename key.
.. code-block:: python
sample_returned_dict = {
"my_file" : {
"authors" : (str array) ["author1", "author2"],
"time_created" : (int) 1395939566,
"time_last_modified" : (int) 1396920409
}
}
:rtype: dictionary
"""

commits = _get_git_commits()
tracked_files = _get_tracked_files()

files_meta = {}
for commit in commits:
for filename in commit["filenames"]:
if filename not in tracked_files:
continue

if filename not in files_meta.keys():
files_meta[filename] = {
"authors" : [commit["author"]],
"time_last_modified" : commit["timestamp"]
}
else:
if commit["author"] not in files_meta[filename]["authors"]:
files_meta[filename]["authors"].append(commit["author"])
files_meta[filename]["time_created"] = commit["timestamp"]

return files_meta

Cargando…
Cancelar
Guardar