Przeglądaj źródła

Mod Codelet, mov codelet creation from crawler.

Add:
    bitshift/crawler/(crawler, git_indexer).py
        -move Codelet creation from the crawler to the git_indexer, in
        preparation for making crawling/indexing independent, threaded
        processes.

Mod:
    bitshift/codelet.py
        -modify documentation for the author instance variable.
tags/v1.0^2
Severyn Kozak 10 lat temu
rodzic
commit
77b448c3de
3 zmienionych plików z 55 dodań i 37 usunięć
  1. +10
    -8
      bitshift/codelet.py
  2. +5
    -3
      bitshift/crawler/crawler.py
  3. +40
    -26
      bitshift/crawler/git_indexer.py

+ 10
- 8
bitshift/codelet.py Wyświetl plik

@@ -7,39 +7,41 @@ class Codelet(object):
:ivar code: (str) A containing the raw source code.
:ivar filename: (str, or None) The filename of the snippet.
:ivar language: (str, or None) The inferred language of `code`.
:ivar author: (str, or None) The name of the code's author.
:ivar url: (str) The url of the (page containing the) source code.
:ivar authors: (array of str tuple) An array of tuples containing an
author's name and profile URL (on the service the code was pulled from).
:ivar code_url: (str) The url of the (page containing the) source code.
:ivar date_created: (str, or None) The date the code was published.
:ivar date_modified: (str, or None) The date the code was last modified.
"""

def __init__(self, code, filename, author, language, code_url, author_url,
def __init__(self, name, code, filename, language, authors, code_url,
date_created, date_modified):
"""
Create a Codelet instance.

:param code: The raw source code.
:param filename: The filename of the code, if any.
:param author: The author of the code.
:param language: The inferred language.
:param authors: An array of tuples containing an author's name and
profile URL (on the service the code was pulled from).
:param code_url: The url of the (page containing the) source code.
:param date_created: The date the code was published.
:param date_modified: The date the code was last modified.

:type code: str
:type filename: str, or None
:type authors: array of str tuples, or None
:type language: str, or None
:type author: str, or None
:type url: str
:type code_url: str
:type author_urls: str array, or none
:type date_created: str, or None
:type date_modified: str, or None
"""

self.code = code
self.filename = filename
self.author = author
self.language = language
self.authors = authors
self.code_url = code_url
self.author_url = author_url
self.date_created = date_created
self.date_modified = date_modified

+ 5
- 3
bitshift/crawler/crawler.py Wyświetl plik

@@ -1,13 +1,15 @@
"""
:synopsis: Main crawler module, to oversee all site-specific crawlers.

...more info soon...
"""

import requests, time

import git_indexer

# from .codelet import Codelet
# from .database import Database
from .codelet import Codelet
from .database import Database

def github():
"""
@@ -29,7 +31,7 @@ def github():
response = requests.get(next_api_url, params=authentication_params)

for repo in response.json():
codelets = git_indexer.index_repository(repo["html_url"])
index_repository(repo["html_url"], framework)

if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())


+ 40
- 26
bitshift/crawler/git_indexer.py Wyświetl plik

@@ -1,48 +1,61 @@
"""
:synopsis: Index all the files in a Git repository.

Clone a Git repository, and retrieve the following information about each file:
filename, contributor names, dates of creation and last modification, and the
file text.
...more info soon...
"""

import fileinput, subprocess, os

def index_repository(repo_url):
from .database import Database

def index_repository(repo_url, framework_name):
"""
Generate metadata for every file in a Git repository.
Insert a Codelet for every file in a Git repository.

`git clone` the Git repository located at **repo_url**, and return metadata
about every one of non-binary (text) files in its if main branch (usually
`git clone` the Git repository located at **repo_url**, and create a Codelet
for every one of non-binary (text) files in its if main branch (usually
*master*).

:return: An array of metadata dictionaries.
.. code-block:: python
sample_returned_array = [
{
"filename" : (str) "myfile"
"time_created" : (int) 1395939566,
"time_last_modified" : (int) 1396920409,
"source" : (str) "The source code of the file."
}
]
"""

repo_name = repo_url.split("/")[-1]
subprocess.call("git clone %s" % repo_url, shell=True)
os.chdir(repo_name)

files_meta = []
commits_meta = _get_commits_metadata()
for filename in commits_meta.keys():
commits_meta[filename]["filename"] = filename
with open(filename, "r") as source_file:
commits_meta[filename]["source"] = source_file.read()
files_meta.append(commits_meta[filename])
source = source_file.read()

authors = [(author,) for author in commits_meta["authors"]]
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url),
framework_name, commits_meta["time_created"],
commits_meta["time_last_modified"])
Database.insert(codelet)

os.chdir("..")
subprocess.call("rm -rf %s" % repo_name, shell=True)
return files_meta

def _generate_file_url(filename, repo_url, framework_name):
"""
Return a url for a filename from a Git wrapper framework.

:param filename: The path of the file.
:param repo_url: The url of the file's parent repository.
:param framework_name: The name of the framework the repository is from.

:type filename: str
:type repo_url: str
:type framework_name: str

:return: The file's full url on the given framework.
:rtype: str
"""

if framework_name == "github":
default branch = subprocess.check_output("git branch --no-color", \
shell=True)[2:-1]
return "%s/blob/%s/%s" % (repo_url, default_branch, filename)

def _get_git_commits():
"""
@@ -58,14 +71,15 @@ def _get_git_commits():
{
"author" : (str) "author"
"timestamp" : (int) 1396919293,
"filename" : (str array) ["file1", "file2"]
"filenames" : (str array) ["file1", "file2"]
}
]
:rtype: dictionary
"""

git_log = subprocess.check_output("git --no-pager log --name-only \
--pretty=format:'%n%n%an%n%at' -z", shell=True)
git_log_cmd = ("git --no-pager --no-color log --name-only "
"--pretty=format:'%n%n%an%n%at' -z")
git_log = subprocess.check_output(git_log_cmd, shell=True)

commits = []
for commit in git_log.split("\n\n"):


Ładowanie…
Anuluj
Zapisz