瀏覽代碼

Clean up crawler/, fix minor bugs.

Add:
    bitshift/codelet.py
        -add name field to Codelet.

    bitshift/crawler/crawler.py
        -fix previously defunct code (which was committed at a point of
        incompletion) -- incorrect dictionary keys, etc..
        -reformat some function calls' argument alignment to fit PEP standards.

    bitshift/crawler.py
        -add sleep() to ensure that an API query is made at regular intervals
        (determined by the GitHub API limit).
tags/v1.0^2
Severyn Kozak 10 年之前
父節點
當前提交
9fc4598001
共有 5 個文件被更改,包括 45 次插入22 次删除
  1. +1
    -1
      bitshift/__init__.py
  2. +6
    -3
      bitshift/codelet.py
  3. +6
    -0
      bitshift/crawler/__init__.py
  4. +10
    -4
      bitshift/crawler/crawler.py
  5. +22
    -14
      bitshift/crawler/git_indexer.py

+ 1
- 1
bitshift/__init__.py 查看文件

@@ -1 +1 @@
from . import assets, codelet, config, database, parser, query
from . import assets, codelet, config, database, parser, query, crawler

+ 6
- 3
bitshift/codelet.py 查看文件

@@ -4,10 +4,11 @@ class Codelet(object):
"""
A source-code object with code metadata and composition analysis.

:ivar name: (str) A suitable name for the codelet.
:ivar code: (str) A containing the raw source code.
:ivar filename: (str, or None) The filename of the snippet.
:ivar language: (str, or None) The inferred language of `code`.
:ivar authors: (array of str tuple) An array of tuples containing an
:ivar authors: (array of str tuples) An array of tuples containing an
author's name and profile URL (on the service the code was pulled from).
:ivar code_url: (str) The url of the (page containing the) source code.
:ivar date_created: (str, or None) The date the code was published.
@@ -19,6 +20,7 @@ class Codelet(object):
"""
Create a Codelet instance.

:param name: The name of the codelet.
:param code: The raw source code.
:param filename: The filename of the code, if any.
:param language: The inferred language.
@@ -28,16 +30,17 @@ class Codelet(object):
:param date_created: The date the code was published.
:param date_modified: The date the code was last modified.

:type name: str
:type code: str
:type filename: str, or None
:type authors: array of str tuples, or None
:type language: str, or None
:type authors: array of str tuples, or None
:type code_url: str
:type author_urls: str array, or none
:type date_created: str, or None
:type date_modified: str, or None
"""

self.name = name
self.code = code
self.filename = filename
self.language = language


+ 6
- 0
bitshift/crawler/__init__.py 查看文件

@@ -0,0 +1,6 @@
import crawler

__all__ = ["crawl"]

def crawl():
pass

+ 10
- 4
bitshift/crawler/crawler.py 查看文件

@@ -8,8 +8,8 @@ import requests, time

import git_indexer

from .codelet import Codelet
from .database import Database
from ..codelet import Codelet
from ..database import Database

def github():
"""
@@ -26,14 +26,20 @@ def github():
"client_id" : "436cb884ae09be7f2a4e",
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
}
api_request_interval = 5e3 / 60 ** 2

while len(next_api_url) > 0:
start_time = time.time()
response = requests.get(next_api_url, params=authentication_params)

for repo in response.json():
index_repository(repo["html_url"], framework)
print repo["id"]

if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())

next_api_url = requests.headers["link"].split(">")[0][1:]
next_api_url = response.headers["link"].split(">")[0][1:]

sleep_time = api_request_interval - (time.time() - start_time)
if sleep_time > 0:
time.sleep(sleep_time)

+ 22
- 14
bitshift/crawler/git_indexer.py 查看文件

@@ -6,7 +6,8 @@

import fileinput, subprocess, os

from .database import Database
from ..database import Database
from ..codelet import Codelet

def index_repository(repo_url, framework_name):
"""
@@ -21,20 +22,25 @@ def index_repository(repo_url, framework_name):
subprocess.call("git clone %s" % repo_url, shell=True)
os.chdir(repo_name)

codelets = []
commits_meta = _get_commits_metadata()
for filename in commits_meta.keys():
with open(filename, "r") as source_file:
source = source_file.read()

authors = [(author,) for author in commits_meta["authors"]]
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url),
framework_name, commits_meta["time_created"],
commits_meta["time_last_modified"])
Database.insert(codelet)
authors = [(author,) for author in commits_meta[filename]["authors"]]
codelets.append(
Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url,
framework_name),
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"]))

# Database.insert(codelet)

os.chdir("..")
subprocess.call("rm -rf %s" % repo_name, shell=True)
return codelets

def _generate_file_url(filename, repo_url, framework_name):
"""
@@ -53,7 +59,7 @@ def _generate_file_url(filename, repo_url, framework_name):
"""

if framework_name == "github":
default branch = subprocess.check_output("git branch --no-color", \
default_branch = subprocess.check_output("git branch --no-color",
shell=True)[2:-1]
return "%s/blob/%s/%s" % (repo_url, default_branch, filename)

@@ -77,9 +83,9 @@ def _get_git_commits():
:rtype: dictionary
"""

git_log_cmd = ("git --no-pager --no-color log --name-only "
"--pretty=format:'%n%n%an%n%at' -z")
git_log = subprocess.check_output(git_log_cmd, shell=True)
git_log = subprocess.check_output(
("git --no-pager log --name-only"
" --pretty=format:'%n%n%an%n%at' -z"), shell=True)

commits = []
for commit in git_log.split("\n\n"):
@@ -105,8 +111,9 @@ def _get_tracked_files():
:rtype: str array
"""

tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if \
-f && -T }' $(find . -type d -name .git -prune -o -print)", shell=True)
tracked_files = subprocess.check_output(
("perl -le 'for (@ARGV){ print if -f && -T }'"
" $(find . -type d -name .git -prune -o -print)"), shell=True)
return [filename[2:] for filename in tracked_files.split("\n")[:-1]]

def _get_commits_metadata():
@@ -138,7 +145,8 @@ def _get_commits_metadata():
if filename not in files_meta.keys():
files_meta[filename] = {
"authors" : [commit["author"]],
"time_last_modified" : commit["timestamp"]
"time_last_modified" : commit["timestamp"],
"time_created" : commit["timestamp"]
}
else:
if commit["author"] not in files_meta[filename]["authors"]:


Loading…
取消
儲存