Browse Source

Clean up crawler/, fix minor bugs.

Add:
    bitshift/codelet.py
        -add name field to Codelet.

    bitshift/crawler/crawler.py
        -fix previously defunct code (which was committed at a point of
        incompletion) -- incorrect dictionary keys, etc..
        -reformat some function calls' argument alignment to fit PEP standards.

    bitshift/crawler.py
        -add sleep() to ensure that an API query is made at regular intervals
        (determined by the GitHub API limit).
tags/v1.0^2
Severyn Kozak 10 years ago
parent
commit
9fc4598001
5 changed files with 45 additions and 22 deletions
  1. +1
    -1
      bitshift/__init__.py
  2. +6
    -3
      bitshift/codelet.py
  3. +6
    -0
      bitshift/crawler/__init__.py
  4. +10
    -4
      bitshift/crawler/crawler.py
  5. +22
    -14
      bitshift/crawler/git_indexer.py

+ 1
- 1
bitshift/__init__.py View File

@@ -1 +1 @@
from . import assets, codelet, config, database, parser, query
from . import assets, codelet, config, database, parser, query, crawler

+ 6
- 3
bitshift/codelet.py View File

@@ -4,10 +4,11 @@ class Codelet(object):
""" """
A source-code object with code metadata and composition analysis. A source-code object with code metadata and composition analysis.


:ivar name: (str) A suitable name for the codelet.
:ivar code: (str) A containing the raw source code. :ivar code: (str) A containing the raw source code.
:ivar filename: (str, or None) The filename of the snippet. :ivar filename: (str, or None) The filename of the snippet.
:ivar language: (str, or None) The inferred language of `code`. :ivar language: (str, or None) The inferred language of `code`.
:ivar authors: (array of str tuple) An array of tuples containing an
:ivar authors: (array of str tuples) An array of tuples containing an
author's name and profile URL (on the service the code was pulled from). author's name and profile URL (on the service the code was pulled from).
:ivar code_url: (str) The url of the (page containing the) source code. :ivar code_url: (str) The url of the (page containing the) source code.
:ivar date_created: (str, or None) The date the code was published. :ivar date_created: (str, or None) The date the code was published.
@@ -19,6 +20,7 @@ class Codelet(object):
""" """
Create a Codelet instance. Create a Codelet instance.


:param name: The name of the codelet.
:param code: The raw source code. :param code: The raw source code.
:param filename: The filename of the code, if any. :param filename: The filename of the code, if any.
:param language: The inferred language. :param language: The inferred language.
@@ -28,16 +30,17 @@ class Codelet(object):
:param date_created: The date the code was published. :param date_created: The date the code was published.
:param date_modified: The date the code was last modified. :param date_modified: The date the code was last modified.


:type name: str
:type code: str :type code: str
:type filename: str, or None :type filename: str, or None
:type authors: array of str tuples, or None
:type language: str, or None :type language: str, or None
:type authors: array of str tuples, or None
:type code_url: str :type code_url: str
:type author_urls: str array, or none
:type date_created: str, or None :type date_created: str, or None
:type date_modified: str, or None :type date_modified: str, or None
""" """


self.name = name
self.code = code self.code = code
self.filename = filename self.filename = filename
self.language = language self.language = language


+ 6
- 0
bitshift/crawler/__init__.py View File

@@ -0,0 +1,6 @@
import crawler

__all__ = ["crawl"]

def crawl():
pass

+ 10
- 4
bitshift/crawler/crawler.py View File

@@ -8,8 +8,8 @@ import requests, time


import git_indexer import git_indexer


from .codelet import Codelet
from .database import Database
from ..codelet import Codelet
from ..database import Database


def github(): def github():
""" """
@@ -26,14 +26,20 @@ def github():
"client_id" : "436cb884ae09be7f2a4e", "client_id" : "436cb884ae09be7f2a4e",
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
} }
api_request_interval = 5e3 / 60 ** 2


while len(next_api_url) > 0: while len(next_api_url) > 0:
start_time = time.time()
response = requests.get(next_api_url, params=authentication_params) response = requests.get(next_api_url, params=authentication_params)


for repo in response.json(): for repo in response.json():
index_repository(repo["html_url"], framework)
print repo["id"]


if int(response.headers["x-ratelimit-remaining"]) == 0: if int(response.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())


next_api_url = requests.headers["link"].split(">")[0][1:]
next_api_url = response.headers["link"].split(">")[0][1:]

sleep_time = api_request_interval - (time.time() - start_time)
if sleep_time > 0:
time.sleep(sleep_time)

+ 22
- 14
bitshift/crawler/git_indexer.py View File

@@ -6,7 +6,8 @@


import fileinput, subprocess, os import fileinput, subprocess, os


from .database import Database
from ..database import Database
from ..codelet import Codelet


def index_repository(repo_url, framework_name): def index_repository(repo_url, framework_name):
""" """
@@ -21,20 +22,25 @@ def index_repository(repo_url, framework_name):
subprocess.call("git clone %s" % repo_url, shell=True) subprocess.call("git clone %s" % repo_url, shell=True)
os.chdir(repo_name) os.chdir(repo_name)


codelets = []
commits_meta = _get_commits_metadata() commits_meta = _get_commits_metadata()
for filename in commits_meta.keys(): for filename in commits_meta.keys():
with open(filename, "r") as source_file: with open(filename, "r") as source_file:
source = source_file.read() source = source_file.read()


authors = [(author,) for author in commits_meta["authors"]]
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url),
framework_name, commits_meta["time_created"],
commits_meta["time_last_modified"])
Database.insert(codelet)
authors = [(author,) for author in commits_meta[filename]["authors"]]
codelets.append(
Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url,
framework_name),
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"]))

# Database.insert(codelet)


os.chdir("..") os.chdir("..")
subprocess.call("rm -rf %s" % repo_name, shell=True) subprocess.call("rm -rf %s" % repo_name, shell=True)
return codelets


def _generate_file_url(filename, repo_url, framework_name): def _generate_file_url(filename, repo_url, framework_name):
""" """
@@ -53,7 +59,7 @@ def _generate_file_url(filename, repo_url, framework_name):
""" """


if framework_name == "github": if framework_name == "github":
default branch = subprocess.check_output("git branch --no-color", \
default_branch = subprocess.check_output("git branch --no-color",
shell=True)[2:-1] shell=True)[2:-1]
return "%s/blob/%s/%s" % (repo_url, default_branch, filename) return "%s/blob/%s/%s" % (repo_url, default_branch, filename)


@@ -77,9 +83,9 @@ def _get_git_commits():
:rtype: dictionary :rtype: dictionary
""" """


git_log_cmd = ("git --no-pager --no-color log --name-only "
"--pretty=format:'%n%n%an%n%at' -z")
git_log = subprocess.check_output(git_log_cmd, shell=True)
git_log = subprocess.check_output(
("git --no-pager log --name-only"
" --pretty=format:'%n%n%an%n%at' -z"), shell=True)


commits = [] commits = []
for commit in git_log.split("\n\n"): for commit in git_log.split("\n\n"):
@@ -105,8 +111,9 @@ def _get_tracked_files():
:rtype: str array :rtype: str array
""" """


tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if \
-f && -T }' $(find . -type d -name .git -prune -o -print)", shell=True)
tracked_files = subprocess.check_output(
("perl -le 'for (@ARGV){ print if -f && -T }'"
" $(find . -type d -name .git -prune -o -print)"), shell=True)
return [filename[2:] for filename in tracked_files.split("\n")[:-1]] return [filename[2:] for filename in tracked_files.split("\n")[:-1]]


def _get_commits_metadata(): def _get_commits_metadata():
@@ -138,7 +145,8 @@ def _get_commits_metadata():
if filename not in files_meta.keys(): if filename not in files_meta.keys():
files_meta[filename] = { files_meta[filename] = {
"authors" : [commit["author"]], "authors" : [commit["author"]],
"time_last_modified" : commit["timestamp"]
"time_last_modified" : commit["timestamp"],
"time_created" : commit["timestamp"]
} }
else: else:
if commit["author"] not in files_meta[filename]["authors"]: if commit["author"] not in files_meta[filename]["authors"]:


Loading…
Cancel
Save