瀏覽代碼

Add tested indexer.

Add:
    bitshift/crawler/indexer.py
        -add _debug().
        -add content to the module docstring; add documentation to GitIndexer,
        and the functions that were lacking it.
        -add another perl one-liner to supplement the `git clone` subprocess
        call, which terminates it after a set amount of time (should it have
        frozen) -- fixes a major bug that caused the entire indexer to hang.
tags/v1.0^2
Severyn Kozak 10 年之前
父節點
當前提交
627c848f20
共有 3 個檔案被更改,包括 120 行新增63 行删除
  1. +4
    -5
      bitshift/crawler/__init__.py
  2. +7
    -18
      bitshift/crawler/crawler.py
  3. +109
    -40
      bitshift/crawler/indexer.py

+ 4
- 5
bitshift/crawler/__init__.py 查看文件

@@ -6,8 +6,7 @@ Contains functions for initializing all subsidiary, threaded crawlers.

import Queue

from bitshift.crawler import crawler
from bitshift.crawler import git_indexer
from bitshift.crawler import crawler, indexer

__all__ = ["crawl"]

@@ -19,12 +18,12 @@ def crawl():

Start the:
1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler`
2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer`
2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`
"""

repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
github_crawler = crawler.GitHubCrawler(repository_queue)
indexer = git_indexer.GitIndexer(repository_queue)
git_indexer = indexer.GitIndexer(repository_queue)

for thread in [github_crawler, indexer]:
for thread in [github_crawler, git_indexer]:
thread.start()

+ 7
- 18
bitshift/crawler/crawler.py 查看文件

@@ -1,12 +1,12 @@
"""
:synopsis: Main crawler module, to oversee all site-specific crawlers.

...more info soon...
Contains all website/framework-specific Class crawlers.
"""

import requests, time, threading

import bitshift.crawler.git_indexer
import bitshift.crawler.indexer

from ..codelet import Codelet
from ..database import Database
@@ -17,12 +17,12 @@ class GitHubCrawler(threading.Thread):

GitHubCrawler is a threaded singleton that queries GitHub's API for URLs
to its public repositories, which it inserts into a :class:`Queue.Queue`
shared with :class:`bitshift.crawler.git_indexer.GitIndexer`.
shared with :class:`bitshift.crawler.indexer.GitIndexer`.

:ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with
repository information retrieved by `GitHubCrawler`, and other Git
crawlers, to be processed by
:class:`bitshift.crawler.git_indexer.GitIndexer`.
:class:`bitshift.crawler.indexer.GitIndexer`.
"""

def __init__(self, repository_queue):
@@ -31,7 +31,7 @@ class GitHubCrawler(threading.Thread):

:param repository_queue: A queue containing dictionaries of repository
metadata retrieved by `GitHubCrawler`, meant to be processed by an
instance of :class:`bitshift.crawler.git_indexer.GitIndexer`.
instance of :class:`bitshift.crawler.indexer.GitIndexer`.

.. code-block:: python
sample_dict = {
@@ -43,7 +43,6 @@ class GitHubCrawler(threading.Thread):
:type repository_queue: :class:`Queue.Queue`
"""


self.repository_queue = repository_queue
super(GitHubCrawler, self).__init__()

@@ -65,26 +64,16 @@ class GitHubCrawler(threading.Thread):
api_request_interval = 5e3 / 60 ** 2

while len(next_api_url) > 0:
# DEBUG
db.log.insert({
"time" : str(time.time()).split(".")[0][-4:],
"qsize" : self.repository_queue.qsize()
})

start_time = time.time()
response = requests.get(next_api_url, params=authentication_params)

for repo in response.json():
logging.basicConfig(filename="crawler.log", level=logging.DEBUG)
logging.debug("crawler: %-20s: %-5s: %-5s: %s",
str(time.time()).split(".")[0],
self.repository_queue.qsize(), repo["id"],
repo["name"])
while self.repository_queue.full():
pass

self.repository_queue.put({
"url" : repo["html_url"],
"name" : repo["html_url"].split("/")[-1],
"name" : repo["name"],
"framework_name" : "GitHub"
})



+ 109
- 40
bitshift/crawler/indexer.py 查看文件

@@ -1,28 +1,60 @@
"""
:synopsis: Index all the files in a Git repository.

.. todo::
Add documentation, threaded Indexer class.
:synopsis: Contains a singleton GitIndexer class, which clones and indexes git
repositories.
"""

import os, shutil, subprocess, threading
import bs4, os, re, shutil, subprocess, threading

from ..database import Database
from ..codelet import Codelet

GIT_CLONE_DIR = "/tmp"
GIT_CLONE_DIR = "/tmp/bitshift"

class GitIndexer(threading.Thread):
"""
A singleton Git repository indexer.

`GitIndexer` clones and indexes the repositories at urls found by the
:mod:`bitshift.crawler.crawler` Git crawlers.

:ivar repository_queue: (:class:`Queue.Queue`) A queue containing urls found
by the :mod:`bitshift.crawler.crawler` Git crawlers.
"""

def __init__(self, repository_queue):
"""
Create an instance of the singleton `GitIndexer`.

:param repository_queue: see :attr:`GitIndexer.repository_queue`

:type repository_queue: see :attr:`GitIndexer.repository_queue`
"""

self.repository_queue = repository_queue
super(GitIndexer, self).__init__()

def run(self):
"""
Retrieve new repository urls, clone, and index them.

Blocks until new urls appear in :attr:`GitIndexer.repository_queue`,
then retrieves one, and attempts cloning/indexing it. Should any errors
occur, the new repository will be discarded and the crawler will
index the next in the queue.
"""

while True:
while self.repository_queue.empty():
pass
new_repo = self.repository_queue.get()
_index_repository(new_repo["url"], new_repo["framework_name"])

repo = self.repository_queue.get()
self.repository_queue.task_done()

try:
_index_repository(repo["url"], repo["name"],
repo["framework_name"])
except: # desperate times -- will be modified later
pass

class _ChangeDir(object):
"""
@@ -62,7 +94,7 @@ class _ChangeDir(object):

os.chdir(self.old_path)

def _index_repository(repo_url, framework_name):
def _index_repository(repo_url, repo_name, framework_name):
"""
Clone and index (create and insert Codeletes for) a Git repository.

@@ -70,32 +102,30 @@ def _index_repository(repo_url, framework_name):
_insert_repository_codelets, then remove said repository.

:param repo_url: The url the Git repository was cloned from.
:param repo_name: The name of the repository.
:param framework_name: The name of the framework the repository is from.

:type repo_url: str
:type repo_name: str
:type framework_name: str

:return: Temporary: the new codelets, for testing purposes.
:rtype: Codelet array
"""

repo_name = repo_url.split("/")[-1]
codelets = []
GIT_CLONE_TIMEOUT = 60

with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
subprocess.call("git clone %s" % repo_url, shell=True)
if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \
clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0:
return

with _ChangeDir(repo_name) as repository_dir:
codelets = _insert_repository_codelets(repo_url, repo_name,
framework_name)
_insert_repository_codelets(repo_url, repo_name, framework_name)
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))

return codelets

def _insert_repository_codelets(repo_url, repo_name, framework_name):
"""
Create a Codelet for the files inside a Git repository.
Create and insert a Codelet for the files inside a Git repository.

Create a new Codelet, and insert it into the Database singlet, for every
Create a new Codelet, and insert it into the Database singleton, for every
file inside the current working directory's default branch (usually
*master*).

@@ -108,21 +138,27 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name):
:type framework_name: str
"""

codelets = []
commits_meta = _get_commits_metadata()
for filename in commits_meta.keys():
with open(filename, "r") as source_file:
source = source_file.read()
source = _decode(source_file.read())
if source is None:
return

authors = [(author,) for author in commits_meta[filename]["authors"]]
codelets.append(
Codelet("%s:%s" % (repo_name, filename), source, filename,
authors = [(_decode(author),) for author in \
commits_meta[filename]["authors"]]
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url,
framework_name),
framework_name),
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"]))
commits_meta[filename]["time_last_modified"])

return codelets
db.codelets.insert({
"name" : codelet.name,
"authors" : codelet.authors
})

# Database.insert(codelet)

def _generate_file_url(filename, repo_url, framework_name):
"""
@@ -142,7 +178,7 @@ def _generate_file_url(filename, repo_url, framework_name):

if framework_name == "GitHub":
default_branch = subprocess.check_output("git branch --no-color",
shell=True)[2:-1]
shell=True)[2:-1]
return "%s/blob/%s/%s" % (repo_url, default_branch, filename)

def _get_git_commits():
@@ -165,8 +201,7 @@ def _get_git_commits():
:rtype: dictionary
"""

git_log = subprocess.check_output(
("git --no-pager log --name-only"
git_log = subprocess.check_output(("git --no-pager log --name-only"
" --pretty=format:'%n%n%an%n%at' -z"), shell=True)

commits = []
@@ -183,24 +218,34 @@ def _get_git_commits():

def _get_tracked_files():
"""
Return a list of the filenames of all files in the Git repository.
Return a list of the filenames of all valuable files in the Git repository.

Get a list of the filenames of the non-binary (Perl heuristics used for
filetype identification) files currently inside the current working
directory's Git repository.
directory's Git repository. Then, weed out any boilerplate/non-code files
that match the regex rules in GIT_IGNORE_FILES.

:return: The filenames of all non-binary files.
:return: The filenames of all index-worthy non-binary files.
:rtype: str array
"""

tracked_files = subprocess.check_output(
("perl -le 'for (@ARGV){ print if -f && -T }'"
" $(find . -type d -name .git -prune -o -print)"), shell=True)
return [filename[2:] for filename in tracked_files.split("\n")[:-1]]
GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"]

tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \
-f && -T }' $(find . -type d -name .git -prune -o -print)"),
shell=True).split("\n")[:-1]

valuable_files = []
for filename in tracked_files:
filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
for pattern in GIT_IGNORE_FILES])
if not filename_match:
valuable_files.append(filename[2:])
return valuable_files

def _get_commits_metadata():
"""
Return a dictionary containing every tracked file's metadata.
Return a dictionary containing every valuable tracked file's metadata.

:return: A dictionary with author names, time of creation, and time of last
modification for every filename key.
@@ -236,3 +281,27 @@ def _get_commits_metadata():
files_meta[filename]["time_created"] = commit["timestamp"]

return files_meta

def _decode(raw):
"""
Return a decoded a raw string.

:param raw: The string to string.

:type raw: (str)

:return: If the original encoding is successfully inferenced, return the
decoded string.
:rtype: str, or None

.. warning::
The raw string's original encoding is identified by heuristics which
can, and occasionally will, fail. Decoding will then fail, and None
will be returned.
"""

try:
return raw.decode(bs4.BeautifulSoup(raw).original_encoding)

except (UnicodeDecodeError, UserWarning):
return None

Loading…
取消
儲存