Browse Source

Add tested indexer.

Add:
    bitshift/crawler/indexer.py
        -add _debug().
        -add content to the module docstring; add documentation to GitIndexer,
        and the functions that were lacking it.
        -add another perl one-liner to supplement the `git clone` subprocess
        call, which terminates it after a set amount of time (should it have
        frozen) -- fixes a major bug that caused the entire indexer to hang.
tags/v1.0^2
Severyn Kozak 10 years ago
parent
commit
627c848f20
3 changed files with 120 additions and 63 deletions
  1. +4
    -5
      bitshift/crawler/__init__.py
  2. +7
    -18
      bitshift/crawler/crawler.py
  3. +109
    -40
      bitshift/crawler/indexer.py

+ 4
- 5
bitshift/crawler/__init__.py View File

@@ -6,8 +6,7 @@ Contains functions for initializing all subsidiary, threaded crawlers.


import Queue import Queue


from bitshift.crawler import crawler
from bitshift.crawler import git_indexer
from bitshift.crawler import crawler, indexer


__all__ = ["crawl"] __all__ = ["crawl"]


@@ -19,12 +18,12 @@ def crawl():


Start the: Start the:
1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler` 1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler`
2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer`
2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`
""" """


repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
github_crawler = crawler.GitHubCrawler(repository_queue) github_crawler = crawler.GitHubCrawler(repository_queue)
indexer = git_indexer.GitIndexer(repository_queue)
git_indexer = indexer.GitIndexer(repository_queue)


for thread in [github_crawler, indexer]:
for thread in [github_crawler, git_indexer]:
thread.start() thread.start()

+ 7
- 18
bitshift/crawler/crawler.py View File

@@ -1,12 +1,12 @@
""" """
:synopsis: Main crawler module, to oversee all site-specific crawlers. :synopsis: Main crawler module, to oversee all site-specific crawlers.


...more info soon...
Contains all website/framework-specific Class crawlers.
""" """


import requests, time, threading import requests, time, threading


import bitshift.crawler.git_indexer
import bitshift.crawler.indexer


from ..codelet import Codelet from ..codelet import Codelet
from ..database import Database from ..database import Database
@@ -17,12 +17,12 @@ class GitHubCrawler(threading.Thread):


GitHubCrawler is a threaded singleton that queries GitHub's API for URLs GitHubCrawler is a threaded singleton that queries GitHub's API for URLs
to its public repositories, which it inserts into a :class:`Queue.Queue` to its public repositories, which it inserts into a :class:`Queue.Queue`
shared with :class:`bitshift.crawler.git_indexer.GitIndexer`.
shared with :class:`bitshift.crawler.indexer.GitIndexer`.


:ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with :ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with
repository information retrieved by `GitHubCrawler`, and other Git repository information retrieved by `GitHubCrawler`, and other Git
crawlers, to be processed by crawlers, to be processed by
:class:`bitshift.crawler.git_indexer.GitIndexer`.
:class:`bitshift.crawler.indexer.GitIndexer`.
""" """


def __init__(self, repository_queue): def __init__(self, repository_queue):
@@ -31,7 +31,7 @@ class GitHubCrawler(threading.Thread):


:param repository_queue: A queue containing dictionaries of repository :param repository_queue: A queue containing dictionaries of repository
metadata retrieved by `GitHubCrawler`, meant to be processed by an metadata retrieved by `GitHubCrawler`, meant to be processed by an
instance of :class:`bitshift.crawler.git_indexer.GitIndexer`.
instance of :class:`bitshift.crawler.indexer.GitIndexer`.


.. code-block:: python .. code-block:: python
sample_dict = { sample_dict = {
@@ -43,7 +43,6 @@ class GitHubCrawler(threading.Thread):
:type repository_queue: :class:`Queue.Queue` :type repository_queue: :class:`Queue.Queue`
""" """



self.repository_queue = repository_queue self.repository_queue = repository_queue
super(GitHubCrawler, self).__init__() super(GitHubCrawler, self).__init__()


@@ -65,26 +64,16 @@ class GitHubCrawler(threading.Thread):
api_request_interval = 5e3 / 60 ** 2 api_request_interval = 5e3 / 60 ** 2


while len(next_api_url) > 0: while len(next_api_url) > 0:
# DEBUG
db.log.insert({
"time" : str(time.time()).split(".")[0][-4:],
"qsize" : self.repository_queue.qsize()
})

start_time = time.time() start_time = time.time()
response = requests.get(next_api_url, params=authentication_params) response = requests.get(next_api_url, params=authentication_params)


for repo in response.json(): for repo in response.json():
logging.basicConfig(filename="crawler.log", level=logging.DEBUG)
logging.debug("crawler: %-20s: %-5s: %-5s: %s",
str(time.time()).split(".")[0],
self.repository_queue.qsize(), repo["id"],
repo["name"])
while self.repository_queue.full(): while self.repository_queue.full():
pass pass

self.repository_queue.put({ self.repository_queue.put({
"url" : repo["html_url"], "url" : repo["html_url"],
"name" : repo["html_url"].split("/")[-1],
"name" : repo["name"],
"framework_name" : "GitHub" "framework_name" : "GitHub"
}) })




+ 109
- 40
bitshift/crawler/indexer.py View File

@@ -1,28 +1,60 @@
""" """
:synopsis: Index all the files in a Git repository.

.. todo::
Add documentation, threaded Indexer class.
:synopsis: Contains a singleton GitIndexer class, which clones and indexes git
repositories.
""" """


import os, shutil, subprocess, threading
import bs4, os, re, shutil, subprocess, threading


from ..database import Database from ..database import Database
from ..codelet import Codelet from ..codelet import Codelet


GIT_CLONE_DIR = "/tmp"
GIT_CLONE_DIR = "/tmp/bitshift"


class GitIndexer(threading.Thread): class GitIndexer(threading.Thread):
"""
A singleton Git repository indexer.

`GitIndexer` clones and indexes the repositories at urls found by the
:mod:`bitshift.crawler.crawler` Git crawlers.

:ivar repository_queue: (:class:`Queue.Queue`) A queue containing urls found
by the :mod:`bitshift.crawler.crawler` Git crawlers.
"""

def __init__(self, repository_queue): def __init__(self, repository_queue):
"""
Create an instance of the singleton `GitIndexer`.

:param repository_queue: see :attr:`GitIndexer.repository_queue`

:type repository_queue: see :attr:`GitIndexer.repository_queue`
"""

self.repository_queue = repository_queue self.repository_queue = repository_queue
super(GitIndexer, self).__init__() super(GitIndexer, self).__init__()


def run(self): def run(self):
"""
Retrieve new repository urls, clone, and index them.

Blocks until new urls appear in :attr:`GitIndexer.repository_queue`,
then retrieves one, and attempts cloning/indexing it. Should any errors
occur, the new repository will be discarded and the crawler will
index the next in the queue.
"""

while True: while True:
while self.repository_queue.empty(): while self.repository_queue.empty():
pass pass
new_repo = self.repository_queue.get()
_index_repository(new_repo["url"], new_repo["framework_name"])

repo = self.repository_queue.get()
self.repository_queue.task_done()

try:
_index_repository(repo["url"], repo["name"],
repo["framework_name"])
except: # desperate times -- will be modified later
pass


class _ChangeDir(object): class _ChangeDir(object):
""" """
@@ -62,7 +94,7 @@ class _ChangeDir(object):


os.chdir(self.old_path) os.chdir(self.old_path)


def _index_repository(repo_url, framework_name):
def _index_repository(repo_url, repo_name, framework_name):
""" """
Clone and index (create and insert Codeletes for) a Git repository. Clone and index (create and insert Codeletes for) a Git repository.


@@ -70,32 +102,30 @@ def _index_repository(repo_url, framework_name):
_insert_repository_codelets, then remove said repository. _insert_repository_codelets, then remove said repository.


:param repo_url: The url the Git repository was cloned from. :param repo_url: The url the Git repository was cloned from.
:param repo_name: The name of the repository.
:param framework_name: The name of the framework the repository is from. :param framework_name: The name of the framework the repository is from.


:type repo_url: str :type repo_url: str
:type repo_name: str
:type framework_name: str :type framework_name: str

:return: Temporary: the new codelets, for testing purposes.
:rtype: Codelet array
""" """


repo_name = repo_url.split("/")[-1]
codelets = []
GIT_CLONE_TIMEOUT = 60


with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
subprocess.call("git clone %s" % repo_url, shell=True)
if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \
clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0:
return

with _ChangeDir(repo_name) as repository_dir: with _ChangeDir(repo_name) as repository_dir:
codelets = _insert_repository_codelets(repo_url, repo_name,
framework_name)
_insert_repository_codelets(repo_url, repo_name, framework_name)
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))


return codelets

def _insert_repository_codelets(repo_url, repo_name, framework_name): def _insert_repository_codelets(repo_url, repo_name, framework_name):
""" """
Create a Codelet for the files inside a Git repository.
Create and insert a Codelet for the files inside a Git repository.


Create a new Codelet, and insert it into the Database singlet, for every
Create a new Codelet, and insert it into the Database singleton, for every
file inside the current working directory's default branch (usually file inside the current working directory's default branch (usually
*master*). *master*).


@@ -108,21 +138,27 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name):
:type framework_name: str :type framework_name: str
""" """


codelets = []
commits_meta = _get_commits_metadata() commits_meta = _get_commits_metadata()
for filename in commits_meta.keys(): for filename in commits_meta.keys():
with open(filename, "r") as source_file: with open(filename, "r") as source_file:
source = source_file.read()
source = _decode(source_file.read())
if source is None:
return


authors = [(author,) for author in commits_meta[filename]["authors"]]
codelets.append(
Codelet("%s:%s" % (repo_name, filename), source, filename,
authors = [(_decode(author),) for author in \
commits_meta[filename]["authors"]]
codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
None, authors, _generate_file_url(filename, repo_url, None, authors, _generate_file_url(filename, repo_url,
framework_name),
framework_name),
commits_meta[filename]["time_created"], commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"]))
commits_meta[filename]["time_last_modified"])


return codelets
db.codelets.insert({
"name" : codelet.name,
"authors" : codelet.authors
})

# Database.insert(codelet)


def _generate_file_url(filename, repo_url, framework_name): def _generate_file_url(filename, repo_url, framework_name):
""" """
@@ -142,7 +178,7 @@ def _generate_file_url(filename, repo_url, framework_name):


if framework_name == "GitHub": if framework_name == "GitHub":
default_branch = subprocess.check_output("git branch --no-color", default_branch = subprocess.check_output("git branch --no-color",
shell=True)[2:-1]
shell=True)[2:-1]
return "%s/blob/%s/%s" % (repo_url, default_branch, filename) return "%s/blob/%s/%s" % (repo_url, default_branch, filename)


def _get_git_commits(): def _get_git_commits():
@@ -165,8 +201,7 @@ def _get_git_commits():
:rtype: dictionary :rtype: dictionary
""" """


git_log = subprocess.check_output(
("git --no-pager log --name-only"
git_log = subprocess.check_output(("git --no-pager log --name-only"
" --pretty=format:'%n%n%an%n%at' -z"), shell=True) " --pretty=format:'%n%n%an%n%at' -z"), shell=True)


commits = [] commits = []
@@ -183,24 +218,34 @@ def _get_git_commits():


def _get_tracked_files(): def _get_tracked_files():
""" """
Return a list of the filenames of all files in the Git repository.
Return a list of the filenames of all valuable files in the Git repository.


Get a list of the filenames of the non-binary (Perl heuristics used for Get a list of the filenames of the non-binary (Perl heuristics used for
filetype identification) files currently inside the current working filetype identification) files currently inside the current working
directory's Git repository.
directory's Git repository. Then, weed out any boilerplate/non-code files
that match the regex rules in GIT_IGNORE_FILES.


:return: The filenames of all non-binary files.
:return: The filenames of all index-worthy non-binary files.
:rtype: str array :rtype: str array
""" """


tracked_files = subprocess.check_output(
("perl -le 'for (@ARGV){ print if -f && -T }'"
" $(find . -type d -name .git -prune -o -print)"), shell=True)
return [filename[2:] for filename in tracked_files.split("\n")[:-1]]
GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"]

tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \
-f && -T }' $(find . -type d -name .git -prune -o -print)"),
shell=True).split("\n")[:-1]

valuable_files = []
for filename in tracked_files:
filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
for pattern in GIT_IGNORE_FILES])
if not filename_match:
valuable_files.append(filename[2:])
return valuable_files


def _get_commits_metadata(): def _get_commits_metadata():
""" """
Return a dictionary containing every tracked file's metadata.
Return a dictionary containing every valuable tracked file's metadata.


:return: A dictionary with author names, time of creation, and time of last :return: A dictionary with author names, time of creation, and time of last
modification for every filename key. modification for every filename key.
@@ -236,3 +281,27 @@ def _get_commits_metadata():
files_meta[filename]["time_created"] = commit["timestamp"] files_meta[filename]["time_created"] = commit["timestamp"]


return files_meta return files_meta

def _decode(raw):
"""
Return a decoded a raw string.

:param raw: The string to string.

:type raw: (str)

:return: If the original encoding is successfully inferenced, return the
decoded string.
:rtype: str, or None

.. warning::
The raw string's original encoding is identified by heuristics which
can, and occasionally will, fail. Decoding will then fail, and None
will be returned.
"""

try:
return raw.decode(bs4.BeautifulSoup(raw).original_encoding)

except (UnicodeDecodeError, UserWarning):
return None

Loading…
Cancel
Save