Browse Source

Merge branch 'develop' of github.com:earwig/bitshift into develop

Conflicts:
	app.py
	setup.py
tags/v1.0^2
Ben Kurtovic 10 years ago
parent
commit
2cf98df3e2
8 changed files with 825 additions and 25 deletions
  1. +1
    -0
      app.py
  2. +1
    -1
      bitshift/__init__.py
  3. +3
    -0
      bitshift/assets.py
  4. +35
    -23
      bitshift/codelet.py
  5. +55
    -0
      bitshift/crawler/__init__.py
  6. +240
    -0
      bitshift/crawler/crawler.py
  7. +489
    -0
      bitshift/crawler/indexer.py
  8. +1
    -1
      setup.py

+ 1
- 0
app.py View File

@@ -5,6 +5,7 @@ Module to contain all the project's Flask server plumbing.
from flask import Flask from flask import Flask
from flask import render_template, session from flask import render_template, session


from bitshift import assets
from bitshift.database import Database from bitshift.database import Database
from bitshift.query import parse_query from bitshift.query import parse_query




+ 1
- 1
bitshift/__init__.py View File

@@ -1 +1 @@
from . import assets, codelet, config, database, parser, query
from . import assets, codelet, config, database, parser, query, crawler

+ 3
- 0
bitshift/assets.py View File

@@ -15,8 +15,11 @@ def tag(filename):


:param filename: The filename of the asset to create a tag for. :param filename: The filename of the asset to create a tag for.


:type filename: str

:return: A string containing a `<source>` tag for JS files, and a `<link>` :return: A string containing a `<source>` tag for JS files, and a `<link>`
for CSS files. for CSS files.
:rtype: str
""" """


file_ext = filename.split(".")[-1] file_ext = filename.split(".")[-1]


+ 35
- 23
bitshift/codelet.py View File

@@ -4,42 +4,54 @@ class Codelet(object):
""" """
A source-code object with code metadata and composition analysis. A source-code object with code metadata and composition analysis.


:ivar name: (str) A suitable name for the codelet.
:ivar code: (str) A containing the raw source code. :ivar code: (str) A containing the raw source code.
:ivar filename: (str, or None) The filename of the snippet. :ivar filename: (str, or None) The filename of the snippet.
:ivar language: (str, or None) The inferred language of `code`.
:ivar author: (str, or None) The name of the code's author.
:ivar url: (str) The url of the (page containing the) source code.
:ivar date_created: (str, or None) The date the code was published.
:ivar date_modified: (str, or None) The date the code was last modified.
:ivar language: (int, or None) The inferred language of `code`.
:ivar authors: (array of tuples (str, str or None)) An array of tuples
containing an author's name and profile URL (on the service the code
was pulled from).
:ivar code_url: (str) The url of the (page containing the) source code.
:ivar date_created: (:class:`datetime.datetime`, or None) The date the code
was published.
:ivar date_modified: (:class:`datetime.datetime`, or None) The date the
code was last modified.
:ivar rank: (float) A quanitification of the source code's quality, as
per available ratings (stars, forks, upvotes, etc.).
""" """


def __init__(self, code, filename, author, language, code_url, author_url,
date_created, date_modified):
def __init__(self, name, code, filename, language, authors, code_url,
date_created, date_modified, rank):
""" """
Create a Codelet instance. Create a Codelet instance.


:param code: The raw source code.
:param filename: The filename of the code, if any.
:param author: The author of the code.
:param language: The inferred language.
:param code_url: The url of the (page containing the) source code.
:param date_created: The date the code was published.
:param date_modified: The date the code was last modified.
:param name: see :attr:`self.name`
:param code: see :attr:`self.code`
:param filename: see :attr:`self.filename`
:param language: see :attr:`self.language`
:param authors: see :attr:`self.authors`
:param code_url: see :attr:`self.code_url`
:param date_created: see :attr:`self.date_created`
:param date_modified: see :attr:`self.date_modified`
:param rank: see :attr:`self.rank`


:type code: str
:type filename: str, or None
:type language: str, or None
:type author: str, or None
:type url: str
:type date_created: str, or None
:type date_modified: str, or None
:type name: see :attr:`self.name`
:type code: see :attr:`self.code`
:type filename: see :attr:`self.filename`
:type language: see :attr:`self.language`
:type authors: see :attr:`self.authors`
:type code_url: see :attr:`self.code_url`
:type date_created: see :attr:`self.date_created`
:type date_modified: see :attr:`self.date_modified`
:type rank: see :attr:`self.rank`
""" """


self.name = name
self.code = code self.code = code
self.filename = filename self.filename = filename
self.author = author
self.language = language self.language = language
self.authors = authors
self.code_url = code_url self.code_url = code_url
self.author_url = author_url
self.date_created = date_created self.date_created = date_created
self.date_modified = date_modified self.date_modified = date_modified
self.rank = rank

+ 55
- 0
bitshift/crawler/__init__.py View File

@@ -0,0 +1,55 @@
"""
:synopsis: Parent crawler module, which supervises all crawlers.

Contains functions for initializing all subsidiary, threaded crawlers.
"""

import logging, logging.handlers, os, Queue

from bitshift.crawler import crawler, indexer

__all__ = ["crawl"]

def crawl():
"""
Initialize all crawlers (and indexers).

Start the:
1. GitHub crawler, :class:`crawler.GitHubCrawler`.
2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`.
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
"""

_configure_logging()

MAX_URL_QUEUE_SIZE = 5e3

repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
threads = [crawler.GitHubCrawler(repo_clone_queue),
crawler.BitbucketCrawler(repo_clone_queue),
indexer.GitIndexer(repo_clone_queue)]

for thread in threads:
thread.start()

def _configure_logging():
LOG_FILE_DIR = "log"

if not os.path.exists(LOG_FILE_DIR):
os.mkdir(LOG_FILE_DIR)

logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)

formatter = logging.Formatter(
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S")

handler = logging.handlers.TimedRotatingFileHandler(
"%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1,
backupCount=20)
handler.setFormatter(formatter)

root_logger = logging.getLogger()
root_logger.addHandler(handler)
root_logger.setLevel(logging.NOTSET)

+ 240
- 0
bitshift/crawler/crawler.py View File

@@ -0,0 +1,240 @@
"""
:synopsis: Main crawler module, to oversee all site-specific crawlers.

Contains all website/framework-specific Class crawlers.
"""

import logging, requests, time, threading

from bitshift.crawler import indexer

from ..codelet import Codelet
from ..database import Database

class GitHubCrawler(threading.Thread):
"""
Crawler that retrieves links to all of GitHub's public repositories.

GitHubCrawler is a threaded singleton that queries GitHub's API for urls
to its public repositories, which it inserts into a :class:`Queue.Queue`
shared with :class:`indexer.GitIndexer`.

:ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository`
with repository metadata retrieved by :class:`GitHubCrawler`, and other Git
crawlers, to be processed by :class:`indexer.GitIndexer`.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
"""

AUTHENTICATION = {
"client_id" : "436cb884ae09be7f2a4e",
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
}

def __init__(self, clone_queue):
"""
Create an instance of the singleton `GitHubCrawler`.

:param clone_queue: see :attr:`self.clone_queue`

:type clone_queue: see :attr:`self.clone_queue`
"""

self.clone_queue = clone_queue
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")
super(GitHubCrawler, self).__init__(name=self.__class__.__name__)

def run(self):
"""
Query the GitHub API for data about every public repository.

Pull all of GitHub's repositories by making calls to its API in a loop,
accessing a subsequent page of results via the "next" URL returned in an
API response header. Uses Severyn Kozak's (sevko) authentication
credentials. For every new repository, a :class:`GitRepository` is
inserted into :attr:`self.clone_queue`.
"""

next_api_url = "https://api.github.com/repositories"
api_request_interval = 5e3 / 60 ** 2

while len(next_api_url) > 0:
start_time = time.time()

try:
resp = requests.get(next_api_url, params=self.AUTHENTICATION)
except ConnectionError as excep:
self._logger.warning("API %s call failed: %s: %s",
next_api_url, excep.__class__.__name__, excep)
time.sleep(0.5)
continue

queue_percent_full = (float(self.clone_queue.qsize()) /
self.clone_queue.maxsize) * 100
self._logger.info("API call made. Queue size: %d/%d, %d%%." %
((self.clone_queue.qsize(), self.clone_queue.maxsize,
queue_percent_full)))

repo_names = [repo["full_name"] for repo in resp.json()]
repo_stars = self._get_repositories_stars(repo_names)

for repo in resp.json():
while self.clone_queue.full():
time.sleep(1)

self.clone_queue.put(indexer.GitRepository(
repo["html_url"], repo["full_name"].replace("/", ""),
"GitHub", repo_stars[repo["full_name"]]))

if int(resp.headers["x-ratelimit-remaining"]) == 0:
time.sleep(int(resp.headers["x-ratelimit-reset"]) -
time.time())

next_api_url = resp.headers["link"].split(">")[0][1:]

sleep_time = api_request_interval - (time.time() - start_time)
if sleep_time > 0:
time.sleep(sleep_time)

def _get_repositories_stars(self, repo_names):
"""
Return the number of stargazers for several repositories.

Queries the GitHub API for the number of stargazers for any given
repositories, and blocks if the query limit is exceeded.

:param repo_names: An array of repository names, in
`username/repository_name` format.

:type repo_names: str

:return: A dictionary with repository name keys, and corresponding
stargazer count values.

Example dictionary:
.. code-block:: python
{
"user/repository" : 100
}

:rtype: dictionary
"""

API_URL = "https://api.github.com/search/repositories"
REPOS_PER_QUERY = 25

repo_stars = {}
for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in
xrange(0, len(repo_names), REPOS_PER_QUERY)]:
query_url = "%s?q=%s" % (API_URL,
"+".join("repo:%s" % name for name in names))

params = self.AUTHENTICATION
resp = requests.get(query_url,
params=params,
headers={
"Accept" : "application/vnd.github.preview"
})

if int(resp.headers["x-ratelimit-remaining"]) == 0:
sleep_time = int(resp.headers["x-ratelimit-reset"]) - \
time.time() + 1
if sleep_time > 0:
logging.info("API quota exceeded. Sleep time: %d." %
sleep_time)
time.sleep(sleep_time)

for repo in resp.json()["items"]:
rank = float(repo["stargazers_count"]) / 1000
repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0

for name in repo_names:
if name not in repo_stars:
repo_stars[name] = 0.5

return repo_stars

class BitbucketCrawler(threading.Thread):
"""
Crawler that retrieves links to all of Bitbucket's public repositories.

BitbucketCrawler is a threaded singleton that queries Bitbucket's API for
urls to its public repositories, and inserts them as
:class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with
:class:`indexer.GitIndexer`.

:ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert
:class:`indexer.GitRepository` repository urls into.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
"""

def __init__(self, clone_queue):
"""
Create an instance of the singleton `BitbucketCrawler`.

:param clone_queue: see :attr:`self.clone_queue`

:type clone_queue: see :attr:`self.clone_queue`
"""

self.clone_queue = clone_queue
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")
super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)

def run(self):
"""
Query the Bitbucket API for data about every public repository.

Query the Bitbucket API's "/repositories" endpoint and read its
paginated responses in a loop; any "git" repositories have their
clone-urls and names inserted into a :class:`indexer.GitRepository` in
:attr:`self.clone_queue`.
"""

next_api_url = "https://api.bitbucket.org/2.0/repositories"

while True:
try:
response = requests.get(next_api_url).json()
except ConnectionError as exception:
time.sleep(0.5)
self._logger.warning("API %s call failed: %s: %s",
next_api_url, excep.__class__.__name__, excep)
continue

queue_percent_full = (float(self.clone_queue.qsize()) /
self.clone_queue.maxsize) * 100
self._logger.info("API call made. Queue size: %d/%d, %d%%." %
((self.clone_queue.qsize(), self.clone_queue.maxsize,
queue_percent_full)))

for repo in response["values"]:
if repo["scm"] == "git":
while self.clone_queue.full():
time.sleep(1)

clone_links = repo["links"]["clone"]
clone_url = (clone_links[0]["href"] if
clone_links[0]["name"] == "https" else
clone_links[1]["href"])
links.append("clone_url")

try:
watchers = requests.get(
repo["links"]["watchers"]["href"])
rank = len(watchers.json()["values"]) / 100
except ConnectionError as exception:
time.sleep(0.5)
self._logger.warning("API %s call failed: %s: %s",
next_api_url, excep.__class__.__name__, excep)
continue

self.clone_queue.put(indexer.GitRepository(
clone_url, repo["full_name"], "Bitbucket"),
rank if rank < 1.0 else 1.0)

next_api_url = response["next"]
time.sleep(0.2)

+ 489
- 0
bitshift/crawler/indexer.py View File

@@ -0,0 +1,489 @@
"""
:synopsis: Contains a singleton GitIndexer class, which clones and indexes git
repositories.
"""

import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\
threading

from ..database import Database
from ..codelet import Codelet

GIT_CLONE_DIR = "/tmp/bitshift"
THREAD_QUEUE_SLEEP = 0.5

class GitRepository(object):
"""
A representation of a Git repository's metadata.

:ivar url: (str) The repository's url.
:ivar name: (str) The name of the repository.
:ivar framework_name: (str) The name of the online Git framework that the
repository belongs to (eg, GitHub, BitBucket).
:ivar rank: (float) The rank of the repository, as assigned by
:class:`crawler.GitHubCrawler`.
"""

def __init__(self, url, name, framework_name, rank):
"""
Create a GitRepository instance.

:param url: see :attr:`GitRepository.url`
:param name: see :attr:`GitRepository.name`
:param framework_name: see :attr:`GitRepository.framework_name`
:param rank: see :attr:`GitRepository.rank`

:type url: str
:type name: str
:type framework_name: str
:type rank: float
"""

self.url = url
self.name = name
self.framework_name = framework_name
self.rank = rank

class GitIndexer(threading.Thread):
"""
A singleton Git repository indexer.

:class:`GitIndexer` indexes the repositories cloned by the
:class:`_GitCloner` singleton.

:ivar index_queue: (:class:`Queue.Queue`) A queue containing
:class:`GitRepository` objects for every new repository succesfully
cloned by :class:`_GitCloner`, which are to be indexed.
:ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner,
which feeds :class:`GitIndexer`.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
"""

def __init__(self, clone_queue):
"""
Create an instance of the singleton `GitIndexer`.

:param clone_queue: see :attr:`self.index_queue`

:type index_queue: see :attr:`self.index_queue`
"""

MAX_INDEX_QUEUE_SIZE = 10

self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
self.git_cloner = _GitCloner(clone_queue, self.index_queue)
self.git_cloner.start()
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")

if not os.path.exists(GIT_CLONE_DIR):
os.makedirs(GIT_CLONE_DIR)

super(GitIndexer, self).__init__(name=self.__class__.__name__)

def run(self):
"""
Retrieve metadata about newly cloned repositories and index them.

Blocks until new repositories appear in :attr:`self.index_queue`, then
retrieves one, and attempts indexing it. Should any errors occur, the
new repository will be discarded and the indexer will index the next in
the queue.
"""

while True:
while self.index_queue.empty():
time.sleep(THREAD_QUEUE_SLEEP)

repo = self.index_queue.get()
self.index_queue.task_done()
try:
self._index_repository(repo)
except Exception as excep:
self._logger.warning("%s: %s.", excep.__class__.__name__, excep)

def _index_repository(self, repo):
"""
Clone and index (create and insert Codeletes for) a Git repository.

`git clone` the Git repository located at **repo.url**, call
`_insert_repository_codelets()`, then remove said repository.

:param repo_url: The metadata of the repository to be indexed.

:type repo_url: :class:`GitRepository`
"""

with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir:
try:
self._insert_repository_codelets(repo)
except Exception as excep:
self._logger.warning("%s: %s.", excep.__class__.__name__, excep)

if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))

def _insert_repository_codelets(self, repo):
"""
Create and insert a Codelet for the files inside a Git repository.

Create a new Codelet, and insert it into the Database singleton, for
every file inside the current working directory's default branch
(usually *master*).

:param repo_url: The metadata of the repository to be indexed.

:type repo_url: :class:`GitRepository`
"""

commits_meta = self._get_commits_metadata()
if commits_meta is None:
return

for filename in commits_meta.keys():
try:
with open(filename) as source_file:
source = self._decode(source_file.read())
if source is None:
continue
except IOError as exception:
continue

authors = [(self._decode(author), None) for author in \
commits_meta[filename]["authors"]]
codelet = Codelet("%s:%s" % (repo.name, filename), source, filename,
None, authors, self._generate_file_url(filename,
repo.url, repo.framework_name),
commits_meta[filename]["time_created"],
commits_meta[filename]["time_last_modified"],
repo.rank)

def _generate_file_url(self, filename, repo_url, framework_name):
"""
Return a url for a filename from a Git wrapper framework.

:param filename: The path of the file.
:param repo_url: The url of the file's parent repository.
:param framework_name: The name of the framework the repository is from.

:type filename: str
:type repo_url: str
:type framework_name: str

:return: The file's full url on the given framework, if successfully
derived.
:rtype: str, or None

.. warning::
Various Git subprocesses will occasionally fail, and, seeing as the
information they provide is a crucial component of some repository file
urls, None may be returned.
"""

try:
if framework_name == "GitHub":
default_branch = subprocess.check_output("git branch"
" --no-color", shell=True)[2:-1]
return ("%s/blob/%s/%s" % (repo_url, default_branch,
filename)).replace("//", "/")
elif framework_name == "Bitbucket":
commit_hash = subprocess.check_output("git rev-parse HEAD",
shell=True).replace("\n", "")
return ("%s/src/%s/%s" % (repo_url, commit_hash,
filename)).replace("//", "/")
except subprocess.CalledProcessError as exception:
return None

def _get_git_commits(self):
"""
Return the current working directory's formatted commit data.

Uses `git log` to generate metadata about every single file in the
repository's commit history.

:return: The author, timestamp, and names of all modified files of every
commit.
.. code-block:: python
sample_returned_array = [
{
"author" : (str) "author"
"timestamp" : (`datetime.datetime`) <object>,
"filenames" : (str array) ["file1", "file2"]
}
]
:rtype: array of dictionaries
"""

git_log = subprocess.check_output(("git --no-pager log --name-only"
" --pretty=format:'%n%n%an%n%at' -z"), shell=True)

commits = []
for commit in git_log.split("\n\n"):
fields = commit.split("\n")
if len(fields) > 2:
commits.append({
"author" : fields[0],
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
"filenames" : fields[2].split("\x00")[:-2]
})

return commits

def _get_tracked_files(self):
"""
Return a list of the filenames of all valuable files in the Git repository.

Get a list of the filenames of the non-binary (Perl heuristics used for
filetype identification) files currently inside the current working
directory's Git repository. Then, weed out any boilerplate/non-code files
that match the regex rules in GIT_IGNORE_FILES.

:return: The filenames of all index-worthy non-binary files.
:rtype: str array
"""

files = []
for dirname, subdir_names, filenames in os.walk("."):
for filename in filenames:
path = os.path.join(dirname, filename)
if self._is_ascii(path):
files.append(path[2:])

return files

def _get_commits_metadata(self):
"""
Return a dictionary containing every valuable tracked file's metadata.

:return: A dictionary with author names, time of creation, and time of last
modification for every filename key.
.. code-block:: python
sample_returned_dict = {
"my_file" : {
"authors" : (str array) ["author1", "author2"],
"time_created" : (`datetime.datetime`) <object>,
"time_last_modified" : (`datetime.datetime`) <object>
}
}
:rtype: dictionary of dictionaries
"""

commits = self._get_git_commits()
tracked_files = self._get_tracked_files()

files_meta = {}
for commit in commits:
for filename in commit["filenames"]:
if filename not in tracked_files:
continue

if filename not in files_meta.keys():
files_meta[filename] = {
"authors" : [commit["author"]],
"time_last_modified" : commit["timestamp"],
"time_created" : commit["timestamp"]
}
else:
if commit["author"] not in files_meta[filename]["authors"]:
files_meta[filename]["authors"].append(commit["author"])
files_meta[filename]["time_created"] = commit["timestamp"]

return files_meta

def _decode(self, raw):
"""
Return a decoded a raw string.

:param raw: The string to string.

:type raw: (str)

:return: If the original encoding is successfully inferenced, return the
decoded string.
:rtype: str, or None

.. warning::
The raw string's original encoding is identified by heuristics which
can, and occasionally will, fail. Decoding will then fail, and None
will be returned.
"""

try:
encoding = bs4.BeautifulSoup(raw).original_encoding
return raw.decode(encoding) if encoding is not None else None

except (LookupError, UnicodeDecodeError, UserWarning) as exception:
return None

def _is_ascii(self, filename):
"""
Heuristically determine whether a file is ASCII text or binary.

If a portion of the file contains null bytes, or the percentage of bytes
that aren't ASCII is greater than 30%, then the file is concluded to be
binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T`
operator, and is the de-facto method for in : passdetermining whether a
file is ASCII.

:param filename: The path of the file to test.

:type filename: str

:return: Whether the file is probably ASCII.
:rtype: Boolean
"""

try:
with open(filename) as source:
file_snippet = source.read(512)

if not file_snippet:
return True

ascii_characters = "".join(map(chr, range(32, 127)) +
list("\n\r\t\b"))
null_trans = string.maketrans("", "")

if "\0" in file_snippet:
return False

non_ascii = file_snippet.translate(null_trans, ascii_characters)
return not float(len(non_ascii)) / len(file_snippet) > 0.30

except IOError as exception:
return False

class _GitCloner(threading.Thread):
"""
A singleton Git repository cloner.

Clones the repositories crawled by :class:`crawler.GitHubCrawler` for
:class:`GitIndexer` to index.

:ivar clone_queue: (:class:`Queue.Queue`) see
:attr:`crawler.GitHubCrawler.clone_queue`.
:ivar index_queue: (:class:`Queue.Queue`) see
:attr:`GitIndexer.index_queue`.
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
"""

def __init__(self, clone_queue, index_queue):
"""
Create an instance of the singleton :class:`_GitCloner`.

:param clone_queue: see :attr:`self.clone_queue`
:param index_queue: see :attr:`self.index_queue`

:type clone_queue: see :attr:`self.clone_queue`
:type index_queue: see :attr:`self.index_queue`
"""

self.clone_queue = clone_queue
self.index_queue = index_queue
self._logger = logging.getLogger("%s.%s" %
(__name__, self.__class__.__name__))
self._logger.info("Starting.")
super(_GitCloner, self).__init__(name=self.__class__.__name__)

def run(self):
"""
Retrieve metadata about newly crawled repositories and clone them.

Blocks until new :class:`GitRepository` appear in
:attr:`self.clone_queue`, then attempts cloning them. If
succcessful, the cloned repository is added to :attr:`self.index_queue`
for the `GitIndexer` to clone; otherwise, it is discarded.
"""

while True:
while self.clone_queue.empty():
time.sleep(THREAD_QUEUE_SLEEP)
repo = self.clone_queue.get()
self.clone_queue.task_done()

try:
self._clone_repository(repo)
except Exception as exception:
pass

def _clone_repository(self, repo):
"""
Attempt cloning a Git repository.

:param repo: Metadata about the repository to clone.

:type repo: :class:`GitRepository`
"""

GIT_CLONE_TIMEOUT = 500

queue_percent_full = (float(self.index_queue.qsize()) /
self.index_queue.maxsize) * 100

exit_code = None
command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone"
" --single-branch %s %s/%s || pkill -f git")

command_attempt = 0
while exit_code is None:
try:
exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT,
repo.url, GIT_CLONE_DIR, repo.name), shell=True)
except Exception as exception:
time.sleep(1)
command_attempt += 1
if command_attempt == 20:
break
else:
continue
else:
break

if exit_code != 0:
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
return

while self.index_queue.full():
time.sleep(THREAD_QUEUE_SLEEP)

self.index_queue.put(repo)

class _ChangeDir(object):
"""
A wrapper class for os.chdir(), to map onto `with` and handle exceptions.

:ivar new_path: (str) The path to change the current directory to.
:ivar old_path: (str) The path of the directory to return to.
"""

def __init__(self, new_path):
"""
Create a _ChangeDir instance.

:param new_path: The directory to enter.

:type new_path: str
"""

self.new_path = new_path

def __enter__(self):
"""
Change the current working-directory to **new_path**.
"""

self.old_path = os.getcwd()
os.chdir(self.new_path)

def __exit__(self, *exception):
"""
Change the current working-directory to **old_path**.

:param exception: Various exception arguments passed by `with`.

:type exception: varargs
"""

os.chdir(self.old_path)

+ 1
- 1
setup.py View File

@@ -6,7 +6,7 @@ setup(
packages = find_packages(), packages = find_packages(),
install_requires = [ install_requires = [
"Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", "Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0",
"BeautifulSoup>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"],
"beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"],
author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak",
license = "MIT", license = "MIT",
url = "https://github.com/earwig/bitshift" url = "https://github.com/earwig/bitshift"


Loading…
Cancel
Save