Conflicts: app.py setup.pytags/v1.0^2
@@ -5,6 +5,7 @@ Module to contain all the project's Flask server plumbing. | |||
from flask import Flask | |||
from flask import render_template, session | |||
from bitshift import assets | |||
from bitshift.database import Database | |||
from bitshift.query import parse_query | |||
@@ -1 +1 @@ | |||
from . import assets, codelet, config, database, parser, query | |||
from . import assets, codelet, config, database, parser, query, crawler |
@@ -15,8 +15,11 @@ def tag(filename): | |||
:param filename: The filename of the asset to create a tag for. | |||
:type filename: str | |||
:return: A string containing a `<source>` tag for JS files, and a `<link>` | |||
for CSS files. | |||
:rtype: str | |||
""" | |||
file_ext = filename.split(".")[-1] | |||
@@ -4,42 +4,54 @@ class Codelet(object): | |||
""" | |||
A source-code object with code metadata and composition analysis. | |||
:ivar name: (str) A suitable name for the codelet. | |||
:ivar code: (str) A containing the raw source code. | |||
:ivar filename: (str, or None) The filename of the snippet. | |||
:ivar language: (str, or None) The inferred language of `code`. | |||
:ivar author: (str, or None) The name of the code's author. | |||
:ivar url: (str) The url of the (page containing the) source code. | |||
:ivar date_created: (str, or None) The date the code was published. | |||
:ivar date_modified: (str, or None) The date the code was last modified. | |||
:ivar language: (int, or None) The inferred language of `code`. | |||
:ivar authors: (array of tuples (str, str or None)) An array of tuples | |||
containing an author's name and profile URL (on the service the code | |||
was pulled from). | |||
:ivar code_url: (str) The url of the (page containing the) source code. | |||
:ivar date_created: (:class:`datetime.datetime`, or None) The date the code | |||
was published. | |||
:ivar date_modified: (:class:`datetime.datetime`, or None) The date the | |||
code was last modified. | |||
:ivar rank: (float) A quanitification of the source code's quality, as | |||
per available ratings (stars, forks, upvotes, etc.). | |||
""" | |||
def __init__(self, code, filename, author, language, code_url, author_url, | |||
date_created, date_modified): | |||
def __init__(self, name, code, filename, language, authors, code_url, | |||
date_created, date_modified, rank): | |||
""" | |||
Create a Codelet instance. | |||
:param code: The raw source code. | |||
:param filename: The filename of the code, if any. | |||
:param author: The author of the code. | |||
:param language: The inferred language. | |||
:param code_url: The url of the (page containing the) source code. | |||
:param date_created: The date the code was published. | |||
:param date_modified: The date the code was last modified. | |||
:param name: see :attr:`self.name` | |||
:param code: see :attr:`self.code` | |||
:param filename: see :attr:`self.filename` | |||
:param language: see :attr:`self.language` | |||
:param authors: see :attr:`self.authors` | |||
:param code_url: see :attr:`self.code_url` | |||
:param date_created: see :attr:`self.date_created` | |||
:param date_modified: see :attr:`self.date_modified` | |||
:param rank: see :attr:`self.rank` | |||
:type code: str | |||
:type filename: str, or None | |||
:type language: str, or None | |||
:type author: str, or None | |||
:type url: str | |||
:type date_created: str, or None | |||
:type date_modified: str, or None | |||
:type name: see :attr:`self.name` | |||
:type code: see :attr:`self.code` | |||
:type filename: see :attr:`self.filename` | |||
:type language: see :attr:`self.language` | |||
:type authors: see :attr:`self.authors` | |||
:type code_url: see :attr:`self.code_url` | |||
:type date_created: see :attr:`self.date_created` | |||
:type date_modified: see :attr:`self.date_modified` | |||
:type rank: see :attr:`self.rank` | |||
""" | |||
self.name = name | |||
self.code = code | |||
self.filename = filename | |||
self.author = author | |||
self.language = language | |||
self.authors = authors | |||
self.code_url = code_url | |||
self.author_url = author_url | |||
self.date_created = date_created | |||
self.date_modified = date_modified | |||
self.rank = rank |
@@ -0,0 +1,55 @@ | |||
""" | |||
:synopsis: Parent crawler module, which supervises all crawlers. | |||
Contains functions for initializing all subsidiary, threaded crawlers. | |||
""" | |||
import logging, logging.handlers, os, Queue | |||
from bitshift.crawler import crawler, indexer | |||
__all__ = ["crawl"] | |||
def crawl(): | |||
""" | |||
Initialize all crawlers (and indexers). | |||
Start the: | |||
1. GitHub crawler, :class:`crawler.GitHubCrawler`. | |||
2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`. | |||
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. | |||
""" | |||
_configure_logging() | |||
MAX_URL_QUEUE_SIZE = 5e3 | |||
repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) | |||
threads = [crawler.GitHubCrawler(repo_clone_queue), | |||
crawler.BitbucketCrawler(repo_clone_queue), | |||
indexer.GitIndexer(repo_clone_queue)] | |||
for thread in threads: | |||
thread.start() | |||
def _configure_logging(): | |||
LOG_FILE_DIR = "log" | |||
if not os.path.exists(LOG_FILE_DIR): | |||
os.mkdir(LOG_FILE_DIR) | |||
logging.getLogger("requests").setLevel(logging.WARNING) | |||
logging.getLogger("urllib3").setLevel(logging.WARNING) | |||
formatter = logging.Formatter( | |||
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" | |||
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S") | |||
handler = logging.handlers.TimedRotatingFileHandler( | |||
"%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1, | |||
backupCount=20) | |||
handler.setFormatter(formatter) | |||
root_logger = logging.getLogger() | |||
root_logger.addHandler(handler) | |||
root_logger.setLevel(logging.NOTSET) |
@@ -0,0 +1,240 @@ | |||
""" | |||
:synopsis: Main crawler module, to oversee all site-specific crawlers. | |||
Contains all website/framework-specific Class crawlers. | |||
""" | |||
import logging, requests, time, threading | |||
from bitshift.crawler import indexer | |||
from ..codelet import Codelet | |||
from ..database import Database | |||
class GitHubCrawler(threading.Thread): | |||
""" | |||
Crawler that retrieves links to all of GitHub's public repositories. | |||
GitHubCrawler is a threaded singleton that queries GitHub's API for urls | |||
to its public repositories, which it inserts into a :class:`Queue.Queue` | |||
shared with :class:`indexer.GitIndexer`. | |||
:ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository` | |||
with repository metadata retrieved by :class:`GitHubCrawler`, and other Git | |||
crawlers, to be processed by :class:`indexer.GitIndexer`. | |||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||
""" | |||
AUTHENTICATION = { | |||
"client_id" : "436cb884ae09be7f2a4e", | |||
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" | |||
} | |||
def __init__(self, clone_queue): | |||
""" | |||
Create an instance of the singleton `GitHubCrawler`. | |||
:param clone_queue: see :attr:`self.clone_queue` | |||
:type clone_queue: see :attr:`self.clone_queue` | |||
""" | |||
self.clone_queue = clone_queue | |||
self._logger = logging.getLogger("%s.%s" % | |||
(__name__, self.__class__.__name__)) | |||
self._logger.info("Starting.") | |||
super(GitHubCrawler, self).__init__(name=self.__class__.__name__) | |||
def run(self): | |||
""" | |||
Query the GitHub API for data about every public repository. | |||
Pull all of GitHub's repositories by making calls to its API in a loop, | |||
accessing a subsequent page of results via the "next" URL returned in an | |||
API response header. Uses Severyn Kozak's (sevko) authentication | |||
credentials. For every new repository, a :class:`GitRepository` is | |||
inserted into :attr:`self.clone_queue`. | |||
""" | |||
next_api_url = "https://api.github.com/repositories" | |||
api_request_interval = 5e3 / 60 ** 2 | |||
while len(next_api_url) > 0: | |||
start_time = time.time() | |||
try: | |||
resp = requests.get(next_api_url, params=self.AUTHENTICATION) | |||
except ConnectionError as excep: | |||
self._logger.warning("API %s call failed: %s: %s", | |||
next_api_url, excep.__class__.__name__, excep) | |||
time.sleep(0.5) | |||
continue | |||
queue_percent_full = (float(self.clone_queue.qsize()) / | |||
self.clone_queue.maxsize) * 100 | |||
self._logger.info("API call made. Queue size: %d/%d, %d%%." % | |||
((self.clone_queue.qsize(), self.clone_queue.maxsize, | |||
queue_percent_full))) | |||
repo_names = [repo["full_name"] for repo in resp.json()] | |||
repo_stars = self._get_repositories_stars(repo_names) | |||
for repo in resp.json(): | |||
while self.clone_queue.full(): | |||
time.sleep(1) | |||
self.clone_queue.put(indexer.GitRepository( | |||
repo["html_url"], repo["full_name"].replace("/", ""), | |||
"GitHub", repo_stars[repo["full_name"]])) | |||
if int(resp.headers["x-ratelimit-remaining"]) == 0: | |||
time.sleep(int(resp.headers["x-ratelimit-reset"]) - | |||
time.time()) | |||
next_api_url = resp.headers["link"].split(">")[0][1:] | |||
sleep_time = api_request_interval - (time.time() - start_time) | |||
if sleep_time > 0: | |||
time.sleep(sleep_time) | |||
def _get_repositories_stars(self, repo_names): | |||
""" | |||
Return the number of stargazers for several repositories. | |||
Queries the GitHub API for the number of stargazers for any given | |||
repositories, and blocks if the query limit is exceeded. | |||
:param repo_names: An array of repository names, in | |||
`username/repository_name` format. | |||
:type repo_names: str | |||
:return: A dictionary with repository name keys, and corresponding | |||
stargazer count values. | |||
Example dictionary: | |||
.. code-block:: python | |||
{ | |||
"user/repository" : 100 | |||
} | |||
:rtype: dictionary | |||
""" | |||
API_URL = "https://api.github.com/search/repositories" | |||
REPOS_PER_QUERY = 25 | |||
repo_stars = {} | |||
for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in | |||
xrange(0, len(repo_names), REPOS_PER_QUERY)]: | |||
query_url = "%s?q=%s" % (API_URL, | |||
"+".join("repo:%s" % name for name in names)) | |||
params = self.AUTHENTICATION | |||
resp = requests.get(query_url, | |||
params=params, | |||
headers={ | |||
"Accept" : "application/vnd.github.preview" | |||
}) | |||
if int(resp.headers["x-ratelimit-remaining"]) == 0: | |||
sleep_time = int(resp.headers["x-ratelimit-reset"]) - \ | |||
time.time() + 1 | |||
if sleep_time > 0: | |||
logging.info("API quota exceeded. Sleep time: %d." % | |||
sleep_time) | |||
time.sleep(sleep_time) | |||
for repo in resp.json()["items"]: | |||
rank = float(repo["stargazers_count"]) / 1000 | |||
repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0 | |||
for name in repo_names: | |||
if name not in repo_stars: | |||
repo_stars[name] = 0.5 | |||
return repo_stars | |||
class BitbucketCrawler(threading.Thread): | |||
""" | |||
Crawler that retrieves links to all of Bitbucket's public repositories. | |||
BitbucketCrawler is a threaded singleton that queries Bitbucket's API for | |||
urls to its public repositories, and inserts them as | |||
:class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with | |||
:class:`indexer.GitIndexer`. | |||
:ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert | |||
:class:`indexer.GitRepository` repository urls into. | |||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||
""" | |||
def __init__(self, clone_queue): | |||
""" | |||
Create an instance of the singleton `BitbucketCrawler`. | |||
:param clone_queue: see :attr:`self.clone_queue` | |||
:type clone_queue: see :attr:`self.clone_queue` | |||
""" | |||
self.clone_queue = clone_queue | |||
self._logger = logging.getLogger("%s.%s" % | |||
(__name__, self.__class__.__name__)) | |||
self._logger.info("Starting.") | |||
super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) | |||
def run(self): | |||
""" | |||
Query the Bitbucket API for data about every public repository. | |||
Query the Bitbucket API's "/repositories" endpoint and read its | |||
paginated responses in a loop; any "git" repositories have their | |||
clone-urls and names inserted into a :class:`indexer.GitRepository` in | |||
:attr:`self.clone_queue`. | |||
""" | |||
next_api_url = "https://api.bitbucket.org/2.0/repositories" | |||
while True: | |||
try: | |||
response = requests.get(next_api_url).json() | |||
except ConnectionError as exception: | |||
time.sleep(0.5) | |||
self._logger.warning("API %s call failed: %s: %s", | |||
next_api_url, excep.__class__.__name__, excep) | |||
continue | |||
queue_percent_full = (float(self.clone_queue.qsize()) / | |||
self.clone_queue.maxsize) * 100 | |||
self._logger.info("API call made. Queue size: %d/%d, %d%%." % | |||
((self.clone_queue.qsize(), self.clone_queue.maxsize, | |||
queue_percent_full))) | |||
for repo in response["values"]: | |||
if repo["scm"] == "git": | |||
while self.clone_queue.full(): | |||
time.sleep(1) | |||
clone_links = repo["links"]["clone"] | |||
clone_url = (clone_links[0]["href"] if | |||
clone_links[0]["name"] == "https" else | |||
clone_links[1]["href"]) | |||
links.append("clone_url") | |||
try: | |||
watchers = requests.get( | |||
repo["links"]["watchers"]["href"]) | |||
rank = len(watchers.json()["values"]) / 100 | |||
except ConnectionError as exception: | |||
time.sleep(0.5) | |||
self._logger.warning("API %s call failed: %s: %s", | |||
next_api_url, excep.__class__.__name__, excep) | |||
continue | |||
self.clone_queue.put(indexer.GitRepository( | |||
clone_url, repo["full_name"], "Bitbucket"), | |||
rank if rank < 1.0 else 1.0) | |||
next_api_url = response["next"] | |||
time.sleep(0.2) |
@@ -0,0 +1,489 @@ | |||
""" | |||
:synopsis: Contains a singleton GitIndexer class, which clones and indexes git | |||
repositories. | |||
""" | |||
import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\ | |||
threading | |||
from ..database import Database | |||
from ..codelet import Codelet | |||
GIT_CLONE_DIR = "/tmp/bitshift" | |||
THREAD_QUEUE_SLEEP = 0.5 | |||
class GitRepository(object): | |||
""" | |||
A representation of a Git repository's metadata. | |||
:ivar url: (str) The repository's url. | |||
:ivar name: (str) The name of the repository. | |||
:ivar framework_name: (str) The name of the online Git framework that the | |||
repository belongs to (eg, GitHub, BitBucket). | |||
:ivar rank: (float) The rank of the repository, as assigned by | |||
:class:`crawler.GitHubCrawler`. | |||
""" | |||
def __init__(self, url, name, framework_name, rank): | |||
""" | |||
Create a GitRepository instance. | |||
:param url: see :attr:`GitRepository.url` | |||
:param name: see :attr:`GitRepository.name` | |||
:param framework_name: see :attr:`GitRepository.framework_name` | |||
:param rank: see :attr:`GitRepository.rank` | |||
:type url: str | |||
:type name: str | |||
:type framework_name: str | |||
:type rank: float | |||
""" | |||
self.url = url | |||
self.name = name | |||
self.framework_name = framework_name | |||
self.rank = rank | |||
class GitIndexer(threading.Thread): | |||
""" | |||
A singleton Git repository indexer. | |||
:class:`GitIndexer` indexes the repositories cloned by the | |||
:class:`_GitCloner` singleton. | |||
:ivar index_queue: (:class:`Queue.Queue`) A queue containing | |||
:class:`GitRepository` objects for every new repository succesfully | |||
cloned by :class:`_GitCloner`, which are to be indexed. | |||
:ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner, | |||
which feeds :class:`GitIndexer`. | |||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||
""" | |||
def __init__(self, clone_queue): | |||
""" | |||
Create an instance of the singleton `GitIndexer`. | |||
:param clone_queue: see :attr:`self.index_queue` | |||
:type index_queue: see :attr:`self.index_queue` | |||
""" | |||
MAX_INDEX_QUEUE_SIZE = 10 | |||
self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) | |||
self.git_cloner = _GitCloner(clone_queue, self.index_queue) | |||
self.git_cloner.start() | |||
self._logger = logging.getLogger("%s.%s" % | |||
(__name__, self.__class__.__name__)) | |||
self._logger.info("Starting.") | |||
if not os.path.exists(GIT_CLONE_DIR): | |||
os.makedirs(GIT_CLONE_DIR) | |||
super(GitIndexer, self).__init__(name=self.__class__.__name__) | |||
def run(self): | |||
""" | |||
Retrieve metadata about newly cloned repositories and index them. | |||
Blocks until new repositories appear in :attr:`self.index_queue`, then | |||
retrieves one, and attempts indexing it. Should any errors occur, the | |||
new repository will be discarded and the indexer will index the next in | |||
the queue. | |||
""" | |||
while True: | |||
while self.index_queue.empty(): | |||
time.sleep(THREAD_QUEUE_SLEEP) | |||
repo = self.index_queue.get() | |||
self.index_queue.task_done() | |||
try: | |||
self._index_repository(repo) | |||
except Exception as excep: | |||
self._logger.warning("%s: %s.", excep.__class__.__name__, excep) | |||
def _index_repository(self, repo): | |||
""" | |||
Clone and index (create and insert Codeletes for) a Git repository. | |||
`git clone` the Git repository located at **repo.url**, call | |||
`_insert_repository_codelets()`, then remove said repository. | |||
:param repo_url: The metadata of the repository to be indexed. | |||
:type repo_url: :class:`GitRepository` | |||
""" | |||
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir: | |||
try: | |||
self._insert_repository_codelets(repo) | |||
except Exception as excep: | |||
self._logger.warning("%s: %s.", excep.__class__.__name__, excep) | |||
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): | |||
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) | |||
def _insert_repository_codelets(self, repo): | |||
""" | |||
Create and insert a Codelet for the files inside a Git repository. | |||
Create a new Codelet, and insert it into the Database singleton, for | |||
every file inside the current working directory's default branch | |||
(usually *master*). | |||
:param repo_url: The metadata of the repository to be indexed. | |||
:type repo_url: :class:`GitRepository` | |||
""" | |||
commits_meta = self._get_commits_metadata() | |||
if commits_meta is None: | |||
return | |||
for filename in commits_meta.keys(): | |||
try: | |||
with open(filename) as source_file: | |||
source = self._decode(source_file.read()) | |||
if source is None: | |||
continue | |||
except IOError as exception: | |||
continue | |||
authors = [(self._decode(author), None) for author in \ | |||
commits_meta[filename]["authors"]] | |||
codelet = Codelet("%s:%s" % (repo.name, filename), source, filename, | |||
None, authors, self._generate_file_url(filename, | |||
repo.url, repo.framework_name), | |||
commits_meta[filename]["time_created"], | |||
commits_meta[filename]["time_last_modified"], | |||
repo.rank) | |||
def _generate_file_url(self, filename, repo_url, framework_name): | |||
""" | |||
Return a url for a filename from a Git wrapper framework. | |||
:param filename: The path of the file. | |||
:param repo_url: The url of the file's parent repository. | |||
:param framework_name: The name of the framework the repository is from. | |||
:type filename: str | |||
:type repo_url: str | |||
:type framework_name: str | |||
:return: The file's full url on the given framework, if successfully | |||
derived. | |||
:rtype: str, or None | |||
.. warning:: | |||
Various Git subprocesses will occasionally fail, and, seeing as the | |||
information they provide is a crucial component of some repository file | |||
urls, None may be returned. | |||
""" | |||
try: | |||
if framework_name == "GitHub": | |||
default_branch = subprocess.check_output("git branch" | |||
" --no-color", shell=True)[2:-1] | |||
return ("%s/blob/%s/%s" % (repo_url, default_branch, | |||
filename)).replace("//", "/") | |||
elif framework_name == "Bitbucket": | |||
commit_hash = subprocess.check_output("git rev-parse HEAD", | |||
shell=True).replace("\n", "") | |||
return ("%s/src/%s/%s" % (repo_url, commit_hash, | |||
filename)).replace("//", "/") | |||
except subprocess.CalledProcessError as exception: | |||
return None | |||
def _get_git_commits(self): | |||
""" | |||
Return the current working directory's formatted commit data. | |||
Uses `git log` to generate metadata about every single file in the | |||
repository's commit history. | |||
:return: The author, timestamp, and names of all modified files of every | |||
commit. | |||
.. code-block:: python | |||
sample_returned_array = [ | |||
{ | |||
"author" : (str) "author" | |||
"timestamp" : (`datetime.datetime`) <object>, | |||
"filenames" : (str array) ["file1", "file2"] | |||
} | |||
] | |||
:rtype: array of dictionaries | |||
""" | |||
git_log = subprocess.check_output(("git --no-pager log --name-only" | |||
" --pretty=format:'%n%n%an%n%at' -z"), shell=True) | |||
commits = [] | |||
for commit in git_log.split("\n\n"): | |||
fields = commit.split("\n") | |||
if len(fields) > 2: | |||
commits.append({ | |||
"author" : fields[0], | |||
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), | |||
"filenames" : fields[2].split("\x00")[:-2] | |||
}) | |||
return commits | |||
def _get_tracked_files(self): | |||
""" | |||
Return a list of the filenames of all valuable files in the Git repository. | |||
Get a list of the filenames of the non-binary (Perl heuristics used for | |||
filetype identification) files currently inside the current working | |||
directory's Git repository. Then, weed out any boilerplate/non-code files | |||
that match the regex rules in GIT_IGNORE_FILES. | |||
:return: The filenames of all index-worthy non-binary files. | |||
:rtype: str array | |||
""" | |||
files = [] | |||
for dirname, subdir_names, filenames in os.walk("."): | |||
for filename in filenames: | |||
path = os.path.join(dirname, filename) | |||
if self._is_ascii(path): | |||
files.append(path[2:]) | |||
return files | |||
def _get_commits_metadata(self): | |||
""" | |||
Return a dictionary containing every valuable tracked file's metadata. | |||
:return: A dictionary with author names, time of creation, and time of last | |||
modification for every filename key. | |||
.. code-block:: python | |||
sample_returned_dict = { | |||
"my_file" : { | |||
"authors" : (str array) ["author1", "author2"], | |||
"time_created" : (`datetime.datetime`) <object>, | |||
"time_last_modified" : (`datetime.datetime`) <object> | |||
} | |||
} | |||
:rtype: dictionary of dictionaries | |||
""" | |||
commits = self._get_git_commits() | |||
tracked_files = self._get_tracked_files() | |||
files_meta = {} | |||
for commit in commits: | |||
for filename in commit["filenames"]: | |||
if filename not in tracked_files: | |||
continue | |||
if filename not in files_meta.keys(): | |||
files_meta[filename] = { | |||
"authors" : [commit["author"]], | |||
"time_last_modified" : commit["timestamp"], | |||
"time_created" : commit["timestamp"] | |||
} | |||
else: | |||
if commit["author"] not in files_meta[filename]["authors"]: | |||
files_meta[filename]["authors"].append(commit["author"]) | |||
files_meta[filename]["time_created"] = commit["timestamp"] | |||
return files_meta | |||
def _decode(self, raw): | |||
""" | |||
Return a decoded a raw string. | |||
:param raw: The string to string. | |||
:type raw: (str) | |||
:return: If the original encoding is successfully inferenced, return the | |||
decoded string. | |||
:rtype: str, or None | |||
.. warning:: | |||
The raw string's original encoding is identified by heuristics which | |||
can, and occasionally will, fail. Decoding will then fail, and None | |||
will be returned. | |||
""" | |||
try: | |||
encoding = bs4.BeautifulSoup(raw).original_encoding | |||
return raw.decode(encoding) if encoding is not None else None | |||
except (LookupError, UnicodeDecodeError, UserWarning) as exception: | |||
return None | |||
def _is_ascii(self, filename): | |||
""" | |||
Heuristically determine whether a file is ASCII text or binary. | |||
If a portion of the file contains null bytes, or the percentage of bytes | |||
that aren't ASCII is greater than 30%, then the file is concluded to be | |||
binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T` | |||
operator, and is the de-facto method for in : passdetermining whether a | |||
file is ASCII. | |||
:param filename: The path of the file to test. | |||
:type filename: str | |||
:return: Whether the file is probably ASCII. | |||
:rtype: Boolean | |||
""" | |||
try: | |||
with open(filename) as source: | |||
file_snippet = source.read(512) | |||
if not file_snippet: | |||
return True | |||
ascii_characters = "".join(map(chr, range(32, 127)) + | |||
list("\n\r\t\b")) | |||
null_trans = string.maketrans("", "") | |||
if "\0" in file_snippet: | |||
return False | |||
non_ascii = file_snippet.translate(null_trans, ascii_characters) | |||
return not float(len(non_ascii)) / len(file_snippet) > 0.30 | |||
except IOError as exception: | |||
return False | |||
class _GitCloner(threading.Thread): | |||
""" | |||
A singleton Git repository cloner. | |||
Clones the repositories crawled by :class:`crawler.GitHubCrawler` for | |||
:class:`GitIndexer` to index. | |||
:ivar clone_queue: (:class:`Queue.Queue`) see | |||
:attr:`crawler.GitHubCrawler.clone_queue`. | |||
:ivar index_queue: (:class:`Queue.Queue`) see | |||
:attr:`GitIndexer.index_queue`. | |||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||
""" | |||
def __init__(self, clone_queue, index_queue): | |||
""" | |||
Create an instance of the singleton :class:`_GitCloner`. | |||
:param clone_queue: see :attr:`self.clone_queue` | |||
:param index_queue: see :attr:`self.index_queue` | |||
:type clone_queue: see :attr:`self.clone_queue` | |||
:type index_queue: see :attr:`self.index_queue` | |||
""" | |||
self.clone_queue = clone_queue | |||
self.index_queue = index_queue | |||
self._logger = logging.getLogger("%s.%s" % | |||
(__name__, self.__class__.__name__)) | |||
self._logger.info("Starting.") | |||
super(_GitCloner, self).__init__(name=self.__class__.__name__) | |||
def run(self): | |||
""" | |||
Retrieve metadata about newly crawled repositories and clone them. | |||
Blocks until new :class:`GitRepository` appear in | |||
:attr:`self.clone_queue`, then attempts cloning them. If | |||
succcessful, the cloned repository is added to :attr:`self.index_queue` | |||
for the `GitIndexer` to clone; otherwise, it is discarded. | |||
""" | |||
while True: | |||
while self.clone_queue.empty(): | |||
time.sleep(THREAD_QUEUE_SLEEP) | |||
repo = self.clone_queue.get() | |||
self.clone_queue.task_done() | |||
try: | |||
self._clone_repository(repo) | |||
except Exception as exception: | |||
pass | |||
def _clone_repository(self, repo): | |||
""" | |||
Attempt cloning a Git repository. | |||
:param repo: Metadata about the repository to clone. | |||
:type repo: :class:`GitRepository` | |||
""" | |||
GIT_CLONE_TIMEOUT = 500 | |||
queue_percent_full = (float(self.index_queue.qsize()) / | |||
self.index_queue.maxsize) * 100 | |||
exit_code = None | |||
command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone" | |||
" --single-branch %s %s/%s || pkill -f git") | |||
command_attempt = 0 | |||
while exit_code is None: | |||
try: | |||
exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT, | |||
repo.url, GIT_CLONE_DIR, repo.name), shell=True) | |||
except Exception as exception: | |||
time.sleep(1) | |||
command_attempt += 1 | |||
if command_attempt == 20: | |||
break | |||
else: | |||
continue | |||
else: | |||
break | |||
if exit_code != 0: | |||
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): | |||
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) | |||
return | |||
while self.index_queue.full(): | |||
time.sleep(THREAD_QUEUE_SLEEP) | |||
self.index_queue.put(repo) | |||
class _ChangeDir(object): | |||
""" | |||
A wrapper class for os.chdir(), to map onto `with` and handle exceptions. | |||
:ivar new_path: (str) The path to change the current directory to. | |||
:ivar old_path: (str) The path of the directory to return to. | |||
""" | |||
def __init__(self, new_path): | |||
""" | |||
Create a _ChangeDir instance. | |||
:param new_path: The directory to enter. | |||
:type new_path: str | |||
""" | |||
self.new_path = new_path | |||
def __enter__(self): | |||
""" | |||
Change the current working-directory to **new_path**. | |||
""" | |||
self.old_path = os.getcwd() | |||
os.chdir(self.new_path) | |||
def __exit__(self, *exception): | |||
""" | |||
Change the current working-directory to **old_path**. | |||
:param exception: Various exception arguments passed by `with`. | |||
:type exception: varargs | |||
""" | |||
os.chdir(self.old_path) |
@@ -6,7 +6,7 @@ setup( | |||
packages = find_packages(), | |||
install_requires = [ | |||
"Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", | |||
"BeautifulSoup>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"], | |||
"beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"], | |||
author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", | |||
license = "MIT", | |||
url = "https://github.com/earwig/bitshift" | |||