Conflicts: app.py setup.pytags/v1.0^2
@@ -5,6 +5,7 @@ Module to contain all the project's Flask server plumbing. | |||||
from flask import Flask | from flask import Flask | ||||
from flask import render_template, session | from flask import render_template, session | ||||
from bitshift import assets | |||||
from bitshift.database import Database | from bitshift.database import Database | ||||
from bitshift.query import parse_query | from bitshift.query import parse_query | ||||
@@ -1 +1 @@ | |||||
from . import assets, codelet, config, database, parser, query | |||||
from . import assets, codelet, config, database, parser, query, crawler |
@@ -15,8 +15,11 @@ def tag(filename): | |||||
:param filename: The filename of the asset to create a tag for. | :param filename: The filename of the asset to create a tag for. | ||||
:type filename: str | |||||
:return: A string containing a `<source>` tag for JS files, and a `<link>` | :return: A string containing a `<source>` tag for JS files, and a `<link>` | ||||
for CSS files. | for CSS files. | ||||
:rtype: str | |||||
""" | """ | ||||
file_ext = filename.split(".")[-1] | file_ext = filename.split(".")[-1] | ||||
@@ -4,42 +4,54 @@ class Codelet(object): | |||||
""" | """ | ||||
A source-code object with code metadata and composition analysis. | A source-code object with code metadata and composition analysis. | ||||
:ivar name: (str) A suitable name for the codelet. | |||||
:ivar code: (str) A containing the raw source code. | :ivar code: (str) A containing the raw source code. | ||||
:ivar filename: (str, or None) The filename of the snippet. | :ivar filename: (str, or None) The filename of the snippet. | ||||
:ivar language: (str, or None) The inferred language of `code`. | |||||
:ivar author: (str, or None) The name of the code's author. | |||||
:ivar url: (str) The url of the (page containing the) source code. | |||||
:ivar date_created: (str, or None) The date the code was published. | |||||
:ivar date_modified: (str, or None) The date the code was last modified. | |||||
:ivar language: (int, or None) The inferred language of `code`. | |||||
:ivar authors: (array of tuples (str, str or None)) An array of tuples | |||||
containing an author's name and profile URL (on the service the code | |||||
was pulled from). | |||||
:ivar code_url: (str) The url of the (page containing the) source code. | |||||
:ivar date_created: (:class:`datetime.datetime`, or None) The date the code | |||||
was published. | |||||
:ivar date_modified: (:class:`datetime.datetime`, or None) The date the | |||||
code was last modified. | |||||
:ivar rank: (float) A quanitification of the source code's quality, as | |||||
per available ratings (stars, forks, upvotes, etc.). | |||||
""" | """ | ||||
def __init__(self, code, filename, author, language, code_url, author_url, | |||||
date_created, date_modified): | |||||
def __init__(self, name, code, filename, language, authors, code_url, | |||||
date_created, date_modified, rank): | |||||
""" | """ | ||||
Create a Codelet instance. | Create a Codelet instance. | ||||
:param code: The raw source code. | |||||
:param filename: The filename of the code, if any. | |||||
:param author: The author of the code. | |||||
:param language: The inferred language. | |||||
:param code_url: The url of the (page containing the) source code. | |||||
:param date_created: The date the code was published. | |||||
:param date_modified: The date the code was last modified. | |||||
:param name: see :attr:`self.name` | |||||
:param code: see :attr:`self.code` | |||||
:param filename: see :attr:`self.filename` | |||||
:param language: see :attr:`self.language` | |||||
:param authors: see :attr:`self.authors` | |||||
:param code_url: see :attr:`self.code_url` | |||||
:param date_created: see :attr:`self.date_created` | |||||
:param date_modified: see :attr:`self.date_modified` | |||||
:param rank: see :attr:`self.rank` | |||||
:type code: str | |||||
:type filename: str, or None | |||||
:type language: str, or None | |||||
:type author: str, or None | |||||
:type url: str | |||||
:type date_created: str, or None | |||||
:type date_modified: str, or None | |||||
:type name: see :attr:`self.name` | |||||
:type code: see :attr:`self.code` | |||||
:type filename: see :attr:`self.filename` | |||||
:type language: see :attr:`self.language` | |||||
:type authors: see :attr:`self.authors` | |||||
:type code_url: see :attr:`self.code_url` | |||||
:type date_created: see :attr:`self.date_created` | |||||
:type date_modified: see :attr:`self.date_modified` | |||||
:type rank: see :attr:`self.rank` | |||||
""" | """ | ||||
self.name = name | |||||
self.code = code | self.code = code | ||||
self.filename = filename | self.filename = filename | ||||
self.author = author | |||||
self.language = language | self.language = language | ||||
self.authors = authors | |||||
self.code_url = code_url | self.code_url = code_url | ||||
self.author_url = author_url | |||||
self.date_created = date_created | self.date_created = date_created | ||||
self.date_modified = date_modified | self.date_modified = date_modified | ||||
self.rank = rank |
@@ -0,0 +1,55 @@ | |||||
""" | |||||
:synopsis: Parent crawler module, which supervises all crawlers. | |||||
Contains functions for initializing all subsidiary, threaded crawlers. | |||||
""" | |||||
import logging, logging.handlers, os, Queue | |||||
from bitshift.crawler import crawler, indexer | |||||
__all__ = ["crawl"] | |||||
def crawl(): | |||||
""" | |||||
Initialize all crawlers (and indexers). | |||||
Start the: | |||||
1. GitHub crawler, :class:`crawler.GitHubCrawler`. | |||||
2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`. | |||||
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. | |||||
""" | |||||
_configure_logging() | |||||
MAX_URL_QUEUE_SIZE = 5e3 | |||||
repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) | |||||
threads = [crawler.GitHubCrawler(repo_clone_queue), | |||||
crawler.BitbucketCrawler(repo_clone_queue), | |||||
indexer.GitIndexer(repo_clone_queue)] | |||||
for thread in threads: | |||||
thread.start() | |||||
def _configure_logging(): | |||||
LOG_FILE_DIR = "log" | |||||
if not os.path.exists(LOG_FILE_DIR): | |||||
os.mkdir(LOG_FILE_DIR) | |||||
logging.getLogger("requests").setLevel(logging.WARNING) | |||||
logging.getLogger("urllib3").setLevel(logging.WARNING) | |||||
formatter = logging.Formatter( | |||||
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" | |||||
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S") | |||||
handler = logging.handlers.TimedRotatingFileHandler( | |||||
"%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1, | |||||
backupCount=20) | |||||
handler.setFormatter(formatter) | |||||
root_logger = logging.getLogger() | |||||
root_logger.addHandler(handler) | |||||
root_logger.setLevel(logging.NOTSET) |
@@ -0,0 +1,240 @@ | |||||
""" | |||||
:synopsis: Main crawler module, to oversee all site-specific crawlers. | |||||
Contains all website/framework-specific Class crawlers. | |||||
""" | |||||
import logging, requests, time, threading | |||||
from bitshift.crawler import indexer | |||||
from ..codelet import Codelet | |||||
from ..database import Database | |||||
class GitHubCrawler(threading.Thread): | |||||
""" | |||||
Crawler that retrieves links to all of GitHub's public repositories. | |||||
GitHubCrawler is a threaded singleton that queries GitHub's API for urls | |||||
to its public repositories, which it inserts into a :class:`Queue.Queue` | |||||
shared with :class:`indexer.GitIndexer`. | |||||
:ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository` | |||||
with repository metadata retrieved by :class:`GitHubCrawler`, and other Git | |||||
crawlers, to be processed by :class:`indexer.GitIndexer`. | |||||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||||
""" | |||||
AUTHENTICATION = { | |||||
"client_id" : "436cb884ae09be7f2a4e", | |||||
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" | |||||
} | |||||
def __init__(self, clone_queue): | |||||
""" | |||||
Create an instance of the singleton `GitHubCrawler`. | |||||
:param clone_queue: see :attr:`self.clone_queue` | |||||
:type clone_queue: see :attr:`self.clone_queue` | |||||
""" | |||||
self.clone_queue = clone_queue | |||||
self._logger = logging.getLogger("%s.%s" % | |||||
(__name__, self.__class__.__name__)) | |||||
self._logger.info("Starting.") | |||||
super(GitHubCrawler, self).__init__(name=self.__class__.__name__) | |||||
def run(self): | |||||
""" | |||||
Query the GitHub API for data about every public repository. | |||||
Pull all of GitHub's repositories by making calls to its API in a loop, | |||||
accessing a subsequent page of results via the "next" URL returned in an | |||||
API response header. Uses Severyn Kozak's (sevko) authentication | |||||
credentials. For every new repository, a :class:`GitRepository` is | |||||
inserted into :attr:`self.clone_queue`. | |||||
""" | |||||
next_api_url = "https://api.github.com/repositories" | |||||
api_request_interval = 5e3 / 60 ** 2 | |||||
while len(next_api_url) > 0: | |||||
start_time = time.time() | |||||
try: | |||||
resp = requests.get(next_api_url, params=self.AUTHENTICATION) | |||||
except ConnectionError as excep: | |||||
self._logger.warning("API %s call failed: %s: %s", | |||||
next_api_url, excep.__class__.__name__, excep) | |||||
time.sleep(0.5) | |||||
continue | |||||
queue_percent_full = (float(self.clone_queue.qsize()) / | |||||
self.clone_queue.maxsize) * 100 | |||||
self._logger.info("API call made. Queue size: %d/%d, %d%%." % | |||||
((self.clone_queue.qsize(), self.clone_queue.maxsize, | |||||
queue_percent_full))) | |||||
repo_names = [repo["full_name"] for repo in resp.json()] | |||||
repo_stars = self._get_repositories_stars(repo_names) | |||||
for repo in resp.json(): | |||||
while self.clone_queue.full(): | |||||
time.sleep(1) | |||||
self.clone_queue.put(indexer.GitRepository( | |||||
repo["html_url"], repo["full_name"].replace("/", ""), | |||||
"GitHub", repo_stars[repo["full_name"]])) | |||||
if int(resp.headers["x-ratelimit-remaining"]) == 0: | |||||
time.sleep(int(resp.headers["x-ratelimit-reset"]) - | |||||
time.time()) | |||||
next_api_url = resp.headers["link"].split(">")[0][1:] | |||||
sleep_time = api_request_interval - (time.time() - start_time) | |||||
if sleep_time > 0: | |||||
time.sleep(sleep_time) | |||||
def _get_repositories_stars(self, repo_names): | |||||
""" | |||||
Return the number of stargazers for several repositories. | |||||
Queries the GitHub API for the number of stargazers for any given | |||||
repositories, and blocks if the query limit is exceeded. | |||||
:param repo_names: An array of repository names, in | |||||
`username/repository_name` format. | |||||
:type repo_names: str | |||||
:return: A dictionary with repository name keys, and corresponding | |||||
stargazer count values. | |||||
Example dictionary: | |||||
.. code-block:: python | |||||
{ | |||||
"user/repository" : 100 | |||||
} | |||||
:rtype: dictionary | |||||
""" | |||||
API_URL = "https://api.github.com/search/repositories" | |||||
REPOS_PER_QUERY = 25 | |||||
repo_stars = {} | |||||
for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in | |||||
xrange(0, len(repo_names), REPOS_PER_QUERY)]: | |||||
query_url = "%s?q=%s" % (API_URL, | |||||
"+".join("repo:%s" % name for name in names)) | |||||
params = self.AUTHENTICATION | |||||
resp = requests.get(query_url, | |||||
params=params, | |||||
headers={ | |||||
"Accept" : "application/vnd.github.preview" | |||||
}) | |||||
if int(resp.headers["x-ratelimit-remaining"]) == 0: | |||||
sleep_time = int(resp.headers["x-ratelimit-reset"]) - \ | |||||
time.time() + 1 | |||||
if sleep_time > 0: | |||||
logging.info("API quota exceeded. Sleep time: %d." % | |||||
sleep_time) | |||||
time.sleep(sleep_time) | |||||
for repo in resp.json()["items"]: | |||||
rank = float(repo["stargazers_count"]) / 1000 | |||||
repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0 | |||||
for name in repo_names: | |||||
if name not in repo_stars: | |||||
repo_stars[name] = 0.5 | |||||
return repo_stars | |||||
class BitbucketCrawler(threading.Thread): | |||||
""" | |||||
Crawler that retrieves links to all of Bitbucket's public repositories. | |||||
BitbucketCrawler is a threaded singleton that queries Bitbucket's API for | |||||
urls to its public repositories, and inserts them as | |||||
:class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with | |||||
:class:`indexer.GitIndexer`. | |||||
:ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert | |||||
:class:`indexer.GitRepository` repository urls into. | |||||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||||
""" | |||||
def __init__(self, clone_queue): | |||||
""" | |||||
Create an instance of the singleton `BitbucketCrawler`. | |||||
:param clone_queue: see :attr:`self.clone_queue` | |||||
:type clone_queue: see :attr:`self.clone_queue` | |||||
""" | |||||
self.clone_queue = clone_queue | |||||
self._logger = logging.getLogger("%s.%s" % | |||||
(__name__, self.__class__.__name__)) | |||||
self._logger.info("Starting.") | |||||
super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) | |||||
def run(self): | |||||
""" | |||||
Query the Bitbucket API for data about every public repository. | |||||
Query the Bitbucket API's "/repositories" endpoint and read its | |||||
paginated responses in a loop; any "git" repositories have their | |||||
clone-urls and names inserted into a :class:`indexer.GitRepository` in | |||||
:attr:`self.clone_queue`. | |||||
""" | |||||
next_api_url = "https://api.bitbucket.org/2.0/repositories" | |||||
while True: | |||||
try: | |||||
response = requests.get(next_api_url).json() | |||||
except ConnectionError as exception: | |||||
time.sleep(0.5) | |||||
self._logger.warning("API %s call failed: %s: %s", | |||||
next_api_url, excep.__class__.__name__, excep) | |||||
continue | |||||
queue_percent_full = (float(self.clone_queue.qsize()) / | |||||
self.clone_queue.maxsize) * 100 | |||||
self._logger.info("API call made. Queue size: %d/%d, %d%%." % | |||||
((self.clone_queue.qsize(), self.clone_queue.maxsize, | |||||
queue_percent_full))) | |||||
for repo in response["values"]: | |||||
if repo["scm"] == "git": | |||||
while self.clone_queue.full(): | |||||
time.sleep(1) | |||||
clone_links = repo["links"]["clone"] | |||||
clone_url = (clone_links[0]["href"] if | |||||
clone_links[0]["name"] == "https" else | |||||
clone_links[1]["href"]) | |||||
links.append("clone_url") | |||||
try: | |||||
watchers = requests.get( | |||||
repo["links"]["watchers"]["href"]) | |||||
rank = len(watchers.json()["values"]) / 100 | |||||
except ConnectionError as exception: | |||||
time.sleep(0.5) | |||||
self._logger.warning("API %s call failed: %s: %s", | |||||
next_api_url, excep.__class__.__name__, excep) | |||||
continue | |||||
self.clone_queue.put(indexer.GitRepository( | |||||
clone_url, repo["full_name"], "Bitbucket"), | |||||
rank if rank < 1.0 else 1.0) | |||||
next_api_url = response["next"] | |||||
time.sleep(0.2) |
@@ -0,0 +1,489 @@ | |||||
""" | |||||
:synopsis: Contains a singleton GitIndexer class, which clones and indexes git | |||||
repositories. | |||||
""" | |||||
import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\ | |||||
threading | |||||
from ..database import Database | |||||
from ..codelet import Codelet | |||||
GIT_CLONE_DIR = "/tmp/bitshift" | |||||
THREAD_QUEUE_SLEEP = 0.5 | |||||
class GitRepository(object): | |||||
""" | |||||
A representation of a Git repository's metadata. | |||||
:ivar url: (str) The repository's url. | |||||
:ivar name: (str) The name of the repository. | |||||
:ivar framework_name: (str) The name of the online Git framework that the | |||||
repository belongs to (eg, GitHub, BitBucket). | |||||
:ivar rank: (float) The rank of the repository, as assigned by | |||||
:class:`crawler.GitHubCrawler`. | |||||
""" | |||||
def __init__(self, url, name, framework_name, rank): | |||||
""" | |||||
Create a GitRepository instance. | |||||
:param url: see :attr:`GitRepository.url` | |||||
:param name: see :attr:`GitRepository.name` | |||||
:param framework_name: see :attr:`GitRepository.framework_name` | |||||
:param rank: see :attr:`GitRepository.rank` | |||||
:type url: str | |||||
:type name: str | |||||
:type framework_name: str | |||||
:type rank: float | |||||
""" | |||||
self.url = url | |||||
self.name = name | |||||
self.framework_name = framework_name | |||||
self.rank = rank | |||||
class GitIndexer(threading.Thread): | |||||
""" | |||||
A singleton Git repository indexer. | |||||
:class:`GitIndexer` indexes the repositories cloned by the | |||||
:class:`_GitCloner` singleton. | |||||
:ivar index_queue: (:class:`Queue.Queue`) A queue containing | |||||
:class:`GitRepository` objects for every new repository succesfully | |||||
cloned by :class:`_GitCloner`, which are to be indexed. | |||||
:ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner, | |||||
which feeds :class:`GitIndexer`. | |||||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||||
""" | |||||
def __init__(self, clone_queue): | |||||
""" | |||||
Create an instance of the singleton `GitIndexer`. | |||||
:param clone_queue: see :attr:`self.index_queue` | |||||
:type index_queue: see :attr:`self.index_queue` | |||||
""" | |||||
MAX_INDEX_QUEUE_SIZE = 10 | |||||
self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) | |||||
self.git_cloner = _GitCloner(clone_queue, self.index_queue) | |||||
self.git_cloner.start() | |||||
self._logger = logging.getLogger("%s.%s" % | |||||
(__name__, self.__class__.__name__)) | |||||
self._logger.info("Starting.") | |||||
if not os.path.exists(GIT_CLONE_DIR): | |||||
os.makedirs(GIT_CLONE_DIR) | |||||
super(GitIndexer, self).__init__(name=self.__class__.__name__) | |||||
def run(self): | |||||
""" | |||||
Retrieve metadata about newly cloned repositories and index them. | |||||
Blocks until new repositories appear in :attr:`self.index_queue`, then | |||||
retrieves one, and attempts indexing it. Should any errors occur, the | |||||
new repository will be discarded and the indexer will index the next in | |||||
the queue. | |||||
""" | |||||
while True: | |||||
while self.index_queue.empty(): | |||||
time.sleep(THREAD_QUEUE_SLEEP) | |||||
repo = self.index_queue.get() | |||||
self.index_queue.task_done() | |||||
try: | |||||
self._index_repository(repo) | |||||
except Exception as excep: | |||||
self._logger.warning("%s: %s.", excep.__class__.__name__, excep) | |||||
def _index_repository(self, repo): | |||||
""" | |||||
Clone and index (create and insert Codeletes for) a Git repository. | |||||
`git clone` the Git repository located at **repo.url**, call | |||||
`_insert_repository_codelets()`, then remove said repository. | |||||
:param repo_url: The metadata of the repository to be indexed. | |||||
:type repo_url: :class:`GitRepository` | |||||
""" | |||||
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir: | |||||
try: | |||||
self._insert_repository_codelets(repo) | |||||
except Exception as excep: | |||||
self._logger.warning("%s: %s.", excep.__class__.__name__, excep) | |||||
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): | |||||
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) | |||||
def _insert_repository_codelets(self, repo): | |||||
""" | |||||
Create and insert a Codelet for the files inside a Git repository. | |||||
Create a new Codelet, and insert it into the Database singleton, for | |||||
every file inside the current working directory's default branch | |||||
(usually *master*). | |||||
:param repo_url: The metadata of the repository to be indexed. | |||||
:type repo_url: :class:`GitRepository` | |||||
""" | |||||
commits_meta = self._get_commits_metadata() | |||||
if commits_meta is None: | |||||
return | |||||
for filename in commits_meta.keys(): | |||||
try: | |||||
with open(filename) as source_file: | |||||
source = self._decode(source_file.read()) | |||||
if source is None: | |||||
continue | |||||
except IOError as exception: | |||||
continue | |||||
authors = [(self._decode(author), None) for author in \ | |||||
commits_meta[filename]["authors"]] | |||||
codelet = Codelet("%s:%s" % (repo.name, filename), source, filename, | |||||
None, authors, self._generate_file_url(filename, | |||||
repo.url, repo.framework_name), | |||||
commits_meta[filename]["time_created"], | |||||
commits_meta[filename]["time_last_modified"], | |||||
repo.rank) | |||||
def _generate_file_url(self, filename, repo_url, framework_name): | |||||
""" | |||||
Return a url for a filename from a Git wrapper framework. | |||||
:param filename: The path of the file. | |||||
:param repo_url: The url of the file's parent repository. | |||||
:param framework_name: The name of the framework the repository is from. | |||||
:type filename: str | |||||
:type repo_url: str | |||||
:type framework_name: str | |||||
:return: The file's full url on the given framework, if successfully | |||||
derived. | |||||
:rtype: str, or None | |||||
.. warning:: | |||||
Various Git subprocesses will occasionally fail, and, seeing as the | |||||
information they provide is a crucial component of some repository file | |||||
urls, None may be returned. | |||||
""" | |||||
try: | |||||
if framework_name == "GitHub": | |||||
default_branch = subprocess.check_output("git branch" | |||||
" --no-color", shell=True)[2:-1] | |||||
return ("%s/blob/%s/%s" % (repo_url, default_branch, | |||||
filename)).replace("//", "/") | |||||
elif framework_name == "Bitbucket": | |||||
commit_hash = subprocess.check_output("git rev-parse HEAD", | |||||
shell=True).replace("\n", "") | |||||
return ("%s/src/%s/%s" % (repo_url, commit_hash, | |||||
filename)).replace("//", "/") | |||||
except subprocess.CalledProcessError as exception: | |||||
return None | |||||
def _get_git_commits(self): | |||||
""" | |||||
Return the current working directory's formatted commit data. | |||||
Uses `git log` to generate metadata about every single file in the | |||||
repository's commit history. | |||||
:return: The author, timestamp, and names of all modified files of every | |||||
commit. | |||||
.. code-block:: python | |||||
sample_returned_array = [ | |||||
{ | |||||
"author" : (str) "author" | |||||
"timestamp" : (`datetime.datetime`) <object>, | |||||
"filenames" : (str array) ["file1", "file2"] | |||||
} | |||||
] | |||||
:rtype: array of dictionaries | |||||
""" | |||||
git_log = subprocess.check_output(("git --no-pager log --name-only" | |||||
" --pretty=format:'%n%n%an%n%at' -z"), shell=True) | |||||
commits = [] | |||||
for commit in git_log.split("\n\n"): | |||||
fields = commit.split("\n") | |||||
if len(fields) > 2: | |||||
commits.append({ | |||||
"author" : fields[0], | |||||
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), | |||||
"filenames" : fields[2].split("\x00")[:-2] | |||||
}) | |||||
return commits | |||||
def _get_tracked_files(self): | |||||
""" | |||||
Return a list of the filenames of all valuable files in the Git repository. | |||||
Get a list of the filenames of the non-binary (Perl heuristics used for | |||||
filetype identification) files currently inside the current working | |||||
directory's Git repository. Then, weed out any boilerplate/non-code files | |||||
that match the regex rules in GIT_IGNORE_FILES. | |||||
:return: The filenames of all index-worthy non-binary files. | |||||
:rtype: str array | |||||
""" | |||||
files = [] | |||||
for dirname, subdir_names, filenames in os.walk("."): | |||||
for filename in filenames: | |||||
path = os.path.join(dirname, filename) | |||||
if self._is_ascii(path): | |||||
files.append(path[2:]) | |||||
return files | |||||
def _get_commits_metadata(self): | |||||
""" | |||||
Return a dictionary containing every valuable tracked file's metadata. | |||||
:return: A dictionary with author names, time of creation, and time of last | |||||
modification for every filename key. | |||||
.. code-block:: python | |||||
sample_returned_dict = { | |||||
"my_file" : { | |||||
"authors" : (str array) ["author1", "author2"], | |||||
"time_created" : (`datetime.datetime`) <object>, | |||||
"time_last_modified" : (`datetime.datetime`) <object> | |||||
} | |||||
} | |||||
:rtype: dictionary of dictionaries | |||||
""" | |||||
commits = self._get_git_commits() | |||||
tracked_files = self._get_tracked_files() | |||||
files_meta = {} | |||||
for commit in commits: | |||||
for filename in commit["filenames"]: | |||||
if filename not in tracked_files: | |||||
continue | |||||
if filename not in files_meta.keys(): | |||||
files_meta[filename] = { | |||||
"authors" : [commit["author"]], | |||||
"time_last_modified" : commit["timestamp"], | |||||
"time_created" : commit["timestamp"] | |||||
} | |||||
else: | |||||
if commit["author"] not in files_meta[filename]["authors"]: | |||||
files_meta[filename]["authors"].append(commit["author"]) | |||||
files_meta[filename]["time_created"] = commit["timestamp"] | |||||
return files_meta | |||||
def _decode(self, raw): | |||||
""" | |||||
Return a decoded a raw string. | |||||
:param raw: The string to string. | |||||
:type raw: (str) | |||||
:return: If the original encoding is successfully inferenced, return the | |||||
decoded string. | |||||
:rtype: str, or None | |||||
.. warning:: | |||||
The raw string's original encoding is identified by heuristics which | |||||
can, and occasionally will, fail. Decoding will then fail, and None | |||||
will be returned. | |||||
""" | |||||
try: | |||||
encoding = bs4.BeautifulSoup(raw).original_encoding | |||||
return raw.decode(encoding) if encoding is not None else None | |||||
except (LookupError, UnicodeDecodeError, UserWarning) as exception: | |||||
return None | |||||
def _is_ascii(self, filename): | |||||
""" | |||||
Heuristically determine whether a file is ASCII text or binary. | |||||
If a portion of the file contains null bytes, or the percentage of bytes | |||||
that aren't ASCII is greater than 30%, then the file is concluded to be | |||||
binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T` | |||||
operator, and is the de-facto method for in : passdetermining whether a | |||||
file is ASCII. | |||||
:param filename: The path of the file to test. | |||||
:type filename: str | |||||
:return: Whether the file is probably ASCII. | |||||
:rtype: Boolean | |||||
""" | |||||
try: | |||||
with open(filename) as source: | |||||
file_snippet = source.read(512) | |||||
if not file_snippet: | |||||
return True | |||||
ascii_characters = "".join(map(chr, range(32, 127)) + | |||||
list("\n\r\t\b")) | |||||
null_trans = string.maketrans("", "") | |||||
if "\0" in file_snippet: | |||||
return False | |||||
non_ascii = file_snippet.translate(null_trans, ascii_characters) | |||||
return not float(len(non_ascii)) / len(file_snippet) > 0.30 | |||||
except IOError as exception: | |||||
return False | |||||
class _GitCloner(threading.Thread): | |||||
""" | |||||
A singleton Git repository cloner. | |||||
Clones the repositories crawled by :class:`crawler.GitHubCrawler` for | |||||
:class:`GitIndexer` to index. | |||||
:ivar clone_queue: (:class:`Queue.Queue`) see | |||||
:attr:`crawler.GitHubCrawler.clone_queue`. | |||||
:ivar index_queue: (:class:`Queue.Queue`) see | |||||
:attr:`GitIndexer.index_queue`. | |||||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||||
""" | |||||
def __init__(self, clone_queue, index_queue): | |||||
""" | |||||
Create an instance of the singleton :class:`_GitCloner`. | |||||
:param clone_queue: see :attr:`self.clone_queue` | |||||
:param index_queue: see :attr:`self.index_queue` | |||||
:type clone_queue: see :attr:`self.clone_queue` | |||||
:type index_queue: see :attr:`self.index_queue` | |||||
""" | |||||
self.clone_queue = clone_queue | |||||
self.index_queue = index_queue | |||||
self._logger = logging.getLogger("%s.%s" % | |||||
(__name__, self.__class__.__name__)) | |||||
self._logger.info("Starting.") | |||||
super(_GitCloner, self).__init__(name=self.__class__.__name__) | |||||
def run(self): | |||||
""" | |||||
Retrieve metadata about newly crawled repositories and clone them. | |||||
Blocks until new :class:`GitRepository` appear in | |||||
:attr:`self.clone_queue`, then attempts cloning them. If | |||||
succcessful, the cloned repository is added to :attr:`self.index_queue` | |||||
for the `GitIndexer` to clone; otherwise, it is discarded. | |||||
""" | |||||
while True: | |||||
while self.clone_queue.empty(): | |||||
time.sleep(THREAD_QUEUE_SLEEP) | |||||
repo = self.clone_queue.get() | |||||
self.clone_queue.task_done() | |||||
try: | |||||
self._clone_repository(repo) | |||||
except Exception as exception: | |||||
pass | |||||
def _clone_repository(self, repo): | |||||
""" | |||||
Attempt cloning a Git repository. | |||||
:param repo: Metadata about the repository to clone. | |||||
:type repo: :class:`GitRepository` | |||||
""" | |||||
GIT_CLONE_TIMEOUT = 500 | |||||
queue_percent_full = (float(self.index_queue.qsize()) / | |||||
self.index_queue.maxsize) * 100 | |||||
exit_code = None | |||||
command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone" | |||||
" --single-branch %s %s/%s || pkill -f git") | |||||
command_attempt = 0 | |||||
while exit_code is None: | |||||
try: | |||||
exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT, | |||||
repo.url, GIT_CLONE_DIR, repo.name), shell=True) | |||||
except Exception as exception: | |||||
time.sleep(1) | |||||
command_attempt += 1 | |||||
if command_attempt == 20: | |||||
break | |||||
else: | |||||
continue | |||||
else: | |||||
break | |||||
if exit_code != 0: | |||||
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): | |||||
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) | |||||
return | |||||
while self.index_queue.full(): | |||||
time.sleep(THREAD_QUEUE_SLEEP) | |||||
self.index_queue.put(repo) | |||||
class _ChangeDir(object): | |||||
""" | |||||
A wrapper class for os.chdir(), to map onto `with` and handle exceptions. | |||||
:ivar new_path: (str) The path to change the current directory to. | |||||
:ivar old_path: (str) The path of the directory to return to. | |||||
""" | |||||
def __init__(self, new_path): | |||||
""" | |||||
Create a _ChangeDir instance. | |||||
:param new_path: The directory to enter. | |||||
:type new_path: str | |||||
""" | |||||
self.new_path = new_path | |||||
def __enter__(self): | |||||
""" | |||||
Change the current working-directory to **new_path**. | |||||
""" | |||||
self.old_path = os.getcwd() | |||||
os.chdir(self.new_path) | |||||
def __exit__(self, *exception): | |||||
""" | |||||
Change the current working-directory to **old_path**. | |||||
:param exception: Various exception arguments passed by `with`. | |||||
:type exception: varargs | |||||
""" | |||||
os.chdir(self.old_path) |
@@ -6,7 +6,7 @@ setup( | |||||
packages = find_packages(), | packages = find_packages(), | ||||
install_requires = [ | install_requires = [ | ||||
"Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", | "Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", | ||||
"BeautifulSoup>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"], | |||||
"beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"], | |||||
author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", | author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", | ||||
license = "MIT", | license = "MIT", | ||||
url = "https://github.com/earwig/bitshift" | url = "https://github.com/earwig/bitshift" | ||||