@@ -1,5 +1,6 @@ | |||||
.sass-cache | .sass-cache | ||||
.DS_Store | .DS_Store | ||||
.my.cnf | |||||
# github premade rules | # github premade rules | ||||
*.py[cod] | *.py[cod] | ||||
@@ -1,7 +1,8 @@ | |||||
bitshift | bitshift | ||||
======== | ======== | ||||
bitshift is a semantic search engine for source code. | |||||
bitshift is a semantic search engine for source code developed by Benjamin | |||||
Attal, Ben Kurtovic, and Severyn Kozak. | |||||
Branches | Branches | ||||
-------- | -------- | ||||
@@ -13,6 +14,11 @@ Branches | |||||
- `feature/*`: individual components of the project with untested, likely | - `feature/*`: individual components of the project with untested, likely | ||||
horribly broken code - branch off from and merge into `develop` when done | horribly broken code - branch off from and merge into `develop` when done | ||||
Style | |||||
----- | |||||
bitshift uses [SASS][SASS] for styling; compile the stylesheets to CSS with | |||||
`sass --watch static/sass/:static/css`. | |||||
Documentation | Documentation | ||||
------------- | ------------- | ||||
@@ -24,3 +30,5 @@ new modules or packages, but *not* when adding functions or changing | |||||
docstrings), run `sphinx-apidoc -fo docs/source/api bitshift` from the project | docstrings), run `sphinx-apidoc -fo docs/source/api bitshift` from the project | ||||
root. Note that this will revert any custom changes made to the files in | root. Note that this will revert any custom changes made to the files in | ||||
`docs/source/api`, so you might want to update them by hand instead. | `docs/source/api`, so you might want to update them by hand instead. | ||||
[SASS]: http://sass-lang.com/guide |
@@ -5,6 +5,8 @@ Module to contain all the project's Flask server plumbing. | |||||
from flask import Flask | from flask import Flask | ||||
from flask import render_template, session | from flask import render_template, session | ||||
from bitshift import assets | |||||
from bitshift.database import Database | |||||
from bitshift.query import parse_query | from bitshift.query import parse_query | ||||
app = Flask(__name__) | app = Flask(__name__) | ||||
@@ -12,7 +14,9 @@ app.config.from_object("bitshift.config") | |||||
app_env = app.jinja_env | app_env = app.jinja_env | ||||
app_env.line_statement_prefix = "=" | app_env.line_statement_prefix = "=" | ||||
app_env.globals.update(assets = assets) | |||||
app_env.globals.update(assets=assets) | |||||
database = Database() | |||||
@app.route("/") | @app.route("/") | ||||
def index(): | def index(): | ||||
@@ -20,8 +24,8 @@ def index(): | |||||
@app.route("/search/<query>") | @app.route("/search/<query>") | ||||
def search(query): | def search(query): | ||||
## tree = parse_query(query) | |||||
## database.search(tree) | |||||
tree = parse_query(query) | |||||
database.search(tree) | |||||
pass | pass | ||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
@@ -1 +1 @@ | |||||
from . import assets, codelet, config, database, parser, query | |||||
from . import assets, codelet, config, database, parser, query, crawler |
@@ -1,6 +1,5 @@ | |||||
""" | """ | ||||
.. module:: assets | |||||
:synopsis: Helper functions for use inside the project's Jinja templates. | |||||
:synopsis: Helper functions for use inside the project's Jinja templates. | |||||
""" | """ | ||||
from flask import Markup | from flask import Markup | ||||
@@ -16,8 +15,11 @@ def tag(filename): | |||||
:param filename: The filename of the asset to create a tag for. | :param filename: The filename of the asset to create a tag for. | ||||
:type filename: str | |||||
:return: A string containing a `<source>` tag for JS files, and a `<link>` | :return: A string containing a `<source>` tag for JS files, and a `<link>` | ||||
for CSS files. | for CSS files. | ||||
:rtype: str | |||||
""" | """ | ||||
file_ext = filename.split(".")[-1] | file_ext = filename.split(".")[-1] | ||||
@@ -1,13 +1,57 @@ | |||||
__all__ = ["Codelet"] | __all__ = ["Codelet"] | ||||
class Codelet(object): | class Codelet(object): | ||||
## object to store the following (it doesn't need to do anything with it): | |||||
## author name, URL, date created/modified, language, source code itself | |||||
## for VCS: project name, file in project | |||||
## also: list of functions, etc (associations data) | |||||
""" | |||||
A source-code object with code metadata and composition analysis. | |||||
## DICTIONARY MAPPING STRINGS REPRESENTING ASSOCIATION TYPE WITH DICTIONARIES | |||||
## MAPPING ASSOCIATION NAMES WITH TUPLES REPRESENTING THEIR PLACE IN THE FILE | |||||
## STORED AS TWO INTEGERS REPRESENTING THE ROW AND THE COLUMN | |||||
:ivar name: (str) A suitable name for the codelet. | |||||
:ivar code: (str) A containing the raw source code. | |||||
:ivar filename: (str, or None) The filename of the snippet. | |||||
:ivar language: (int, or None) The inferred language of `code`. | |||||
:ivar authors: (array of tuples (str, str or None)) An array of tuples | |||||
containing an author's name and profile URL (on the service the code | |||||
was pulled from). | |||||
:ivar code_url: (str) The url of the (page containing the) source code. | |||||
:ivar date_created: (:class:`datetime.datetime`, or None) The date the code | |||||
was published. | |||||
:ivar date_modified: (:class:`datetime.datetime`, or None) The date the | |||||
code was last modified. | |||||
:ivar rank: (float) A quanitification of the source code's quality, as | |||||
per available ratings (stars, forks, upvotes, etc.). | |||||
""" | |||||
## {"functions": {"foo": (12, 13), "bar": (53, 3)}} | |||||
def __init__(self, name, code, filename, language, authors, code_url, | |||||
date_created, date_modified, rank): | |||||
""" | |||||
Create a Codelet instance. | |||||
:param name: see :attr:`self.name` | |||||
:param code: see :attr:`self.code` | |||||
:param filename: see :attr:`self.filename` | |||||
:param language: see :attr:`self.language` | |||||
:param authors: see :attr:`self.authors` | |||||
:param code_url: see :attr:`self.code_url` | |||||
:param date_created: see :attr:`self.date_created` | |||||
:param date_modified: see :attr:`self.date_modified` | |||||
:param rank: see :attr:`self.rank` | |||||
:type name: see :attr:`self.name` | |||||
:type code: see :attr:`self.code` | |||||
:type filename: see :attr:`self.filename` | |||||
:type language: see :attr:`self.language` | |||||
:type authors: see :attr:`self.authors` | |||||
:type code_url: see :attr:`self.code_url` | |||||
:type date_created: see :attr:`self.date_created` | |||||
:type date_modified: see :attr:`self.date_modified` | |||||
:type rank: see :attr:`self.rank` | |||||
""" | |||||
self.name = name | |||||
self.code = code | |||||
self.filename = filename | |||||
self.language = language | |||||
self.authors = authors | |||||
self.code_url = code_url | |||||
self.date_created = date_created | |||||
self.date_modified = date_modified | |||||
self.rank = rank |
@@ -0,0 +1,55 @@ | |||||
""" | |||||
:synopsis: Parent crawler module, which supervises all crawlers. | |||||
Contains functions for initializing all subsidiary, threaded crawlers. | |||||
""" | |||||
import logging, logging.handlers, os, Queue | |||||
from bitshift.crawler import crawler, indexer | |||||
__all__ = ["crawl"] | |||||
def crawl(): | |||||
""" | |||||
Initialize all crawlers (and indexers). | |||||
Start the: | |||||
1. GitHub crawler, :class:`crawler.GitHubCrawler`. | |||||
2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`. | |||||
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. | |||||
""" | |||||
_configure_logging() | |||||
MAX_URL_QUEUE_SIZE = 5e3 | |||||
repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) | |||||
threads = [crawler.GitHubCrawler(repo_clone_queue), | |||||
crawler.BitbucketCrawler(repo_clone_queue), | |||||
indexer.GitIndexer(repo_clone_queue)] | |||||
for thread in threads: | |||||
thread.start() | |||||
def _configure_logging(): | |||||
LOG_FILE_DIR = "log" | |||||
if not os.path.exists(LOG_FILE_DIR): | |||||
os.mkdir(LOG_FILE_DIR) | |||||
logging.getLogger("requests").setLevel(logging.WARNING) | |||||
logging.getLogger("urllib3").setLevel(logging.WARNING) | |||||
formatter = logging.Formatter( | |||||
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" | |||||
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S") | |||||
handler = logging.handlers.TimedRotatingFileHandler( | |||||
"%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1, | |||||
backupCount=20) | |||||
handler.setFormatter(formatter) | |||||
root_logger = logging.getLogger() | |||||
root_logger.addHandler(handler) | |||||
root_logger.setLevel(logging.NOTSET) |
@@ -0,0 +1,240 @@ | |||||
""" | |||||
:synopsis: Main crawler module, to oversee all site-specific crawlers. | |||||
Contains all website/framework-specific Class crawlers. | |||||
""" | |||||
import logging, requests, time, threading | |||||
from bitshift.crawler import indexer | |||||
from ..codelet import Codelet | |||||
from ..database import Database | |||||
class GitHubCrawler(threading.Thread): | |||||
""" | |||||
Crawler that retrieves links to all of GitHub's public repositories. | |||||
GitHubCrawler is a threaded singleton that queries GitHub's API for urls | |||||
to its public repositories, which it inserts into a :class:`Queue.Queue` | |||||
shared with :class:`indexer.GitIndexer`. | |||||
:ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository` | |||||
with repository metadata retrieved by :class:`GitHubCrawler`, and other Git | |||||
crawlers, to be processed by :class:`indexer.GitIndexer`. | |||||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||||
""" | |||||
AUTHENTICATION = { | |||||
"client_id" : "436cb884ae09be7f2a4e", | |||||
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" | |||||
} | |||||
def __init__(self, clone_queue): | |||||
""" | |||||
Create an instance of the singleton `GitHubCrawler`. | |||||
:param clone_queue: see :attr:`self.clone_queue` | |||||
:type clone_queue: see :attr:`self.clone_queue` | |||||
""" | |||||
self.clone_queue = clone_queue | |||||
self._logger = logging.getLogger("%s.%s" % | |||||
(__name__, self.__class__.__name__)) | |||||
self._logger.info("Starting.") | |||||
super(GitHubCrawler, self).__init__(name=self.__class__.__name__) | |||||
def run(self): | |||||
""" | |||||
Query the GitHub API for data about every public repository. | |||||
Pull all of GitHub's repositories by making calls to its API in a loop, | |||||
accessing a subsequent page of results via the "next" URL returned in an | |||||
API response header. Uses Severyn Kozak's (sevko) authentication | |||||
credentials. For every new repository, a :class:`GitRepository` is | |||||
inserted into :attr:`self.clone_queue`. | |||||
""" | |||||
next_api_url = "https://api.github.com/repositories" | |||||
api_request_interval = 5e3 / 60 ** 2 | |||||
while len(next_api_url) > 0: | |||||
start_time = time.time() | |||||
try: | |||||
resp = requests.get(next_api_url, params=self.AUTHENTICATION) | |||||
except ConnectionError as excep: | |||||
self._logger.warning("API %s call failed: %s: %s", | |||||
next_api_url, excep.__class__.__name__, excep) | |||||
time.sleep(0.5) | |||||
continue | |||||
queue_percent_full = (float(self.clone_queue.qsize()) / | |||||
self.clone_queue.maxsize) * 100 | |||||
self._logger.info("API call made. Queue size: %d/%d, %d%%." % | |||||
((self.clone_queue.qsize(), self.clone_queue.maxsize, | |||||
queue_percent_full))) | |||||
repo_names = [repo["full_name"] for repo in resp.json()] | |||||
repo_stars = self._get_repositories_stars(repo_names) | |||||
for repo in resp.json(): | |||||
while self.clone_queue.full(): | |||||
time.sleep(1) | |||||
self.clone_queue.put(indexer.GitRepository( | |||||
repo["html_url"], repo["full_name"].replace("/", ""), | |||||
"GitHub", repo_stars[repo["full_name"]])) | |||||
if int(resp.headers["x-ratelimit-remaining"]) == 0: | |||||
time.sleep(int(resp.headers["x-ratelimit-reset"]) - | |||||
time.time()) | |||||
next_api_url = resp.headers["link"].split(">")[0][1:] | |||||
sleep_time = api_request_interval - (time.time() - start_time) | |||||
if sleep_time > 0: | |||||
time.sleep(sleep_time) | |||||
def _get_repositories_stars(self, repo_names): | |||||
""" | |||||
Return the number of stargazers for several repositories. | |||||
Queries the GitHub API for the number of stargazers for any given | |||||
repositories, and blocks if the query limit is exceeded. | |||||
:param repo_names: An array of repository names, in | |||||
`username/repository_name` format. | |||||
:type repo_names: str | |||||
:return: A dictionary with repository name keys, and corresponding | |||||
stargazer count values. | |||||
Example dictionary: | |||||
.. code-block:: python | |||||
{ | |||||
"user/repository" : 100 | |||||
} | |||||
:rtype: dictionary | |||||
""" | |||||
API_URL = "https://api.github.com/search/repositories" | |||||
REPOS_PER_QUERY = 25 | |||||
repo_stars = {} | |||||
for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in | |||||
xrange(0, len(repo_names), REPOS_PER_QUERY)]: | |||||
query_url = "%s?q=%s" % (API_URL, | |||||
"+".join("repo:%s" % name for name in names)) | |||||
params = self.AUTHENTICATION | |||||
resp = requests.get(query_url, | |||||
params=params, | |||||
headers={ | |||||
"Accept" : "application/vnd.github.preview" | |||||
}) | |||||
if int(resp.headers["x-ratelimit-remaining"]) == 0: | |||||
sleep_time = int(resp.headers["x-ratelimit-reset"]) - \ | |||||
time.time() + 1 | |||||
if sleep_time > 0: | |||||
logging.info("API quota exceeded. Sleep time: %d." % | |||||
sleep_time) | |||||
time.sleep(sleep_time) | |||||
for repo in resp.json()["items"]: | |||||
rank = float(repo["stargazers_count"]) / 1000 | |||||
repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0 | |||||
for name in repo_names: | |||||
if name not in repo_stars: | |||||
repo_stars[name] = 0.5 | |||||
return repo_stars | |||||
class BitbucketCrawler(threading.Thread): | |||||
""" | |||||
Crawler that retrieves links to all of Bitbucket's public repositories. | |||||
BitbucketCrawler is a threaded singleton that queries Bitbucket's API for | |||||
urls to its public repositories, and inserts them as | |||||
:class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with | |||||
:class:`indexer.GitIndexer`. | |||||
:ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert | |||||
:class:`indexer.GitRepository` repository urls into. | |||||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||||
""" | |||||
def __init__(self, clone_queue): | |||||
""" | |||||
Create an instance of the singleton `BitbucketCrawler`. | |||||
:param clone_queue: see :attr:`self.clone_queue` | |||||
:type clone_queue: see :attr:`self.clone_queue` | |||||
""" | |||||
self.clone_queue = clone_queue | |||||
self._logger = logging.getLogger("%s.%s" % | |||||
(__name__, self.__class__.__name__)) | |||||
self._logger.info("Starting.") | |||||
super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) | |||||
def run(self): | |||||
""" | |||||
Query the Bitbucket API for data about every public repository. | |||||
Query the Bitbucket API's "/repositories" endpoint and read its | |||||
paginated responses in a loop; any "git" repositories have their | |||||
clone-urls and names inserted into a :class:`indexer.GitRepository` in | |||||
:attr:`self.clone_queue`. | |||||
""" | |||||
next_api_url = "https://api.bitbucket.org/2.0/repositories" | |||||
while True: | |||||
try: | |||||
response = requests.get(next_api_url).json() | |||||
except ConnectionError as exception: | |||||
time.sleep(0.5) | |||||
self._logger.warning("API %s call failed: %s: %s", | |||||
next_api_url, excep.__class__.__name__, excep) | |||||
continue | |||||
queue_percent_full = (float(self.clone_queue.qsize()) / | |||||
self.clone_queue.maxsize) * 100 | |||||
self._logger.info("API call made. Queue size: %d/%d, %d%%." % | |||||
((self.clone_queue.qsize(), self.clone_queue.maxsize, | |||||
queue_percent_full))) | |||||
for repo in response["values"]: | |||||
if repo["scm"] == "git": | |||||
while self.clone_queue.full(): | |||||
time.sleep(1) | |||||
clone_links = repo["links"]["clone"] | |||||
clone_url = (clone_links[0]["href"] if | |||||
clone_links[0]["name"] == "https" else | |||||
clone_links[1]["href"]) | |||||
links.append("clone_url") | |||||
try: | |||||
watchers = requests.get( | |||||
repo["links"]["watchers"]["href"]) | |||||
rank = len(watchers.json()["values"]) / 100 | |||||
except ConnectionError as exception: | |||||
time.sleep(0.5) | |||||
self._logger.warning("API %s call failed: %s: %s", | |||||
next_api_url, excep.__class__.__name__, excep) | |||||
continue | |||||
self.clone_queue.put(indexer.GitRepository( | |||||
clone_url, repo["full_name"], "Bitbucket"), | |||||
rank if rank < 1.0 else 1.0) | |||||
next_api_url = response["next"] | |||||
time.sleep(0.2) |
@@ -0,0 +1,489 @@ | |||||
""" | |||||
:synopsis: Contains a singleton GitIndexer class, which clones and indexes git | |||||
repositories. | |||||
""" | |||||
import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\ | |||||
threading | |||||
from ..database import Database | |||||
from ..codelet import Codelet | |||||
GIT_CLONE_DIR = "/tmp/bitshift" | |||||
THREAD_QUEUE_SLEEP = 0.5 | |||||
class GitRepository(object): | |||||
""" | |||||
A representation of a Git repository's metadata. | |||||
:ivar url: (str) The repository's url. | |||||
:ivar name: (str) The name of the repository. | |||||
:ivar framework_name: (str) The name of the online Git framework that the | |||||
repository belongs to (eg, GitHub, BitBucket). | |||||
:ivar rank: (float) The rank of the repository, as assigned by | |||||
:class:`crawler.GitHubCrawler`. | |||||
""" | |||||
def __init__(self, url, name, framework_name, rank): | |||||
""" | |||||
Create a GitRepository instance. | |||||
:param url: see :attr:`GitRepository.url` | |||||
:param name: see :attr:`GitRepository.name` | |||||
:param framework_name: see :attr:`GitRepository.framework_name` | |||||
:param rank: see :attr:`GitRepository.rank` | |||||
:type url: str | |||||
:type name: str | |||||
:type framework_name: str | |||||
:type rank: float | |||||
""" | |||||
self.url = url | |||||
self.name = name | |||||
self.framework_name = framework_name | |||||
self.rank = rank | |||||
class GitIndexer(threading.Thread): | |||||
""" | |||||
A singleton Git repository indexer. | |||||
:class:`GitIndexer` indexes the repositories cloned by the | |||||
:class:`_GitCloner` singleton. | |||||
:ivar index_queue: (:class:`Queue.Queue`) A queue containing | |||||
:class:`GitRepository` objects for every new repository succesfully | |||||
cloned by :class:`_GitCloner`, which are to be indexed. | |||||
:ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner, | |||||
which feeds :class:`GitIndexer`. | |||||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||||
""" | |||||
def __init__(self, clone_queue): | |||||
""" | |||||
Create an instance of the singleton `GitIndexer`. | |||||
:param clone_queue: see :attr:`self.index_queue` | |||||
:type index_queue: see :attr:`self.index_queue` | |||||
""" | |||||
MAX_INDEX_QUEUE_SIZE = 10 | |||||
self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) | |||||
self.git_cloner = _GitCloner(clone_queue, self.index_queue) | |||||
self.git_cloner.start() | |||||
self._logger = logging.getLogger("%s.%s" % | |||||
(__name__, self.__class__.__name__)) | |||||
self._logger.info("Starting.") | |||||
if not os.path.exists(GIT_CLONE_DIR): | |||||
os.makedirs(GIT_CLONE_DIR) | |||||
super(GitIndexer, self).__init__(name=self.__class__.__name__) | |||||
def run(self): | |||||
""" | |||||
Retrieve metadata about newly cloned repositories and index them. | |||||
Blocks until new repositories appear in :attr:`self.index_queue`, then | |||||
retrieves one, and attempts indexing it. Should any errors occur, the | |||||
new repository will be discarded and the indexer will index the next in | |||||
the queue. | |||||
""" | |||||
while True: | |||||
while self.index_queue.empty(): | |||||
time.sleep(THREAD_QUEUE_SLEEP) | |||||
repo = self.index_queue.get() | |||||
self.index_queue.task_done() | |||||
try: | |||||
self._index_repository(repo) | |||||
except Exception as excep: | |||||
self._logger.warning("%s: %s.", excep.__class__.__name__, excep) | |||||
def _index_repository(self, repo): | |||||
""" | |||||
Clone and index (create and insert Codeletes for) a Git repository. | |||||
`git clone` the Git repository located at **repo.url**, call | |||||
`_insert_repository_codelets()`, then remove said repository. | |||||
:param repo_url: The metadata of the repository to be indexed. | |||||
:type repo_url: :class:`GitRepository` | |||||
""" | |||||
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir: | |||||
try: | |||||
self._insert_repository_codelets(repo) | |||||
except Exception as excep: | |||||
self._logger.warning("%s: %s.", excep.__class__.__name__, excep) | |||||
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): | |||||
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) | |||||
def _insert_repository_codelets(self, repo): | |||||
""" | |||||
Create and insert a Codelet for the files inside a Git repository. | |||||
Create a new Codelet, and insert it into the Database singleton, for | |||||
every file inside the current working directory's default branch | |||||
(usually *master*). | |||||
:param repo_url: The metadata of the repository to be indexed. | |||||
:type repo_url: :class:`GitRepository` | |||||
""" | |||||
commits_meta = self._get_commits_metadata() | |||||
if commits_meta is None: | |||||
return | |||||
for filename in commits_meta.keys(): | |||||
try: | |||||
with open(filename) as source_file: | |||||
source = self._decode(source_file.read()) | |||||
if source is None: | |||||
continue | |||||
except IOError as exception: | |||||
continue | |||||
authors = [(self._decode(author), None) for author in \ | |||||
commits_meta[filename]["authors"]] | |||||
codelet = Codelet("%s:%s" % (repo.name, filename), source, filename, | |||||
None, authors, self._generate_file_url(filename, | |||||
repo.url, repo.framework_name), | |||||
commits_meta[filename]["time_created"], | |||||
commits_meta[filename]["time_last_modified"], | |||||
repo.rank) | |||||
def _generate_file_url(self, filename, repo_url, framework_name): | |||||
""" | |||||
Return a url for a filename from a Git wrapper framework. | |||||
:param filename: The path of the file. | |||||
:param repo_url: The url of the file's parent repository. | |||||
:param framework_name: The name of the framework the repository is from. | |||||
:type filename: str | |||||
:type repo_url: str | |||||
:type framework_name: str | |||||
:return: The file's full url on the given framework, if successfully | |||||
derived. | |||||
:rtype: str, or None | |||||
.. warning:: | |||||
Various Git subprocesses will occasionally fail, and, seeing as the | |||||
information they provide is a crucial component of some repository file | |||||
urls, None may be returned. | |||||
""" | |||||
try: | |||||
if framework_name == "GitHub": | |||||
default_branch = subprocess.check_output("git branch" | |||||
" --no-color", shell=True)[2:-1] | |||||
return ("%s/blob/%s/%s" % (repo_url, default_branch, | |||||
filename)).replace("//", "/") | |||||
elif framework_name == "Bitbucket": | |||||
commit_hash = subprocess.check_output("git rev-parse HEAD", | |||||
shell=True).replace("\n", "") | |||||
return ("%s/src/%s/%s" % (repo_url, commit_hash, | |||||
filename)).replace("//", "/") | |||||
except subprocess.CalledProcessError as exception: | |||||
return None | |||||
def _get_git_commits(self): | |||||
""" | |||||
Return the current working directory's formatted commit data. | |||||
Uses `git log` to generate metadata about every single file in the | |||||
repository's commit history. | |||||
:return: The author, timestamp, and names of all modified files of every | |||||
commit. | |||||
.. code-block:: python | |||||
sample_returned_array = [ | |||||
{ | |||||
"author" : (str) "author" | |||||
"timestamp" : (`datetime.datetime`) <object>, | |||||
"filenames" : (str array) ["file1", "file2"] | |||||
} | |||||
] | |||||
:rtype: array of dictionaries | |||||
""" | |||||
git_log = subprocess.check_output(("git --no-pager log --name-only" | |||||
" --pretty=format:'%n%n%an%n%at' -z"), shell=True) | |||||
commits = [] | |||||
for commit in git_log.split("\n\n"): | |||||
fields = commit.split("\n") | |||||
if len(fields) > 2: | |||||
commits.append({ | |||||
"author" : fields[0], | |||||
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), | |||||
"filenames" : fields[2].split("\x00")[:-2] | |||||
}) | |||||
return commits | |||||
def _get_tracked_files(self): | |||||
""" | |||||
Return a list of the filenames of all valuable files in the Git repository. | |||||
Get a list of the filenames of the non-binary (Perl heuristics used for | |||||
filetype identification) files currently inside the current working | |||||
directory's Git repository. Then, weed out any boilerplate/non-code files | |||||
that match the regex rules in GIT_IGNORE_FILES. | |||||
:return: The filenames of all index-worthy non-binary files. | |||||
:rtype: str array | |||||
""" | |||||
files = [] | |||||
for dirname, subdir_names, filenames in os.walk("."): | |||||
for filename in filenames: | |||||
path = os.path.join(dirname, filename) | |||||
if self._is_ascii(path): | |||||
files.append(path[2:]) | |||||
return files | |||||
def _get_commits_metadata(self): | |||||
""" | |||||
Return a dictionary containing every valuable tracked file's metadata. | |||||
:return: A dictionary with author names, time of creation, and time of last | |||||
modification for every filename key. | |||||
.. code-block:: python | |||||
sample_returned_dict = { | |||||
"my_file" : { | |||||
"authors" : (str array) ["author1", "author2"], | |||||
"time_created" : (`datetime.datetime`) <object>, | |||||
"time_last_modified" : (`datetime.datetime`) <object> | |||||
} | |||||
} | |||||
:rtype: dictionary of dictionaries | |||||
""" | |||||
commits = self._get_git_commits() | |||||
tracked_files = self._get_tracked_files() | |||||
files_meta = {} | |||||
for commit in commits: | |||||
for filename in commit["filenames"]: | |||||
if filename not in tracked_files: | |||||
continue | |||||
if filename not in files_meta.keys(): | |||||
files_meta[filename] = { | |||||
"authors" : [commit["author"]], | |||||
"time_last_modified" : commit["timestamp"], | |||||
"time_created" : commit["timestamp"] | |||||
} | |||||
else: | |||||
if commit["author"] not in files_meta[filename]["authors"]: | |||||
files_meta[filename]["authors"].append(commit["author"]) | |||||
files_meta[filename]["time_created"] = commit["timestamp"] | |||||
return files_meta | |||||
def _decode(self, raw): | |||||
""" | |||||
Return a decoded a raw string. | |||||
:param raw: The string to string. | |||||
:type raw: (str) | |||||
:return: If the original encoding is successfully inferenced, return the | |||||
decoded string. | |||||
:rtype: str, or None | |||||
.. warning:: | |||||
The raw string's original encoding is identified by heuristics which | |||||
can, and occasionally will, fail. Decoding will then fail, and None | |||||
will be returned. | |||||
""" | |||||
try: | |||||
encoding = bs4.BeautifulSoup(raw).original_encoding | |||||
return raw.decode(encoding) if encoding is not None else None | |||||
except (LookupError, UnicodeDecodeError, UserWarning) as exception: | |||||
return None | |||||
def _is_ascii(self, filename): | |||||
""" | |||||
Heuristically determine whether a file is ASCII text or binary. | |||||
If a portion of the file contains null bytes, or the percentage of bytes | |||||
that aren't ASCII is greater than 30%, then the file is concluded to be | |||||
binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T` | |||||
operator, and is the de-facto method for in : passdetermining whether a | |||||
file is ASCII. | |||||
:param filename: The path of the file to test. | |||||
:type filename: str | |||||
:return: Whether the file is probably ASCII. | |||||
:rtype: Boolean | |||||
""" | |||||
try: | |||||
with open(filename) as source: | |||||
file_snippet = source.read(512) | |||||
if not file_snippet: | |||||
return True | |||||
ascii_characters = "".join(map(chr, range(32, 127)) + | |||||
list("\n\r\t\b")) | |||||
null_trans = string.maketrans("", "") | |||||
if "\0" in file_snippet: | |||||
return False | |||||
non_ascii = file_snippet.translate(null_trans, ascii_characters) | |||||
return not float(len(non_ascii)) / len(file_snippet) > 0.30 | |||||
except IOError as exception: | |||||
return False | |||||
class _GitCloner(threading.Thread): | |||||
""" | |||||
A singleton Git repository cloner. | |||||
Clones the repositories crawled by :class:`crawler.GitHubCrawler` for | |||||
:class:`GitIndexer` to index. | |||||
:ivar clone_queue: (:class:`Queue.Queue`) see | |||||
:attr:`crawler.GitHubCrawler.clone_queue`. | |||||
:ivar index_queue: (:class:`Queue.Queue`) see | |||||
:attr:`GitIndexer.index_queue`. | |||||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||||
""" | |||||
def __init__(self, clone_queue, index_queue): | |||||
""" | |||||
Create an instance of the singleton :class:`_GitCloner`. | |||||
:param clone_queue: see :attr:`self.clone_queue` | |||||
:param index_queue: see :attr:`self.index_queue` | |||||
:type clone_queue: see :attr:`self.clone_queue` | |||||
:type index_queue: see :attr:`self.index_queue` | |||||
""" | |||||
self.clone_queue = clone_queue | |||||
self.index_queue = index_queue | |||||
self._logger = logging.getLogger("%s.%s" % | |||||
(__name__, self.__class__.__name__)) | |||||
self._logger.info("Starting.") | |||||
super(_GitCloner, self).__init__(name=self.__class__.__name__) | |||||
def run(self): | |||||
""" | |||||
Retrieve metadata about newly crawled repositories and clone them. | |||||
Blocks until new :class:`GitRepository` appear in | |||||
:attr:`self.clone_queue`, then attempts cloning them. If | |||||
succcessful, the cloned repository is added to :attr:`self.index_queue` | |||||
for the `GitIndexer` to clone; otherwise, it is discarded. | |||||
""" | |||||
while True: | |||||
while self.clone_queue.empty(): | |||||
time.sleep(THREAD_QUEUE_SLEEP) | |||||
repo = self.clone_queue.get() | |||||
self.clone_queue.task_done() | |||||
try: | |||||
self._clone_repository(repo) | |||||
except Exception as exception: | |||||
pass | |||||
def _clone_repository(self, repo): | |||||
""" | |||||
Attempt cloning a Git repository. | |||||
:param repo: Metadata about the repository to clone. | |||||
:type repo: :class:`GitRepository` | |||||
""" | |||||
GIT_CLONE_TIMEOUT = 500 | |||||
queue_percent_full = (float(self.index_queue.qsize()) / | |||||
self.index_queue.maxsize) * 100 | |||||
exit_code = None | |||||
command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone" | |||||
" --single-branch %s %s/%s || pkill -f git") | |||||
command_attempt = 0 | |||||
while exit_code is None: | |||||
try: | |||||
exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT, | |||||
repo.url, GIT_CLONE_DIR, repo.name), shell=True) | |||||
except Exception as exception: | |||||
time.sleep(1) | |||||
command_attempt += 1 | |||||
if command_attempt == 20: | |||||
break | |||||
else: | |||||
continue | |||||
else: | |||||
break | |||||
if exit_code != 0: | |||||
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): | |||||
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) | |||||
return | |||||
while self.index_queue.full(): | |||||
time.sleep(THREAD_QUEUE_SLEEP) | |||||
self.index_queue.put(repo) | |||||
class _ChangeDir(object): | |||||
""" | |||||
A wrapper class for os.chdir(), to map onto `with` and handle exceptions. | |||||
:ivar new_path: (str) The path to change the current directory to. | |||||
:ivar old_path: (str) The path of the directory to return to. | |||||
""" | |||||
def __init__(self, new_path): | |||||
""" | |||||
Create a _ChangeDir instance. | |||||
:param new_path: The directory to enter. | |||||
:type new_path: str | |||||
""" | |||||
self.new_path = new_path | |||||
def __enter__(self): | |||||
""" | |||||
Change the current working-directory to **new_path**. | |||||
""" | |||||
self.old_path = os.getcwd() | |||||
os.chdir(self.new_path) | |||||
def __exit__(self, *exception): | |||||
""" | |||||
Change the current working-directory to **old_path**. | |||||
:param exception: Various exception arguments passed by `with`. | |||||
:type exception: varargs | |||||
""" | |||||
os.chdir(self.old_path) |
@@ -1,18 +0,0 @@ | |||||
""" | |||||
Module with classes and functions to handle communication with the MySQL | |||||
database backend, which manages the search index. | |||||
""" | |||||
import oursql | |||||
class Database(object): | |||||
"""Represents the MySQL database.""" | |||||
def __init__(self): | |||||
pass | |||||
def _connect(self): | |||||
pass | |||||
def _create(self): | |||||
pass |
@@ -0,0 +1,153 @@ | |||||
""" | |||||
Subpackage with classes and functions to handle communication with the MySQL | |||||
database backend, which manages the search index. | |||||
""" | |||||
import os | |||||
import mmh3 | |||||
import oursql | |||||
from .migration import VERSION, MIGRATIONS | |||||
__all__ = ["Database"] | |||||
class Database(object): | |||||
"""Represents the MySQL database.""" | |||||
def __init__(self, migrate=False): | |||||
self._conn = self._connect() | |||||
self._check_version(migrate) | |||||
def _connect(self): | |||||
"""Establish a connection to the database.""" | |||||
root = os.path.dirname(os.path.abspath(__file__)) | |||||
default_file = os.path.join(root, ".my.cnf") | |||||
return oursql.connect(db="bitshift", read_default_file=default_file, | |||||
autoping=True, autoreconnect=True) | |||||
def _migrate(self, cursor, current): | |||||
"""Migrate the database to the latest schema version.""" | |||||
for version in xrange(current, VERSION): | |||||
print "Migrating to %d..." % version + 1 | |||||
for query in MIGRATIONS[version - 1]: | |||||
cursor.execute(query) | |||||
cursor.execute("UPDATE version SET version = ?", (version + 1,)) | |||||
def _check_version(self, migrate): | |||||
"""Check the database schema version and respond accordingly. | |||||
If the schema is out of date, migrate if *migrate* is True, else raise | |||||
an exception. | |||||
""" | |||||
with self._conn.cursor() as cursor: | |||||
cursor.execute("SELECT version FROM version") | |||||
version = cursor.fetchone()[0] | |||||
if version < VERSION: | |||||
if migrate: | |||||
self._migrate(cursor, version) | |||||
else: | |||||
err = "Database schema out of date. " \ | |||||
"Run `python -m bitshift.database.migration`." | |||||
raise RuntimeError(err) | |||||
def _get_codelets_from_ids(self, cursor, ids): | |||||
"""Return a list of Codelet objects given a list of codelet IDs.""" | |||||
raise NotImplementedError() ## TODO | |||||
def _decompose_url(self, cursor, url): | |||||
"""Break up a URL into an origin (with a URL base) and a suffix.""" | |||||
query = """SELECT origin_id, SUBSTR(?, LENGTH(origin_url_base)) | |||||
FROM origins | |||||
WHERE origin_url_base IS NOT NULL | |||||
AND ? LIKE CONCAT(origin_url_base, "%")""" | |||||
cursor.execute(query, (url, url)) | |||||
result = cursor.fetchone() | |||||
return result if result else (1, url) | |||||
def _insert_symbols(self, cursor, code_id, sym_type, symbols): | |||||
"""Insert a list of symbols of a given type into the database.""" | |||||
sym_types = ["functions", "classes", "variables"] | |||||
query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)" | |||||
query2 = """INSERT INTO symbol_locations VALUES | |||||
(DEFAULT, ?, ?, ?, ?, ?, ?)""" | |||||
for (name, decls, uses) in symbols: | |||||
cursor.execute(query1, (code_id, sym_types.index(sym_type), name)) | |||||
sym_id = cursor.lastrowid | |||||
params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] + | |||||
[tuple([sym_id, 1] + list(loc)) for loc in uses]) | |||||
cursor.executemany(query2, params) | |||||
def close(self): | |||||
"""Disconnect from the database.""" | |||||
self._conn.close() | |||||
def search(self, query, page=1): | |||||
""" | |||||
Search the database for a query and return the *n*\ th page of results. | |||||
:param query: The query to search for. | |||||
:type query: :py:class:`~.query.tree.Tree` | |||||
:param page: The result page to display. | |||||
:type page: int | |||||
:return: The total number of results, and the *n*\ th page of results. | |||||
:rtype: 2-tuple of (long, list of :py:class:`.Codelet`\ s) | |||||
""" | |||||
query1 = """SELECT cdata_codelet, cache_count_mnt, cache_count_exp | |||||
FROM cache | |||||
INNER JOIN cache_data ON cache_id = cdata_cache | |||||
WHERE cache_id = ?""" | |||||
query2 = "INSERT INTO cache VALUES (?, ?, ?, DEFAULT)" | |||||
query3 = "INSERT INTO cache_data VALUES (?, ?)" | |||||
cache_id = mmh3.hash64(str(page) + ":" + query.serialize())[0] | |||||
with self._conn.cursor() as cursor: | |||||
cursor.execute(query1, (cache_id,)) | |||||
results = cursor.fetchall() | |||||
if results: # Cache hit | |||||
num_results = results[0][1] * (10 ** results[0][2]) | |||||
ids = [res[0] for res in results] | |||||
else: # Cache miss | |||||
## TODO: build and execute search query | |||||
results = cursor.fetchall() | |||||
ids = NotImplemented ## TODO: extract ids from results | |||||
num_results = NotImplemented ## TODO: num if results else 0 | |||||
num_exp = max(len(str(num_results)) - 3, 0) | |||||
num_results = int(round(num_results, -num_exp)) | |||||
num_mnt = num_results / (10 ** num_exp) | |||||
cursor.execute(query2, (cache_id, num_mnt, num_exp)) | |||||
cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) | |||||
return (num_results, self._get_codelets_from_ids(cursor, ids)) | |||||
def insert(self, codelet): | |||||
""" | |||||
Insert a codelet into the database. | |||||
:param codelet: The codelet to insert. | |||||
:type codelet: :py:class:`.Codelet` | |||||
""" | |||||
query1 = """INSERT INTO code VALUES (?, ?, ?) | |||||
ON DUPLICATE KEY UPDATE code_id=code_id""" | |||||
query2 = """INSERT INTO codelets VALUES | |||||
(DEFAULT, ?, ?, ?, ?, ?, ?, ?)""" | |||||
query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)" | |||||
hash_key = str(codelet.language) + ":" + codelet.code.encode("utf8") | |||||
code_id = mmh3.hash64(hash_key)[0] | |||||
with self._conn.cursor() as cursor: | |||||
cursor.execute(query1, (code_id, codelet.language, codelet.code)) | |||||
if cursor.rowcount == 1: | |||||
for sym_type, symbols in codelet.symbols.iteritems(): | |||||
self._insert_symbols(cursor, code_id, sym_type, symbols) | |||||
origin, url = self._decompose_url(cursor, codelet.url) | |||||
cursor.execute(query2, (codelet.name, code_id, origin, url, | |||||
codelet.rank, codelet.date_created, | |||||
codelet.date_modified)) | |||||
codelet_id = cursor.lastrowid | |||||
authors = [(codelet_id, a[0], a[1]) for a in codelet.authors] | |||||
cursor.executemany(query3, authors) |
@@ -0,0 +1,97 @@ | |||||
""" | |||||
Contains information about database schema versions, and SQL queries to update | |||||
between them. | |||||
""" | |||||
VERSION = 6 | |||||
MIGRATIONS = [ | |||||
# 1 -> 2 | |||||
[ | |||||
"""ALTER TABLE `codelets` | |||||
DROP FOREIGN KEY `codelets_ibfk_1`""", | |||||
"""ALTER TABLE `code` | |||||
DROP KEY `code_hash`, | |||||
DROP COLUMN `code_hash`, | |||||
MODIFY COLUMN `code_id` BIGINT NOT NULL""", | |||||
"""ALTER TABLE `codelets` | |||||
MODIFY COLUMN `codelet_code_id` BIGINT NOT NULL, | |||||
ADD KEY (`codelet_lang`), | |||||
ADD CONSTRAINT `codelets_ibfk_1` FOREIGN KEY (`codelet_code_id`) | |||||
REFERENCES `code` (`code_id`) | |||||
ON DELETE RESTRICT ON UPDATE CASCADE""", | |||||
"""ALTER TABLE `symbols` | |||||
ADD COLUMN `symbol_end_row` INT UNSIGNED NOT NULL, | |||||
ADD COLUMN `symbol_end_col` INT UNSIGNED NOT NULL""" | |||||
], | |||||
# 2 -> 3 | |||||
[ | |||||
"""ALTER TABLE `symbols` | |||||
DROP FOREIGN KEY `symbols_ibfk_1`, | |||||
CHANGE COLUMN `symbol_codelet` `symbol_code` BIGINT NOT NULL, | |||||
ADD CONSTRAINT `symbols_ibfk_1` FOREIGN KEY (`symbol_code`) | |||||
REFERENCES `code` (`code_id`) | |||||
ON DELETE CASCADE ON UPDATE CASCADE""" | |||||
], | |||||
# 3 -> 4 | |||||
[ | |||||
"""ALTER TABLE `symbols` | |||||
DROP COLUMN `symbol_row`, | |||||
DROP COLUMN `symbol_col`, | |||||
DROP COLUMN `symbol_end_row`, | |||||
DROP COLUMN `symbol_end_col`""", | |||||
"""CREATE TABLE `symbol_locations` ( | |||||
`sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, | |||||
`sloc_symbol` BIGINT UNSIGNED NOT NULL, | |||||
`sloc_type` TINYINT UNSIGNED NOT NULL, | |||||
`sloc_row` INT UNSIGNED NOT NULL, | |||||
`sloc_col` INT UNSIGNED NOT NULL, | |||||
`sloc_end_row` INT UNSIGNED NOT NULL, | |||||
`sloc_end_col` INT UNSIGNED NOT NULL, | |||||
PRIMARY KEY (`sloc_id`), | |||||
FOREIGN KEY (`sloc_symbol`) | |||||
REFERENCES `symbols` (`symbol_id`) | |||||
ON DELETE CASCADE ON UPDATE CASCADE | |||||
) ENGINE=InnoDB""" | |||||
], | |||||
# 4 -> 5 | |||||
[ | |||||
"""ALTER TABLE `origins` | |||||
MODIFY COLUMN `origin_name` VARCHAR(64) DEFAULT NULL, | |||||
MODIFY COLUMN `origin_url` VARCHAR(512) DEFAULT NULL, | |||||
MODIFY COLUMN `origin_url_base` VARCHAR(512) DEFAULT NULL""" | |||||
], | |||||
# 5 -> 6 | |||||
[ | |||||
"""ALTER TABLE `code` | |||||
ADD COLUMN `code_lang` SMALLINT UNSIGNED DEFAULT NULL | |||||
AFTER `code_id`, | |||||
ADD KEY (`code_lang`)""", | |||||
"""ALTER TABLE `codelets` | |||||
DROP KEY `codelet_lang`, | |||||
DROP COLUMN `codelet_lang`""", | |||||
"""ALTER TABLE `cache_data` | |||||
DROP FOREIGN KEY `cache_data_ibfk_1`""", | |||||
"""ALTER TABLE `cache` | |||||
MODIFY COLUMN `cache_id` BIGINT NOT NULL, | |||||
DROP COLUMN `cache_hash`, | |||||
DROP COLUMN `cache_last_used`, | |||||
MODIFY COLUMN `cache_count_mnt` SMALLINT UNSIGNED NOT NULL""", | |||||
"""ALTER TABLE `cache_data` | |||||
MODIFY COLUMN `cdata_cache` BIGINT NOT NULL, | |||||
ADD PRIMARY KEY (`cdata_cache`, `cdata_codelet`), | |||||
ADD CONSTRAINT `cache_data_ibfk_1` FOREIGN KEY (`cdata_codelet`) | |||||
REFERENCES `codelets` (`codelet_id`) | |||||
ON DELETE CASCADE ON UPDATE CASCADE""", | |||||
"""CREATE EVENT `flush_cache` | |||||
ON SCHEDULE EVERY 1 HOUR | |||||
DO | |||||
DELETE FROM `cache` | |||||
WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY);""" | |||||
] | |||||
] | |||||
if __name__ == "__main__": | |||||
from . import Database | |||||
Database(migrate=True).close() |
@@ -0,0 +1,114 @@ | |||||
-- Schema version 6 | |||||
CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; | |||||
USE `bitshift`; | |||||
CREATE TABLE `version` ( | |||||
`version` INT UNSIGNED NOT NULL | |||||
) ENGINE=InnoDB; | |||||
INSERT INTO `version` VALUES (6); | |||||
CREATE TABLE `origins` ( | |||||
`origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, | |||||
`origin_name` VARCHAR(64) DEFAULT NULL, | |||||
`origin_url` VARCHAR(512) DEFAULT NULL, | |||||
`origin_url_base` VARCHAR(512) DEFAULT NULL, | |||||
`origin_image` BLOB DEFAULT NULL, | |||||
PRIMARY KEY (`origin_id`) | |||||
) ENGINE=InnoDB; | |||||
INSERT INTO `origins` VALUES (1, NULL, NULL, NULL, NULL); | |||||
CREATE TABLE `code` ( | |||||
`code_id` BIGINT NOT NULL, | |||||
`code_lang` SMALLINT UNSIGNED DEFAULT NULL, | |||||
`code_code` MEDIUMTEXT NOT NULL, | |||||
PRIMARY KEY (`code_id`), | |||||
KEY (`code_lang`), | |||||
FULLTEXT KEY (`code_code`) | |||||
) ENGINE=InnoDB; | |||||
CREATE TABLE `codelets` ( | |||||
`codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, | |||||
`codelet_name` VARCHAR(300) NOT NULL, | |||||
`codelet_code_id` BIGINT NOT NULL, | |||||
`codelet_origin` TINYINT UNSIGNED NOT NULL, | |||||
`codelet_url` VARCHAR(512) NOT NULL, | |||||
`codelet_rank` FLOAT NOT NULL, | |||||
`codelet_date_created` DATETIME DEFAULT NULL, | |||||
`codelet_date_modified` DATETIME DEFAULT NULL, | |||||
PRIMARY KEY (`codelet_id`), | |||||
FULLTEXT KEY (`codelet_name`), | |||||
KEY (`codelet_rank`), | |||||
KEY (`codelet_date_created`), | |||||
KEY (`codelet_date_modified`), | |||||
FOREIGN KEY (`codelet_code_id`) | |||||
REFERENCES `code` (`code_id`) | |||||
ON DELETE RESTRICT ON UPDATE CASCADE, | |||||
FOREIGN KEY (`codelet_origin`) | |||||
REFERENCES `origins` (`origin_id`) | |||||
ON DELETE RESTRICT ON UPDATE CASCADE | |||||
) ENGINE=InnoDB; | |||||
CREATE TABLE `authors` ( | |||||
`author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, | |||||
`author_codelet` BIGINT UNSIGNED NOT NULL, | |||||
`author_name` VARCHAR(128) NOT NULL, | |||||
`author_url` VARCHAR(512) DEFAULT NULL, | |||||
PRIMARY KEY (`author_id`), | |||||
FULLTEXT KEY (`author_name`), | |||||
FOREIGN KEY (`author_codelet`) | |||||
REFERENCES `codelets` (`codelet_id`) | |||||
ON DELETE CASCADE ON UPDATE CASCADE | |||||
) ENGINE=InnoDB; | |||||
CREATE TABLE `symbols` ( | |||||
`symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, | |||||
`symbol_code` BIGINT NOT NULL, | |||||
`symbol_type` TINYINT UNSIGNED NOT NULL, | |||||
`symbol_name` VARCHAR(512) NOT NULL, | |||||
PRIMARY KEY (`symbol_id`), | |||||
KEY (`symbol_type`, `symbol_name`(32)), | |||||
FOREIGN KEY (`symbol_code`) | |||||
REFERENCES `code` (`code_id`) | |||||
ON DELETE CASCADE ON UPDATE CASCADE | |||||
) ENGINE=InnoDB; | |||||
CREATE TABLE `symbol_locations` ( | |||||
`sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, | |||||
`sloc_symbol` BIGINT UNSIGNED NOT NULL, | |||||
`sloc_type` TINYINT UNSIGNED NOT NULL, | |||||
`sloc_row` INT UNSIGNED NOT NULL, | |||||
`sloc_col` INT UNSIGNED NOT NULL, | |||||
`sloc_end_row` INT UNSIGNED NOT NULL, | |||||
`sloc_end_col` INT UNSIGNED NOT NULL, | |||||
PRIMARY KEY (`sloc_id`), | |||||
FOREIGN KEY (`sloc_symbol`) | |||||
REFERENCES `symbols` (`symbol_id`) | |||||
ON DELETE CASCADE ON UPDATE CASCADE | |||||
) ENGINE=InnoDB; | |||||
CREATE TABLE `cache` ( | |||||
`cache_id` BIGINT NOT NULL, | |||||
`cache_count_mnt` SMALLINT UNSIGNED NOT NULL, | |||||
`cache_count_exp` TINYINT UNSIGNED NOT NULL, | |||||
`cache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, | |||||
PRIMARY KEY (`cache_id`) | |||||
) ENGINE=InnoDB; | |||||
CREATE TABLE `cache_data` ( | |||||
`cdata_cache` BIGINT NOT NULL, | |||||
`cdata_codelet` BIGINT UNSIGNED NOT NULL, | |||||
PRIMARY KEY (`cdata_cache`, `cdata_codelet`), | |||||
FOREIGN KEY (`cdata_cache`) | |||||
REFERENCES `cache` (`cache_id`) | |||||
ON DELETE CASCADE ON UPDATE CASCADE, | |||||
FOREIGN KEY (`cdata_codelet`) | |||||
REFERENCES `codelets` (`codelet_id`) | |||||
ON DELETE CASCADE ON UPDATE CASCADE | |||||
) ENGINE=InnoDB; | |||||
CREATE EVENT `flush_cache` | |||||
ON SCHEDULE EVERY 1 HOUR | |||||
DO | |||||
DELETE FROM `cache` | |||||
WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY); |
@@ -22,4 +22,6 @@ def parse_query(query): | |||||
# gets a string, returns a Tree | # gets a string, returns a Tree | ||||
# TODO: note: resultant Trees should be normalized so that "foo OR bar" | |||||
# and "bar OR foo" result in equivalent trees | |||||
pass | pass |
@@ -0,0 +1,11 @@ | |||||
query Package | |||||
============= | |||||
:mod:`query` Package | |||||
-------------------- | |||||
.. automodule:: bitshift.query | |||||
:members: | |||||
:undoc-members: | |||||
:show-inheritance: | |||||
@@ -1,30 +1,51 @@ | |||||
bitshift package | |||||
bitshift Package | |||||
================ | ================ | ||||
Submodules | |||||
:mod:`bitshift` Package | |||||
----------------------- | |||||
bitshift.assets module | |||||
.. automodule:: bitshift.__init__ | |||||
:members: | |||||
:undoc-members: | |||||
:show-inheritance: | |||||
:mod:`assets` Module | |||||
-------------------- | |||||
.. automodule:: bitshift.assets | .. automodule:: bitshift.assets | ||||
:members: | :members: | ||||
:undoc-members: | :undoc-members: | ||||
:show-inheritance: | :show-inheritance: | ||||
bitshift.config module | |||||
:mod:`codelet` Module | |||||
--------------------- | |||||
.. automodule:: bitshift.config | |||||
.. automodule:: bitshift.codelet | |||||
:members: | :members: | ||||
:undoc-members: | :undoc-members: | ||||
:show-inheritance: | :show-inheritance: | ||||
:mod:`config` Module | |||||
-------------------- | |||||
Module contents | |||||
.. automodule:: bitshift.config | |||||
:members: | |||||
:undoc-members: | |||||
:show-inheritance: | |||||
:mod:`database` Module | |||||
---------------------- | |||||
.. automodule:: bitshift | |||||
.. automodule:: bitshift.database | |||||
:members: | :members: | ||||
:undoc-members: | :undoc-members: | ||||
:show-inheritance: | :show-inheritance: | ||||
Subpackages | |||||
----------- | |||||
.. toctree:: | |||||
bitshift.parser | |||||
bitshift.query | |||||
@@ -4,7 +4,9 @@ setup( | |||||
name = "bitshift", | name = "bitshift", | ||||
version = "0.1", | version = "0.1", | ||||
packages = find_packages(), | packages = find_packages(), | ||||
install_requires = ["Flask>=0.10.1", "pygments>=1.6"], | |||||
install_requires = [ | |||||
"Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", | |||||
"beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"], | |||||
author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", | author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", | ||||
license = "MIT", | license = "MIT", | ||||
url = "https://github.com/earwig/bitshift" | url = "https://github.com/earwig/bitshift" | ||||