@@ -1,5 +1,6 @@ | |||
.sass-cache | |||
.DS_Store | |||
.my.cnf | |||
# github premade rules | |||
*.py[cod] | |||
@@ -1,7 +1,8 @@ | |||
bitshift | |||
======== | |||
bitshift is a semantic search engine for source code. | |||
bitshift is a semantic search engine for source code developed by Benjamin | |||
Attal, Ben Kurtovic, and Severyn Kozak. | |||
Branches | |||
-------- | |||
@@ -13,6 +14,11 @@ Branches | |||
- `feature/*`: individual components of the project with untested, likely | |||
horribly broken code - branch off from and merge into `develop` when done | |||
Style | |||
----- | |||
bitshift uses [SASS][SASS] for styling; compile the stylesheets to CSS with | |||
`sass --watch static/sass/:static/css`. | |||
Documentation | |||
------------- | |||
@@ -24,3 +30,5 @@ new modules or packages, but *not* when adding functions or changing | |||
docstrings), run `sphinx-apidoc -fo docs/source/api bitshift` from the project | |||
root. Note that this will revert any custom changes made to the files in | |||
`docs/source/api`, so you might want to update them by hand instead. | |||
[SASS]: http://sass-lang.com/guide |
@@ -5,6 +5,8 @@ Module to contain all the project's Flask server plumbing. | |||
from flask import Flask | |||
from flask import render_template, session | |||
from bitshift import assets | |||
from bitshift.database import Database | |||
from bitshift.query import parse_query | |||
app = Flask(__name__) | |||
@@ -12,7 +14,9 @@ app.config.from_object("bitshift.config") | |||
app_env = app.jinja_env | |||
app_env.line_statement_prefix = "=" | |||
app_env.globals.update(assets = assets) | |||
app_env.globals.update(assets=assets) | |||
database = Database() | |||
@app.route("/") | |||
def index(): | |||
@@ -20,8 +24,8 @@ def index(): | |||
@app.route("/search/<query>") | |||
def search(query): | |||
## tree = parse_query(query) | |||
## database.search(tree) | |||
tree = parse_query(query) | |||
database.search(tree) | |||
pass | |||
if __name__ == "__main__": | |||
@@ -1 +1 @@ | |||
from . import assets, codelet, config, database, parser, query | |||
from . import assets, codelet, config, database, parser, query, crawler |
@@ -1,6 +1,5 @@ | |||
""" | |||
.. module:: assets | |||
:synopsis: Helper functions for use inside the project's Jinja templates. | |||
:synopsis: Helper functions for use inside the project's Jinja templates. | |||
""" | |||
from flask import Markup | |||
@@ -16,8 +15,11 @@ def tag(filename): | |||
:param filename: The filename of the asset to create a tag for. | |||
:type filename: str | |||
:return: A string containing a `<source>` tag for JS files, and a `<link>` | |||
for CSS files. | |||
:rtype: str | |||
""" | |||
file_ext = filename.split(".")[-1] | |||
@@ -1,13 +1,57 @@ | |||
__all__ = ["Codelet"] | |||
class Codelet(object): | |||
## object to store the following (it doesn't need to do anything with it): | |||
## author name, URL, date created/modified, language, source code itself | |||
## for VCS: project name, file in project | |||
## also: list of functions, etc (associations data) | |||
""" | |||
A source-code object with code metadata and composition analysis. | |||
## DICTIONARY MAPPING STRINGS REPRESENTING ASSOCIATION TYPE WITH DICTIONARIES | |||
## MAPPING ASSOCIATION NAMES WITH TUPLES REPRESENTING THEIR PLACE IN THE FILE | |||
## STORED AS TWO INTEGERS REPRESENTING THE ROW AND THE COLUMN | |||
:ivar name: (str) A suitable name for the codelet. | |||
:ivar code: (str) A containing the raw source code. | |||
:ivar filename: (str, or None) The filename of the snippet. | |||
:ivar language: (int, or None) The inferred language of `code`. | |||
:ivar authors: (array of tuples (str, str or None)) An array of tuples | |||
containing an author's name and profile URL (on the service the code | |||
was pulled from). | |||
:ivar code_url: (str) The url of the (page containing the) source code. | |||
:ivar date_created: (:class:`datetime.datetime`, or None) The date the code | |||
was published. | |||
:ivar date_modified: (:class:`datetime.datetime`, or None) The date the | |||
code was last modified. | |||
:ivar rank: (float) A quanitification of the source code's quality, as | |||
per available ratings (stars, forks, upvotes, etc.). | |||
""" | |||
## {"functions": {"foo": (12, 13), "bar": (53, 3)}} | |||
def __init__(self, name, code, filename, language, authors, code_url, | |||
date_created, date_modified, rank): | |||
""" | |||
Create a Codelet instance. | |||
:param name: see :attr:`self.name` | |||
:param code: see :attr:`self.code` | |||
:param filename: see :attr:`self.filename` | |||
:param language: see :attr:`self.language` | |||
:param authors: see :attr:`self.authors` | |||
:param code_url: see :attr:`self.code_url` | |||
:param date_created: see :attr:`self.date_created` | |||
:param date_modified: see :attr:`self.date_modified` | |||
:param rank: see :attr:`self.rank` | |||
:type name: see :attr:`self.name` | |||
:type code: see :attr:`self.code` | |||
:type filename: see :attr:`self.filename` | |||
:type language: see :attr:`self.language` | |||
:type authors: see :attr:`self.authors` | |||
:type code_url: see :attr:`self.code_url` | |||
:type date_created: see :attr:`self.date_created` | |||
:type date_modified: see :attr:`self.date_modified` | |||
:type rank: see :attr:`self.rank` | |||
""" | |||
self.name = name | |||
self.code = code | |||
self.filename = filename | |||
self.language = language | |||
self.authors = authors | |||
self.code_url = code_url | |||
self.date_created = date_created | |||
self.date_modified = date_modified | |||
self.rank = rank |
@@ -0,0 +1,55 @@ | |||
""" | |||
:synopsis: Parent crawler module, which supervises all crawlers. | |||
Contains functions for initializing all subsidiary, threaded crawlers. | |||
""" | |||
import logging, logging.handlers, os, Queue | |||
from bitshift.crawler import crawler, indexer | |||
__all__ = ["crawl"] | |||
def crawl(): | |||
""" | |||
Initialize all crawlers (and indexers). | |||
Start the: | |||
1. GitHub crawler, :class:`crawler.GitHubCrawler`. | |||
2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`. | |||
3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. | |||
""" | |||
_configure_logging() | |||
MAX_URL_QUEUE_SIZE = 5e3 | |||
repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) | |||
threads = [crawler.GitHubCrawler(repo_clone_queue), | |||
crawler.BitbucketCrawler(repo_clone_queue), | |||
indexer.GitIndexer(repo_clone_queue)] | |||
for thread in threads: | |||
thread.start() | |||
def _configure_logging(): | |||
LOG_FILE_DIR = "log" | |||
if not os.path.exists(LOG_FILE_DIR): | |||
os.mkdir(LOG_FILE_DIR) | |||
logging.getLogger("requests").setLevel(logging.WARNING) | |||
logging.getLogger("urllib3").setLevel(logging.WARNING) | |||
formatter = logging.Formatter( | |||
fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" | |||
" %(message)s"), datefmt="%y-%m-%d %H:%M:%S") | |||
handler = logging.handlers.TimedRotatingFileHandler( | |||
"%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1, | |||
backupCount=20) | |||
handler.setFormatter(formatter) | |||
root_logger = logging.getLogger() | |||
root_logger.addHandler(handler) | |||
root_logger.setLevel(logging.NOTSET) |
@@ -0,0 +1,240 @@ | |||
""" | |||
:synopsis: Main crawler module, to oversee all site-specific crawlers. | |||
Contains all website/framework-specific Class crawlers. | |||
""" | |||
import logging, requests, time, threading | |||
from bitshift.crawler import indexer | |||
from ..codelet import Codelet | |||
from ..database import Database | |||
class GitHubCrawler(threading.Thread): | |||
""" | |||
Crawler that retrieves links to all of GitHub's public repositories. | |||
GitHubCrawler is a threaded singleton that queries GitHub's API for urls | |||
to its public repositories, which it inserts into a :class:`Queue.Queue` | |||
shared with :class:`indexer.GitIndexer`. | |||
:ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository` | |||
with repository metadata retrieved by :class:`GitHubCrawler`, and other Git | |||
crawlers, to be processed by :class:`indexer.GitIndexer`. | |||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||
""" | |||
AUTHENTICATION = { | |||
"client_id" : "436cb884ae09be7f2a4e", | |||
"client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" | |||
} | |||
def __init__(self, clone_queue): | |||
""" | |||
Create an instance of the singleton `GitHubCrawler`. | |||
:param clone_queue: see :attr:`self.clone_queue` | |||
:type clone_queue: see :attr:`self.clone_queue` | |||
""" | |||
self.clone_queue = clone_queue | |||
self._logger = logging.getLogger("%s.%s" % | |||
(__name__, self.__class__.__name__)) | |||
self._logger.info("Starting.") | |||
super(GitHubCrawler, self).__init__(name=self.__class__.__name__) | |||
def run(self): | |||
""" | |||
Query the GitHub API for data about every public repository. | |||
Pull all of GitHub's repositories by making calls to its API in a loop, | |||
accessing a subsequent page of results via the "next" URL returned in an | |||
API response header. Uses Severyn Kozak's (sevko) authentication | |||
credentials. For every new repository, a :class:`GitRepository` is | |||
inserted into :attr:`self.clone_queue`. | |||
""" | |||
next_api_url = "https://api.github.com/repositories" | |||
api_request_interval = 5e3 / 60 ** 2 | |||
while len(next_api_url) > 0: | |||
start_time = time.time() | |||
try: | |||
resp = requests.get(next_api_url, params=self.AUTHENTICATION) | |||
except ConnectionError as excep: | |||
self._logger.warning("API %s call failed: %s: %s", | |||
next_api_url, excep.__class__.__name__, excep) | |||
time.sleep(0.5) | |||
continue | |||
queue_percent_full = (float(self.clone_queue.qsize()) / | |||
self.clone_queue.maxsize) * 100 | |||
self._logger.info("API call made. Queue size: %d/%d, %d%%." % | |||
((self.clone_queue.qsize(), self.clone_queue.maxsize, | |||
queue_percent_full))) | |||
repo_names = [repo["full_name"] for repo in resp.json()] | |||
repo_stars = self._get_repositories_stars(repo_names) | |||
for repo in resp.json(): | |||
while self.clone_queue.full(): | |||
time.sleep(1) | |||
self.clone_queue.put(indexer.GitRepository( | |||
repo["html_url"], repo["full_name"].replace("/", ""), | |||
"GitHub", repo_stars[repo["full_name"]])) | |||
if int(resp.headers["x-ratelimit-remaining"]) == 0: | |||
time.sleep(int(resp.headers["x-ratelimit-reset"]) - | |||
time.time()) | |||
next_api_url = resp.headers["link"].split(">")[0][1:] | |||
sleep_time = api_request_interval - (time.time() - start_time) | |||
if sleep_time > 0: | |||
time.sleep(sleep_time) | |||
def _get_repositories_stars(self, repo_names): | |||
""" | |||
Return the number of stargazers for several repositories. | |||
Queries the GitHub API for the number of stargazers for any given | |||
repositories, and blocks if the query limit is exceeded. | |||
:param repo_names: An array of repository names, in | |||
`username/repository_name` format. | |||
:type repo_names: str | |||
:return: A dictionary with repository name keys, and corresponding | |||
stargazer count values. | |||
Example dictionary: | |||
.. code-block:: python | |||
{ | |||
"user/repository" : 100 | |||
} | |||
:rtype: dictionary | |||
""" | |||
API_URL = "https://api.github.com/search/repositories" | |||
REPOS_PER_QUERY = 25 | |||
repo_stars = {} | |||
for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in | |||
xrange(0, len(repo_names), REPOS_PER_QUERY)]: | |||
query_url = "%s?q=%s" % (API_URL, | |||
"+".join("repo:%s" % name for name in names)) | |||
params = self.AUTHENTICATION | |||
resp = requests.get(query_url, | |||
params=params, | |||
headers={ | |||
"Accept" : "application/vnd.github.preview" | |||
}) | |||
if int(resp.headers["x-ratelimit-remaining"]) == 0: | |||
sleep_time = int(resp.headers["x-ratelimit-reset"]) - \ | |||
time.time() + 1 | |||
if sleep_time > 0: | |||
logging.info("API quota exceeded. Sleep time: %d." % | |||
sleep_time) | |||
time.sleep(sleep_time) | |||
for repo in resp.json()["items"]: | |||
rank = float(repo["stargazers_count"]) / 1000 | |||
repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0 | |||
for name in repo_names: | |||
if name not in repo_stars: | |||
repo_stars[name] = 0.5 | |||
return repo_stars | |||
class BitbucketCrawler(threading.Thread): | |||
""" | |||
Crawler that retrieves links to all of Bitbucket's public repositories. | |||
BitbucketCrawler is a threaded singleton that queries Bitbucket's API for | |||
urls to its public repositories, and inserts them as | |||
:class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with | |||
:class:`indexer.GitIndexer`. | |||
:ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert | |||
:class:`indexer.GitRepository` repository urls into. | |||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||
""" | |||
def __init__(self, clone_queue): | |||
""" | |||
Create an instance of the singleton `BitbucketCrawler`. | |||
:param clone_queue: see :attr:`self.clone_queue` | |||
:type clone_queue: see :attr:`self.clone_queue` | |||
""" | |||
self.clone_queue = clone_queue | |||
self._logger = logging.getLogger("%s.%s" % | |||
(__name__, self.__class__.__name__)) | |||
self._logger.info("Starting.") | |||
super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) | |||
def run(self): | |||
""" | |||
Query the Bitbucket API for data about every public repository. | |||
Query the Bitbucket API's "/repositories" endpoint and read its | |||
paginated responses in a loop; any "git" repositories have their | |||
clone-urls and names inserted into a :class:`indexer.GitRepository` in | |||
:attr:`self.clone_queue`. | |||
""" | |||
next_api_url = "https://api.bitbucket.org/2.0/repositories" | |||
while True: | |||
try: | |||
response = requests.get(next_api_url).json() | |||
except ConnectionError as exception: | |||
time.sleep(0.5) | |||
self._logger.warning("API %s call failed: %s: %s", | |||
next_api_url, excep.__class__.__name__, excep) | |||
continue | |||
queue_percent_full = (float(self.clone_queue.qsize()) / | |||
self.clone_queue.maxsize) * 100 | |||
self._logger.info("API call made. Queue size: %d/%d, %d%%." % | |||
((self.clone_queue.qsize(), self.clone_queue.maxsize, | |||
queue_percent_full))) | |||
for repo in response["values"]: | |||
if repo["scm"] == "git": | |||
while self.clone_queue.full(): | |||
time.sleep(1) | |||
clone_links = repo["links"]["clone"] | |||
clone_url = (clone_links[0]["href"] if | |||
clone_links[0]["name"] == "https" else | |||
clone_links[1]["href"]) | |||
links.append("clone_url") | |||
try: | |||
watchers = requests.get( | |||
repo["links"]["watchers"]["href"]) | |||
rank = len(watchers.json()["values"]) / 100 | |||
except ConnectionError as exception: | |||
time.sleep(0.5) | |||
self._logger.warning("API %s call failed: %s: %s", | |||
next_api_url, excep.__class__.__name__, excep) | |||
continue | |||
self.clone_queue.put(indexer.GitRepository( | |||
clone_url, repo["full_name"], "Bitbucket"), | |||
rank if rank < 1.0 else 1.0) | |||
next_api_url = response["next"] | |||
time.sleep(0.2) |
@@ -0,0 +1,489 @@ | |||
""" | |||
:synopsis: Contains a singleton GitIndexer class, which clones and indexes git | |||
repositories. | |||
""" | |||
import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\ | |||
threading | |||
from ..database import Database | |||
from ..codelet import Codelet | |||
GIT_CLONE_DIR = "/tmp/bitshift" | |||
THREAD_QUEUE_SLEEP = 0.5 | |||
class GitRepository(object): | |||
""" | |||
A representation of a Git repository's metadata. | |||
:ivar url: (str) The repository's url. | |||
:ivar name: (str) The name of the repository. | |||
:ivar framework_name: (str) The name of the online Git framework that the | |||
repository belongs to (eg, GitHub, BitBucket). | |||
:ivar rank: (float) The rank of the repository, as assigned by | |||
:class:`crawler.GitHubCrawler`. | |||
""" | |||
def __init__(self, url, name, framework_name, rank): | |||
""" | |||
Create a GitRepository instance. | |||
:param url: see :attr:`GitRepository.url` | |||
:param name: see :attr:`GitRepository.name` | |||
:param framework_name: see :attr:`GitRepository.framework_name` | |||
:param rank: see :attr:`GitRepository.rank` | |||
:type url: str | |||
:type name: str | |||
:type framework_name: str | |||
:type rank: float | |||
""" | |||
self.url = url | |||
self.name = name | |||
self.framework_name = framework_name | |||
self.rank = rank | |||
class GitIndexer(threading.Thread): | |||
""" | |||
A singleton Git repository indexer. | |||
:class:`GitIndexer` indexes the repositories cloned by the | |||
:class:`_GitCloner` singleton. | |||
:ivar index_queue: (:class:`Queue.Queue`) A queue containing | |||
:class:`GitRepository` objects for every new repository succesfully | |||
cloned by :class:`_GitCloner`, which are to be indexed. | |||
:ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner, | |||
which feeds :class:`GitIndexer`. | |||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||
""" | |||
def __init__(self, clone_queue): | |||
""" | |||
Create an instance of the singleton `GitIndexer`. | |||
:param clone_queue: see :attr:`self.index_queue` | |||
:type index_queue: see :attr:`self.index_queue` | |||
""" | |||
MAX_INDEX_QUEUE_SIZE = 10 | |||
self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) | |||
self.git_cloner = _GitCloner(clone_queue, self.index_queue) | |||
self.git_cloner.start() | |||
self._logger = logging.getLogger("%s.%s" % | |||
(__name__, self.__class__.__name__)) | |||
self._logger.info("Starting.") | |||
if not os.path.exists(GIT_CLONE_DIR): | |||
os.makedirs(GIT_CLONE_DIR) | |||
super(GitIndexer, self).__init__(name=self.__class__.__name__) | |||
def run(self): | |||
""" | |||
Retrieve metadata about newly cloned repositories and index them. | |||
Blocks until new repositories appear in :attr:`self.index_queue`, then | |||
retrieves one, and attempts indexing it. Should any errors occur, the | |||
new repository will be discarded and the indexer will index the next in | |||
the queue. | |||
""" | |||
while True: | |||
while self.index_queue.empty(): | |||
time.sleep(THREAD_QUEUE_SLEEP) | |||
repo = self.index_queue.get() | |||
self.index_queue.task_done() | |||
try: | |||
self._index_repository(repo) | |||
except Exception as excep: | |||
self._logger.warning("%s: %s.", excep.__class__.__name__, excep) | |||
def _index_repository(self, repo): | |||
""" | |||
Clone and index (create and insert Codeletes for) a Git repository. | |||
`git clone` the Git repository located at **repo.url**, call | |||
`_insert_repository_codelets()`, then remove said repository. | |||
:param repo_url: The metadata of the repository to be indexed. | |||
:type repo_url: :class:`GitRepository` | |||
""" | |||
with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir: | |||
try: | |||
self._insert_repository_codelets(repo) | |||
except Exception as excep: | |||
self._logger.warning("%s: %s.", excep.__class__.__name__, excep) | |||
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): | |||
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) | |||
def _insert_repository_codelets(self, repo): | |||
""" | |||
Create and insert a Codelet for the files inside a Git repository. | |||
Create a new Codelet, and insert it into the Database singleton, for | |||
every file inside the current working directory's default branch | |||
(usually *master*). | |||
:param repo_url: The metadata of the repository to be indexed. | |||
:type repo_url: :class:`GitRepository` | |||
""" | |||
commits_meta = self._get_commits_metadata() | |||
if commits_meta is None: | |||
return | |||
for filename in commits_meta.keys(): | |||
try: | |||
with open(filename) as source_file: | |||
source = self._decode(source_file.read()) | |||
if source is None: | |||
continue | |||
except IOError as exception: | |||
continue | |||
authors = [(self._decode(author), None) for author in \ | |||
commits_meta[filename]["authors"]] | |||
codelet = Codelet("%s:%s" % (repo.name, filename), source, filename, | |||
None, authors, self._generate_file_url(filename, | |||
repo.url, repo.framework_name), | |||
commits_meta[filename]["time_created"], | |||
commits_meta[filename]["time_last_modified"], | |||
repo.rank) | |||
def _generate_file_url(self, filename, repo_url, framework_name): | |||
""" | |||
Return a url for a filename from a Git wrapper framework. | |||
:param filename: The path of the file. | |||
:param repo_url: The url of the file's parent repository. | |||
:param framework_name: The name of the framework the repository is from. | |||
:type filename: str | |||
:type repo_url: str | |||
:type framework_name: str | |||
:return: The file's full url on the given framework, if successfully | |||
derived. | |||
:rtype: str, or None | |||
.. warning:: | |||
Various Git subprocesses will occasionally fail, and, seeing as the | |||
information they provide is a crucial component of some repository file | |||
urls, None may be returned. | |||
""" | |||
try: | |||
if framework_name == "GitHub": | |||
default_branch = subprocess.check_output("git branch" | |||
" --no-color", shell=True)[2:-1] | |||
return ("%s/blob/%s/%s" % (repo_url, default_branch, | |||
filename)).replace("//", "/") | |||
elif framework_name == "Bitbucket": | |||
commit_hash = subprocess.check_output("git rev-parse HEAD", | |||
shell=True).replace("\n", "") | |||
return ("%s/src/%s/%s" % (repo_url, commit_hash, | |||
filename)).replace("//", "/") | |||
except subprocess.CalledProcessError as exception: | |||
return None | |||
def _get_git_commits(self): | |||
""" | |||
Return the current working directory's formatted commit data. | |||
Uses `git log` to generate metadata about every single file in the | |||
repository's commit history. | |||
:return: The author, timestamp, and names of all modified files of every | |||
commit. | |||
.. code-block:: python | |||
sample_returned_array = [ | |||
{ | |||
"author" : (str) "author" | |||
"timestamp" : (`datetime.datetime`) <object>, | |||
"filenames" : (str array) ["file1", "file2"] | |||
} | |||
] | |||
:rtype: array of dictionaries | |||
""" | |||
git_log = subprocess.check_output(("git --no-pager log --name-only" | |||
" --pretty=format:'%n%n%an%n%at' -z"), shell=True) | |||
commits = [] | |||
for commit in git_log.split("\n\n"): | |||
fields = commit.split("\n") | |||
if len(fields) > 2: | |||
commits.append({ | |||
"author" : fields[0], | |||
"timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), | |||
"filenames" : fields[2].split("\x00")[:-2] | |||
}) | |||
return commits | |||
def _get_tracked_files(self): | |||
""" | |||
Return a list of the filenames of all valuable files in the Git repository. | |||
Get a list of the filenames of the non-binary (Perl heuristics used for | |||
filetype identification) files currently inside the current working | |||
directory's Git repository. Then, weed out any boilerplate/non-code files | |||
that match the regex rules in GIT_IGNORE_FILES. | |||
:return: The filenames of all index-worthy non-binary files. | |||
:rtype: str array | |||
""" | |||
files = [] | |||
for dirname, subdir_names, filenames in os.walk("."): | |||
for filename in filenames: | |||
path = os.path.join(dirname, filename) | |||
if self._is_ascii(path): | |||
files.append(path[2:]) | |||
return files | |||
def _get_commits_metadata(self): | |||
""" | |||
Return a dictionary containing every valuable tracked file's metadata. | |||
:return: A dictionary with author names, time of creation, and time of last | |||
modification for every filename key. | |||
.. code-block:: python | |||
sample_returned_dict = { | |||
"my_file" : { | |||
"authors" : (str array) ["author1", "author2"], | |||
"time_created" : (`datetime.datetime`) <object>, | |||
"time_last_modified" : (`datetime.datetime`) <object> | |||
} | |||
} | |||
:rtype: dictionary of dictionaries | |||
""" | |||
commits = self._get_git_commits() | |||
tracked_files = self._get_tracked_files() | |||
files_meta = {} | |||
for commit in commits: | |||
for filename in commit["filenames"]: | |||
if filename not in tracked_files: | |||
continue | |||
if filename not in files_meta.keys(): | |||
files_meta[filename] = { | |||
"authors" : [commit["author"]], | |||
"time_last_modified" : commit["timestamp"], | |||
"time_created" : commit["timestamp"] | |||
} | |||
else: | |||
if commit["author"] not in files_meta[filename]["authors"]: | |||
files_meta[filename]["authors"].append(commit["author"]) | |||
files_meta[filename]["time_created"] = commit["timestamp"] | |||
return files_meta | |||
def _decode(self, raw): | |||
""" | |||
Return a decoded a raw string. | |||
:param raw: The string to string. | |||
:type raw: (str) | |||
:return: If the original encoding is successfully inferenced, return the | |||
decoded string. | |||
:rtype: str, or None | |||
.. warning:: | |||
The raw string's original encoding is identified by heuristics which | |||
can, and occasionally will, fail. Decoding will then fail, and None | |||
will be returned. | |||
""" | |||
try: | |||
encoding = bs4.BeautifulSoup(raw).original_encoding | |||
return raw.decode(encoding) if encoding is not None else None | |||
except (LookupError, UnicodeDecodeError, UserWarning) as exception: | |||
return None | |||
def _is_ascii(self, filename): | |||
""" | |||
Heuristically determine whether a file is ASCII text or binary. | |||
If a portion of the file contains null bytes, or the percentage of bytes | |||
that aren't ASCII is greater than 30%, then the file is concluded to be | |||
binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T` | |||
operator, and is the de-facto method for in : passdetermining whether a | |||
file is ASCII. | |||
:param filename: The path of the file to test. | |||
:type filename: str | |||
:return: Whether the file is probably ASCII. | |||
:rtype: Boolean | |||
""" | |||
try: | |||
with open(filename) as source: | |||
file_snippet = source.read(512) | |||
if not file_snippet: | |||
return True | |||
ascii_characters = "".join(map(chr, range(32, 127)) + | |||
list("\n\r\t\b")) | |||
null_trans = string.maketrans("", "") | |||
if "\0" in file_snippet: | |||
return False | |||
non_ascii = file_snippet.translate(null_trans, ascii_characters) | |||
return not float(len(non_ascii)) / len(file_snippet) > 0.30 | |||
except IOError as exception: | |||
return False | |||
class _GitCloner(threading.Thread): | |||
""" | |||
A singleton Git repository cloner. | |||
Clones the repositories crawled by :class:`crawler.GitHubCrawler` for | |||
:class:`GitIndexer` to index. | |||
:ivar clone_queue: (:class:`Queue.Queue`) see | |||
:attr:`crawler.GitHubCrawler.clone_queue`. | |||
:ivar index_queue: (:class:`Queue.Queue`) see | |||
:attr:`GitIndexer.index_queue`. | |||
:ivar _logger: (:class:`logging.Logger`) A class-specific logger object. | |||
""" | |||
def __init__(self, clone_queue, index_queue): | |||
""" | |||
Create an instance of the singleton :class:`_GitCloner`. | |||
:param clone_queue: see :attr:`self.clone_queue` | |||
:param index_queue: see :attr:`self.index_queue` | |||
:type clone_queue: see :attr:`self.clone_queue` | |||
:type index_queue: see :attr:`self.index_queue` | |||
""" | |||
self.clone_queue = clone_queue | |||
self.index_queue = index_queue | |||
self._logger = logging.getLogger("%s.%s" % | |||
(__name__, self.__class__.__name__)) | |||
self._logger.info("Starting.") | |||
super(_GitCloner, self).__init__(name=self.__class__.__name__) | |||
def run(self): | |||
""" | |||
Retrieve metadata about newly crawled repositories and clone them. | |||
Blocks until new :class:`GitRepository` appear in | |||
:attr:`self.clone_queue`, then attempts cloning them. If | |||
succcessful, the cloned repository is added to :attr:`self.index_queue` | |||
for the `GitIndexer` to clone; otherwise, it is discarded. | |||
""" | |||
while True: | |||
while self.clone_queue.empty(): | |||
time.sleep(THREAD_QUEUE_SLEEP) | |||
repo = self.clone_queue.get() | |||
self.clone_queue.task_done() | |||
try: | |||
self._clone_repository(repo) | |||
except Exception as exception: | |||
pass | |||
def _clone_repository(self, repo): | |||
""" | |||
Attempt cloning a Git repository. | |||
:param repo: Metadata about the repository to clone. | |||
:type repo: :class:`GitRepository` | |||
""" | |||
GIT_CLONE_TIMEOUT = 500 | |||
queue_percent_full = (float(self.index_queue.qsize()) / | |||
self.index_queue.maxsize) * 100 | |||
exit_code = None | |||
command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone" | |||
" --single-branch %s %s/%s || pkill -f git") | |||
command_attempt = 0 | |||
while exit_code is None: | |||
try: | |||
exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT, | |||
repo.url, GIT_CLONE_DIR, repo.name), shell=True) | |||
except Exception as exception: | |||
time.sleep(1) | |||
command_attempt += 1 | |||
if command_attempt == 20: | |||
break | |||
else: | |||
continue | |||
else: | |||
break | |||
if exit_code != 0: | |||
if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): | |||
shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) | |||
return | |||
while self.index_queue.full(): | |||
time.sleep(THREAD_QUEUE_SLEEP) | |||
self.index_queue.put(repo) | |||
class _ChangeDir(object): | |||
""" | |||
A wrapper class for os.chdir(), to map onto `with` and handle exceptions. | |||
:ivar new_path: (str) The path to change the current directory to. | |||
:ivar old_path: (str) The path of the directory to return to. | |||
""" | |||
def __init__(self, new_path): | |||
""" | |||
Create a _ChangeDir instance. | |||
:param new_path: The directory to enter. | |||
:type new_path: str | |||
""" | |||
self.new_path = new_path | |||
def __enter__(self): | |||
""" | |||
Change the current working-directory to **new_path**. | |||
""" | |||
self.old_path = os.getcwd() | |||
os.chdir(self.new_path) | |||
def __exit__(self, *exception): | |||
""" | |||
Change the current working-directory to **old_path**. | |||
:param exception: Various exception arguments passed by `with`. | |||
:type exception: varargs | |||
""" | |||
os.chdir(self.old_path) |
@@ -1,18 +0,0 @@ | |||
""" | |||
Module with classes and functions to handle communication with the MySQL | |||
database backend, which manages the search index. | |||
""" | |||
import oursql | |||
class Database(object): | |||
"""Represents the MySQL database.""" | |||
def __init__(self): | |||
pass | |||
def _connect(self): | |||
pass | |||
def _create(self): | |||
pass |
@@ -0,0 +1,153 @@ | |||
""" | |||
Subpackage with classes and functions to handle communication with the MySQL | |||
database backend, which manages the search index. | |||
""" | |||
import os | |||
import mmh3 | |||
import oursql | |||
from .migration import VERSION, MIGRATIONS | |||
__all__ = ["Database"] | |||
class Database(object): | |||
"""Represents the MySQL database.""" | |||
def __init__(self, migrate=False): | |||
self._conn = self._connect() | |||
self._check_version(migrate) | |||
def _connect(self): | |||
"""Establish a connection to the database.""" | |||
root = os.path.dirname(os.path.abspath(__file__)) | |||
default_file = os.path.join(root, ".my.cnf") | |||
return oursql.connect(db="bitshift", read_default_file=default_file, | |||
autoping=True, autoreconnect=True) | |||
def _migrate(self, cursor, current): | |||
"""Migrate the database to the latest schema version.""" | |||
for version in xrange(current, VERSION): | |||
print "Migrating to %d..." % version + 1 | |||
for query in MIGRATIONS[version - 1]: | |||
cursor.execute(query) | |||
cursor.execute("UPDATE version SET version = ?", (version + 1,)) | |||
def _check_version(self, migrate): | |||
"""Check the database schema version and respond accordingly. | |||
If the schema is out of date, migrate if *migrate* is True, else raise | |||
an exception. | |||
""" | |||
with self._conn.cursor() as cursor: | |||
cursor.execute("SELECT version FROM version") | |||
version = cursor.fetchone()[0] | |||
if version < VERSION: | |||
if migrate: | |||
self._migrate(cursor, version) | |||
else: | |||
err = "Database schema out of date. " \ | |||
"Run `python -m bitshift.database.migration`." | |||
raise RuntimeError(err) | |||
def _get_codelets_from_ids(self, cursor, ids): | |||
"""Return a list of Codelet objects given a list of codelet IDs.""" | |||
raise NotImplementedError() ## TODO | |||
def _decompose_url(self, cursor, url): | |||
"""Break up a URL into an origin (with a URL base) and a suffix.""" | |||
query = """SELECT origin_id, SUBSTR(?, LENGTH(origin_url_base)) | |||
FROM origins | |||
WHERE origin_url_base IS NOT NULL | |||
AND ? LIKE CONCAT(origin_url_base, "%")""" | |||
cursor.execute(query, (url, url)) | |||
result = cursor.fetchone() | |||
return result if result else (1, url) | |||
def _insert_symbols(self, cursor, code_id, sym_type, symbols): | |||
"""Insert a list of symbols of a given type into the database.""" | |||
sym_types = ["functions", "classes", "variables"] | |||
query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)" | |||
query2 = """INSERT INTO symbol_locations VALUES | |||
(DEFAULT, ?, ?, ?, ?, ?, ?)""" | |||
for (name, decls, uses) in symbols: | |||
cursor.execute(query1, (code_id, sym_types.index(sym_type), name)) | |||
sym_id = cursor.lastrowid | |||
params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] + | |||
[tuple([sym_id, 1] + list(loc)) for loc in uses]) | |||
cursor.executemany(query2, params) | |||
def close(self): | |||
"""Disconnect from the database.""" | |||
self._conn.close() | |||
def search(self, query, page=1): | |||
""" | |||
Search the database for a query and return the *n*\ th page of results. | |||
:param query: The query to search for. | |||
:type query: :py:class:`~.query.tree.Tree` | |||
:param page: The result page to display. | |||
:type page: int | |||
:return: The total number of results, and the *n*\ th page of results. | |||
:rtype: 2-tuple of (long, list of :py:class:`.Codelet`\ s) | |||
""" | |||
query1 = """SELECT cdata_codelet, cache_count_mnt, cache_count_exp | |||
FROM cache | |||
INNER JOIN cache_data ON cache_id = cdata_cache | |||
WHERE cache_id = ?""" | |||
query2 = "INSERT INTO cache VALUES (?, ?, ?, DEFAULT)" | |||
query3 = "INSERT INTO cache_data VALUES (?, ?)" | |||
cache_id = mmh3.hash64(str(page) + ":" + query.serialize())[0] | |||
with self._conn.cursor() as cursor: | |||
cursor.execute(query1, (cache_id,)) | |||
results = cursor.fetchall() | |||
if results: # Cache hit | |||
num_results = results[0][1] * (10 ** results[0][2]) | |||
ids = [res[0] for res in results] | |||
else: # Cache miss | |||
## TODO: build and execute search query | |||
results = cursor.fetchall() | |||
ids = NotImplemented ## TODO: extract ids from results | |||
num_results = NotImplemented ## TODO: num if results else 0 | |||
num_exp = max(len(str(num_results)) - 3, 0) | |||
num_results = int(round(num_results, -num_exp)) | |||
num_mnt = num_results / (10 ** num_exp) | |||
cursor.execute(query2, (cache_id, num_mnt, num_exp)) | |||
cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) | |||
return (num_results, self._get_codelets_from_ids(cursor, ids)) | |||
def insert(self, codelet): | |||
""" | |||
Insert a codelet into the database. | |||
:param codelet: The codelet to insert. | |||
:type codelet: :py:class:`.Codelet` | |||
""" | |||
query1 = """INSERT INTO code VALUES (?, ?, ?) | |||
ON DUPLICATE KEY UPDATE code_id=code_id""" | |||
query2 = """INSERT INTO codelets VALUES | |||
(DEFAULT, ?, ?, ?, ?, ?, ?, ?)""" | |||
query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)" | |||
hash_key = str(codelet.language) + ":" + codelet.code.encode("utf8") | |||
code_id = mmh3.hash64(hash_key)[0] | |||
with self._conn.cursor() as cursor: | |||
cursor.execute(query1, (code_id, codelet.language, codelet.code)) | |||
if cursor.rowcount == 1: | |||
for sym_type, symbols in codelet.symbols.iteritems(): | |||
self._insert_symbols(cursor, code_id, sym_type, symbols) | |||
origin, url = self._decompose_url(cursor, codelet.url) | |||
cursor.execute(query2, (codelet.name, code_id, origin, url, | |||
codelet.rank, codelet.date_created, | |||
codelet.date_modified)) | |||
codelet_id = cursor.lastrowid | |||
authors = [(codelet_id, a[0], a[1]) for a in codelet.authors] | |||
cursor.executemany(query3, authors) |
@@ -0,0 +1,97 @@ | |||
""" | |||
Contains information about database schema versions, and SQL queries to update | |||
between them. | |||
""" | |||
VERSION = 6 | |||
MIGRATIONS = [ | |||
# 1 -> 2 | |||
[ | |||
"""ALTER TABLE `codelets` | |||
DROP FOREIGN KEY `codelets_ibfk_1`""", | |||
"""ALTER TABLE `code` | |||
DROP KEY `code_hash`, | |||
DROP COLUMN `code_hash`, | |||
MODIFY COLUMN `code_id` BIGINT NOT NULL""", | |||
"""ALTER TABLE `codelets` | |||
MODIFY COLUMN `codelet_code_id` BIGINT NOT NULL, | |||
ADD KEY (`codelet_lang`), | |||
ADD CONSTRAINT `codelets_ibfk_1` FOREIGN KEY (`codelet_code_id`) | |||
REFERENCES `code` (`code_id`) | |||
ON DELETE RESTRICT ON UPDATE CASCADE""", | |||
"""ALTER TABLE `symbols` | |||
ADD COLUMN `symbol_end_row` INT UNSIGNED NOT NULL, | |||
ADD COLUMN `symbol_end_col` INT UNSIGNED NOT NULL""" | |||
], | |||
# 2 -> 3 | |||
[ | |||
"""ALTER TABLE `symbols` | |||
DROP FOREIGN KEY `symbols_ibfk_1`, | |||
CHANGE COLUMN `symbol_codelet` `symbol_code` BIGINT NOT NULL, | |||
ADD CONSTRAINT `symbols_ibfk_1` FOREIGN KEY (`symbol_code`) | |||
REFERENCES `code` (`code_id`) | |||
ON DELETE CASCADE ON UPDATE CASCADE""" | |||
], | |||
# 3 -> 4 | |||
[ | |||
"""ALTER TABLE `symbols` | |||
DROP COLUMN `symbol_row`, | |||
DROP COLUMN `symbol_col`, | |||
DROP COLUMN `symbol_end_row`, | |||
DROP COLUMN `symbol_end_col`""", | |||
"""CREATE TABLE `symbol_locations` ( | |||
`sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, | |||
`sloc_symbol` BIGINT UNSIGNED NOT NULL, | |||
`sloc_type` TINYINT UNSIGNED NOT NULL, | |||
`sloc_row` INT UNSIGNED NOT NULL, | |||
`sloc_col` INT UNSIGNED NOT NULL, | |||
`sloc_end_row` INT UNSIGNED NOT NULL, | |||
`sloc_end_col` INT UNSIGNED NOT NULL, | |||
PRIMARY KEY (`sloc_id`), | |||
FOREIGN KEY (`sloc_symbol`) | |||
REFERENCES `symbols` (`symbol_id`) | |||
ON DELETE CASCADE ON UPDATE CASCADE | |||
) ENGINE=InnoDB""" | |||
], | |||
# 4 -> 5 | |||
[ | |||
"""ALTER TABLE `origins` | |||
MODIFY COLUMN `origin_name` VARCHAR(64) DEFAULT NULL, | |||
MODIFY COLUMN `origin_url` VARCHAR(512) DEFAULT NULL, | |||
MODIFY COLUMN `origin_url_base` VARCHAR(512) DEFAULT NULL""" | |||
], | |||
# 5 -> 6 | |||
[ | |||
"""ALTER TABLE `code` | |||
ADD COLUMN `code_lang` SMALLINT UNSIGNED DEFAULT NULL | |||
AFTER `code_id`, | |||
ADD KEY (`code_lang`)""", | |||
"""ALTER TABLE `codelets` | |||
DROP KEY `codelet_lang`, | |||
DROP COLUMN `codelet_lang`""", | |||
"""ALTER TABLE `cache_data` | |||
DROP FOREIGN KEY `cache_data_ibfk_1`""", | |||
"""ALTER TABLE `cache` | |||
MODIFY COLUMN `cache_id` BIGINT NOT NULL, | |||
DROP COLUMN `cache_hash`, | |||
DROP COLUMN `cache_last_used`, | |||
MODIFY COLUMN `cache_count_mnt` SMALLINT UNSIGNED NOT NULL""", | |||
"""ALTER TABLE `cache_data` | |||
MODIFY COLUMN `cdata_cache` BIGINT NOT NULL, | |||
ADD PRIMARY KEY (`cdata_cache`, `cdata_codelet`), | |||
ADD CONSTRAINT `cache_data_ibfk_1` FOREIGN KEY (`cdata_codelet`) | |||
REFERENCES `codelets` (`codelet_id`) | |||
ON DELETE CASCADE ON UPDATE CASCADE""", | |||
"""CREATE EVENT `flush_cache` | |||
ON SCHEDULE EVERY 1 HOUR | |||
DO | |||
DELETE FROM `cache` | |||
WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY);""" | |||
] | |||
] | |||
if __name__ == "__main__": | |||
from . import Database | |||
Database(migrate=True).close() |
@@ -0,0 +1,114 @@ | |||
-- Schema version 6 | |||
CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; | |||
USE `bitshift`; | |||
CREATE TABLE `version` ( | |||
`version` INT UNSIGNED NOT NULL | |||
) ENGINE=InnoDB; | |||
INSERT INTO `version` VALUES (6); | |||
CREATE TABLE `origins` ( | |||
`origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, | |||
`origin_name` VARCHAR(64) DEFAULT NULL, | |||
`origin_url` VARCHAR(512) DEFAULT NULL, | |||
`origin_url_base` VARCHAR(512) DEFAULT NULL, | |||
`origin_image` BLOB DEFAULT NULL, | |||
PRIMARY KEY (`origin_id`) | |||
) ENGINE=InnoDB; | |||
INSERT INTO `origins` VALUES (1, NULL, NULL, NULL, NULL); | |||
CREATE TABLE `code` ( | |||
`code_id` BIGINT NOT NULL, | |||
`code_lang` SMALLINT UNSIGNED DEFAULT NULL, | |||
`code_code` MEDIUMTEXT NOT NULL, | |||
PRIMARY KEY (`code_id`), | |||
KEY (`code_lang`), | |||
FULLTEXT KEY (`code_code`) | |||
) ENGINE=InnoDB; | |||
CREATE TABLE `codelets` ( | |||
`codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, | |||
`codelet_name` VARCHAR(300) NOT NULL, | |||
`codelet_code_id` BIGINT NOT NULL, | |||
`codelet_origin` TINYINT UNSIGNED NOT NULL, | |||
`codelet_url` VARCHAR(512) NOT NULL, | |||
`codelet_rank` FLOAT NOT NULL, | |||
`codelet_date_created` DATETIME DEFAULT NULL, | |||
`codelet_date_modified` DATETIME DEFAULT NULL, | |||
PRIMARY KEY (`codelet_id`), | |||
FULLTEXT KEY (`codelet_name`), | |||
KEY (`codelet_rank`), | |||
KEY (`codelet_date_created`), | |||
KEY (`codelet_date_modified`), | |||
FOREIGN KEY (`codelet_code_id`) | |||
REFERENCES `code` (`code_id`) | |||
ON DELETE RESTRICT ON UPDATE CASCADE, | |||
FOREIGN KEY (`codelet_origin`) | |||
REFERENCES `origins` (`origin_id`) | |||
ON DELETE RESTRICT ON UPDATE CASCADE | |||
) ENGINE=InnoDB; | |||
CREATE TABLE `authors` ( | |||
`author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, | |||
`author_codelet` BIGINT UNSIGNED NOT NULL, | |||
`author_name` VARCHAR(128) NOT NULL, | |||
`author_url` VARCHAR(512) DEFAULT NULL, | |||
PRIMARY KEY (`author_id`), | |||
FULLTEXT KEY (`author_name`), | |||
FOREIGN KEY (`author_codelet`) | |||
REFERENCES `codelets` (`codelet_id`) | |||
ON DELETE CASCADE ON UPDATE CASCADE | |||
) ENGINE=InnoDB; | |||
CREATE TABLE `symbols` ( | |||
`symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, | |||
`symbol_code` BIGINT NOT NULL, | |||
`symbol_type` TINYINT UNSIGNED NOT NULL, | |||
`symbol_name` VARCHAR(512) NOT NULL, | |||
PRIMARY KEY (`symbol_id`), | |||
KEY (`symbol_type`, `symbol_name`(32)), | |||
FOREIGN KEY (`symbol_code`) | |||
REFERENCES `code` (`code_id`) | |||
ON DELETE CASCADE ON UPDATE CASCADE | |||
) ENGINE=InnoDB; | |||
CREATE TABLE `symbol_locations` ( | |||
`sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, | |||
`sloc_symbol` BIGINT UNSIGNED NOT NULL, | |||
`sloc_type` TINYINT UNSIGNED NOT NULL, | |||
`sloc_row` INT UNSIGNED NOT NULL, | |||
`sloc_col` INT UNSIGNED NOT NULL, | |||
`sloc_end_row` INT UNSIGNED NOT NULL, | |||
`sloc_end_col` INT UNSIGNED NOT NULL, | |||
PRIMARY KEY (`sloc_id`), | |||
FOREIGN KEY (`sloc_symbol`) | |||
REFERENCES `symbols` (`symbol_id`) | |||
ON DELETE CASCADE ON UPDATE CASCADE | |||
) ENGINE=InnoDB; | |||
CREATE TABLE `cache` ( | |||
`cache_id` BIGINT NOT NULL, | |||
`cache_count_mnt` SMALLINT UNSIGNED NOT NULL, | |||
`cache_count_exp` TINYINT UNSIGNED NOT NULL, | |||
`cache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, | |||
PRIMARY KEY (`cache_id`) | |||
) ENGINE=InnoDB; | |||
CREATE TABLE `cache_data` ( | |||
`cdata_cache` BIGINT NOT NULL, | |||
`cdata_codelet` BIGINT UNSIGNED NOT NULL, | |||
PRIMARY KEY (`cdata_cache`, `cdata_codelet`), | |||
FOREIGN KEY (`cdata_cache`) | |||
REFERENCES `cache` (`cache_id`) | |||
ON DELETE CASCADE ON UPDATE CASCADE, | |||
FOREIGN KEY (`cdata_codelet`) | |||
REFERENCES `codelets` (`codelet_id`) | |||
ON DELETE CASCADE ON UPDATE CASCADE | |||
) ENGINE=InnoDB; | |||
CREATE EVENT `flush_cache` | |||
ON SCHEDULE EVERY 1 HOUR | |||
DO | |||
DELETE FROM `cache` | |||
WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY); |
@@ -22,4 +22,6 @@ def parse_query(query): | |||
# gets a string, returns a Tree | |||
# TODO: note: resultant Trees should be normalized so that "foo OR bar" | |||
# and "bar OR foo" result in equivalent trees | |||
pass |
@@ -0,0 +1,11 @@ | |||
query Package | |||
============= | |||
:mod:`query` Package | |||
-------------------- | |||
.. automodule:: bitshift.query | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
@@ -1,30 +1,51 @@ | |||
bitshift package | |||
bitshift Package | |||
================ | |||
Submodules | |||
:mod:`bitshift` Package | |||
----------------------- | |||
bitshift.assets module | |||
.. automodule:: bitshift.__init__ | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
:mod:`assets` Module | |||
-------------------- | |||
.. automodule:: bitshift.assets | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
bitshift.config module | |||
:mod:`codelet` Module | |||
--------------------- | |||
.. automodule:: bitshift.config | |||
.. automodule:: bitshift.codelet | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
:mod:`config` Module | |||
-------------------- | |||
Module contents | |||
.. automodule:: bitshift.config | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
:mod:`database` Module | |||
---------------------- | |||
.. automodule:: bitshift | |||
.. automodule:: bitshift.database | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
Subpackages | |||
----------- | |||
.. toctree:: | |||
bitshift.parser | |||
bitshift.query | |||
@@ -4,7 +4,9 @@ setup( | |||
name = "bitshift", | |||
version = "0.1", | |||
packages = find_packages(), | |||
install_requires = ["Flask>=0.10.1", "pygments>=1.6"], | |||
install_requires = [ | |||
"Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", | |||
"beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"], | |||
author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", | |||
license = "MIT", | |||
url = "https://github.com/earwig/bitshift" | |||