diff --git a/.gitignore b/.gitignore index 6a014f5..7e00121 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .sass-cache .DS_Store +.my.cnf # github premade rules *.py[cod] diff --git a/README.md b/README.md index 3cb81a1..8ca31d7 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ bitshift ======== -bitshift is a semantic search engine for source code. +bitshift is a semantic search engine for source code developed by Benjamin +Attal, Ben Kurtovic, and Severyn Kozak. Branches -------- @@ -13,6 +14,11 @@ Branches - `feature/*`: individual components of the project with untested, likely horribly broken code - branch off from and merge into `develop` when done +Style +----- +bitshift uses [SASS][SASS] for styling; compile the stylesheets to CSS with +`sass --watch static/sass/:static/css`. + Documentation ------------- @@ -24,3 +30,5 @@ new modules or packages, but *not* when adding functions or changing docstrings), run `sphinx-apidoc -fo docs/source/api bitshift` from the project root. Note that this will revert any custom changes made to the files in `docs/source/api`, so you might want to update them by hand instead. + +[SASS]: http://sass-lang.com/guide diff --git a/app.py b/app.py index c4083c9..b5e8b57 100644 --- a/app.py +++ b/app.py @@ -5,6 +5,8 @@ Module to contain all the project's Flask server plumbing. from flask import Flask from flask import render_template, session +from bitshift import assets +from bitshift.database import Database from bitshift.query import parse_query app = Flask(__name__) @@ -12,7 +14,9 @@ app.config.from_object("bitshift.config") app_env = app.jinja_env app_env.line_statement_prefix = "=" -app_env.globals.update(assets = assets) +app_env.globals.update(assets=assets) + +database = Database() @app.route("/") def index(): @@ -20,8 +24,8 @@ def index(): @app.route("/search/") def search(query): - ## tree = parse_query(query) - ## database.search(tree) + tree = parse_query(query) + database.search(tree) pass if __name__ == "__main__": diff --git a/bitshift/__init__.py b/bitshift/__init__.py index 9a18c9b..78ca5e9 100644 --- a/bitshift/__init__.py +++ b/bitshift/__init__.py @@ -1 +1 @@ -from . import assets, codelet, config, database, parser, query +from . import assets, codelet, config, database, parser, query, crawler diff --git a/bitshift/assets.py b/bitshift/assets.py index 90564d2..b4f597b 100644 --- a/bitshift/assets.py +++ b/bitshift/assets.py @@ -1,6 +1,5 @@ """ -.. module:: assets - :synopsis: Helper functions for use inside the project's Jinja templates. +:synopsis: Helper functions for use inside the project's Jinja templates. """ from flask import Markup @@ -16,8 +15,11 @@ def tag(filename): :param filename: The filename of the asset to create a tag for. + :type filename: str + :return: A string containing a `` tag for JS files, and a `` for CSS files. + :rtype: str """ file_ext = filename.split(".")[-1] diff --git a/bitshift/codelet.py b/bitshift/codelet.py index df81294..453ace0 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -1,13 +1,57 @@ __all__ = ["Codelet"] class Codelet(object): - ## object to store the following (it doesn't need to do anything with it): - ## author name, URL, date created/modified, language, source code itself - ## for VCS: project name, file in project - ## also: list of functions, etc (associations data) + """ + A source-code object with code metadata and composition analysis. - ## DICTIONARY MAPPING STRINGS REPRESENTING ASSOCIATION TYPE WITH DICTIONARIES - ## MAPPING ASSOCIATION NAMES WITH TUPLES REPRESENTING THEIR PLACE IN THE FILE - ## STORED AS TWO INTEGERS REPRESENTING THE ROW AND THE COLUMN + :ivar name: (str) A suitable name for the codelet. + :ivar code: (str) A containing the raw source code. + :ivar filename: (str, or None) The filename of the snippet. + :ivar language: (int, or None) The inferred language of `code`. + :ivar authors: (array of tuples (str, str or None)) An array of tuples + containing an author's name and profile URL (on the service the code + was pulled from). + :ivar code_url: (str) The url of the (page containing the) source code. + :ivar date_created: (:class:`datetime.datetime`, or None) The date the code + was published. + :ivar date_modified: (:class:`datetime.datetime`, or None) The date the + code was last modified. + :ivar rank: (float) A quanitification of the source code's quality, as + per available ratings (stars, forks, upvotes, etc.). + """ - ## {"functions": {"foo": (12, 13), "bar": (53, 3)}} + def __init__(self, name, code, filename, language, authors, code_url, + date_created, date_modified, rank): + """ + Create a Codelet instance. + + :param name: see :attr:`self.name` + :param code: see :attr:`self.code` + :param filename: see :attr:`self.filename` + :param language: see :attr:`self.language` + :param authors: see :attr:`self.authors` + :param code_url: see :attr:`self.code_url` + :param date_created: see :attr:`self.date_created` + :param date_modified: see :attr:`self.date_modified` + :param rank: see :attr:`self.rank` + + :type name: see :attr:`self.name` + :type code: see :attr:`self.code` + :type filename: see :attr:`self.filename` + :type language: see :attr:`self.language` + :type authors: see :attr:`self.authors` + :type code_url: see :attr:`self.code_url` + :type date_created: see :attr:`self.date_created` + :type date_modified: see :attr:`self.date_modified` + :type rank: see :attr:`self.rank` + """ + + self.name = name + self.code = code + self.filename = filename + self.language = language + self.authors = authors + self.code_url = code_url + self.date_created = date_created + self.date_modified = date_modified + self.rank = rank diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py new file mode 100644 index 0000000..73b1c22 --- /dev/null +++ b/bitshift/crawler/__init__.py @@ -0,0 +1,55 @@ +""" +:synopsis: Parent crawler module, which supervises all crawlers. + +Contains functions for initializing all subsidiary, threaded crawlers. +""" + +import logging, logging.handlers, os, Queue + +from bitshift.crawler import crawler, indexer + +__all__ = ["crawl"] + +def crawl(): + """ + Initialize all crawlers (and indexers). + + Start the: + 1. GitHub crawler, :class:`crawler.GitHubCrawler`. + 2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`. + 3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. + """ + + _configure_logging() + + MAX_URL_QUEUE_SIZE = 5e3 + + repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) + threads = [crawler.GitHubCrawler(repo_clone_queue), + crawler.BitbucketCrawler(repo_clone_queue), + indexer.GitIndexer(repo_clone_queue)] + + for thread in threads: + thread.start() + +def _configure_logging(): + LOG_FILE_DIR = "log" + + if not os.path.exists(LOG_FILE_DIR): + os.mkdir(LOG_FILE_DIR) + + logging.getLogger("requests").setLevel(logging.WARNING) + logging.getLogger("urllib3").setLevel(logging.WARNING) + + formatter = logging.Formatter( + fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" + " %(message)s"), datefmt="%y-%m-%d %H:%M:%S") + + handler = logging.handlers.TimedRotatingFileHandler( + "%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1, + backupCount=20) + handler.setFormatter(formatter) + + root_logger = logging.getLogger() + root_logger.addHandler(handler) + root_logger.setLevel(logging.NOTSET) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py new file mode 100644 index 0000000..9501bd0 --- /dev/null +++ b/bitshift/crawler/crawler.py @@ -0,0 +1,240 @@ +""" +:synopsis: Main crawler module, to oversee all site-specific crawlers. + +Contains all website/framework-specific Class crawlers. +""" + +import logging, requests, time, threading + +from bitshift.crawler import indexer + +from ..codelet import Codelet +from ..database import Database + +class GitHubCrawler(threading.Thread): + """ + Crawler that retrieves links to all of GitHub's public repositories. + + GitHubCrawler is a threaded singleton that queries GitHub's API for urls + to its public repositories, which it inserts into a :class:`Queue.Queue` + shared with :class:`indexer.GitIndexer`. + + :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository` + with repository metadata retrieved by :class:`GitHubCrawler`, and other Git + crawlers, to be processed by :class:`indexer.GitIndexer`. + :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. + """ + + AUTHENTICATION = { + "client_id" : "436cb884ae09be7f2a4e", + "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" + } + + def __init__(self, clone_queue): + """ + Create an instance of the singleton `GitHubCrawler`. + + :param clone_queue: see :attr:`self.clone_queue` + + :type clone_queue: see :attr:`self.clone_queue` + """ + + self.clone_queue = clone_queue + self._logger = logging.getLogger("%s.%s" % + (__name__, self.__class__.__name__)) + self._logger.info("Starting.") + super(GitHubCrawler, self).__init__(name=self.__class__.__name__) + + def run(self): + """ + Query the GitHub API for data about every public repository. + + Pull all of GitHub's repositories by making calls to its API in a loop, + accessing a subsequent page of results via the "next" URL returned in an + API response header. Uses Severyn Kozak's (sevko) authentication + credentials. For every new repository, a :class:`GitRepository` is + inserted into :attr:`self.clone_queue`. + """ + + next_api_url = "https://api.github.com/repositories" + api_request_interval = 5e3 / 60 ** 2 + + while len(next_api_url) > 0: + start_time = time.time() + + try: + resp = requests.get(next_api_url, params=self.AUTHENTICATION) + except ConnectionError as excep: + self._logger.warning("API %s call failed: %s: %s", + next_api_url, excep.__class__.__name__, excep) + time.sleep(0.5) + continue + + queue_percent_full = (float(self.clone_queue.qsize()) / + self.clone_queue.maxsize) * 100 + self._logger.info("API call made. Queue size: %d/%d, %d%%." % + ((self.clone_queue.qsize(), self.clone_queue.maxsize, + queue_percent_full))) + + repo_names = [repo["full_name"] for repo in resp.json()] + repo_stars = self._get_repositories_stars(repo_names) + + for repo in resp.json(): + while self.clone_queue.full(): + time.sleep(1) + + self.clone_queue.put(indexer.GitRepository( + repo["html_url"], repo["full_name"].replace("/", ""), + "GitHub", repo_stars[repo["full_name"]])) + + if int(resp.headers["x-ratelimit-remaining"]) == 0: + time.sleep(int(resp.headers["x-ratelimit-reset"]) - + time.time()) + + next_api_url = resp.headers["link"].split(">")[0][1:] + + sleep_time = api_request_interval - (time.time() - start_time) + if sleep_time > 0: + time.sleep(sleep_time) + + def _get_repositories_stars(self, repo_names): + """ + Return the number of stargazers for several repositories. + + Queries the GitHub API for the number of stargazers for any given + repositories, and blocks if the query limit is exceeded. + + :param repo_names: An array of repository names, in + `username/repository_name` format. + + :type repo_names: str + + :return: A dictionary with repository name keys, and corresponding + stargazer count values. + + Example dictionary: + .. code-block:: python + { + "user/repository" : 100 + } + + :rtype: dictionary + """ + + API_URL = "https://api.github.com/search/repositories" + REPOS_PER_QUERY = 25 + + repo_stars = {} + for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in + xrange(0, len(repo_names), REPOS_PER_QUERY)]: + query_url = "%s?q=%s" % (API_URL, + "+".join("repo:%s" % name for name in names)) + + params = self.AUTHENTICATION + resp = requests.get(query_url, + params=params, + headers={ + "Accept" : "application/vnd.github.preview" + }) + + if int(resp.headers["x-ratelimit-remaining"]) == 0: + sleep_time = int(resp.headers["x-ratelimit-reset"]) - \ + time.time() + 1 + if sleep_time > 0: + logging.info("API quota exceeded. Sleep time: %d." % + sleep_time) + time.sleep(sleep_time) + + for repo in resp.json()["items"]: + rank = float(repo["stargazers_count"]) / 1000 + repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0 + + for name in repo_names: + if name not in repo_stars: + repo_stars[name] = 0.5 + + return repo_stars + +class BitbucketCrawler(threading.Thread): + """ + Crawler that retrieves links to all of Bitbucket's public repositories. + + BitbucketCrawler is a threaded singleton that queries Bitbucket's API for + urls to its public repositories, and inserts them as + :class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with + :class:`indexer.GitIndexer`. + + :ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert + :class:`indexer.GitRepository` repository urls into. + :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. + """ + + def __init__(self, clone_queue): + """ + Create an instance of the singleton `BitbucketCrawler`. + + :param clone_queue: see :attr:`self.clone_queue` + + :type clone_queue: see :attr:`self.clone_queue` + """ + + self.clone_queue = clone_queue + self._logger = logging.getLogger("%s.%s" % + (__name__, self.__class__.__name__)) + self._logger.info("Starting.") + super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) + + def run(self): + """ + Query the Bitbucket API for data about every public repository. + + Query the Bitbucket API's "/repositories" endpoint and read its + paginated responses in a loop; any "git" repositories have their + clone-urls and names inserted into a :class:`indexer.GitRepository` in + :attr:`self.clone_queue`. + """ + + next_api_url = "https://api.bitbucket.org/2.0/repositories" + + while True: + try: + response = requests.get(next_api_url).json() + except ConnectionError as exception: + time.sleep(0.5) + self._logger.warning("API %s call failed: %s: %s", + next_api_url, excep.__class__.__name__, excep) + continue + + queue_percent_full = (float(self.clone_queue.qsize()) / + self.clone_queue.maxsize) * 100 + self._logger.info("API call made. Queue size: %d/%d, %d%%." % + ((self.clone_queue.qsize(), self.clone_queue.maxsize, + queue_percent_full))) + + for repo in response["values"]: + if repo["scm"] == "git": + while self.clone_queue.full(): + time.sleep(1) + + clone_links = repo["links"]["clone"] + clone_url = (clone_links[0]["href"] if + clone_links[0]["name"] == "https" else + clone_links[1]["href"]) + links.append("clone_url") + + try: + watchers = requests.get( + repo["links"]["watchers"]["href"]) + rank = len(watchers.json()["values"]) / 100 + except ConnectionError as exception: + time.sleep(0.5) + self._logger.warning("API %s call failed: %s: %s", + next_api_url, excep.__class__.__name__, excep) + continue + + self.clone_queue.put(indexer.GitRepository( + clone_url, repo["full_name"], "Bitbucket"), + rank if rank < 1.0 else 1.0) + + next_api_url = response["next"] + time.sleep(0.2) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py new file mode 100644 index 0000000..c1c77ad --- /dev/null +++ b/bitshift/crawler/indexer.py @@ -0,0 +1,489 @@ +""" +:synopsis: Contains a singleton GitIndexer class, which clones and indexes git + repositories. +""" + +import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\ + threading + +from ..database import Database +from ..codelet import Codelet + +GIT_CLONE_DIR = "/tmp/bitshift" +THREAD_QUEUE_SLEEP = 0.5 + +class GitRepository(object): + """ + A representation of a Git repository's metadata. + + :ivar url: (str) The repository's url. + :ivar name: (str) The name of the repository. + :ivar framework_name: (str) The name of the online Git framework that the + repository belongs to (eg, GitHub, BitBucket). + :ivar rank: (float) The rank of the repository, as assigned by + :class:`crawler.GitHubCrawler`. + """ + + def __init__(self, url, name, framework_name, rank): + """ + Create a GitRepository instance. + + :param url: see :attr:`GitRepository.url` + :param name: see :attr:`GitRepository.name` + :param framework_name: see :attr:`GitRepository.framework_name` + :param rank: see :attr:`GitRepository.rank` + + :type url: str + :type name: str + :type framework_name: str + :type rank: float + """ + + self.url = url + self.name = name + self.framework_name = framework_name + self.rank = rank + +class GitIndexer(threading.Thread): + """ + A singleton Git repository indexer. + + :class:`GitIndexer` indexes the repositories cloned by the + :class:`_GitCloner` singleton. + + :ivar index_queue: (:class:`Queue.Queue`) A queue containing + :class:`GitRepository` objects for every new repository succesfully + cloned by :class:`_GitCloner`, which are to be indexed. + :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner, + which feeds :class:`GitIndexer`. + :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. + """ + + def __init__(self, clone_queue): + """ + Create an instance of the singleton `GitIndexer`. + + :param clone_queue: see :attr:`self.index_queue` + + :type index_queue: see :attr:`self.index_queue` + """ + + MAX_INDEX_QUEUE_SIZE = 10 + + self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) + self.git_cloner = _GitCloner(clone_queue, self.index_queue) + self.git_cloner.start() + self._logger = logging.getLogger("%s.%s" % + (__name__, self.__class__.__name__)) + self._logger.info("Starting.") + + if not os.path.exists(GIT_CLONE_DIR): + os.makedirs(GIT_CLONE_DIR) + + super(GitIndexer, self).__init__(name=self.__class__.__name__) + + def run(self): + """ + Retrieve metadata about newly cloned repositories and index them. + + Blocks until new repositories appear in :attr:`self.index_queue`, then + retrieves one, and attempts indexing it. Should any errors occur, the + new repository will be discarded and the indexer will index the next in + the queue. + """ + + while True: + while self.index_queue.empty(): + time.sleep(THREAD_QUEUE_SLEEP) + + repo = self.index_queue.get() + self.index_queue.task_done() + try: + self._index_repository(repo) + except Exception as excep: + self._logger.warning("%s: %s.", excep.__class__.__name__, excep) + + def _index_repository(self, repo): + """ + Clone and index (create and insert Codeletes for) a Git repository. + + `git clone` the Git repository located at **repo.url**, call + `_insert_repository_codelets()`, then remove said repository. + + :param repo_url: The metadata of the repository to be indexed. + + :type repo_url: :class:`GitRepository` + """ + + with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir: + try: + self._insert_repository_codelets(repo) + except Exception as excep: + self._logger.warning("%s: %s.", excep.__class__.__name__, excep) + + if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) + + def _insert_repository_codelets(self, repo): + """ + Create and insert a Codelet for the files inside a Git repository. + + Create a new Codelet, and insert it into the Database singleton, for + every file inside the current working directory's default branch + (usually *master*). + + :param repo_url: The metadata of the repository to be indexed. + + :type repo_url: :class:`GitRepository` + """ + + commits_meta = self._get_commits_metadata() + if commits_meta is None: + return + + for filename in commits_meta.keys(): + try: + with open(filename) as source_file: + source = self._decode(source_file.read()) + if source is None: + continue + except IOError as exception: + continue + + authors = [(self._decode(author), None) for author in \ + commits_meta[filename]["authors"]] + codelet = Codelet("%s:%s" % (repo.name, filename), source, filename, + None, authors, self._generate_file_url(filename, + repo.url, repo.framework_name), + commits_meta[filename]["time_created"], + commits_meta[filename]["time_last_modified"], + repo.rank) + + def _generate_file_url(self, filename, repo_url, framework_name): + """ + Return a url for a filename from a Git wrapper framework. + + :param filename: The path of the file. + :param repo_url: The url of the file's parent repository. + :param framework_name: The name of the framework the repository is from. + + :type filename: str + :type repo_url: str + :type framework_name: str + + :return: The file's full url on the given framework, if successfully + derived. + :rtype: str, or None + + .. warning:: + Various Git subprocesses will occasionally fail, and, seeing as the + information they provide is a crucial component of some repository file + urls, None may be returned. + """ + + try: + if framework_name == "GitHub": + default_branch = subprocess.check_output("git branch" + " --no-color", shell=True)[2:-1] + return ("%s/blob/%s/%s" % (repo_url, default_branch, + filename)).replace("//", "/") + elif framework_name == "Bitbucket": + commit_hash = subprocess.check_output("git rev-parse HEAD", + shell=True).replace("\n", "") + return ("%s/src/%s/%s" % (repo_url, commit_hash, + filename)).replace("//", "/") + except subprocess.CalledProcessError as exception: + return None + + def _get_git_commits(self): + """ + Return the current working directory's formatted commit data. + + Uses `git log` to generate metadata about every single file in the + repository's commit history. + + :return: The author, timestamp, and names of all modified files of every + commit. + .. code-block:: python + sample_returned_array = [ + { + "author" : (str) "author" + "timestamp" : (`datetime.datetime`) , + "filenames" : (str array) ["file1", "file2"] + } + ] + :rtype: array of dictionaries + """ + + git_log = subprocess.check_output(("git --no-pager log --name-only" + " --pretty=format:'%n%n%an%n%at' -z"), shell=True) + + commits = [] + for commit in git_log.split("\n\n"): + fields = commit.split("\n") + if len(fields) > 2: + commits.append({ + "author" : fields[0], + "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), + "filenames" : fields[2].split("\x00")[:-2] + }) + + return commits + + def _get_tracked_files(self): + """ + Return a list of the filenames of all valuable files in the Git repository. + + Get a list of the filenames of the non-binary (Perl heuristics used for + filetype identification) files currently inside the current working + directory's Git repository. Then, weed out any boilerplate/non-code files + that match the regex rules in GIT_IGNORE_FILES. + + :return: The filenames of all index-worthy non-binary files. + :rtype: str array + """ + + files = [] + for dirname, subdir_names, filenames in os.walk("."): + for filename in filenames: + path = os.path.join(dirname, filename) + if self._is_ascii(path): + files.append(path[2:]) + + return files + + def _get_commits_metadata(self): + """ + Return a dictionary containing every valuable tracked file's metadata. + + :return: A dictionary with author names, time of creation, and time of last + modification for every filename key. + .. code-block:: python + sample_returned_dict = { + "my_file" : { + "authors" : (str array) ["author1", "author2"], + "time_created" : (`datetime.datetime`) , + "time_last_modified" : (`datetime.datetime`) + } + } + :rtype: dictionary of dictionaries + """ + + commits = self._get_git_commits() + tracked_files = self._get_tracked_files() + + files_meta = {} + for commit in commits: + for filename in commit["filenames"]: + if filename not in tracked_files: + continue + + if filename not in files_meta.keys(): + files_meta[filename] = { + "authors" : [commit["author"]], + "time_last_modified" : commit["timestamp"], + "time_created" : commit["timestamp"] + } + else: + if commit["author"] not in files_meta[filename]["authors"]: + files_meta[filename]["authors"].append(commit["author"]) + files_meta[filename]["time_created"] = commit["timestamp"] + + return files_meta + + def _decode(self, raw): + """ + Return a decoded a raw string. + + :param raw: The string to string. + + :type raw: (str) + + :return: If the original encoding is successfully inferenced, return the + decoded string. + :rtype: str, or None + + .. warning:: + The raw string's original encoding is identified by heuristics which + can, and occasionally will, fail. Decoding will then fail, and None + will be returned. + """ + + try: + encoding = bs4.BeautifulSoup(raw).original_encoding + return raw.decode(encoding) if encoding is not None else None + + except (LookupError, UnicodeDecodeError, UserWarning) as exception: + return None + + def _is_ascii(self, filename): + """ + Heuristically determine whether a file is ASCII text or binary. + + If a portion of the file contains null bytes, or the percentage of bytes + that aren't ASCII is greater than 30%, then the file is concluded to be + binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T` + operator, and is the de-facto method for in : passdetermining whether a + file is ASCII. + + :param filename: The path of the file to test. + + :type filename: str + + :return: Whether the file is probably ASCII. + :rtype: Boolean + """ + + try: + with open(filename) as source: + file_snippet = source.read(512) + + if not file_snippet: + return True + + ascii_characters = "".join(map(chr, range(32, 127)) + + list("\n\r\t\b")) + null_trans = string.maketrans("", "") + + if "\0" in file_snippet: + return False + + non_ascii = file_snippet.translate(null_trans, ascii_characters) + return not float(len(non_ascii)) / len(file_snippet) > 0.30 + + except IOError as exception: + return False + +class _GitCloner(threading.Thread): + """ + A singleton Git repository cloner. + + Clones the repositories crawled by :class:`crawler.GitHubCrawler` for + :class:`GitIndexer` to index. + + :ivar clone_queue: (:class:`Queue.Queue`) see + :attr:`crawler.GitHubCrawler.clone_queue`. + :ivar index_queue: (:class:`Queue.Queue`) see + :attr:`GitIndexer.index_queue`. + :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. + """ + + def __init__(self, clone_queue, index_queue): + """ + Create an instance of the singleton :class:`_GitCloner`. + + :param clone_queue: see :attr:`self.clone_queue` + :param index_queue: see :attr:`self.index_queue` + + :type clone_queue: see :attr:`self.clone_queue` + :type index_queue: see :attr:`self.index_queue` + """ + + self.clone_queue = clone_queue + self.index_queue = index_queue + self._logger = logging.getLogger("%s.%s" % + (__name__, self.__class__.__name__)) + self._logger.info("Starting.") + super(_GitCloner, self).__init__(name=self.__class__.__name__) + + def run(self): + """ + Retrieve metadata about newly crawled repositories and clone them. + + Blocks until new :class:`GitRepository` appear in + :attr:`self.clone_queue`, then attempts cloning them. If + succcessful, the cloned repository is added to :attr:`self.index_queue` + for the `GitIndexer` to clone; otherwise, it is discarded. + """ + + while True: + while self.clone_queue.empty(): + time.sleep(THREAD_QUEUE_SLEEP) + repo = self.clone_queue.get() + self.clone_queue.task_done() + + try: + self._clone_repository(repo) + except Exception as exception: + pass + + def _clone_repository(self, repo): + """ + Attempt cloning a Git repository. + + :param repo: Metadata about the repository to clone. + + :type repo: :class:`GitRepository` + """ + + GIT_CLONE_TIMEOUT = 500 + + queue_percent_full = (float(self.index_queue.qsize()) / + self.index_queue.maxsize) * 100 + + exit_code = None + command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone" + " --single-branch %s %s/%s || pkill -f git") + + command_attempt = 0 + while exit_code is None: + try: + exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT, + repo.url, GIT_CLONE_DIR, repo.name), shell=True) + except Exception as exception: + time.sleep(1) + command_attempt += 1 + if command_attempt == 20: + break + else: + continue + else: + break + + if exit_code != 0: + if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) + return + + while self.index_queue.full(): + time.sleep(THREAD_QUEUE_SLEEP) + + self.index_queue.put(repo) + +class _ChangeDir(object): + """ + A wrapper class for os.chdir(), to map onto `with` and handle exceptions. + + :ivar new_path: (str) The path to change the current directory to. + :ivar old_path: (str) The path of the directory to return to. + """ + + def __init__(self, new_path): + """ + Create a _ChangeDir instance. + + :param new_path: The directory to enter. + + :type new_path: str + """ + + self.new_path = new_path + + def __enter__(self): + """ + Change the current working-directory to **new_path**. + """ + + self.old_path = os.getcwd() + os.chdir(self.new_path) + + def __exit__(self, *exception): + """ + Change the current working-directory to **old_path**. + + :param exception: Various exception arguments passed by `with`. + + :type exception: varargs + """ + + os.chdir(self.old_path) diff --git a/bitshift/database.py b/bitshift/database.py deleted file mode 100644 index b8995ee..0000000 --- a/bitshift/database.py +++ /dev/null @@ -1,18 +0,0 @@ -""" -Module with classes and functions to handle communication with the MySQL -database backend, which manages the search index. -""" - -import oursql - -class Database(object): - """Represents the MySQL database.""" - - def __init__(self): - pass - - def _connect(self): - pass - - def _create(self): - pass diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py new file mode 100644 index 0000000..75f39da --- /dev/null +++ b/bitshift/database/__init__.py @@ -0,0 +1,153 @@ +""" +Subpackage with classes and functions to handle communication with the MySQL +database backend, which manages the search index. +""" + +import os + +import mmh3 +import oursql + +from .migration import VERSION, MIGRATIONS + +__all__ = ["Database"] + +class Database(object): + """Represents the MySQL database.""" + + def __init__(self, migrate=False): + self._conn = self._connect() + self._check_version(migrate) + + def _connect(self): + """Establish a connection to the database.""" + root = os.path.dirname(os.path.abspath(__file__)) + default_file = os.path.join(root, ".my.cnf") + return oursql.connect(db="bitshift", read_default_file=default_file, + autoping=True, autoreconnect=True) + + def _migrate(self, cursor, current): + """Migrate the database to the latest schema version.""" + for version in xrange(current, VERSION): + print "Migrating to %d..." % version + 1 + for query in MIGRATIONS[version - 1]: + cursor.execute(query) + cursor.execute("UPDATE version SET version = ?", (version + 1,)) + + def _check_version(self, migrate): + """Check the database schema version and respond accordingly. + + If the schema is out of date, migrate if *migrate* is True, else raise + an exception. + """ + with self._conn.cursor() as cursor: + cursor.execute("SELECT version FROM version") + version = cursor.fetchone()[0] + if version < VERSION: + if migrate: + self._migrate(cursor, version) + else: + err = "Database schema out of date. " \ + "Run `python -m bitshift.database.migration`." + raise RuntimeError(err) + + def _get_codelets_from_ids(self, cursor, ids): + """Return a list of Codelet objects given a list of codelet IDs.""" + raise NotImplementedError() ## TODO + + def _decompose_url(self, cursor, url): + """Break up a URL into an origin (with a URL base) and a suffix.""" + query = """SELECT origin_id, SUBSTR(?, LENGTH(origin_url_base)) + FROM origins + WHERE origin_url_base IS NOT NULL + AND ? LIKE CONCAT(origin_url_base, "%")""" + + cursor.execute(query, (url, url)) + result = cursor.fetchone() + return result if result else (1, url) + + def _insert_symbols(self, cursor, code_id, sym_type, symbols): + """Insert a list of symbols of a given type into the database.""" + sym_types = ["functions", "classes", "variables"] + query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)" + query2 = """INSERT INTO symbol_locations VALUES + (DEFAULT, ?, ?, ?, ?, ?, ?)""" + + for (name, decls, uses) in symbols: + cursor.execute(query1, (code_id, sym_types.index(sym_type), name)) + sym_id = cursor.lastrowid + params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] + + [tuple([sym_id, 1] + list(loc)) for loc in uses]) + cursor.executemany(query2, params) + + def close(self): + """Disconnect from the database.""" + self._conn.close() + + def search(self, query, page=1): + """ + Search the database for a query and return the *n*\ th page of results. + + :param query: The query to search for. + :type query: :py:class:`~.query.tree.Tree` + :param page: The result page to display. + :type page: int + + :return: The total number of results, and the *n*\ th page of results. + :rtype: 2-tuple of (long, list of :py:class:`.Codelet`\ s) + """ + query1 = """SELECT cdata_codelet, cache_count_mnt, cache_count_exp + FROM cache + INNER JOIN cache_data ON cache_id = cdata_cache + WHERE cache_id = ?""" + query2 = "INSERT INTO cache VALUES (?, ?, ?, DEFAULT)" + query3 = "INSERT INTO cache_data VALUES (?, ?)" + + cache_id = mmh3.hash64(str(page) + ":" + query.serialize())[0] + + with self._conn.cursor() as cursor: + cursor.execute(query1, (cache_id,)) + results = cursor.fetchall() + if results: # Cache hit + num_results = results[0][1] * (10 ** results[0][2]) + ids = [res[0] for res in results] + else: # Cache miss + ## TODO: build and execute search query + results = cursor.fetchall() + ids = NotImplemented ## TODO: extract ids from results + num_results = NotImplemented ## TODO: num if results else 0 + num_exp = max(len(str(num_results)) - 3, 0) + num_results = int(round(num_results, -num_exp)) + num_mnt = num_results / (10 ** num_exp) + cursor.execute(query2, (cache_id, num_mnt, num_exp)) + cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) + return (num_results, self._get_codelets_from_ids(cursor, ids)) + + def insert(self, codelet): + """ + Insert a codelet into the database. + + :param codelet: The codelet to insert. + :type codelet: :py:class:`.Codelet` + """ + query1 = """INSERT INTO code VALUES (?, ?, ?) + ON DUPLICATE KEY UPDATE code_id=code_id""" + query2 = """INSERT INTO codelets VALUES + (DEFAULT, ?, ?, ?, ?, ?, ?, ?)""" + query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)" + + hash_key = str(codelet.language) + ":" + codelet.code.encode("utf8") + code_id = mmh3.hash64(hash_key)[0] + + with self._conn.cursor() as cursor: + cursor.execute(query1, (code_id, codelet.language, codelet.code)) + if cursor.rowcount == 1: + for sym_type, symbols in codelet.symbols.iteritems(): + self._insert_symbols(cursor, code_id, sym_type, symbols) + origin, url = self._decompose_url(cursor, codelet.url) + cursor.execute(query2, (codelet.name, code_id, origin, url, + codelet.rank, codelet.date_created, + codelet.date_modified)) + codelet_id = cursor.lastrowid + authors = [(codelet_id, a[0], a[1]) for a in codelet.authors] + cursor.executemany(query3, authors) diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py new file mode 100644 index 0000000..24f744a --- /dev/null +++ b/bitshift/database/migration.py @@ -0,0 +1,97 @@ +""" +Contains information about database schema versions, and SQL queries to update +between them. +""" + +VERSION = 6 + +MIGRATIONS = [ + # 1 -> 2 + [ + """ALTER TABLE `codelets` + DROP FOREIGN KEY `codelets_ibfk_1`""", + """ALTER TABLE `code` + DROP KEY `code_hash`, + DROP COLUMN `code_hash`, + MODIFY COLUMN `code_id` BIGINT NOT NULL""", + """ALTER TABLE `codelets` + MODIFY COLUMN `codelet_code_id` BIGINT NOT NULL, + ADD KEY (`codelet_lang`), + ADD CONSTRAINT `codelets_ibfk_1` FOREIGN KEY (`codelet_code_id`) + REFERENCES `code` (`code_id`) + ON DELETE RESTRICT ON UPDATE CASCADE""", + """ALTER TABLE `symbols` + ADD COLUMN `symbol_end_row` INT UNSIGNED NOT NULL, + ADD COLUMN `symbol_end_col` INT UNSIGNED NOT NULL""" + ], + # 2 -> 3 + [ + """ALTER TABLE `symbols` + DROP FOREIGN KEY `symbols_ibfk_1`, + CHANGE COLUMN `symbol_codelet` `symbol_code` BIGINT NOT NULL, + ADD CONSTRAINT `symbols_ibfk_1` FOREIGN KEY (`symbol_code`) + REFERENCES `code` (`code_id`) + ON DELETE CASCADE ON UPDATE CASCADE""" + ], + # 3 -> 4 + [ + """ALTER TABLE `symbols` + DROP COLUMN `symbol_row`, + DROP COLUMN `symbol_col`, + DROP COLUMN `symbol_end_row`, + DROP COLUMN `symbol_end_col`""", + """CREATE TABLE `symbol_locations` ( + `sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `sloc_symbol` BIGINT UNSIGNED NOT NULL, + `sloc_type` TINYINT UNSIGNED NOT NULL, + `sloc_row` INT UNSIGNED NOT NULL, + `sloc_col` INT UNSIGNED NOT NULL, + `sloc_end_row` INT UNSIGNED NOT NULL, + `sloc_end_col` INT UNSIGNED NOT NULL, + PRIMARY KEY (`sloc_id`), + FOREIGN KEY (`sloc_symbol`) + REFERENCES `symbols` (`symbol_id`) + ON DELETE CASCADE ON UPDATE CASCADE + ) ENGINE=InnoDB""" + ], + # 4 -> 5 + [ + """ALTER TABLE `origins` + MODIFY COLUMN `origin_name` VARCHAR(64) DEFAULT NULL, + MODIFY COLUMN `origin_url` VARCHAR(512) DEFAULT NULL, + MODIFY COLUMN `origin_url_base` VARCHAR(512) DEFAULT NULL""" + ], + # 5 -> 6 + [ + """ALTER TABLE `code` + ADD COLUMN `code_lang` SMALLINT UNSIGNED DEFAULT NULL + AFTER `code_id`, + ADD KEY (`code_lang`)""", + """ALTER TABLE `codelets` + DROP KEY `codelet_lang`, + DROP COLUMN `codelet_lang`""", + """ALTER TABLE `cache_data` + DROP FOREIGN KEY `cache_data_ibfk_1`""", + """ALTER TABLE `cache` + MODIFY COLUMN `cache_id` BIGINT NOT NULL, + DROP COLUMN `cache_hash`, + DROP COLUMN `cache_last_used`, + MODIFY COLUMN `cache_count_mnt` SMALLINT UNSIGNED NOT NULL""", + """ALTER TABLE `cache_data` + MODIFY COLUMN `cdata_cache` BIGINT NOT NULL, + ADD PRIMARY KEY (`cdata_cache`, `cdata_codelet`), + ADD CONSTRAINT `cache_data_ibfk_1` FOREIGN KEY (`cdata_codelet`) + REFERENCES `codelets` (`codelet_id`) + ON DELETE CASCADE ON UPDATE CASCADE""", + """CREATE EVENT `flush_cache` + ON SCHEDULE EVERY 1 HOUR + DO + DELETE FROM `cache` + WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY);""" + ] +] + +if __name__ == "__main__": + from . import Database + + Database(migrate=True).close() diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql new file mode 100644 index 0000000..8634416 --- /dev/null +++ b/bitshift/database/schema.sql @@ -0,0 +1,114 @@ +-- Schema version 6 + +CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; +USE `bitshift`; + +CREATE TABLE `version` ( + `version` INT UNSIGNED NOT NULL +) ENGINE=InnoDB; +INSERT INTO `version` VALUES (6); + +CREATE TABLE `origins` ( + `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, + `origin_name` VARCHAR(64) DEFAULT NULL, + `origin_url` VARCHAR(512) DEFAULT NULL, + `origin_url_base` VARCHAR(512) DEFAULT NULL, + `origin_image` BLOB DEFAULT NULL, + PRIMARY KEY (`origin_id`) +) ENGINE=InnoDB; +INSERT INTO `origins` VALUES (1, NULL, NULL, NULL, NULL); + +CREATE TABLE `code` ( + `code_id` BIGINT NOT NULL, + `code_lang` SMALLINT UNSIGNED DEFAULT NULL, + `code_code` MEDIUMTEXT NOT NULL, + PRIMARY KEY (`code_id`), + KEY (`code_lang`), + FULLTEXT KEY (`code_code`) +) ENGINE=InnoDB; + +CREATE TABLE `codelets` ( + `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `codelet_name` VARCHAR(300) NOT NULL, + `codelet_code_id` BIGINT NOT NULL, + `codelet_origin` TINYINT UNSIGNED NOT NULL, + `codelet_url` VARCHAR(512) NOT NULL, + `codelet_rank` FLOAT NOT NULL, + `codelet_date_created` DATETIME DEFAULT NULL, + `codelet_date_modified` DATETIME DEFAULT NULL, + PRIMARY KEY (`codelet_id`), + FULLTEXT KEY (`codelet_name`), + KEY (`codelet_rank`), + KEY (`codelet_date_created`), + KEY (`codelet_date_modified`), + FOREIGN KEY (`codelet_code_id`) + REFERENCES `code` (`code_id`) + ON DELETE RESTRICT ON UPDATE CASCADE, + FOREIGN KEY (`codelet_origin`) + REFERENCES `origins` (`origin_id`) + ON DELETE RESTRICT ON UPDATE CASCADE +) ENGINE=InnoDB; + +CREATE TABLE `authors` ( + `author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `author_codelet` BIGINT UNSIGNED NOT NULL, + `author_name` VARCHAR(128) NOT NULL, + `author_url` VARCHAR(512) DEFAULT NULL, + PRIMARY KEY (`author_id`), + FULLTEXT KEY (`author_name`), + FOREIGN KEY (`author_codelet`) + REFERENCES `codelets` (`codelet_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; + +CREATE TABLE `symbols` ( + `symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `symbol_code` BIGINT NOT NULL, + `symbol_type` TINYINT UNSIGNED NOT NULL, + `symbol_name` VARCHAR(512) NOT NULL, + PRIMARY KEY (`symbol_id`), + KEY (`symbol_type`, `symbol_name`(32)), + FOREIGN KEY (`symbol_code`) + REFERENCES `code` (`code_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; + +CREATE TABLE `symbol_locations` ( + `sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `sloc_symbol` BIGINT UNSIGNED NOT NULL, + `sloc_type` TINYINT UNSIGNED NOT NULL, + `sloc_row` INT UNSIGNED NOT NULL, + `sloc_col` INT UNSIGNED NOT NULL, + `sloc_end_row` INT UNSIGNED NOT NULL, + `sloc_end_col` INT UNSIGNED NOT NULL, + PRIMARY KEY (`sloc_id`), + FOREIGN KEY (`sloc_symbol`) + REFERENCES `symbols` (`symbol_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; + +CREATE TABLE `cache` ( + `cache_id` BIGINT NOT NULL, + `cache_count_mnt` SMALLINT UNSIGNED NOT NULL, + `cache_count_exp` TINYINT UNSIGNED NOT NULL, + `cache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`cache_id`) +) ENGINE=InnoDB; + +CREATE TABLE `cache_data` ( + `cdata_cache` BIGINT NOT NULL, + `cdata_codelet` BIGINT UNSIGNED NOT NULL, + PRIMARY KEY (`cdata_cache`, `cdata_codelet`), + FOREIGN KEY (`cdata_cache`) + REFERENCES `cache` (`cache_id`) + ON DELETE CASCADE ON UPDATE CASCADE, + FOREIGN KEY (`cdata_codelet`) + REFERENCES `codelets` (`codelet_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; + +CREATE EVENT `flush_cache` + ON SCHEDULE EVERY 1 HOUR + DO + DELETE FROM `cache` + WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY); diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index bc70cde..5498b62 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -22,4 +22,6 @@ def parse_query(query): # gets a string, returns a Tree + # TODO: note: resultant Trees should be normalized so that "foo OR bar" + # and "bar OR foo" result in equivalent trees pass diff --git a/docs/source/api/bitshift.query.rst b/docs/source/api/bitshift.query.rst new file mode 100644 index 0000000..35b39a6 --- /dev/null +++ b/docs/source/api/bitshift.query.rst @@ -0,0 +1,11 @@ +query Package +============= + +:mod:`query` Package +-------------------- + +.. automodule:: bitshift.query + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/source/api/bitshift.rst b/docs/source/api/bitshift.rst index a5f0898..1b1c703 100644 --- a/docs/source/api/bitshift.rst +++ b/docs/source/api/bitshift.rst @@ -1,30 +1,51 @@ -bitshift package +bitshift Package ================ -Submodules ----------- +:mod:`bitshift` Package +----------------------- -bitshift.assets module ----------------------- +.. automodule:: bitshift.__init__ + :members: + :undoc-members: + :show-inheritance: + +:mod:`assets` Module +-------------------- .. automodule:: bitshift.assets :members: :undoc-members: :show-inheritance: -bitshift.config module ----------------------- +:mod:`codelet` Module +--------------------- -.. automodule:: bitshift.config +.. automodule:: bitshift.codelet :members: :undoc-members: :show-inheritance: +:mod:`config` Module +-------------------- -Module contents ---------------- +.. automodule:: bitshift.config + :members: + :undoc-members: + :show-inheritance: + +:mod:`database` Module +---------------------- -.. automodule:: bitshift +.. automodule:: bitshift.database :members: :undoc-members: :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + bitshift.parser + bitshift.query + diff --git a/setup.py b/setup.py index 0ec5f77..47508e9 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,9 @@ setup( name = "bitshift", version = "0.1", packages = find_packages(), - install_requires = ["Flask>=0.10.1", "pygments>=1.6"], + install_requires = [ + "Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", + "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"], author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", license = "MIT", url = "https://github.com/earwig/bitshift"