From 878088f9ab7558f540ef0cd8af642e343f331a97 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 29 May 2014 13:04:18 -0400 Subject: [PATCH 01/34] Split symbols into implicit OR groups (closes #28) --- bitshift/query/__init__.py | 148 ++++++++++++++++++++++++--------------------- 1 file changed, 80 insertions(+), 68 deletions(-) diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 41d01cf..73d6f65 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -36,10 +36,75 @@ class _QueryParser(object): self._parse_variable: ["v", "var", "variable"] } + def _scan_query(self, query, markers): + """Scan a query (sub)string for the first occurance of some markers. + + Returns a 2-tuple of (first_marker_found, marker_index). + """ + def is_escaped(query, index): + """Return whether a query marker is backslash-escaped.""" + return (index > 0 and query[index - 1] == "\\" and + (index < 2 or query[index - 2] != "\\")) + + best_marker, best_index = None, maxsize + for marker in markers: + index = query.find(marker) + if is_escaped(query, index): + _, new_index = self._scan_query(query[index + 1:], marker) + index += new_index + 1 + if index >= 0 and index < best_index: + best_marker, best_index = marker, index + return best_marker, best_index + + def _split_query(self, query, markers, parens=False): + """Split a query string into a nested list of query terms. + + Returns a list of terms and/or nested sublists of terms. Each term and + sublist is guarenteed to be non-empty. + """ + query = query.lstrip() + if not query: + return [] + marker, index = self._scan_query(query, markers) + if not marker: + return [query] + nest = [query[:index]] if index > 0 else [] + after = query[index + 1:] + + if marker == " ": + nest += self._split_query(after, markers, parens) + elif marker in ('"', "'"): + close_marker, close_index = self._scan_query(after, marker) + if close_marker: + if close_index > 0: + nest.append(after[:close_index]) + after = after[close_index + 1:] + nest += self._split_query(after, markers, parens) + elif after: + nest.append(after) + elif marker == "(": + inner, after = self._split_query(after, markers, True), [] + if inner and isinstance(inner[-1], tuple): + after = self._split_query(inner.pop()[0], markers, parens) + if inner: + nest.append(inner) + if after: + nest += after + elif marker == ")": + if parens: + nest.append((after,)) + else: + nest += self._split_query(after, markers) + return nest + def _parse_literal(self, literal): """Parse part of a search query into a string or regular expression.""" if literal.startswith(("r:", "re:", "regex:", "regexp:")): - return Regex(literal.split(":", 1)[1]) + arg = literal.split(":", 1)[1] + if not arg: + err = 'Incomplete query term: "%s"' % literal + raise QueryParseException(err) + return Regex(arg) return String(literal) def _parse_language(self, term): @@ -98,21 +163,29 @@ class _QueryParser(object): """Parse part of a query into a date created node and return it.""" return self._parse_date(term, Date.CREATE) - def _parse_symbol(self, term): + def _parse_symbol(self, term, stype=Symbol.ALL): """Parse part of a query into a symbol node and return it.""" - return Symbol(Symbol.ALL, self._parse_literal(term)) + literal = self._parse_literal(term) + if isinstance(literal, String): + make_symbol = lambda lit: Symbol(stype, String(lit)) + symbols = self._split_query(literal.string, " \"'") + node = make_symbol(symbols.pop()) + while symbols: + node = BinaryOp(make_symbol(symbols.pop()), BinaryOp.OR, node) + return node + return Symbol(stype, literal) def _parse_function(self, term): """Parse part of a query into a function node and return it.""" - return Symbol(Symbol.FUNCTION, self._parse_literal(term)) + return self._parse_symbol(term, Symbol.FUNCTION) def _parse_class(self, term): """Parse part of a query into a class node and return it.""" - return Symbol(Symbol.CLASS, self._parse_literal(term)) + return self._parse_symbol(term, Symbol.CLASS) def _parse_variable(self, term): """Parse part of a query into a variable node and return it.""" - return Symbol(Symbol.VARIABLE, self._parse_literal(term)) + return self._parse_symbol(term, Symbol.VARIABLE) def _parse_term(self, term): """Parse a query term into a tree node and return it.""" @@ -134,67 +207,6 @@ class _QueryParser(object): return meth(arg) return Text(self._parse_literal(term)) - def _scan_query(self, query, markers): - """Scan a query (sub)string for the first occurance of some markers. - - Returns a 2-tuple of (first_marker_found, marker_index). - """ - def is_escaped(query, index): - """Return whether a query marker is backslash-escaped.""" - return (index > 0 and query[index - 1] == "\\" and - (index < 2 or query[index - 2] != "\\")) - - best_marker, best_index = None, maxsize - for marker in markers: - index = query.find(marker) - if is_escaped(query, index): - _, new_index = self._scan_query(query[index + 1:], marker) - index += new_index + 1 - if index >= 0 and index < best_index: - best_marker, best_index = marker, index - return best_marker, best_index - - def _split_query(self, query, parens=False): - """Split a query string into a nested list of query terms. - - Returns a list of terms and/or nested sublists of terms. Each term and - sublist is guarenteed to be non-empty. - """ - query = query.lstrip() - if not query: - return [] - marker, index = self._scan_query(query, " \"'()") - if not marker: - return [query] - nest = [query[:index]] if index > 0 else [] - after = query[index + 1:] - - if marker == " ": - nest += self._split_query(after, parens) - elif marker in ('"', "'"): - close_marker, close_index = self._scan_query(after, marker) - if close_marker: - if close_index > 0: - nest.append(after[:close_index]) - after = after[close_index + 1:] - nest += self._split_query(after, parens) - elif after: - nest.append(after) - elif marker == "(": - inner, after = self._split_query(after, True), [] - if inner and isinstance(inner[-1], tuple): - after = self._split_query(inner.pop()[0], parens) - if inner: - nest.append(inner) - if after: - nest += after - elif marker == ")": - if parens: - nest.append((after,)) - else: - nest += self._split_query(after) - return nest - def _parse_boolean_operators(self, nest): """Parse boolean operators in a nested query list.""" op_lookup = { @@ -271,7 +283,7 @@ class _QueryParser(object): :raises: :py:class:`.QueryParseException` """ - nest = self._split_query(query.rstrip()) + nest = self._split_query(query.rstrip(), " \"'()") if not nest: raise QueryParseException('Empty query: "%s"' % query) self._parse_boolean_operators(nest) From 326be3050f8e088578297a00ae641fc0a50b5bfa Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 29 May 2014 13:15:25 -0400 Subject: [PATCH 02/34] Start work on serializing; clean up languages. --- bitshift/codelet.py | 15 +++++++++++++++ bitshift/{resources => }/languages.json | 0 bitshift/languages.py | 6 +++--- 3 files changed, 18 insertions(+), 3 deletions(-) rename bitshift/{resources => }/languages.json (100%) diff --git a/bitshift/codelet.py b/bitshift/codelet.py index 92debf4..2562d27 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -1,3 +1,7 @@ +import json + +from .languages import LANGS + __all__ = ["Codelet"] class Codelet(object): @@ -65,3 +69,14 @@ class Codelet(object): self.rank = rank self.symbols = symbols or {} self.origin = origin or (None, None, None) + + def serialize(self): + """ + Convert the codelet into a JSON string representation for the frontend. + + :return: The JSON codelet representation. + :rtype: str + """ + lang = LANGS[self.language] + data = {"name": self.name, "code": self.code, "lang": lang} + return json.dumps(data) diff --git a/bitshift/resources/languages.json b/bitshift/languages.json similarity index 100% rename from bitshift/resources/languages.json rename to bitshift/languages.json diff --git a/bitshift/languages.py b/bitshift/languages.py index 78c0830..36d7f63 100644 --- a/bitshift/languages.py +++ b/bitshift/languages.py @@ -1,5 +1,5 @@ import json +from os import path -with open("bitshift/resources/languages.json") as lang_json: - LANGS = [lang.encode("ascii","ignore") for lang in - json.load(lang_json)["languages"]] +with open(path.join(path.dirname(__file__), "languages.json")) as lang_json: + LANGS = [lang for lang in json.load(lang_json)["languages"]] From 52c7e9cf5ebbbe6ab73a621ae19a7f0db41d6088 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 29 May 2014 14:14:14 -0400 Subject: [PATCH 03/34] Finish JSON stringification (closes #29) --- bitshift/codelet.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/bitshift/codelet.py b/bitshift/codelet.py index 2562d27..8897249 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -77,6 +77,10 @@ class Codelet(object): :return: The JSON codelet representation. :rtype: str """ - lang = LANGS[self.language] - data = {"name": self.name, "code": self.code, "lang": lang} - return json.dumps(data) + return json.dumps({ + "name": self.name, "code": self.code, "lang": LANGS[self.language], + "authors": self.authors, "url": self.code_url, + "created": self.date_created.isoformat(), + "modified": self.date_modified.isoformat(), + "symbols": self.symbols, "origin": self.origin + }) From 4dc83e913aaf4cd75a731b4dcbee860b1428969e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 29 May 2014 14:37:44 -0400 Subject: [PATCH 04/34] Implement /search.json route (#27) --- app.py | 38 +++++++++++++++++++++++++++----------- bitshift/codelet.py | 10 ++++------ bitshift/database/__init__.py | 2 +- 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/app.py b/app.py index 92949bd..20d8997 100644 --- a/app.py +++ b/app.py @@ -2,13 +2,13 @@ Module to contain all the project's Flask server plumbing. """ -from flask import Flask -from flask import render_template, session +from json import dumps + +from flask import Flask, make_response, render_template, request from bitshift import assets -from bitshift import languages -# from bitshift.database import Database -# from bitshift.query import parse_query +from bitshift.database import Database +from bitshift.query import parse_query, QueryParseException app = Flask(__name__) app.config.from_object("bitshift.config") @@ -17,17 +17,33 @@ app_env = app.jinja_env app_env.line_statement_prefix = "=" app_env.globals.update(assets=assets) -# database = Database() +database = Database() @app.route("/") def index(): return render_template("index.html", typeahead_languages=languages.LANGS) -@app.route("/search/") -def search(query): - # tree = parse_query(query) - # database.search(tree) - pass +@app.route("/search.json") +def search(): + def reply(json): + resp = make_response(dumps(json)) + resp.mimetype = "application/json" + return resp + + query, page = request.args.get("q"), request.args.get("p", 1) + if not query: + return reply({"error": "No query given"}) + try: + tree = parse_query(query) + except QueryParseException as exc: + return reply({"error": exc.args[0]}) + try: + page = int(page) + except ValueError: + return reply({"error": u"Invalid page number: %s" % page}) + count, codelets = database.search(tree, page) + results = [clt.serialize() for clt in codelets] + return reply({"count": count, "results": results}) @app.route("/about") def about(): diff --git a/bitshift/codelet.py b/bitshift/codelet.py index 8897249..865ae52 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -1,5 +1,3 @@ -import json - from .languages import LANGS __all__ = ["Codelet"] @@ -72,15 +70,15 @@ class Codelet(object): def serialize(self): """ - Convert the codelet into a JSON string representation for the frontend. + Convert the codelet into a dictionary that can be sent as JSON. - :return: The JSON codelet representation. + :return: The codelet as a dictionary. :rtype: str """ - return json.dumps({ + return { "name": self.name, "code": self.code, "lang": LANGS[self.language], "authors": self.authors, "url": self.code_url, "created": self.date_created.isoformat(), "modified": self.date_modified.isoformat(), "symbols": self.symbols, "origin": self.origin - }) + } diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index e4fa430..1e49a8d 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -63,7 +63,7 @@ class Database(object): query, args = tree.build_query(page) cursor.execute(query, args) ids = [id for id, _ in cursor.fetchall()] - num_results = 0 # TODO: NotImplemented + num_results = len(ids) # TODO: NotImplemented return ids, num_results def _get_authors_for_codelet(self, cursor, codelet_id): From f8436fa484e2cc4316ee8552ed31f674c8740149 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Fri, 30 May 2014 12:44:31 -0400 Subject: [PATCH 05/34] Part of #26. Move __init__.py to crawl.py. Add: bitshift/crawler/(__init__, crawl).py -Move `__init__.py` to `crawl.py`, and add a `main` block to allow running the crawler via `python -m`. --- bitshift/crawler/__init__.py | 55 ----------------------------------------- bitshift/crawler/crawl.py | 59 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 55 deletions(-) create mode 100644 bitshift/crawler/crawl.py diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index 73b1c22..e69de29 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -1,55 +0,0 @@ -""" -:synopsis: Parent crawler module, which supervises all crawlers. - -Contains functions for initializing all subsidiary, threaded crawlers. -""" - -import logging, logging.handlers, os, Queue - -from bitshift.crawler import crawler, indexer - -__all__ = ["crawl"] - -def crawl(): - """ - Initialize all crawlers (and indexers). - - Start the: - 1. GitHub crawler, :class:`crawler.GitHubCrawler`. - 2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`. - 3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. - """ - - _configure_logging() - - MAX_URL_QUEUE_SIZE = 5e3 - - repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) - threads = [crawler.GitHubCrawler(repo_clone_queue), - crawler.BitbucketCrawler(repo_clone_queue), - indexer.GitIndexer(repo_clone_queue)] - - for thread in threads: - thread.start() - -def _configure_logging(): - LOG_FILE_DIR = "log" - - if not os.path.exists(LOG_FILE_DIR): - os.mkdir(LOG_FILE_DIR) - - logging.getLogger("requests").setLevel(logging.WARNING) - logging.getLogger("urllib3").setLevel(logging.WARNING) - - formatter = logging.Formatter( - fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" - " %(message)s"), datefmt="%y-%m-%d %H:%M:%S") - - handler = logging.handlers.TimedRotatingFileHandler( - "%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1, - backupCount=20) - handler.setFormatter(formatter) - - root_logger = logging.getLogger() - root_logger.addHandler(handler) - root_logger.setLevel(logging.NOTSET) diff --git a/bitshift/crawler/crawl.py b/bitshift/crawler/crawl.py new file mode 100644 index 0000000..91e6675 --- /dev/null +++ b/bitshift/crawler/crawl.py @@ -0,0 +1,59 @@ +""" +:synopsis: Parent crawler module, which supervises all crawlers. + +Contains functions for initializing all subsidiary, threaded crawlers. +""" + +import logging, logging.handlers, os, Queue + +from bitshift.crawler import crawler, indexer + +__all__ = ["crawl"] + +def crawl(): + """ + Initialize all crawlers (and indexers). + + Start the: + 1. GitHub crawler, :class:`crawler.GitHubCrawler`. + 2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`. + 3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. + """ + + _configure_logging() + + MAX_URL_QUEUE_SIZE = 5e3 + + repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) + threads = [crawler.GitHubCrawler(repo_clone_queue), + crawler.BitbucketCrawler(repo_clone_queue), + indexer.GitIndexer(repo_clone_queue)] + + for thread in threads: + thread.start() + +def _configure_logging(): + LOG_FILE_DIR = "log" + + if not os.path.exists(LOG_FILE_DIR): + os.mkdir(LOG_FILE_DIR) + + logging.getLogger("requests").setLevel(logging.WARNING) + logging.getLogger("urllib3").setLevel(logging.WARNING) + + formatter = logging.Formatter( + fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" + " %(message)s"), datefmt="%y-%m-%d %H:%M:%S") + + handler = logging.handlers.TimedRotatingFileHandler( + "%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1, + backupCount=20) + handler.setFormatter(formatter) + + root_logger = logging.getLogger() + root_logger.addHandler(handler) + root_logger.setLevel(logging.NOTSET) + +if __name__ == "__main__": + _configure_logging() + crawl() From b698a16c98850159ddd169c9a14cd901bb43ec03 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Fri, 30 May 2014 13:02:54 -0400 Subject: [PATCH 06/34] Add parse() and insert() calls to crawler. Add: bitshift/crawler/indexer.py -Add `parse()` and `insert()` calls to `_insert_repository_codelets()`. --- bitshift/crawler/indexer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index c1c77ad..ad730c8 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -7,6 +7,7 @@ import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\ threading from ..database import Database +from ..parser import parse from ..codelet import Codelet GIT_CLONE_DIR = "/tmp/bitshift" @@ -73,6 +74,7 @@ class GitIndexer(threading.Thread): self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) self.git_cloner = _GitCloner(clone_queue, self.index_queue) self.git_cloner.start() + self.database = Database() self._logger = logging.getLogger("%s.%s" % (__name__, self.__class__.__name__)) self._logger.info("Starting.") @@ -158,6 +160,8 @@ class GitIndexer(threading.Thread): commits_meta[filename]["time_created"], commits_meta[filename]["time_last_modified"], repo.rank) + parse(codelet) + self.database.insert(codelet) def _generate_file_url(self, filename, repo_url, framework_name): """ From 0a7cccc74c351b05aeb64c00274695dc9cc33590 Mon Sep 17 00:00:00 2001 From: Benjamin Attal Date: Fri, 30 May 2014 23:54:51 -0400 Subject: [PATCH 07/34] Change format of string returned from ruby server to correct symbol table format. --- parsers/ruby/lib/parser.rb | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/parsers/ruby/lib/parser.rb b/parsers/ruby/lib/parser.rb index c757fa0..6dfa175 100644 --- a/parsers/ruby/lib/parser.rb +++ b/parsers/ruby/lib/parser.rb @@ -3,6 +3,20 @@ require 'ruby_parser' require 'sexp_processor' module Bitshift + class Tuple + attr_accessor :objects + + def initialize(arr) + @objects = arr + end + + def inspect + s = "(" + @objects.each {|o| s += "#{o},"} + s = s[0..-2] + ')' + end + end + class Parser def initialize(source) @source = source @@ -25,7 +39,7 @@ module Bitshift def initialize(offset, tree) super() - module_hash = Hash.new {|hash, key| hash[key] = { assignments: [], uses: [] }} + module_hash = Hash.new {|hash, key| hash[key] = { decls: [], uses: [] }} class_hash = module_hash.clone function_hash = module_hash.clone var_hash = module_hash.clone @@ -50,7 +64,7 @@ module Bitshift break if cur_exp == nil end - pos = [start_ln, -1, end_ln, -1] + pos = Tuple.new([start_ln, -1, end_ln, -1]) return pos end @@ -58,7 +72,7 @@ module Bitshift pos = Hash.new end_ln = start_ln = exp.line - offset - pos = [start_ln, -1, end_ln, -1] + pos = Tuple.new([start_ln, -1, end_ln, -1]) return pos end @@ -66,7 +80,7 @@ module Bitshift pos = block_position(exp) exp.shift name = exp.shift - symbols[:modules][name][:assignments] << pos + symbols[:modules][name][:decls] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -75,7 +89,7 @@ module Bitshift pos = block_position(exp) exp.shift name = exp.shift - symbols[:classes][name][:assignments] << pos + symbols[:classes][name][:decls] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -84,7 +98,7 @@ module Bitshift pos = block_position(exp) exp.shift name = exp.shift - symbols[:functions][name][:assignments] << pos + symbols[:functions][name][:decls] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -103,7 +117,7 @@ module Bitshift pos = statement_position(exp) exp.shift name = exp.shift - symbols[:vars][name][:assignments] << pos + symbols[:vars][name][:decls] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -112,14 +126,22 @@ module Bitshift pos = statement_position(exp) exp.shift name = exp.shift - symbols[:vars][name][:assignments] << pos + symbols[:vars][name][:decls] << pos exp.each_sexp {|s| process(s)} return exp.clear end def to_s - str = symbols.to_s - str = str.gsub(/:(\w*)=>/, '"\1":') + new_symbols = Hash.new {|hash, key| hash[key] = []} + + symbols.each do |type, sym_list| + sym_list.each do |name, sym| + new_symbols[type.to_s] << Tuple.new(["'#{name}'", sym[:decls], sym[:uses]]) + end + end + + str = new_symbols.to_s + str = str.gsub(/=>/, ":") return str end end From 9f935bbb74093b62c35840791f1eabd3ed4f87db Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 31 May 2014 22:02:25 -0400 Subject: [PATCH 08/34] This is ugly, but it improves the current setup. --- bitshift/crawler/crawl.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/bitshift/crawler/crawl.py b/bitshift/crawler/crawl.py index 91e6675..d1922c9 100644 --- a/bitshift/crawler/crawl.py +++ b/bitshift/crawler/crawl.py @@ -33,10 +33,13 @@ def crawl(): thread.start() def _configure_logging(): - LOG_FILE_DIR = "log" + # This isn't ideal, since it means the bitshift python package must be kept + # inside the app, but it works for now: + root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) + log_dir = os.path.join(root, "logs") - if not os.path.exists(LOG_FILE_DIR): - os.mkdir(LOG_FILE_DIR) + if not os.path.exists(log_dir): + os.mkdir(log_dir) logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) @@ -46,7 +49,7 @@ def _configure_logging(): " %(message)s"), datefmt="%y-%m-%d %H:%M:%S") handler = logging.handlers.TimedRotatingFileHandler( - "%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1, + "%s/%s" % (log_dir, "app.log"), when="H", interval=1, backupCount=20) handler.setFormatter(formatter) From 5d9ef2774d63531ad5f9aab29466acc051e3fca8 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 31 May 2014 22:59:14 -0400 Subject: [PATCH 09/34] Some fixes, mainly involving language detection. --- .gitignore | 2 +- bitshift/codelet.py | 2 +- bitshift/languages.json | 281 +++++++++++++++++++++++++++++++++++++++++++- bitshift/parser/__init__.py | 37 +++--- bitshift/query/nodes.py | 2 +- 5 files changed, 305 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index 6156aee..b8b2697 100644 --- a/.gitignore +++ b/.gitignore @@ -51,4 +51,4 @@ target # Ctags */tags -log +logs diff --git a/bitshift/codelet.py b/bitshift/codelet.py index 865ae52..a12ec3a 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -27,7 +27,7 @@ class Codelet(object): """ def __init__(self, name, code, filename, language, authors, code_url, - date_created, date_modified, rank, symbols=None, origin=None): + date_created, date_modified, rank, symbols=None, origin=None): """ Create a Codelet instance. diff --git a/bitshift/languages.json b/bitshift/languages.json index 02ca0ad..d855164 100644 --- a/bitshift/languages.json +++ b/bitshift/languages.json @@ -1,4 +1,283 @@ { "_comment" : "A list of programming languages supported by `bitshift`.", - "languages" : ["Debian Sourcelist", "Delphi", "JavaScript+Mako", "Brainfuck", "Ceylon", "JavaScript+Django/Jinja", "HTML+Evoque", "NumPy", "Modula-2", "LiveScript", "Nimrod", "Bash", "HTML+Django/Jinja", "CSS+PHP", "XML+Lasso", "VimL", "CSS+Genshi Text", "Fancy", "Coldfusion HTML", "cfstatement", "Scalate Server Page", "Smarty", "XML+Evoque", "haXe", "PowerShell", "Tea", "HTML+Cheetah", "Mason", "Django/Jinja", "JAGS", "ApacheConf", "DTD", "Lighttpd configuration file", "Java", "JavaScript+Genshi Text", "Scheme", "Nemerle", "RHTML", "Ragel in Java Host", "Darcs Patch", "Puppet", "Octave", "CoffeeScript", "Ragel in D Host", "Scilab", "Monkey", "HTML+Myghty", "CSS", "JavaScript+Smarty", "Io", "COBOLFree", "Asymptote", "vhdl", "Python 3", "CSS+Ruby", "Fortran", "d-objdump", "MySQL", "REBOL", "C++", "ERB", "CBM BASIC V2", "Befunge", "Julia", "MoonScript", "Ruby", "XML+Smarty", "Dylan", "Groovy", "MoinMoin/Trac Wiki markup", "autohotkey", "C", "HTML", "Felix", "CMake", "NSIS", "SourcePawn", "Mako", "VGL", "Velocity", "Koka", "CUDA", "Gnuplot", "IRC logs", "Prolog", "Python", "CSS+Django/Jinja", "verilog", "Smalltalk", "JavaScript+Myghty", "YAML", "Julia console", "ANTLR With ActionScript Target", "XML+Mako", "XSLT", "UrbiScript", "Scaml", "S", "DylanLID", "MAQL", "sqlite3con", "Boo", "OCaml", "eC", "ActionScript", "VB.net", "SquidConf", "XQuery", "D", "Fantom", "Gettext Catalog", "Logos", "Lasso", "SCSS", "BBCode", "Haml", "FoxPro", "Python 3.0 Traceback", "MuPAD", "XML+Ruby", "Dart", "IDL", "dg", "Evoque", "Jade", "c-objdump", "Kconfig", "Java Server Page", "reg", "ABAP", "XML+Velocity", "JavaScript+Cheetah", "HTML+Mako", "Ragel in Ruby Host", "RobotFramework", "Protocol Buffer", "CFEngine3", "Ragel", "GLSL", "COBOL", "TypeScript", "Ada", "PostgreSQL SQL dialect", "Xtend", "Logtalk", "objdump", "CSS+Mako", "ca65", "Objective-C++", "Gherkin", "HTML+PHP", "Makefile", "PostScript", "Hxml", "Kotlin", "PL/pgSQL", "Vala", "Haskell", "Bro", "Lua", "POVRay", "Sass", "ANTLR With Java Target", "Tcl", "ANTLR With ObjectiveC Target", "JavaScript+Ruby", "Racket", "AspectJ", "Base Makefile", "ANTLR With Python Target", "cpp-objdump", "Genshi Text", "Ioke", "PyPy Log", "Croc", "Objective-J", "GAS", "Batchfile", "Snobol", "XML", "ANTLR", "Opa", "XML+Cheetah", "Go", "Diff", "MiniD", "Cython", "Ragel in C Host", "Erlang", "Debian Control file", "aspx-vb", "BUGS", "Ragel in CPP Host", "aspx-cs", "Properties", "Groff", "Clojure", "Modelica", "QML", "JavaScript+Lasso", "ANTLR With Perl Target", "Genshi", "BlitzMax", "Treetop", "Matlab", "Myghty", "HTML+Genshi", "Duel", "Perl", "FSharp", "reStructuredText", "NewLisp", "Scala", "CSS+Lasso", "XML+PHP", "Stan", "INI", "MOOCode", "Shell Session", "RPMSpec", "Newspeak", "Bash Session", "Coq", "Raw token data", "Tcsh", "HTML+Lasso", "C#", "Gosu Template", "RConsole", "MXML", "TeX", "CSS+Smarty", "Text only", "ANTLR With C# Target", "OpenEdge ABL", "Cheetah", "Smali", "CSS+Myghty", "Rd", "LLVM", "Standard ML", "Elixir", "Nginx configuration file", "GoodData-CL", "AppleScript", "HTML+Smarty", "Objective-C", "JavaScript", "Rust", "Common Lisp", "Embedded Ragel", "ActionScript 3", "systemverilog", "Literate Haskell", "Python Traceback", "PHP", "ANTLR With CPP Target", "Gosu", "Hybris", "JavaScript+PHP", "Factor", "HTML+Velocity", "Mscgen", "Ooc", "SQL", "HTTP", "ECL", "Redcode", "Ragel in Objective C Host", "XML+Django/Jinja", "Awk", "JSON", "NASM", "ANTLR With Ruby Target", "XML+Myghty", "AutoIt", "Mako", "CSS+Mako", "HTML+Mako", "XML+Mako", "JavaScript+Mako"] + "languages" : [ + "Debian Sourcelist", + "Delphi", + "JavaScript+Mako", + "Brainfuck", + "Ceylon", + "JavaScript+Django/Jinja", + "HTML+Evoque", + "NumPy", + "Modula-2", + "LiveScript", + "Nimrod", + "Bash", + "HTML+Django/Jinja", + "CSS+PHP", + "XML+Lasso", + "VimL", + "CSS+Genshi Text", + "Fancy", + "Coldfusion HTML", + "cfstatement", + "Scalate Server Page", + "Smarty", + "XML+Evoque", + "haXe", + "PowerShell", + "Tea", + "HTML+Cheetah", + "Mason", + "Django/Jinja", + "JAGS", + "ApacheConf", + "DTD", + "Lighttpd configuration file", + "Java", + "JavaScript+Genshi Text", + "Scheme", + "Nemerle", + "RHTML", + "Ragel in Java Host", + "Darcs Patch", + "Puppet", + "Octave", + "CoffeeScript", + "Ragel in D Host", + "Scilab", + "Monkey", + "HTML+Myghty", + "CSS", + "JavaScript+Smarty", + "Io", + "COBOLFree", + "Asymptote", + "vhdl", + "CSS+Ruby", + "Fortran", + "d-objdump", + "MySQL", + "REBOL", + "C++", + "ERB", + "CBM BASIC V2", + "Befunge", + "Julia", + "MoonScript", + "Ruby", + "XML+Smarty", + "Dylan", + "Groovy", + "MoinMoin/Trac Wiki markup", + "autohotkey", + "C", + "HTML", + "Felix", + "CMake", + "NSIS", + "SourcePawn", + "Mako", + "VGL", + "Velocity", + "Koka", + "CUDA", + "Gnuplot", + "IRC logs", + "Prolog", + "Python", + "CSS+Django/Jinja", + "verilog", + "Smalltalk", + "JavaScript+Myghty", + "YAML", + "Julia console", + "ANTLR With ActionScript Target", + "XML+Mako", + "XSLT", + "UrbiScript", + "Scaml", + "S", + "DylanLID", + "MAQL", + "sqlite3con", + "Boo", + "OCaml", + "eC", + "ActionScript", + "VB.net", + "SquidConf", + "XQuery", + "D", + "Fantom", + "Gettext Catalog", + "Logos", + "Lasso", + "SCSS", + "BBCode", + "Haml", + "FoxPro", + "MuPAD", + "XML+Ruby", + "Dart", + "IDL", + "dg", + "Evoque", + "Jade", + "c-objdump", + "Kconfig", + "Java Server Page", + "reg", + "ABAP", + "XML+Velocity", + "JavaScript+Cheetah", + "HTML+Mako", + "Ragel in Ruby Host", + "RobotFramework", + "Protocol Buffer", + "CFEngine3", + "Ragel", + "GLSL", + "COBOL", + "TypeScript", + "Ada", + "PostgreSQL SQL dialect", + "Xtend", + "Logtalk", + "objdump", + "CSS+Mako", + "ca65", + "Objective-C++", + "Gherkin", + "HTML+PHP", + "Makefile", + "PostScript", + "Hxml", + "Kotlin", + "PL/pgSQL", + "Vala", + "Haskell", + "Bro", + "Lua", + "POVRay", + "Sass", + "ANTLR With Java Target", + "Tcl", + "ANTLR With ObjectiveC Target", + "JavaScript+Ruby", + "Racket", + "AspectJ", + "Base Makefile", + "ANTLR With Python Target", + "cpp-objdump", + "Genshi Text", + "Ioke", + "PyPy Log", + "Croc", + "Objective-J", + "GAS", + "Batchfile", + "Snobol", + "XML", + "ANTLR", + "Opa", + "XML+Cheetah", + "Go", + "Diff", + "MiniD", + "Cython", + "Ragel in C Host", + "Erlang", + "Debian Control file", + "aspx-vb", + "BUGS", + "Ragel in CPP Host", + "aspx-cs", + "Properties", + "Groff", + "Clojure", + "Modelica", + "QML", + "JavaScript+Lasso", + "ANTLR With Perl Target", + "Genshi", + "BlitzMax", + "Treetop", + "Matlab", + "Myghty", + "HTML+Genshi", + "Duel", + "Perl", + "FSharp", + "reStructuredText", + "NewLisp", + "Scala", + "CSS+Lasso", + "XML+PHP", + "Stan", + "INI", + "MOOCode", + "Shell Session", + "RPMSpec", + "Newspeak", + "Bash Session", + "Coq", + "Raw token data", + "Tcsh", + "HTML+Lasso", + "C#", + "Gosu Template", + "RConsole", + "MXML", + "TeX", + "CSS+Smarty", + "Text only", + "ANTLR With C# Target", + "OpenEdge ABL", + "Cheetah", + "Smali", + "CSS+Myghty", + "Rd", + "LLVM", + "Standard ML", + "Elixir", + "Nginx configuration file", + "GoodData-CL", + "AppleScript", + "HTML+Smarty", + "Objective-C", + "JavaScript", + "Rust", + "Common Lisp", + "Embedded Ragel", + "ActionScript 3", + "systemverilog", + "Literate Haskell", + "PHP", + "ANTLR With CPP Target", + "Gosu", + "Hybris", + "JavaScript+PHP", + "Factor", + "HTML+Velocity", + "Mscgen", + "Ooc", + "SQL", + "HTTP", + "ECL", + "Redcode", + "Ragel in Objective C Host", + "XML+Django/Jinja", + "Awk", + "JSON", + "NASM", + "ANTLR With Ruby Target", + "XML+Myghty", + "AutoIt", + "Mako", + "CSS+Mako", + "HTML+Mako", + "XML+Mako", + "JavaScript+Mako" + ] } diff --git a/bitshift/parser/__init__.py b/bitshift/parser/__init__.py index bc22514..79fca78 100644 --- a/bitshift/parser/__init__.py +++ b/bitshift/parser/__init__.py @@ -1,4 +1,10 @@ -import json, pygments.lexers as pgl, sys, socket, struct +import json +import sys +import socket +import struct + +from pygments import lexers as pgl, util + from ..languages import LANGS from .python import parse_py @@ -19,13 +25,14 @@ def _lang(codelet): Modify function to incorporate tags from stackoverflow. """ - if codelet.filename is not None: - try: - return pgl.guess_lexer_for_filename(codelet.filename, codelet.code).name - except: - raise UnsupportedFileError('Could not find a lexer for the codelet\'s filename') - - return LANGS.index(pgl.guess_lexer(codelet.code)) + try: + if codelet.filename: + lex = pgl.guess_lexer_for_filename(codelet.filename, codelet.code) + else: + lex = pgl.guess_lexer(codelet.code) + except util.ClassNotFound: + raise UnsupportedFileError(codelet.filename) + return LANGS.index(lex.name) def _recv_data(server_socket): """ @@ -39,8 +46,9 @@ def _recv_data(server_socket): """ recv_size = 8192 - total_data = []; size_data = cur_data = '' - total_size = 0; size = sys.maxint + total_data = [] + size_data = cur_data = '' + total_size, size = 0, sys.maxint while total_size < size: cur_data = server_socket.recv(recv_size) @@ -61,8 +69,7 @@ def _recv_data(server_socket): total_size = sum([len(s) for s in total_data]) server_socket.close() - return ''.join(total_data); - + return ''.join(total_data) def parse(codelet): """ @@ -76,7 +83,8 @@ def parse(codelet): :type code: Codelet """ - lang = _lang(codelet); source = codelet.code + lang = _lang(codelet) + source = codelet.code codelet.language = lang server_socket_number = 5000 + lang @@ -86,8 +94,7 @@ def parse(codelet): else: server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_socket.connect(("localhost", server_socket_number)) - server_socket.send("%d\n%s" % (len(source), source)); + server_socket.send("%d\n%s" % (len(source), source)) symbols = json.loads(_recv_data(server_socket)) codelet.symbols = symbols - diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index 5d157b5..d375ffb 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -195,7 +195,7 @@ class Symbol(_Node): CLASS = 1 VARIABLE = 2 TYPES = {FUNCTION: "FUNCTION", CLASS: "CLASS", VARIABLE: "VARIABLE"} - TYPES_INV = ["functions", "classes", "variables"] + TYPES_INV = ["functions", "classes", "vars"] def __init__(self, type_, name): """ From a3eacc287ea16a661c78be4915e3d3cf7153d532 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 31 May 2014 23:11:42 -0400 Subject: [PATCH 10/34] Try to make exception reporting more useful. --- bitshift/crawler/indexer.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index ad730c8..b5b64eb 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -100,10 +100,7 @@ class GitIndexer(threading.Thread): repo = self.index_queue.get() self.index_queue.task_done() - try: - self._index_repository(repo) - except Exception as excep: - self._logger.warning("%s: %s.", excep.__class__.__name__, excep) + self._index_repository(repo) def _index_repository(self, repo): """ @@ -121,10 +118,10 @@ class GitIndexer(threading.Thread): try: self._insert_repository_codelets(repo) except Exception as excep: - self._logger.warning("%s: %s.", excep.__class__.__name__, excep) - - if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) + self._logger.exception("Exception raised while indexing:") + finally: + if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) def _insert_repository_codelets(self, repo): """ From fc8d478060b2168771650b4f68ac9890a8ead1fa Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Sat, 31 May 2014 23:22:28 -0400 Subject: [PATCH 11/34] Untested fix #33. Add: bitshift/crawler/indexer.py -Add conditional to remove the full path of a repository if the owner's directory contains only one sub-directory. --- bitshift/crawler/indexer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index b5b64eb..5a351c0 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -121,7 +121,12 @@ class GitIndexer(threading.Thread): self._logger.exception("Exception raised while indexing:") finally: if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) + if len([obj for obj in os.listdir('.') if + os.path.isdir(obj)]) <= 1: + shutil.rmtree("%s/%s" % ( + GIT_CLONE_DIR, repo.name.split("/")[0])) + else: + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) def _insert_repository_codelets(self, repo): """ From 5a83720617dff15f912d4401492cf1a6c47074c7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 31 May 2014 23:38:11 -0400 Subject: [PATCH 12/34] Strip encoding lines. --- bitshift/crawler/crawl.py | 2 +- bitshift/parser/python.py | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/bitshift/crawler/crawl.py b/bitshift/crawler/crawl.py index d1922c9..c121866 100644 --- a/bitshift/crawler/crawl.py +++ b/bitshift/crawler/crawl.py @@ -45,7 +45,7 @@ def _configure_logging(): logging.getLogger("urllib3").setLevel(logging.WARNING) formatter = logging.Formatter( - fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" + fmt=("%(asctime)s %(levelname)s %(name)s:%(funcName)s" " %(message)s"), datefmt="%y-%m-%d %H:%M:%S") handler = logging.handlers.TimedRotatingFileHandler( diff --git a/bitshift/parser/python.py b/bitshift/parser/python.py index d0cd7d3..713cae9 100644 --- a/bitshift/parser/python.py +++ b/bitshift/parser/python.py @@ -1,4 +1,7 @@ import ast +import re + +encoding_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", re.UNICODE) class _CachedWalker(ast.NodeVisitor): """ @@ -154,7 +157,25 @@ def parse_py(codelet): :type code: Codelet """ - tree = ast.parse(codelet.code) + def strip_encoding(lines): + """Strips the encoding line from a file, which breaks the parser.""" + try: + first = next(lines) + if not encoding_re.match(first): + yield first + second = next(lines) + if not encoding_re.match(second): + yield second + except StopIteration: + return + for line in lines: + yield line + + try: + tree = ast.parse("\n".join(strip_encoding(codelet.code.splitlines()))) + except SyntaxError: + ## TODO: add some logging here? + return cutter = _CachedWalker() cutter.visit(tree) codelet.symbols = cutter.accum From 94953624c8ff761c6fb869dfa05a8d681c0010f1 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Sat, 31 May 2014 23:37:21 -0400 Subject: [PATCH 13/34] Fix #34. Add: bitshift/crawler/indexer.py -Add a `try-except` block to catch the `UnsupportedFileError` exception. --- bitshift/crawler/indexer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 5a351c0..c66df0b 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -162,8 +162,11 @@ class GitIndexer(threading.Thread): commits_meta[filename]["time_created"], commits_meta[filename]["time_last_modified"], repo.rank) - parse(codelet) - self.database.insert(codelet) + try: + parse(codelet) + self.database.insert(codelet) + except UnsupportedFileError as excep: + pass def _generate_file_url(self, filename, repo_url, framework_name): """ From 0fb2cacbd4f13e881387d2fef152c3cb8fb92b5f Mon Sep 17 00:00:00 2001 From: Benjamin Attal Date: Sat, 31 May 2014 23:42:10 -0400 Subject: [PATCH 14/34] Make sure parse servers return json loadable data. Make sure Java server closes client connection. --- .../src/main/java/com/bitshift/parsing/Parse.java | 13 +--- .../com/bitshift/parsing/parsers/JavaParser.java | 27 ++------ .../java/com/bitshift/parsing/parsers/Parser.java | 15 +++-- .../com/bitshift/parsing/symbols/JavaSymbols.java | 71 ++++++++++++---------- .../com/bitshift/parsing/utils/PackableMemory.java | 2 +- .../java/com/bitshift/parsing/utils/Tuple.java | 23 +++++++ parsers/ruby/lib/parse_server.rb | 2 +- parsers/ruby/lib/parser.rb | 37 ++++------- 8 files changed, 93 insertions(+), 97 deletions(-) create mode 100644 parsers/java/src/main/java/com/bitshift/parsing/utils/Tuple.java diff --git a/parsers/java/src/main/java/com/bitshift/parsing/Parse.java b/parsers/java/src/main/java/com/bitshift/parsing/Parse.java index fc1d36f..302c083 100644 --- a/parsers/java/src/main/java/com/bitshift/parsing/Parse.java +++ b/parsers/java/src/main/java/com/bitshift/parsing/Parse.java @@ -1,8 +1,5 @@ package com.bitshift.parsing; -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.io.PrintWriter; import java.io.IOException; import java.net.ServerSocket; @@ -13,18 +10,12 @@ import com.bitshift.parsing.parsers.JavaParser; public class Parse { public static void main(String[] args) { - String fromClient; - String toClient; - try { - ServerSocket server = new ServerSocket(5002); + ServerSocket server = new ServerSocket(5033); while(true) { Socket clientSocket = server.accept(); - - JavaParser parser = new JavaParser(clientSocket); - Thread parserTask = new Thread(parser); - parserTask.start(); + new Thread(new JavaParser(clientSocket)).start(); } } catch (IOException ex) { } diff --git a/parsers/java/src/main/java/com/bitshift/parsing/parsers/JavaParser.java b/parsers/java/src/main/java/com/bitshift/parsing/parsers/JavaParser.java index 4ba3623..989c0dd 100644 --- a/parsers/java/src/main/java/com/bitshift/parsing/parsers/JavaParser.java +++ b/parsers/java/src/main/java/com/bitshift/parsing/parsers/JavaParser.java @@ -13,7 +13,6 @@ import org.eclipse.jdt.core.dom.ASTParser; import org.eclipse.jdt.core.dom.ASTVisitor; import org.eclipse.jdt.core.dom.CompilationUnit; import org.eclipse.jdt.core.dom.ClassInstanceCreation; -import org.eclipse.jdt.core.dom.FieldDeclaration; import org.eclipse.jdt.core.dom.MethodDeclaration; import org.eclipse.jdt.core.dom.MethodInvocation; import org.eclipse.jdt.core.dom.Name; @@ -71,22 +70,6 @@ public class JavaParser extends Parser { this._cache = new Stack>(); } - public boolean visit(FieldDeclaration node) { - HashMap data = new HashMap(); - int sl = this.root.getLineNumber(node.getStartPosition()); - int sc = this.root.getColumnNumber(node.getStartPosition()); - - data.put("coord", Symbols.createCoord(sl, sc, -1, -1)); - this._cache.push(data); - return true; - } - - public void endVisit(FieldDeclaration node) { - HashMap data = this._cache.pop(); - String name = (String)data.remove("name"); - this.symbols.insertFieldDeclaration(name, data); - } - public boolean visit(MethodDeclaration node) { HashMap data = new HashMap(); Name nameObj = node.getName(); @@ -115,7 +98,7 @@ public class JavaParser extends Parser { public void endVisit(MethodDeclaration node) { HashMap data = this._cache.pop(); String name = (String)data.remove("name"); - this.symbols.insertMethodDeclaration(name, data); + this.symbols.insertMethodDeclaration("\"" + name + "\"", data); } public boolean visit(MethodInvocation node) { @@ -136,7 +119,7 @@ public class JavaParser extends Parser { public void endVisit(MethodInvocation node) { HashMap data = this._cache.pop(); String name = (String)data.remove("name"); - this.symbols.insertMethodInvocation(name, data); + this.symbols.insertMethodInvocation("\"" + name + "\"", data); } public boolean visit(PackageDeclaration node) { @@ -167,9 +150,9 @@ public class JavaParser extends Parser { String name = (String)data.remove("name"); if (node.isInterface()) { - this.symbols.insertInterfaceDeclaration(name, data); + this.symbols.insertInterfaceDeclaration("\"" + name + "\"", data); } else { - this.symbols.insertClassDeclaration(name, data); + this.symbols.insertClassDeclaration("\"" + name + "\"", data); } } @@ -186,7 +169,7 @@ public class JavaParser extends Parser { public void endVisit(VariableDeclarationFragment node) { HashMap data = this._cache.pop(); String name = (String)data.remove("name"); - this.symbols.insertVariableDeclaration(name, data); + this.symbols.insertVariableDeclaration("\"" + name + "\"", data); } public boolean visit(QualifiedName node) { diff --git a/parsers/java/src/main/java/com/bitshift/parsing/parsers/Parser.java b/parsers/java/src/main/java/com/bitshift/parsing/parsers/Parser.java index 9d00954..83100f5 100644 --- a/parsers/java/src/main/java/com/bitshift/parsing/parsers/Parser.java +++ b/parsers/java/src/main/java/com/bitshift/parsing/parsers/Parser.java @@ -1,8 +1,9 @@ package com.bitshift.parsing.parsers; import java.io.BufferedReader; +import java.io.BufferedWriter; import java.io.InputStreamReader; -import java.io.PrintWriter; +import java.io.OutputStreamWriter; import java.io.IOException; import java.net.Socket; @@ -46,12 +47,16 @@ public abstract class Parser implements Runnable { protected void writeToClient(String toClient) { try { - PrintWriter clientWriter = new PrintWriter( - this.clientSocket.getOutputStream(), true); + BufferedWriter clientWriter = new BufferedWriter( + new OutputStreamWriter(this.clientSocket.getOutputStream())); - PackableMemory mem = new PackableMemory(toClient.length()); + PackableMemory mem = new PackableMemory(4); + mem.pack(toClient.length(), 0); String dataSize = new String(mem.mem); - clientWriter.println(dataSize + toClient); + + clientWriter.write(dataSize + toClient); + clientWriter.flush(); + this.clientSocket.close(); } catch (IOException ex) { } } diff --git a/parsers/java/src/main/java/com/bitshift/parsing/symbols/JavaSymbols.java b/parsers/java/src/main/java/com/bitshift/parsing/symbols/JavaSymbols.java index 5419d5a..6f0caf1 100644 --- a/parsers/java/src/main/java/com/bitshift/parsing/symbols/JavaSymbols.java +++ b/parsers/java/src/main/java/com/bitshift/parsing/symbols/JavaSymbols.java @@ -11,15 +11,16 @@ public class JavaSymbols extends Symbols { private HashMap> _classes; private HashMap> _interfaces; private HashMap> _methods; - private HashMap> _fields; private HashMap> _vars; + private final String assignKey = "\"assignments\""; + private final String useKey = "\"uses\""; + public JavaSymbols() { _packageName = null; _classes = new HashMap>(); _interfaces = new HashMap>(); _methods = new HashMap>(); - _fields = new HashMap>(); _vars = new HashMap>(); } @@ -34,15 +35,23 @@ public class JavaSymbols extends Symbols { HashMap klass = new HashMap(); assignments.add(data.get("coord")); - klass.put("assignments", assignments); - klass.put("uses", uses); + klass.put(assignKey, assignments); + klass.put(useKey, uses); this._classes.put(name, klass); return true; } public boolean insertInterfaceDeclaration(String name, HashMap data) { - this._interfaces.put(name, data); + ArrayList assignments = new ArrayList(10); + ArrayList uses = new ArrayList(10); + HashMap klass = new HashMap(); + + assignments.add(data.get("coord")); + klass.put(assignKey, assignments); + klass.put(useKey, uses); + + this._interfaces.put(name, klass); return true; } @@ -54,13 +63,13 @@ public class JavaSymbols extends Symbols { ArrayList uses = new ArrayList(10); assignments.add(data.get("coord")); - method.put("assignments", assignments); - method.put("uses", uses); + method.put(assignKey, assignments); + method.put(useKey, uses); } else { - ArrayList assignments = (ArrayList)method.get("assignments"); + ArrayList assignments = (ArrayList)method.get(assignKey); assignments.add(data.get("coord")); - method.put("assignments", assignments); + method.put(assignKey, assignments); } this._methods.put(name, method); @@ -74,24 +83,19 @@ public class JavaSymbols extends Symbols { ArrayList uses = new ArrayList(10); uses.add(data.get("coord")); - method.put("assignments", assignments); - method.put("uses", uses); + method.put(assignKey, assignments); + method.put(useKey, uses); } else { - ArrayList uses = (ArrayList)method.get("uses"); + ArrayList uses = (ArrayList)method.get(useKey); uses.add(data.get("coord")); - method.put("uses", uses); + method.put(useKey, uses); } this._methods.put(name, method); return true; } - public boolean insertFieldDeclaration(String name, HashMap data) { - this._fields.put(name, data); - return true; - } - public boolean insertVariableDeclaration(String name, HashMap data) { HashMap var = this._vars.get(name); if (var == null) { @@ -100,13 +104,13 @@ public class JavaSymbols extends Symbols { ArrayList uses = new ArrayList(10); assignments.add(data.get("coord")); - var.put("assignments", assignments); - var.put("uses", uses); + var.put(assignKey, assignments); + var.put(useKey, uses); } else { - ArrayList assignments = (ArrayList)var.get("assignments"); + ArrayList assignments = (ArrayList)var.get(assignKey); assignments.add(data.get("coord")); - var.put("assignments", assignments); + var.put(assignKey, assignments); } this._vars.put(name, var); @@ -120,13 +124,13 @@ public class JavaSymbols extends Symbols { ArrayList uses = new ArrayList(10); uses.add(data.get("coord")); - var.put("assignments", assignments); - var.put("uses", uses); + var.put(assignKey, assignments); + var.put(useKey, uses); } else { - ArrayList uses = (ArrayList)var.get("uses"); + ArrayList uses = (ArrayList)var.get(useKey); uses.add(data.get("coord")); - var.put("uses", uses); + var.put(useKey, uses); } this._vars.put(name, var); @@ -135,13 +139,14 @@ public class JavaSymbols extends Symbols { public String toString() { StringBuilder builder = new StringBuilder(); - builder.append("classes:" + this._classes + ","); - builder.append("interfaces:" + this._interfaces + ","); - builder.append("methods:" + this._methods + ","); - builder.append("fields:" + this._fields + ","); - builder.append("vars:" + this._vars + ","); - - return "{" + builder.toString() + "}"; + builder.append("\"classes\":" + this._classes + ","); + builder.append("\"interfaces\":" + this._interfaces + ","); + builder.append("\"methods\":" + this._methods + ","); + builder.append("\"vars\":" + this._vars + ","); + + String s = builder.toString().replaceAll("=", ":"); + s = s.substring(0, s.length() - 1); + return "{" + s + "}"; } } diff --git a/parsers/java/src/main/java/com/bitshift/parsing/utils/PackableMemory.java b/parsers/java/src/main/java/com/bitshift/parsing/utils/PackableMemory.java index 24d883c..1f54d99 100644 --- a/parsers/java/src/main/java/com/bitshift/parsing/utils/PackableMemory.java +++ b/parsers/java/src/main/java/com/bitshift/parsing/utils/PackableMemory.java @@ -22,7 +22,7 @@ public class PackableMemory { // The most significant porion of the integer is stored in mem[loc]. // Bytes are masked out of the integer and stored in the array, working // from right(least significant) to left (most significant). - void pack(int val, int loc) + public void pack(int val, int loc) { final int MASK = 0xff; for (int i = 3; i >= 0; i--) diff --git a/parsers/java/src/main/java/com/bitshift/parsing/utils/Tuple.java b/parsers/java/src/main/java/com/bitshift/parsing/utils/Tuple.java new file mode 100644 index 0000000..115a3c6 --- /dev/null +++ b/parsers/java/src/main/java/com/bitshift/parsing/utils/Tuple.java @@ -0,0 +1,23 @@ +package com.bitshift.parsing.utils; + +import java.util.List; +import java.util.Arrays; + +public class Tuple { + private List _objects; + + public Tuple(T... args) { + _objects = Arrays.asList(args); + } + + public String toString() { + StringBuilder builder = new StringBuilder(); + + for(T o: this._objects) { + builder.append(o + ","); + } + + String s = builder.toString(); + return "(" + s.substring(0, s.length() - 1) + ")"; + } +} diff --git a/parsers/ruby/lib/parse_server.rb b/parsers/ruby/lib/parse_server.rb index 916f434..9a929aa 100644 --- a/parsers/ruby/lib/parse_server.rb +++ b/parsers/ruby/lib/parse_server.rb @@ -14,7 +14,7 @@ end def start_server - server = TCPServer.new 5003 + server = TCPServer.new 5065 loop do # Start a new thread for each client accepted diff --git a/parsers/ruby/lib/parser.rb b/parsers/ruby/lib/parser.rb index 6dfa175..eec293b 100644 --- a/parsers/ruby/lib/parser.rb +++ b/parsers/ruby/lib/parser.rb @@ -3,20 +3,6 @@ require 'ruby_parser' require 'sexp_processor' module Bitshift - class Tuple - attr_accessor :objects - - def initialize(arr) - @objects = arr - end - - def inspect - s = "(" - @objects.each {|o| s += "#{o},"} - s = s[0..-2] + ')' - end - end - class Parser def initialize(source) @source = source @@ -39,7 +25,8 @@ module Bitshift def initialize(offset, tree) super() - module_hash = Hash.new {|hash, key| hash[key] = { decls: [], uses: [] }} + module_hash = Hash.new {|hash, key| + hash[key] = { assignments: [], uses: [] }} class_hash = module_hash.clone function_hash = module_hash.clone var_hash = module_hash.clone @@ -64,7 +51,7 @@ module Bitshift break if cur_exp == nil end - pos = Tuple.new([start_ln, -1, end_ln, -1]) + pos = [start_ln, -1, end_ln, -1] return pos end @@ -72,7 +59,7 @@ module Bitshift pos = Hash.new end_ln = start_ln = exp.line - offset - pos = Tuple.new([start_ln, -1, end_ln, -1]) + pos = [start_ln, -1, end_ln, -1] return pos end @@ -80,7 +67,7 @@ module Bitshift pos = block_position(exp) exp.shift name = exp.shift - symbols[:modules][name][:decls] << pos + symbols[:modules][name][:assignments] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -89,7 +76,7 @@ module Bitshift pos = block_position(exp) exp.shift name = exp.shift - symbols[:classes][name][:decls] << pos + symbols[:classes][name][:assignments] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -98,7 +85,7 @@ module Bitshift pos = block_position(exp) exp.shift name = exp.shift - symbols[:functions][name][:decls] << pos + symbols[:functions][name][:assignments] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -117,7 +104,7 @@ module Bitshift pos = statement_position(exp) exp.shift name = exp.shift - symbols[:vars][name][:decls] << pos + symbols[:vars][name][:assignments] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -126,17 +113,19 @@ module Bitshift pos = statement_position(exp) exp.shift name = exp.shift - symbols[:vars][name][:decls] << pos + symbols[:vars][name][:assignments] << pos exp.each_sexp {|s| process(s)} return exp.clear end def to_s - new_symbols = Hash.new {|hash, key| hash[key] = []} + new_symbols = Hash.new {|hash, key| hash[key] = Hash.new} symbols.each do |type, sym_list| sym_list.each do |name, sym| - new_symbols[type.to_s] << Tuple.new(["'#{name}'", sym[:decls], sym[:uses]]) + new_symbols[type.to_s][name.to_s] = { + "assignments" => sym[:assignments], + "uses" => sym[:uses]} end end From 5e6e3fcbf59532a68c0cb3ec8c788050ae6ac814 Mon Sep 17 00:00:00 2001 From: Benjamin Attal Date: Sat, 31 May 2014 23:42:44 -0400 Subject: [PATCH 15/34] Adjust test for new language dict. --- bitshift/parser/__init__.py | 3 ++- test/parser_test.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bitshift/parser/__init__.py b/bitshift/parser/__init__.py index bc22514..c3af22e 100644 --- a/bitshift/parser/__init__.py +++ b/bitshift/parser/__init__.py @@ -21,7 +21,8 @@ def _lang(codelet): if codelet.filename is not None: try: - return pgl.guess_lexer_for_filename(codelet.filename, codelet.code).name + return LANGS.index( + pgl.get_lexer_for_filename(codelet.filename).name) except: raise UnsupportedFileError('Could not find a lexer for the codelet\'s filename') diff --git a/test/parser_test.py b/test/parser_test.py index a1cfad3..ffee75c 100644 --- a/test/parser_test.py +++ b/test/parser_test.py @@ -21,7 +21,7 @@ if __name__ == '__main__': elif sys.argv[1] == 'ruby': file_name = "resources/parser.rb" - server_socket_number = 5003 + server_socket_number = 5065 server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_socket.connect(("localhost", server_socket_number)) From 5d8ac664fee099ec970da3368fa1f7a425ededa1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 31 May 2014 23:44:30 -0400 Subject: [PATCH 16/34] HAHA WHAT ARE ITERATORS --- bitshift/parser/python.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bitshift/parser/python.py b/bitshift/parser/python.py index 713cae9..3cb141d 100644 --- a/bitshift/parser/python.py +++ b/bitshift/parser/python.py @@ -159,16 +159,17 @@ def parse_py(codelet): def strip_encoding(lines): """Strips the encoding line from a file, which breaks the parser.""" + it = iter(lines) try: - first = next(lines) + first = next(it) if not encoding_re.match(first): yield first - second = next(lines) + second = next(it) if not encoding_re.match(second): yield second except StopIteration: return - for line in lines: + for line in it: yield line try: From 5d3baf49d142edb91150e61a13113fb04ffdf536 Mon Sep 17 00:00:00 2001 From: Benjamin Attal Date: Fri, 30 May 2014 23:54:51 -0400 Subject: [PATCH 17/34] Change format of string returned from ruby server to correct symbol table format. --- parsers/ruby/lib/parser.rb | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/parsers/ruby/lib/parser.rb b/parsers/ruby/lib/parser.rb index c757fa0..6dfa175 100644 --- a/parsers/ruby/lib/parser.rb +++ b/parsers/ruby/lib/parser.rb @@ -3,6 +3,20 @@ require 'ruby_parser' require 'sexp_processor' module Bitshift + class Tuple + attr_accessor :objects + + def initialize(arr) + @objects = arr + end + + def inspect + s = "(" + @objects.each {|o| s += "#{o},"} + s = s[0..-2] + ')' + end + end + class Parser def initialize(source) @source = source @@ -25,7 +39,7 @@ module Bitshift def initialize(offset, tree) super() - module_hash = Hash.new {|hash, key| hash[key] = { assignments: [], uses: [] }} + module_hash = Hash.new {|hash, key| hash[key] = { decls: [], uses: [] }} class_hash = module_hash.clone function_hash = module_hash.clone var_hash = module_hash.clone @@ -50,7 +64,7 @@ module Bitshift break if cur_exp == nil end - pos = [start_ln, -1, end_ln, -1] + pos = Tuple.new([start_ln, -1, end_ln, -1]) return pos end @@ -58,7 +72,7 @@ module Bitshift pos = Hash.new end_ln = start_ln = exp.line - offset - pos = [start_ln, -1, end_ln, -1] + pos = Tuple.new([start_ln, -1, end_ln, -1]) return pos end @@ -66,7 +80,7 @@ module Bitshift pos = block_position(exp) exp.shift name = exp.shift - symbols[:modules][name][:assignments] << pos + symbols[:modules][name][:decls] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -75,7 +89,7 @@ module Bitshift pos = block_position(exp) exp.shift name = exp.shift - symbols[:classes][name][:assignments] << pos + symbols[:classes][name][:decls] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -84,7 +98,7 @@ module Bitshift pos = block_position(exp) exp.shift name = exp.shift - symbols[:functions][name][:assignments] << pos + symbols[:functions][name][:decls] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -103,7 +117,7 @@ module Bitshift pos = statement_position(exp) exp.shift name = exp.shift - symbols[:vars][name][:assignments] << pos + symbols[:vars][name][:decls] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -112,14 +126,22 @@ module Bitshift pos = statement_position(exp) exp.shift name = exp.shift - symbols[:vars][name][:assignments] << pos + symbols[:vars][name][:decls] << pos exp.each_sexp {|s| process(s)} return exp.clear end def to_s - str = symbols.to_s - str = str.gsub(/:(\w*)=>/, '"\1":') + new_symbols = Hash.new {|hash, key| hash[key] = []} + + symbols.each do |type, sym_list| + sym_list.each do |name, sym| + new_symbols[type.to_s] << Tuple.new(["'#{name}'", sym[:decls], sym[:uses]]) + end + end + + str = new_symbols.to_s + str = str.gsub(/=>/, ":") return str end end From 1f56300c73ef236936d340256333fb635b6c14dd Mon Sep 17 00:00:00 2001 From: Benjamin Attal Date: Sat, 31 May 2014 23:42:10 -0400 Subject: [PATCH 18/34] Make sure parse servers return json loadable data. Make sure Java server closes client connection. --- .../src/main/java/com/bitshift/parsing/Parse.java | 13 +--- .../com/bitshift/parsing/parsers/JavaParser.java | 27 ++------ .../java/com/bitshift/parsing/parsers/Parser.java | 15 +++-- .../com/bitshift/parsing/symbols/JavaSymbols.java | 71 ++++++++++++---------- .../com/bitshift/parsing/utils/PackableMemory.java | 2 +- .../java/com/bitshift/parsing/utils/Tuple.java | 23 +++++++ parsers/ruby/lib/parse_server.rb | 2 +- parsers/ruby/lib/parser.rb | 37 ++++------- 8 files changed, 93 insertions(+), 97 deletions(-) create mode 100644 parsers/java/src/main/java/com/bitshift/parsing/utils/Tuple.java diff --git a/parsers/java/src/main/java/com/bitshift/parsing/Parse.java b/parsers/java/src/main/java/com/bitshift/parsing/Parse.java index fc1d36f..302c083 100644 --- a/parsers/java/src/main/java/com/bitshift/parsing/Parse.java +++ b/parsers/java/src/main/java/com/bitshift/parsing/Parse.java @@ -1,8 +1,5 @@ package com.bitshift.parsing; -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.io.PrintWriter; import java.io.IOException; import java.net.ServerSocket; @@ -13,18 +10,12 @@ import com.bitshift.parsing.parsers.JavaParser; public class Parse { public static void main(String[] args) { - String fromClient; - String toClient; - try { - ServerSocket server = new ServerSocket(5002); + ServerSocket server = new ServerSocket(5033); while(true) { Socket clientSocket = server.accept(); - - JavaParser parser = new JavaParser(clientSocket); - Thread parserTask = new Thread(parser); - parserTask.start(); + new Thread(new JavaParser(clientSocket)).start(); } } catch (IOException ex) { } diff --git a/parsers/java/src/main/java/com/bitshift/parsing/parsers/JavaParser.java b/parsers/java/src/main/java/com/bitshift/parsing/parsers/JavaParser.java index 4ba3623..989c0dd 100644 --- a/parsers/java/src/main/java/com/bitshift/parsing/parsers/JavaParser.java +++ b/parsers/java/src/main/java/com/bitshift/parsing/parsers/JavaParser.java @@ -13,7 +13,6 @@ import org.eclipse.jdt.core.dom.ASTParser; import org.eclipse.jdt.core.dom.ASTVisitor; import org.eclipse.jdt.core.dom.CompilationUnit; import org.eclipse.jdt.core.dom.ClassInstanceCreation; -import org.eclipse.jdt.core.dom.FieldDeclaration; import org.eclipse.jdt.core.dom.MethodDeclaration; import org.eclipse.jdt.core.dom.MethodInvocation; import org.eclipse.jdt.core.dom.Name; @@ -71,22 +70,6 @@ public class JavaParser extends Parser { this._cache = new Stack>(); } - public boolean visit(FieldDeclaration node) { - HashMap data = new HashMap(); - int sl = this.root.getLineNumber(node.getStartPosition()); - int sc = this.root.getColumnNumber(node.getStartPosition()); - - data.put("coord", Symbols.createCoord(sl, sc, -1, -1)); - this._cache.push(data); - return true; - } - - public void endVisit(FieldDeclaration node) { - HashMap data = this._cache.pop(); - String name = (String)data.remove("name"); - this.symbols.insertFieldDeclaration(name, data); - } - public boolean visit(MethodDeclaration node) { HashMap data = new HashMap(); Name nameObj = node.getName(); @@ -115,7 +98,7 @@ public class JavaParser extends Parser { public void endVisit(MethodDeclaration node) { HashMap data = this._cache.pop(); String name = (String)data.remove("name"); - this.symbols.insertMethodDeclaration(name, data); + this.symbols.insertMethodDeclaration("\"" + name + "\"", data); } public boolean visit(MethodInvocation node) { @@ -136,7 +119,7 @@ public class JavaParser extends Parser { public void endVisit(MethodInvocation node) { HashMap data = this._cache.pop(); String name = (String)data.remove("name"); - this.symbols.insertMethodInvocation(name, data); + this.symbols.insertMethodInvocation("\"" + name + "\"", data); } public boolean visit(PackageDeclaration node) { @@ -167,9 +150,9 @@ public class JavaParser extends Parser { String name = (String)data.remove("name"); if (node.isInterface()) { - this.symbols.insertInterfaceDeclaration(name, data); + this.symbols.insertInterfaceDeclaration("\"" + name + "\"", data); } else { - this.symbols.insertClassDeclaration(name, data); + this.symbols.insertClassDeclaration("\"" + name + "\"", data); } } @@ -186,7 +169,7 @@ public class JavaParser extends Parser { public void endVisit(VariableDeclarationFragment node) { HashMap data = this._cache.pop(); String name = (String)data.remove("name"); - this.symbols.insertVariableDeclaration(name, data); + this.symbols.insertVariableDeclaration("\"" + name + "\"", data); } public boolean visit(QualifiedName node) { diff --git a/parsers/java/src/main/java/com/bitshift/parsing/parsers/Parser.java b/parsers/java/src/main/java/com/bitshift/parsing/parsers/Parser.java index 9d00954..83100f5 100644 --- a/parsers/java/src/main/java/com/bitshift/parsing/parsers/Parser.java +++ b/parsers/java/src/main/java/com/bitshift/parsing/parsers/Parser.java @@ -1,8 +1,9 @@ package com.bitshift.parsing.parsers; import java.io.BufferedReader; +import java.io.BufferedWriter; import java.io.InputStreamReader; -import java.io.PrintWriter; +import java.io.OutputStreamWriter; import java.io.IOException; import java.net.Socket; @@ -46,12 +47,16 @@ public abstract class Parser implements Runnable { protected void writeToClient(String toClient) { try { - PrintWriter clientWriter = new PrintWriter( - this.clientSocket.getOutputStream(), true); + BufferedWriter clientWriter = new BufferedWriter( + new OutputStreamWriter(this.clientSocket.getOutputStream())); - PackableMemory mem = new PackableMemory(toClient.length()); + PackableMemory mem = new PackableMemory(4); + mem.pack(toClient.length(), 0); String dataSize = new String(mem.mem); - clientWriter.println(dataSize + toClient); + + clientWriter.write(dataSize + toClient); + clientWriter.flush(); + this.clientSocket.close(); } catch (IOException ex) { } } diff --git a/parsers/java/src/main/java/com/bitshift/parsing/symbols/JavaSymbols.java b/parsers/java/src/main/java/com/bitshift/parsing/symbols/JavaSymbols.java index 5419d5a..6f0caf1 100644 --- a/parsers/java/src/main/java/com/bitshift/parsing/symbols/JavaSymbols.java +++ b/parsers/java/src/main/java/com/bitshift/parsing/symbols/JavaSymbols.java @@ -11,15 +11,16 @@ public class JavaSymbols extends Symbols { private HashMap> _classes; private HashMap> _interfaces; private HashMap> _methods; - private HashMap> _fields; private HashMap> _vars; + private final String assignKey = "\"assignments\""; + private final String useKey = "\"uses\""; + public JavaSymbols() { _packageName = null; _classes = new HashMap>(); _interfaces = new HashMap>(); _methods = new HashMap>(); - _fields = new HashMap>(); _vars = new HashMap>(); } @@ -34,15 +35,23 @@ public class JavaSymbols extends Symbols { HashMap klass = new HashMap(); assignments.add(data.get("coord")); - klass.put("assignments", assignments); - klass.put("uses", uses); + klass.put(assignKey, assignments); + klass.put(useKey, uses); this._classes.put(name, klass); return true; } public boolean insertInterfaceDeclaration(String name, HashMap data) { - this._interfaces.put(name, data); + ArrayList assignments = new ArrayList(10); + ArrayList uses = new ArrayList(10); + HashMap klass = new HashMap(); + + assignments.add(data.get("coord")); + klass.put(assignKey, assignments); + klass.put(useKey, uses); + + this._interfaces.put(name, klass); return true; } @@ -54,13 +63,13 @@ public class JavaSymbols extends Symbols { ArrayList uses = new ArrayList(10); assignments.add(data.get("coord")); - method.put("assignments", assignments); - method.put("uses", uses); + method.put(assignKey, assignments); + method.put(useKey, uses); } else { - ArrayList assignments = (ArrayList)method.get("assignments"); + ArrayList assignments = (ArrayList)method.get(assignKey); assignments.add(data.get("coord")); - method.put("assignments", assignments); + method.put(assignKey, assignments); } this._methods.put(name, method); @@ -74,24 +83,19 @@ public class JavaSymbols extends Symbols { ArrayList uses = new ArrayList(10); uses.add(data.get("coord")); - method.put("assignments", assignments); - method.put("uses", uses); + method.put(assignKey, assignments); + method.put(useKey, uses); } else { - ArrayList uses = (ArrayList)method.get("uses"); + ArrayList uses = (ArrayList)method.get(useKey); uses.add(data.get("coord")); - method.put("uses", uses); + method.put(useKey, uses); } this._methods.put(name, method); return true; } - public boolean insertFieldDeclaration(String name, HashMap data) { - this._fields.put(name, data); - return true; - } - public boolean insertVariableDeclaration(String name, HashMap data) { HashMap var = this._vars.get(name); if (var == null) { @@ -100,13 +104,13 @@ public class JavaSymbols extends Symbols { ArrayList uses = new ArrayList(10); assignments.add(data.get("coord")); - var.put("assignments", assignments); - var.put("uses", uses); + var.put(assignKey, assignments); + var.put(useKey, uses); } else { - ArrayList assignments = (ArrayList)var.get("assignments"); + ArrayList assignments = (ArrayList)var.get(assignKey); assignments.add(data.get("coord")); - var.put("assignments", assignments); + var.put(assignKey, assignments); } this._vars.put(name, var); @@ -120,13 +124,13 @@ public class JavaSymbols extends Symbols { ArrayList uses = new ArrayList(10); uses.add(data.get("coord")); - var.put("assignments", assignments); - var.put("uses", uses); + var.put(assignKey, assignments); + var.put(useKey, uses); } else { - ArrayList uses = (ArrayList)var.get("uses"); + ArrayList uses = (ArrayList)var.get(useKey); uses.add(data.get("coord")); - var.put("uses", uses); + var.put(useKey, uses); } this._vars.put(name, var); @@ -135,13 +139,14 @@ public class JavaSymbols extends Symbols { public String toString() { StringBuilder builder = new StringBuilder(); - builder.append("classes:" + this._classes + ","); - builder.append("interfaces:" + this._interfaces + ","); - builder.append("methods:" + this._methods + ","); - builder.append("fields:" + this._fields + ","); - builder.append("vars:" + this._vars + ","); - - return "{" + builder.toString() + "}"; + builder.append("\"classes\":" + this._classes + ","); + builder.append("\"interfaces\":" + this._interfaces + ","); + builder.append("\"methods\":" + this._methods + ","); + builder.append("\"vars\":" + this._vars + ","); + + String s = builder.toString().replaceAll("=", ":"); + s = s.substring(0, s.length() - 1); + return "{" + s + "}"; } } diff --git a/parsers/java/src/main/java/com/bitshift/parsing/utils/PackableMemory.java b/parsers/java/src/main/java/com/bitshift/parsing/utils/PackableMemory.java index 24d883c..1f54d99 100644 --- a/parsers/java/src/main/java/com/bitshift/parsing/utils/PackableMemory.java +++ b/parsers/java/src/main/java/com/bitshift/parsing/utils/PackableMemory.java @@ -22,7 +22,7 @@ public class PackableMemory { // The most significant porion of the integer is stored in mem[loc]. // Bytes are masked out of the integer and stored in the array, working // from right(least significant) to left (most significant). - void pack(int val, int loc) + public void pack(int val, int loc) { final int MASK = 0xff; for (int i = 3; i >= 0; i--) diff --git a/parsers/java/src/main/java/com/bitshift/parsing/utils/Tuple.java b/parsers/java/src/main/java/com/bitshift/parsing/utils/Tuple.java new file mode 100644 index 0000000..115a3c6 --- /dev/null +++ b/parsers/java/src/main/java/com/bitshift/parsing/utils/Tuple.java @@ -0,0 +1,23 @@ +package com.bitshift.parsing.utils; + +import java.util.List; +import java.util.Arrays; + +public class Tuple { + private List _objects; + + public Tuple(T... args) { + _objects = Arrays.asList(args); + } + + public String toString() { + StringBuilder builder = new StringBuilder(); + + for(T o: this._objects) { + builder.append(o + ","); + } + + String s = builder.toString(); + return "(" + s.substring(0, s.length() - 1) + ")"; + } +} diff --git a/parsers/ruby/lib/parse_server.rb b/parsers/ruby/lib/parse_server.rb index 916f434..9a929aa 100644 --- a/parsers/ruby/lib/parse_server.rb +++ b/parsers/ruby/lib/parse_server.rb @@ -14,7 +14,7 @@ end def start_server - server = TCPServer.new 5003 + server = TCPServer.new 5065 loop do # Start a new thread for each client accepted diff --git a/parsers/ruby/lib/parser.rb b/parsers/ruby/lib/parser.rb index 6dfa175..eec293b 100644 --- a/parsers/ruby/lib/parser.rb +++ b/parsers/ruby/lib/parser.rb @@ -3,20 +3,6 @@ require 'ruby_parser' require 'sexp_processor' module Bitshift - class Tuple - attr_accessor :objects - - def initialize(arr) - @objects = arr - end - - def inspect - s = "(" - @objects.each {|o| s += "#{o},"} - s = s[0..-2] + ')' - end - end - class Parser def initialize(source) @source = source @@ -39,7 +25,8 @@ module Bitshift def initialize(offset, tree) super() - module_hash = Hash.new {|hash, key| hash[key] = { decls: [], uses: [] }} + module_hash = Hash.new {|hash, key| + hash[key] = { assignments: [], uses: [] }} class_hash = module_hash.clone function_hash = module_hash.clone var_hash = module_hash.clone @@ -64,7 +51,7 @@ module Bitshift break if cur_exp == nil end - pos = Tuple.new([start_ln, -1, end_ln, -1]) + pos = [start_ln, -1, end_ln, -1] return pos end @@ -72,7 +59,7 @@ module Bitshift pos = Hash.new end_ln = start_ln = exp.line - offset - pos = Tuple.new([start_ln, -1, end_ln, -1]) + pos = [start_ln, -1, end_ln, -1] return pos end @@ -80,7 +67,7 @@ module Bitshift pos = block_position(exp) exp.shift name = exp.shift - symbols[:modules][name][:decls] << pos + symbols[:modules][name][:assignments] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -89,7 +76,7 @@ module Bitshift pos = block_position(exp) exp.shift name = exp.shift - symbols[:classes][name][:decls] << pos + symbols[:classes][name][:assignments] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -98,7 +85,7 @@ module Bitshift pos = block_position(exp) exp.shift name = exp.shift - symbols[:functions][name][:decls] << pos + symbols[:functions][name][:assignments] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -117,7 +104,7 @@ module Bitshift pos = statement_position(exp) exp.shift name = exp.shift - symbols[:vars][name][:decls] << pos + symbols[:vars][name][:assignments] << pos exp.each_sexp {|s| process(s)} return exp.clear end @@ -126,17 +113,19 @@ module Bitshift pos = statement_position(exp) exp.shift name = exp.shift - symbols[:vars][name][:decls] << pos + symbols[:vars][name][:assignments] << pos exp.each_sexp {|s| process(s)} return exp.clear end def to_s - new_symbols = Hash.new {|hash, key| hash[key] = []} + new_symbols = Hash.new {|hash, key| hash[key] = Hash.new} symbols.each do |type, sym_list| sym_list.each do |name, sym| - new_symbols[type.to_s] << Tuple.new(["'#{name}'", sym[:decls], sym[:uses]]) + new_symbols[type.to_s][name.to_s] = { + "assignments" => sym[:assignments], + "uses" => sym[:uses]} end end From f02dc4497cde81862e6aaef75c98b4250b0282df Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 31 May 2014 23:49:22 -0400 Subject: [PATCH 19/34] Fixes. --- bitshift/crawler/indexer.py | 14 +++++++------- bitshift/parser/__init__.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index c66df0b..5b5e83d 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -7,7 +7,7 @@ import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\ threading from ..database import Database -from ..parser import parse +from ..parser import parse, UnsupportedFileError from ..codelet import Codelet GIT_CLONE_DIR = "/tmp/bitshift" @@ -151,11 +151,11 @@ class GitIndexer(threading.Thread): source = self._decode(source_file.read()) if source is None: continue - except IOError as exception: + except IOError: continue - authors = [(self._decode(author), None) for author in \ - commits_meta[filename]["authors"]] + authors = [(self._decode(author), None) for author in + commits_meta[filename]["authors"]] codelet = Codelet("%s:%s" % (repo.name, filename), source, filename, None, authors, self._generate_file_url(filename, repo.url, repo.framework_name), @@ -164,9 +164,9 @@ class GitIndexer(threading.Thread): repo.rank) try: parse(codelet) - self.database.insert(codelet) - except UnsupportedFileError as excep: - pass + except UnsupportedFileError: + continue + self.database.insert(codelet) def _generate_file_url(self, filename, repo_url, framework_name): """ diff --git a/bitshift/parser/__init__.py b/bitshift/parser/__init__.py index 79fca78..a7446ab 100644 --- a/bitshift/parser/__init__.py +++ b/bitshift/parser/__init__.py @@ -8,7 +8,7 @@ from pygments import lexers as pgl, util from ..languages import LANGS from .python import parse_py -_all__ = ["parse"] +_all__ = ["parse", "UnsupportedFileError"] class UnsupportedFileError(Exception): pass From 65a2688be9175de2beae9bca833f501e8a683b59 Mon Sep 17 00:00:00 2001 From: Benjamin Attal Date: Sat, 31 May 2014 23:42:44 -0400 Subject: [PATCH 20/34] Adjust test for new language dict. --- bitshift/parser/__init__.py | 3 ++- test/parser_test.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bitshift/parser/__init__.py b/bitshift/parser/__init__.py index 79fca78..927b397 100644 --- a/bitshift/parser/__init__.py +++ b/bitshift/parser/__init__.py @@ -27,11 +27,12 @@ def _lang(codelet): try: if codelet.filename: - lex = pgl.guess_lexer_for_filename(codelet.filename, codelet.code) + lex = pgl.get_lexer_for_filename(codelet.filename) else: lex = pgl.guess_lexer(codelet.code) except util.ClassNotFound: raise UnsupportedFileError(codelet.filename) + return LANGS.index(lex.name) def _recv_data(server_socket): diff --git a/test/parser_test.py b/test/parser_test.py index a1cfad3..ffee75c 100644 --- a/test/parser_test.py +++ b/test/parser_test.py @@ -21,7 +21,7 @@ if __name__ == '__main__': elif sys.argv[1] == 'ruby': file_name = "resources/parser.rb" - server_socket_number = 5003 + server_socket_number = 5065 server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_socket.connect(("localhost", server_socket_number)) From 6edb142b4a7e9f854f9c0c0f32dcf7a2572384c5 Mon Sep 17 00:00:00 2001 From: Benjamin Attal Date: Sun, 1 Jun 2014 00:18:02 -0400 Subject: [PATCH 21/34] Can specify port number to run java server on. --- .../src/main/java/com/bitshift/parsing/Parse.java | 18 +----- .../com/bitshift/parsing/utils/ParseServer.java | 65 ++++++++++++++++++++++ .../java/com/bitshift/parsing/utils/Tuple.java | 23 -------- 3 files changed, 68 insertions(+), 38 deletions(-) create mode 100644 parsers/java/src/main/java/com/bitshift/parsing/utils/ParseServer.java delete mode 100644 parsers/java/src/main/java/com/bitshift/parsing/utils/Tuple.java diff --git a/parsers/java/src/main/java/com/bitshift/parsing/Parse.java b/parsers/java/src/main/java/com/bitshift/parsing/Parse.java index 302c083..ae54a9e 100644 --- a/parsers/java/src/main/java/com/bitshift/parsing/Parse.java +++ b/parsers/java/src/main/java/com/bitshift/parsing/Parse.java @@ -1,24 +1,12 @@ package com.bitshift.parsing; -import java.io.IOException; - -import java.net.ServerSocket; -import java.net.Socket; - -import com.bitshift.parsing.parsers.JavaParser; +import com.bitshift.parsing.utils.ParseServer; public class Parse { public static void main(String[] args) { - try { - ServerSocket server = new ServerSocket(5033); - - while(true) { - Socket clientSocket = server.accept(); - new Thread(new JavaParser(clientSocket)).start(); - } - } catch (IOException ex) { - } + ParseServer server = new ParseServer(Integer.parseInt(args[0])); + new Thread(server).start(); } } diff --git a/parsers/java/src/main/java/com/bitshift/parsing/utils/ParseServer.java b/parsers/java/src/main/java/com/bitshift/parsing/utils/ParseServer.java new file mode 100644 index 0000000..291be34 --- /dev/null +++ b/parsers/java/src/main/java/com/bitshift/parsing/utils/ParseServer.java @@ -0,0 +1,65 @@ +/* Code for multithreaded server taken from Jakob Jenkov */ +package com.bitshift.parsing.utils; + +import java.net.ServerSocket; +import java.net.Socket; +import java.io.IOException; + +import com.bitshift.parsing.parsers.JavaParser; + +public class ParseServer implements Runnable{ + + protected int serverPort = 8080; + protected ServerSocket serverSocket = null; + protected boolean isStopped = false; + protected Thread runningThread= null; + + public ParseServer(int port){ + this.serverPort = port; + } + + public void run(){ + synchronized(this){ + this.runningThread = Thread.currentThread(); + } + openServerSocket(); + while(! isStopped()){ + Socket clientSocket = null; + try { + clientSocket = this.serverSocket.accept(); + } catch (IOException e) { + if(isStopped()) { + System.out.println("Server Stopped.") ; + return; + } + throw new RuntimeException( + "Error accepting client connection", e); + } + new Thread(new JavaParser(clientSocket)).start(); + } + System.out.println("Server Stopped.") ; + } + + + private synchronized boolean isStopped() { + return this.isStopped; + } + + public synchronized void stop(){ + this.isStopped = true; + try { + this.serverSocket.close(); + } catch (IOException e) { + throw new RuntimeException("Error closing server", e); + } + } + + private void openServerSocket() { + try { + this.serverSocket = new ServerSocket(this.serverPort); + } catch (IOException e) { + throw new RuntimeException("Cannot open port 8080", e); + } + } + +} diff --git a/parsers/java/src/main/java/com/bitshift/parsing/utils/Tuple.java b/parsers/java/src/main/java/com/bitshift/parsing/utils/Tuple.java deleted file mode 100644 index 115a3c6..0000000 --- a/parsers/java/src/main/java/com/bitshift/parsing/utils/Tuple.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.bitshift.parsing.utils; - -import java.util.List; -import java.util.Arrays; - -public class Tuple { - private List _objects; - - public Tuple(T... args) { - _objects = Arrays.asList(args); - } - - public String toString() { - StringBuilder builder = new StringBuilder(); - - for(T o: this._objects) { - builder.append(o + ","); - } - - String s = builder.toString(); - return "(" + s.substring(0, s.length() - 1) + ")"; - } -} From e64c81f66ffe6eae57d69b8b2ae4475b2ca517bf Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 1 Jun 2014 00:51:43 -0400 Subject: [PATCH 22/34] Consistency fix: Use Codelet.url instead of Codelet.code_url. --- bitshift/codelet.py | 12 ++++++------ static/js/index.js | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bitshift/codelet.py b/bitshift/codelet.py index a12ec3a..3021ffe 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -13,7 +13,7 @@ class Codelet(object): :ivar authors: (array of tuples (str, str or None)) An array of tuples containing an author's name and profile URL (on the service the code was pulled from). - :ivar code_url: (str) The url of the (page containing the) source code. + :ivar url: (str) The url of the (page containing the) source code. :ivar date_created: (:class:`datetime.datetime`, or None) The date the code was published. :ivar date_modified: (:class:`datetime.datetime`, or None) The date the @@ -26,7 +26,7 @@ class Codelet(object): added by the database. """ - def __init__(self, name, code, filename, language, authors, code_url, + def __init__(self, name, code, filename, language, authors, url, date_created, date_modified, rank, symbols=None, origin=None): """ Create a Codelet instance. @@ -36,7 +36,7 @@ class Codelet(object): :param filename: see :attr:`self.filename` :param language: see :attr:`self.language` :param authors: see :attr:`self.authors` - :param code_url: see :attr:`self.code_url` + :param url: see :attr:`self.url` :param date_created: see :attr:`self.date_created` :param date_modified: see :attr:`self.date_modified` :param rank: see :attr:`self.rank` @@ -48,7 +48,7 @@ class Codelet(object): :type filename: see :attr:`self.filename` :type language: see :attr:`self.language` :type authors: see :attr:`self.authors` - :type code_url: see :attr:`self.code_url` + :type url: see :attr:`self.url` :type date_created: see :attr:`self.date_created` :type date_modified: see :attr:`self.date_modified` :type rank: see :attr:`self.rank` @@ -61,7 +61,7 @@ class Codelet(object): self.filename = filename self.language = language self.authors = authors - self.code_url = code_url + self.url = url self.date_created = date_created self.date_modified = date_modified self.rank = rank @@ -77,7 +77,7 @@ class Codelet(object): """ return { "name": self.name, "code": self.code, "lang": LANGS[self.language], - "authors": self.authors, "url": self.code_url, + "authors": self.authors, "url": self.url, "created": self.date_created.isoformat(), "modified": self.date_modified.isoformat(), "symbols": self.symbols, "origin": self.origin diff --git a/static/js/index.js b/static/js/index.js index 2d84791..e5a4e14 100644 --- a/static/js/index.js +++ b/static/js/index.js @@ -30,7 +30,7 @@ var codeExample = '
' + title.innerHTML = 'File ' + codelet.filename + ''; site.innerHTML = 'on ' + codelet.origin[0] +''; dateModified.innerHTML = 'Last modified ' + codelet.date_modified; From fd3a8b15985147b8665b87793aad430e087063b5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 1 Jun 2014 01:19:04 -0400 Subject: [PATCH 23/34] Fix missing import. --- app.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app.py b/app.py index 20d8997..0dba068 100644 --- a/app.py +++ b/app.py @@ -8,6 +8,7 @@ from flask import Flask, make_response, render_template, request from bitshift import assets from bitshift.database import Database +from bitshift.languages import LANGS from bitshift.query import parse_query, QueryParseException app = Flask(__name__) @@ -21,7 +22,7 @@ database = Database() @app.route("/") def index(): - return render_template("index.html", typeahead_languages=languages.LANGS) + return render_template("index.html", typeahead_languages=LANGS) @app.route("/search.json") def search(): From 73dee778c58e30bdd5796257ceb05960fa287187 Mon Sep 17 00:00:00 2001 From: Benjamin Attal Date: Sun, 1 Jun 2014 01:29:01 -0400 Subject: [PATCH 24/34] Can specify which port to run ruby server on. --- parsers/java/src/main/java/com/bitshift/parsing/Parse.java | 1 + parsers/ruby/Rakefile | 4 ++-- parsers/ruby/lib/parse_server.rb | 5 +++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/parsers/java/src/main/java/com/bitshift/parsing/Parse.java b/parsers/java/src/main/java/com/bitshift/parsing/Parse.java index ae54a9e..1964b59 100644 --- a/parsers/java/src/main/java/com/bitshift/parsing/Parse.java +++ b/parsers/java/src/main/java/com/bitshift/parsing/Parse.java @@ -6,6 +6,7 @@ public class Parse { public static void main(String[] args) { ParseServer server = new ParseServer(Integer.parseInt(args[0])); + System.out.println("Java Server listening on port " + args[0]); new Thread(server).start(); } diff --git a/parsers/ruby/Rakefile b/parsers/ruby/Rakefile index e66f695..f8cdf64 100644 --- a/parsers/ruby/Rakefile +++ b/parsers/ruby/Rakefile @@ -1,5 +1,5 @@ require File.expand_path('../lib/parse_server.rb', __FILE__) -task :start_server do |t| - start_server +task :start_server, [:port_number] do |t, args| + start_server Integer(args[:port_number]) end diff --git a/parsers/ruby/lib/parse_server.rb b/parsers/ruby/lib/parse_server.rb index 9a929aa..2c87e49 100644 --- a/parsers/ruby/lib/parse_server.rb +++ b/parsers/ruby/lib/parse_server.rb @@ -13,8 +13,9 @@ def pack_int(i) end -def start_server - server = TCPServer.new 5065 +def start_server(port_number) + server = TCPServer.new port_number + puts "Ruby Server listening on port #{port_number}\n" loop do # Start a new thread for each client accepted From c9520fd4e3a562e30a1cdb50fa4ab9c72854076f Mon Sep 17 00:00:00 2001 From: Benjamin Attal Date: Sun, 1 Jun 2014 01:29:28 -0400 Subject: [PATCH 25/34] Add function to start parse servers in parser/__init__.py --- bitshift/parser/__init__.py | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/bitshift/parser/__init__.py b/bitshift/parser/__init__.py index 927b397..7979f09 100644 --- a/bitshift/parser/__init__.py +++ b/bitshift/parser/__init__.py @@ -2,13 +2,24 @@ import json import sys import socket import struct +import subprocess +from os import path from pygments import lexers as pgl, util from ..languages import LANGS from .python import parse_py -_all__ = ["parse"] +_all__ = ["parse", "start_parse_servers"] + +PARSER_COMMANDS = [ + ('Java', ['mvn', '-f', + path.join(path.dirname(__file__), "../../parsers/java/pom.xml"), + 'exec:java', '-Dexec.args="%d"']), + ('Ruby', ['rake', '-f', + path.join(path.dirname(__file__), "../../parsers/ruby/Rakefile"), + "'start_server[%d]'"]) +] class UnsupportedFileError(Exception): pass @@ -72,6 +83,22 @@ def _recv_data(server_socket): server_socket.close() return ''.join(total_data) +def start_parse_servers(): + """ + Starts all the parse servers for languages besides python. + + :rtype: list + """ + + procs = [] + + for (lang, cmd) in PARSER_COMMANDS: + procs.append( + subprocess.Popen(' '.join(cmd) % (5001 + LANGS.index(lang)), + shell=True)) + + return procs + def parse(codelet): """ Dispatches the codelet to the correct parser based on its language. @@ -87,7 +114,7 @@ def parse(codelet): lang = _lang(codelet) source = codelet.code codelet.language = lang - server_socket_number = 5000 + lang + server_socket_number = 5001 + lang if lang == LANGS.index('Python'): parse_py(codelet) From 21cf52ea65dbe68f2529e70c799d8afa8796088e Mon Sep 17 00:00:00 2001 From: Benjamin Attal Date: Sun, 1 Jun 2014 01:29:41 -0400 Subject: [PATCH 26/34] Call start_parse_servers from crawl.py --- bitshift/crawler/crawl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bitshift/crawler/crawl.py b/bitshift/crawler/crawl.py index c121866..b91fc95 100644 --- a/bitshift/crawler/crawl.py +++ b/bitshift/crawler/crawl.py @@ -7,6 +7,7 @@ Contains functions for initializing all subsidiary, threaded crawlers. import logging, logging.handlers, os, Queue from bitshift.crawler import crawler, indexer +from bitshift.parser import parse, start_parse_servers __all__ = ["crawl"] @@ -32,6 +33,8 @@ def crawl(): for thread in threads: thread.start() + parse_servers = start_parse_servers() + def _configure_logging(): # This isn't ideal, since it means the bitshift python package must be kept # inside the app, but it works for now: From b6fabc54ecc8d77751482221bf159cf55dcec7c0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 1 Jun 2014 01:45:06 -0400 Subject: [PATCH 27/34] Can't use executemany() here. --- bitshift/database/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 1e49a8d..6027dc1 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -103,8 +103,9 @@ class Database(object): WHERE codelet_id = ?""" with self._conn.cursor(oursql.DictCursor) as dict_cursor: - dict_cursor.executemany(query, [(id,) for id in ids]) - for row in dict_cursor.fetchone(): + for codelet_id in ids: + dict_cursor.execute(query, (codelet_id,)) + row = dict_cursor.fetchall()[0] codelet_id = row["codelet_id"] if row["origin_url_base"]: url = row["codelet_url"] From b2d9ad5c972c2727f676bdf8f3c31ab5ff210144 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 1 Jun 2014 01:45:27 -0400 Subject: [PATCH 28/34] Missed a line. --- bitshift/database/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 6027dc1..8235aa7 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -106,7 +106,6 @@ class Database(object): for codelet_id in ids: dict_cursor.execute(query, (codelet_id,)) row = dict_cursor.fetchall()[0] - codelet_id = row["codelet_id"] if row["origin_url_base"]: url = row["codelet_url"] else: From ca4bb8ff424b5acde1472f71a56a3c87007e154e Mon Sep 17 00:00:00 2001 From: Benjamin Attal Date: Sun, 1 Jun 2014 01:52:35 -0400 Subject: [PATCH 29/34] Change format of symbols to fit databse model. --- bitshift/parser/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bitshift/parser/__init__.py b/bitshift/parser/__init__.py index 7979f09..295d45a 100644 --- a/bitshift/parser/__init__.py +++ b/bitshift/parser/__init__.py @@ -125,4 +125,11 @@ def parse(codelet): server_socket.send("%d\n%s" % (len(source), source)) symbols = json.loads(_recv_data(server_socket)) + symbols = {key: [(name, [tuple(loc) + for loc in syms[name]['assignments']], + [tuple(loc) for loc in syms[name]['uses']]) + for name in syms.keys()] + for key, syms in symbols.iteritems()} + + print symbols codelet.symbols = symbols From 7b9e98bc5b1c28b0440b861677f45e30fcb49396 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 1 Jun 2014 01:52:49 -0400 Subject: [PATCH 30/34] Thanks @riamse --- bitshift/database/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 8235aa7..0c7ec83 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -107,9 +107,9 @@ class Database(object): dict_cursor.execute(query, (codelet_id,)) row = dict_cursor.fetchall()[0] if row["origin_url_base"]: - url = row["codelet_url"] - else: url = row["origin_url_base"] + row["codelet_url"] + else: + url = row["codelet_url"] origin = (row["origin_name"], row["origin_url"], row["origin_image"]) authors = self._get_authors_for_codelet(cursor, codelet_id) From 7337638b72e2a78c8b1cc33302415ef3da370324 Mon Sep 17 00:00:00 2001 From: Benjamin Attal Date: Sun, 1 Jun 2014 01:55:00 -0400 Subject: [PATCH 31/34] Change form get_lexer to guess_lexer -- guess_lexer is more accurate. --- bitshift/parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bitshift/parser/__init__.py b/bitshift/parser/__init__.py index 295d45a..66484a4 100644 --- a/bitshift/parser/__init__.py +++ b/bitshift/parser/__init__.py @@ -38,7 +38,7 @@ def _lang(codelet): try: if codelet.filename: - lex = pgl.get_lexer_for_filename(codelet.filename) + lex = pgl.guess_lexer_for_filename(codelet.filename, codelet.code) else: lex = pgl.guess_lexer(codelet.code) except util.ClassNotFound: From 8acddb6834a4302743b0804668f15c9e86ae5a4b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 1 Jun 2014 02:44:04 -0400 Subject: [PATCH 32/34] Fix cache retrieval behavior. --- bitshift/database/__init__.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 0c7ec83..c5dfc19 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -160,22 +160,25 @@ class Database(object): :return: The total number of results, and the *n*\ th page of results. :rtype: 2-tuple of (long, list of :py:class:`.Codelet`\ s) """ - query1 = """SELECT cdata_codelet, cache_count_mnt, cache_count_exp + query1 = "SELECT 1 FROM cache WHERE cache_id = ?" + query2 = """SELECT cdata_codelet, cache_count_mnt, cache_count_exp FROM cache INNER JOIN cache_data ON cache_id = cdata_cache WHERE cache_id = ?""" - query2 = "INSERT INTO cache VALUES (?, ?, ?, DEFAULT)" - query3 = "INSERT INTO cache_data VALUES (?, ?)" + query3 = "INSERT INTO cache VALUES (?, ?, ?, DEFAULT)" + query4 = "INSERT INTO cache_data VALUES (?, ?)" cache_id = mmh3.hash64(str(page) + ":" + query.serialize())[0] with self._conn.cursor() as cursor: cursor.execute(query1, (cache_id,)) - results = cursor.fetchall() - if results: # Cache hit + cache_hit = cursor.fetchall() + if cache_hit: + cursor.execute(query2, (cache_id,)) + results = cursor.fetchall() num_results = results[0][1] * (10 ** results[0][2]) ids = [res[0] for res in results] - else: # Cache miss + else: ids, num_results = self._search_with_query(cursor, query, page) num_exp = max(len(str(num_results)) - 3, 0) num_results = int(round(num_results, -num_exp)) From 69b011ac129e293f773bfec236811e738564f155 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 1 Jun 2014 02:48:51 -0400 Subject: [PATCH 33/34] Fix. --- bitshift/database/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index c5dfc19..c1e6c2e 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -175,9 +175,9 @@ class Database(object): cache_hit = cursor.fetchall() if cache_hit: cursor.execute(query2, (cache_id,)) - results = cursor.fetchall() - num_results = results[0][1] * (10 ** results[0][2]) - ids = [res[0] for res in results] + rows = cursor.fetchall() + num_results = rows[0][1] * (10 ** rows[0][2]) if rows else 0 + ids = [row[0] for row in rows] else: ids, num_results = self._search_with_query(cursor, query, page) num_exp = max(len(str(num_results)) - 3, 0) From e0194ab3469e3368cd6f09c042a55b31b1b9d770 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 1 Jun 2014 02:52:24 -0400 Subject: [PATCH 34/34] Forgot to update query numbers. --- bitshift/database/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index c1e6c2e..311eb21 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -183,8 +183,8 @@ class Database(object): num_exp = max(len(str(num_results)) - 3, 0) num_results = int(round(num_results, -num_exp)) num_mnt = num_results / (10 ** num_exp) - cursor.execute(query2, (cache_id, num_mnt, num_exp)) - cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) + cursor.execute(query3, (cache_id, num_mnt, num_exp)) + cursor.executemany(query4, [(cache_id, c_id) for c_id in ids]) codelet_gen = self._get_codelets_from_ids(cursor, ids) return (num_results, list(codelet_gen))