From 5d9ef2774d63531ad5f9aab29466acc051e3fca8 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 31 May 2014 22:59:14 -0400 Subject: [PATCH] Some fixes, mainly involving language detection. --- .gitignore | 2 +- bitshift/codelet.py | 2 +- bitshift/languages.json | 281 +++++++++++++++++++++++++++++++++++++++++++- bitshift/parser/__init__.py | 37 +++--- bitshift/query/nodes.py | 2 +- 5 files changed, 305 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index 6156aee..b8b2697 100644 --- a/.gitignore +++ b/.gitignore @@ -51,4 +51,4 @@ target # Ctags */tags -log +logs diff --git a/bitshift/codelet.py b/bitshift/codelet.py index 865ae52..a12ec3a 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -27,7 +27,7 @@ class Codelet(object): """ def __init__(self, name, code, filename, language, authors, code_url, - date_created, date_modified, rank, symbols=None, origin=None): + date_created, date_modified, rank, symbols=None, origin=None): """ Create a Codelet instance. diff --git a/bitshift/languages.json b/bitshift/languages.json index 02ca0ad..d855164 100644 --- a/bitshift/languages.json +++ b/bitshift/languages.json @@ -1,4 +1,283 @@ { "_comment" : "A list of programming languages supported by `bitshift`.", - "languages" : ["Debian Sourcelist", "Delphi", "JavaScript+Mako", "Brainfuck", "Ceylon", "JavaScript+Django/Jinja", "HTML+Evoque", "NumPy", "Modula-2", "LiveScript", "Nimrod", "Bash", "HTML+Django/Jinja", "CSS+PHP", "XML+Lasso", "VimL", "CSS+Genshi Text", "Fancy", "Coldfusion HTML", "cfstatement", "Scalate Server Page", "Smarty", "XML+Evoque", "haXe", "PowerShell", "Tea", "HTML+Cheetah", "Mason", "Django/Jinja", "JAGS", "ApacheConf", "DTD", "Lighttpd configuration file", "Java", "JavaScript+Genshi Text", "Scheme", "Nemerle", "RHTML", "Ragel in Java Host", "Darcs Patch", "Puppet", "Octave", "CoffeeScript", "Ragel in D Host", "Scilab", "Monkey", "HTML+Myghty", "CSS", "JavaScript+Smarty", "Io", "COBOLFree", "Asymptote", "vhdl", "Python 3", "CSS+Ruby", "Fortran", "d-objdump", "MySQL", "REBOL", "C++", "ERB", "CBM BASIC V2", "Befunge", "Julia", "MoonScript", "Ruby", "XML+Smarty", "Dylan", "Groovy", "MoinMoin/Trac Wiki markup", "autohotkey", "C", "HTML", "Felix", "CMake", "NSIS", "SourcePawn", "Mako", "VGL", "Velocity", "Koka", "CUDA", "Gnuplot", "IRC logs", "Prolog", "Python", "CSS+Django/Jinja", "verilog", "Smalltalk", "JavaScript+Myghty", "YAML", "Julia console", "ANTLR With ActionScript Target", "XML+Mako", "XSLT", "UrbiScript", "Scaml", "S", "DylanLID", "MAQL", "sqlite3con", "Boo", "OCaml", "eC", "ActionScript", "VB.net", "SquidConf", "XQuery", "D", "Fantom", "Gettext Catalog", "Logos", "Lasso", "SCSS", "BBCode", "Haml", "FoxPro", "Python 3.0 Traceback", "MuPAD", "XML+Ruby", "Dart", "IDL", "dg", "Evoque", "Jade", "c-objdump", "Kconfig", "Java Server Page", "reg", "ABAP", "XML+Velocity", "JavaScript+Cheetah", "HTML+Mako", "Ragel in Ruby Host", "RobotFramework", "Protocol Buffer", "CFEngine3", "Ragel", "GLSL", "COBOL", "TypeScript", "Ada", "PostgreSQL SQL dialect", "Xtend", "Logtalk", "objdump", "CSS+Mako", "ca65", "Objective-C++", "Gherkin", "HTML+PHP", "Makefile", "PostScript", "Hxml", "Kotlin", "PL/pgSQL", "Vala", "Haskell", "Bro", "Lua", "POVRay", "Sass", "ANTLR With Java Target", "Tcl", "ANTLR With ObjectiveC Target", "JavaScript+Ruby", "Racket", "AspectJ", "Base Makefile", "ANTLR With Python Target", "cpp-objdump", "Genshi Text", "Ioke", "PyPy Log", "Croc", "Objective-J", "GAS", "Batchfile", "Snobol", "XML", "ANTLR", "Opa", "XML+Cheetah", "Go", "Diff", "MiniD", "Cython", "Ragel in C Host", "Erlang", "Debian Control file", "aspx-vb", "BUGS", "Ragel in CPP Host", "aspx-cs", "Properties", "Groff", "Clojure", "Modelica", "QML", "JavaScript+Lasso", "ANTLR With Perl Target", "Genshi", "BlitzMax", "Treetop", "Matlab", "Myghty", "HTML+Genshi", "Duel", "Perl", "FSharp", "reStructuredText", "NewLisp", "Scala", "CSS+Lasso", "XML+PHP", "Stan", "INI", "MOOCode", "Shell Session", "RPMSpec", "Newspeak", "Bash Session", "Coq", "Raw token data", "Tcsh", "HTML+Lasso", "C#", "Gosu Template", "RConsole", "MXML", "TeX", "CSS+Smarty", "Text only", "ANTLR With C# Target", "OpenEdge ABL", "Cheetah", "Smali", "CSS+Myghty", "Rd", "LLVM", "Standard ML", "Elixir", "Nginx configuration file", "GoodData-CL", "AppleScript", "HTML+Smarty", "Objective-C", "JavaScript", "Rust", "Common Lisp", "Embedded Ragel", "ActionScript 3", "systemverilog", "Literate Haskell", "Python Traceback", "PHP", "ANTLR With CPP Target", "Gosu", "Hybris", "JavaScript+PHP", "Factor", "HTML+Velocity", "Mscgen", "Ooc", "SQL", "HTTP", "ECL", "Redcode", "Ragel in Objective C Host", "XML+Django/Jinja", "Awk", "JSON", "NASM", "ANTLR With Ruby Target", "XML+Myghty", "AutoIt", "Mako", "CSS+Mako", "HTML+Mako", "XML+Mako", "JavaScript+Mako"] + "languages" : [ + "Debian Sourcelist", + "Delphi", + "JavaScript+Mako", + "Brainfuck", + "Ceylon", + "JavaScript+Django/Jinja", + "HTML+Evoque", + "NumPy", + "Modula-2", + "LiveScript", + "Nimrod", + "Bash", + "HTML+Django/Jinja", + "CSS+PHP", + "XML+Lasso", + "VimL", + "CSS+Genshi Text", + "Fancy", + "Coldfusion HTML", + "cfstatement", + "Scalate Server Page", + "Smarty", + "XML+Evoque", + "haXe", + "PowerShell", + "Tea", + "HTML+Cheetah", + "Mason", + "Django/Jinja", + "JAGS", + "ApacheConf", + "DTD", + "Lighttpd configuration file", + "Java", + "JavaScript+Genshi Text", + "Scheme", + "Nemerle", + "RHTML", + "Ragel in Java Host", + "Darcs Patch", + "Puppet", + "Octave", + "CoffeeScript", + "Ragel in D Host", + "Scilab", + "Monkey", + "HTML+Myghty", + "CSS", + "JavaScript+Smarty", + "Io", + "COBOLFree", + "Asymptote", + "vhdl", + "CSS+Ruby", + "Fortran", + "d-objdump", + "MySQL", + "REBOL", + "C++", + "ERB", + "CBM BASIC V2", + "Befunge", + "Julia", + "MoonScript", + "Ruby", + "XML+Smarty", + "Dylan", + "Groovy", + "MoinMoin/Trac Wiki markup", + "autohotkey", + "C", + "HTML", + "Felix", + "CMake", + "NSIS", + "SourcePawn", + "Mako", + "VGL", + "Velocity", + "Koka", + "CUDA", + "Gnuplot", + "IRC logs", + "Prolog", + "Python", + "CSS+Django/Jinja", + "verilog", + "Smalltalk", + "JavaScript+Myghty", + "YAML", + "Julia console", + "ANTLR With ActionScript Target", + "XML+Mako", + "XSLT", + "UrbiScript", + "Scaml", + "S", + "DylanLID", + "MAQL", + "sqlite3con", + "Boo", + "OCaml", + "eC", + "ActionScript", + "VB.net", + "SquidConf", + "XQuery", + "D", + "Fantom", + "Gettext Catalog", + "Logos", + "Lasso", + "SCSS", + "BBCode", + "Haml", + "FoxPro", + "MuPAD", + "XML+Ruby", + "Dart", + "IDL", + "dg", + "Evoque", + "Jade", + "c-objdump", + "Kconfig", + "Java Server Page", + "reg", + "ABAP", + "XML+Velocity", + "JavaScript+Cheetah", + "HTML+Mako", + "Ragel in Ruby Host", + "RobotFramework", + "Protocol Buffer", + "CFEngine3", + "Ragel", + "GLSL", + "COBOL", + "TypeScript", + "Ada", + "PostgreSQL SQL dialect", + "Xtend", + "Logtalk", + "objdump", + "CSS+Mako", + "ca65", + "Objective-C++", + "Gherkin", + "HTML+PHP", + "Makefile", + "PostScript", + "Hxml", + "Kotlin", + "PL/pgSQL", + "Vala", + "Haskell", + "Bro", + "Lua", + "POVRay", + "Sass", + "ANTLR With Java Target", + "Tcl", + "ANTLR With ObjectiveC Target", + "JavaScript+Ruby", + "Racket", + "AspectJ", + "Base Makefile", + "ANTLR With Python Target", + "cpp-objdump", + "Genshi Text", + "Ioke", + "PyPy Log", + "Croc", + "Objective-J", + "GAS", + "Batchfile", + "Snobol", + "XML", + "ANTLR", + "Opa", + "XML+Cheetah", + "Go", + "Diff", + "MiniD", + "Cython", + "Ragel in C Host", + "Erlang", + "Debian Control file", + "aspx-vb", + "BUGS", + "Ragel in CPP Host", + "aspx-cs", + "Properties", + "Groff", + "Clojure", + "Modelica", + "QML", + "JavaScript+Lasso", + "ANTLR With Perl Target", + "Genshi", + "BlitzMax", + "Treetop", + "Matlab", + "Myghty", + "HTML+Genshi", + "Duel", + "Perl", + "FSharp", + "reStructuredText", + "NewLisp", + "Scala", + "CSS+Lasso", + "XML+PHP", + "Stan", + "INI", + "MOOCode", + "Shell Session", + "RPMSpec", + "Newspeak", + "Bash Session", + "Coq", + "Raw token data", + "Tcsh", + "HTML+Lasso", + "C#", + "Gosu Template", + "RConsole", + "MXML", + "TeX", + "CSS+Smarty", + "Text only", + "ANTLR With C# Target", + "OpenEdge ABL", + "Cheetah", + "Smali", + "CSS+Myghty", + "Rd", + "LLVM", + "Standard ML", + "Elixir", + "Nginx configuration file", + "GoodData-CL", + "AppleScript", + "HTML+Smarty", + "Objective-C", + "JavaScript", + "Rust", + "Common Lisp", + "Embedded Ragel", + "ActionScript 3", + "systemverilog", + "Literate Haskell", + "PHP", + "ANTLR With CPP Target", + "Gosu", + "Hybris", + "JavaScript+PHP", + "Factor", + "HTML+Velocity", + "Mscgen", + "Ooc", + "SQL", + "HTTP", + "ECL", + "Redcode", + "Ragel in Objective C Host", + "XML+Django/Jinja", + "Awk", + "JSON", + "NASM", + "ANTLR With Ruby Target", + "XML+Myghty", + "AutoIt", + "Mako", + "CSS+Mako", + "HTML+Mako", + "XML+Mako", + "JavaScript+Mako" + ] } diff --git a/bitshift/parser/__init__.py b/bitshift/parser/__init__.py index bc22514..79fca78 100644 --- a/bitshift/parser/__init__.py +++ b/bitshift/parser/__init__.py @@ -1,4 +1,10 @@ -import json, pygments.lexers as pgl, sys, socket, struct +import json +import sys +import socket +import struct + +from pygments import lexers as pgl, util + from ..languages import LANGS from .python import parse_py @@ -19,13 +25,14 @@ def _lang(codelet): Modify function to incorporate tags from stackoverflow. """ - if codelet.filename is not None: - try: - return pgl.guess_lexer_for_filename(codelet.filename, codelet.code).name - except: - raise UnsupportedFileError('Could not find a lexer for the codelet\'s filename') - - return LANGS.index(pgl.guess_lexer(codelet.code)) + try: + if codelet.filename: + lex = pgl.guess_lexer_for_filename(codelet.filename, codelet.code) + else: + lex = pgl.guess_lexer(codelet.code) + except util.ClassNotFound: + raise UnsupportedFileError(codelet.filename) + return LANGS.index(lex.name) def _recv_data(server_socket): """ @@ -39,8 +46,9 @@ def _recv_data(server_socket): """ recv_size = 8192 - total_data = []; size_data = cur_data = '' - total_size = 0; size = sys.maxint + total_data = [] + size_data = cur_data = '' + total_size, size = 0, sys.maxint while total_size < size: cur_data = server_socket.recv(recv_size) @@ -61,8 +69,7 @@ def _recv_data(server_socket): total_size = sum([len(s) for s in total_data]) server_socket.close() - return ''.join(total_data); - + return ''.join(total_data) def parse(codelet): """ @@ -76,7 +83,8 @@ def parse(codelet): :type code: Codelet """ - lang = _lang(codelet); source = codelet.code + lang = _lang(codelet) + source = codelet.code codelet.language = lang server_socket_number = 5000 + lang @@ -86,8 +94,7 @@ def parse(codelet): else: server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_socket.connect(("localhost", server_socket_number)) - server_socket.send("%d\n%s" % (len(source), source)); + server_socket.send("%d\n%s" % (len(source), source)) symbols = json.loads(_recv_data(server_socket)) codelet.symbols = symbols - diff --git a/bitshift/query/nodes.py b/bitshift/query/nodes.py index 5d157b5..d375ffb 100644 --- a/bitshift/query/nodes.py +++ b/bitshift/query/nodes.py @@ -195,7 +195,7 @@ class Symbol(_Node): CLASS = 1 VARIABLE = 2 TYPES = {FUNCTION: "FUNCTION", CLASS: "CLASS", VARIABLE: "VARIABLE"} - TYPES_INV = ["functions", "classes", "variables"] + TYPES_INV = ["functions", "classes", "vars"] def __init__(self, type_, name): """