diff --git a/.gitignore b/.gitignore index b8b2697..bfad355 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,4 @@ target # Ctags */tags logs +Gemfile.lock diff --git a/bitshift/crawler/crawl.py b/bitshift/crawler/crawl.py index d34deb9..40238af 100644 --- a/bitshift/crawler/crawl.py +++ b/bitshift/crawler/crawl.py @@ -8,6 +8,7 @@ import logging import logging.handlers import os import Queue +import sys import time from threading import Event @@ -37,9 +38,9 @@ def crawl(): crawler.BitbucketCrawler(repo_clone_queue, run_event), indexer.GitIndexer(repo_clone_queue, run_event)] + parse_servers = start_parse_servers() for thread in threads: thread.start() - parse_servers = start_parse_servers() try: while 1: @@ -77,5 +78,4 @@ def _configure_logging(): root_logger.setLevel(logging.NOTSET) if __name__ == "__main__": - _configure_logging() crawl() diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 655725f..ba04412 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -86,8 +86,8 @@ class GitHubCrawler(threading.Thread): time.sleep(1) self.clone_queue.put(indexer.GitRepository( - repo["html_url"], repo["full_name"].replace("/", ""), - "GitHub", repo_ranks[repo["full_name"]])) + repo["html_url"], repo["full_name"], "GitHub", + repo_ranks[repo["full_name"]])) if int(resp.headers["x-ratelimit-remaining"]) == 0: time.sleep(int(resp.headers["x-ratelimit-reset"]) - diff --git a/bitshift/languages.json b/bitshift/languages.json deleted file mode 100644 index d855164..0000000 --- a/bitshift/languages.json +++ /dev/null @@ -1,283 +0,0 @@ -{ - "_comment" : "A list of programming languages supported by `bitshift`.", - "languages" : [ - "Debian Sourcelist", - "Delphi", - "JavaScript+Mako", - "Brainfuck", - "Ceylon", - "JavaScript+Django/Jinja", - "HTML+Evoque", - "NumPy", - "Modula-2", - "LiveScript", - "Nimrod", - "Bash", - "HTML+Django/Jinja", - "CSS+PHP", - "XML+Lasso", - "VimL", - "CSS+Genshi Text", - "Fancy", - "Coldfusion HTML", - "cfstatement", - "Scalate Server Page", - "Smarty", - "XML+Evoque", - "haXe", - "PowerShell", - "Tea", - "HTML+Cheetah", - "Mason", - "Django/Jinja", - "JAGS", - "ApacheConf", - "DTD", - "Lighttpd configuration file", - "Java", - "JavaScript+Genshi Text", - "Scheme", - "Nemerle", - "RHTML", - "Ragel in Java Host", - "Darcs Patch", - "Puppet", - "Octave", - "CoffeeScript", - "Ragel in D Host", - "Scilab", - "Monkey", - "HTML+Myghty", - "CSS", - "JavaScript+Smarty", - "Io", - "COBOLFree", - "Asymptote", - "vhdl", - "CSS+Ruby", - "Fortran", - "d-objdump", - "MySQL", - "REBOL", - "C++", - "ERB", - "CBM BASIC V2", - "Befunge", - "Julia", - "MoonScript", - "Ruby", - "XML+Smarty", - "Dylan", - "Groovy", - "MoinMoin/Trac Wiki markup", - "autohotkey", - "C", - "HTML", - "Felix", - "CMake", - "NSIS", - "SourcePawn", - "Mako", - "VGL", - "Velocity", - "Koka", - "CUDA", - "Gnuplot", - "IRC logs", - "Prolog", - "Python", - "CSS+Django/Jinja", - "verilog", - "Smalltalk", - "JavaScript+Myghty", - "YAML", - "Julia console", - "ANTLR With ActionScript Target", - "XML+Mako", - "XSLT", - "UrbiScript", - "Scaml", - "S", - "DylanLID", - "MAQL", - "sqlite3con", - "Boo", - "OCaml", - "eC", - "ActionScript", - "VB.net", - "SquidConf", - "XQuery", - "D", - "Fantom", - "Gettext Catalog", - "Logos", - "Lasso", - "SCSS", - "BBCode", - "Haml", - "FoxPro", - "MuPAD", - "XML+Ruby", - "Dart", - "IDL", - "dg", - "Evoque", - "Jade", - "c-objdump", - "Kconfig", - "Java Server Page", - "reg", - "ABAP", - "XML+Velocity", - "JavaScript+Cheetah", - "HTML+Mako", - "Ragel in Ruby Host", - "RobotFramework", - "Protocol Buffer", - "CFEngine3", - "Ragel", - "GLSL", - "COBOL", - "TypeScript", - "Ada", - "PostgreSQL SQL dialect", - "Xtend", - "Logtalk", - "objdump", - "CSS+Mako", - "ca65", - "Objective-C++", - "Gherkin", - "HTML+PHP", - "Makefile", - "PostScript", - "Hxml", - "Kotlin", - "PL/pgSQL", - "Vala", - "Haskell", - "Bro", - "Lua", - "POVRay", - "Sass", - "ANTLR With Java Target", - "Tcl", - "ANTLR With ObjectiveC Target", - "JavaScript+Ruby", - "Racket", - "AspectJ", - "Base Makefile", - "ANTLR With Python Target", - "cpp-objdump", - "Genshi Text", - "Ioke", - "PyPy Log", - "Croc", - "Objective-J", - "GAS", - "Batchfile", - "Snobol", - "XML", - "ANTLR", - "Opa", - "XML+Cheetah", - "Go", - "Diff", - "MiniD", - "Cython", - "Ragel in C Host", - "Erlang", - "Debian Control file", - "aspx-vb", - "BUGS", - "Ragel in CPP Host", - "aspx-cs", - "Properties", - "Groff", - "Clojure", - "Modelica", - "QML", - "JavaScript+Lasso", - "ANTLR With Perl Target", - "Genshi", - "BlitzMax", - "Treetop", - "Matlab", - "Myghty", - "HTML+Genshi", - "Duel", - "Perl", - "FSharp", - "reStructuredText", - "NewLisp", - "Scala", - "CSS+Lasso", - "XML+PHP", - "Stan", - "INI", - "MOOCode", - "Shell Session", - "RPMSpec", - "Newspeak", - "Bash Session", - "Coq", - "Raw token data", - "Tcsh", - "HTML+Lasso", - "C#", - "Gosu Template", - "RConsole", - "MXML", - "TeX", - "CSS+Smarty", - "Text only", - "ANTLR With C# Target", - "OpenEdge ABL", - "Cheetah", - "Smali", - "CSS+Myghty", - "Rd", - "LLVM", - "Standard ML", - "Elixir", - "Nginx configuration file", - "GoodData-CL", - "AppleScript", - "HTML+Smarty", - "Objective-C", - "JavaScript", - "Rust", - "Common Lisp", - "Embedded Ragel", - "ActionScript 3", - "systemverilog", - "Literate Haskell", - "PHP", - "ANTLR With CPP Target", - "Gosu", - "Hybris", - "JavaScript+PHP", - "Factor", - "HTML+Velocity", - "Mscgen", - "Ooc", - "SQL", - "HTTP", - "ECL", - "Redcode", - "Ragel in Objective C Host", - "XML+Django/Jinja", - "Awk", - "JSON", - "NASM", - "ANTLR With Ruby Target", - "XML+Myghty", - "AutoIt", - "Mako", - "CSS+Mako", - "HTML+Mako", - "XML+Mako", - "JavaScript+Mako" - ] -} diff --git a/bitshift/languages.py b/bitshift/languages.py index 36d7f63..d6395bb 100644 --- a/bitshift/languages.py +++ b/bitshift/languages.py @@ -1,5 +1,21 @@ -import json from os import path -with open(path.join(path.dirname(__file__), "languages.json")) as lang_json: - LANGS = [lang for lang in json.load(lang_json)["languages"]] +import yaml + +__all__ = ["LANGS", "LANGS_ALL"] + +def _load_langs(): + filename = path.join(path.dirname(__file__), "languages.yml") + with open(filename) as fp: + data = yaml.load(fp)["languages"] + langs = [it.keys()[0] if isinstance(it, dict) else it for it in data] + all_langs = {} + for i, lang in enumerate(data): + if isinstance(lang, dict): + for val in lang.values()[0]: + all_langs[val] = i + else: + all_langs[lang] = i + return langs, all_langs + +LANGS, LANGS_ALL = _load_langs() diff --git a/bitshift/languages.yml b/bitshift/languages.yml new file mode 100644 index 0000000..cd29c7e --- /dev/null +++ b/bitshift/languages.yml @@ -0,0 +1,368 @@ +# A list of programming languages supported by bitshift: + +languages: + # With parsers: + - Python: + - Python + - Python 3 + - Python 3.0 Traceback + - Python console session + - Python Traceback + - NumPy + - PyPy Log + - C + - Java + - Ruby: + - Ruby + - Ruby irb session + + # Without parsers: + - ABAP + - APL + - ActionScript: + - ActionScript + - ActionScript 3 + - ANTLR: + - ANTLR + - ANTLR With ActionScript Target + - ANTLR With CPP Target + - "ANTLR With C# Target" + - ANTLR With Java Target + - ANTLR With ObjectiveC Target + - ANTLR With Perl Target + - ANTLR With Python Target + - ANTLR With Ruby Target + - Ada + - Agda: + - Agda + - Literate Agda + - Alloy + - AmbientTalk + - ApacheConf + - AppleScript + - AspectJ + - aspx-cs + - aspx-vb + - Asymptote + - autohotkey + - AutoIt + - Awk + - BBCode + - BUGS + - Bash: + - Bash + - Bash Session + - Batchfile + - Befunge + - BlitzBasic: + - BlitzBasic + - BlitzMax + - Boo + - Brainfuck + - Bro + - "C#" + - C++ + - ca65 + - CBM BASIC V2 + - Ceylon + - CFEngine3 + - cfstatement + - ChaiScript + - Chapel + - Cheetah + - Cirru + - Clay + - Clojure: + - Clojure + - ClojureScript + - CMake + - COBOL: + - COBOL + - COBOLFree + - CoffeeScript + - Coldfusion CFC + - Coldfusion HTML + - Common Lisp + - Coq + - Croc + - Cryptol: + - Cryptol + - Literate Cryptol + - CSS: + - CSS + - CSS+Django/Jinja + - CSS+Genshi Text + - CSS+Lasso + - CSS+Mako + - CSS+Mako + - CSS+Myghty + - CSS+PHP + - CSS+Ruby + - CSS+Smarty + - CUDA + - Cypher + - Cython + - D + - Darcs Patch + - Dart + - Debian Control file + - Debian Sourcelist + - Delphi + - dg + - Diff + - Django/Jinja + - Docker + - DTD + - Duel + - Dylan: + - Dylan + - Dylan session + - DylanLID + - EBNF + - eC + - ECL + - Eiffel + - Elixir: + - Elixir + - Elixir iex session + - Embedded Ragel + - ERB: + - ERB + - RHTML + - Erlang: + - Erlang + - Erlang erl session + - Evoque + - Factor + - Fancy + - Fantom + - Felix + - Fortran + - FoxPro + - FSharp + - GAP + - GAS + - Genshi + - Genshi Text + - Gettext Catalog + - Gherkin + - GLSL + - Gnuplot + - Go + - Golo + - GoodData-CL + - Gosu + - Gosu Template + - Groff + - Groovy + - Haml + - Handlebars + - Haskell: + - Haskell + - Literate Haskell + - Haxe + - HTML: + - HTML + - HTML+Cheetah + - HTML+Django/Jinja + - HTML+Evoque + - HTML+Genshi + - HTML+Lasso + - HTML+Mako + - HTML+Mako + - HTML+Myghty + - HTML+PHP + - HTML+Smarty + - HTML+Velocity + - Hxml + - Hy + - Hybris + - IDL + - Idris: + - Idris + - Literate Idris + - Igor + - Inform 6: + - Inform 6 + - Inform 6 template + - Inform 7 + - INI + - Io + - Ioke + - Jade + - JAGS + - Jasmin + - Java Server Page + - JavaScript: + - JavaScript + - JavaScript+Cheetah + - JavaScript+Django/Jinja + - JavaScript+Genshi Text + - JavaScript+Lasso + - JavaScript+Mak + - JavaScript+Mako + - JavaScript+Myghty + - JavaScript+PHP + - JavaScript+Ruby + - JavaScript+Smarty + - JSON + - Julia: + - Julia + - Julia console + - Kal + - Kconfig + - Koka + - Kotlin + - Lasso + - Lighttpd configuration file + - Limbo + - LiveScript + - LLVM + - Logos + - Logtalk + - LSL + - Lua + - Makefile + - Makefile + - Base Makefile + - Mako + - MAQL + - Mask + - Mason + - Mathematica + - Matlab: + - Matlab + - Matlab session + - MiniD + - Modelica + - Modula-2 + - Monkey + - MOOCode + - MoonScript + - MQL + - Mscgen + - MuPAD + - MXML + - Myghty + - NASM + - Nemerle + - nesC + - NewLisp + - Newspeak + - Nginx configuration file + - Nimrod + - Nix + - NSIS + - Objective-C + - Objective-C++ + - Objective-J + - OCaml + - Octave + - Ooc + - Opa + - OpenEdge ABL + - Pan + - Pawn + - Perl: + - Perl + - Perl6 + - PHP + - Pig + - Pike + - PostScript + - POVRay + - PowerShell + - Prolog + - Properties + - Protocol Buffer + - Puppet + - QBasic + - QML + - Racket + - Ragel: + - Ragel + - Ragel in C Host + - Ragel in CPP Host + - Ragel in D Host + - Ragel in Java Host + - Ragel in Objective C Host + - Ragel in Ruby Host + - RConsole + - Rd + - REBOL + - Red + - Redcode + - reg + - reStructuredText + - Rexx + - RobotFramework + - RPMSpec + - RQL + - RSL + - Rust + - S + - Sass: + - Sass + - SCSS + - Scala + - Scalate Server Page + - Scaml + - Scheme + - Scilab + - Shell Session + - Slim + - Smali + - Smalltalk + - Smarty + - Snobol + - SourcePawn + - SPARQL + - SQL: + - SQL + - MySQL + - PL/pgSQL + - PostgreSQL console (psql) + - PostgreSQL SQL dialect + - sqlite3con + + - SquidConf + - Stan + - Standard ML + - SWIG + - systemverilog + - Tcl + - Tcsh + - Tea + - TeX + - Todotxt + - Treetop + - TypeScript + - UrbiScript + - Vala + - VB.net + - VCTreeStatus + - Velocity + - verilog + - VGL + - vhdl + - VimL + - XML: + - XML + - XML+Cheetah + - XML+Django/Jinja + - XML+Evoque + - XML+Lasso + - XML+Mako + - XML+Mako + - XML+Myghty + - XML+PHP + - XML+Ruby + - XML+Smarty + - XML+Velocity + - XQuery + - XSLT + - Xtend + - YAML: + - YAML + - YAML+Jinja + - Zephir diff --git a/bitshift/parser/__init__.py b/bitshift/parser/__init__.py index 6d4229b..88dc03e 100644 --- a/bitshift/parser/__init__.py +++ b/bitshift/parser/__init__.py @@ -7,7 +7,7 @@ import subprocess from os import path from pygments import lexers as pgl, util -from ..languages import LANGS +from ..languages import LANGS, LANGS_ALL from .python import parse_py __all__ = ["parse", "UnsupportedFileError", "start_parse_servers"] @@ -41,11 +41,10 @@ def _lang(codelet): lex = pgl.guess_lexer_for_filename(codelet.filename, codelet.code) else: lex = pgl.guess_lexer(codelet.code) - except util.ClassNotFound: + return LANGS_ALL[lex.name] + except (util.ClassNotFound, KeyError): raise UnsupportedFileError(codelet.filename) - return LANGS.index(lex.name) - def _recv_data(server_socket): """ Private function to read string response from a server. It reads a certain diff --git a/setup.py b/setup.py index f268991..869c896 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( install_requires = [ "Flask>=0.10.1", "gunicorn>=18.0", "pygments>=1.6", "requests>=2.2.0", "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3", - "python-dateutil>=2.2"], + "PyYAML>=3.11", "python-dateutil>=2.2"], author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", license = "MIT", url = "https://github.com/earwig/bitshift"