From 6a4ba580ed024ada5efcfe2149d28b4f4d992d3d Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Fri, 11 Apr 2014 12:43:34 -0400 Subject: [PATCH 01/42] Add Codelet, crawler dependencies to setup. Add: bitshift/codelet.py -add Codelet class with constructor. README.md -add SASS stylesheet documentation --- README.md | 7 ++++++ bitshift/assets.py | 3 +-- bitshift/codelet.py | 46 +++++++++++++++++++++++++++++++------- docs/source/api/bitshift.query.rst | 11 +++++++++ docs/source/api/bitshift.rst | 43 ++++++++++++++++++++++++++--------- setup.py | 3 ++- 6 files changed, 91 insertions(+), 22 deletions(-) create mode 100644 docs/source/api/bitshift.query.rst diff --git a/README.md b/README.md index 3cb81a1..0fe39d0 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,11 @@ Branches - `feature/*`: individual components of the project with untested, likely horribly broken code - branch off from and merge into `develop` when done +Style +----- +bitshift uses [SASS][SASS] for styling; compile the stylesheets to CSS with +`sass --watch static/sass/:static/css`. + Documentation ------------- @@ -24,3 +29,5 @@ new modules or packages, but *not* when adding functions or changing docstrings), run `sphinx-apidoc -fo docs/source/api bitshift` from the project root. Note that this will revert any custom changes made to the files in `docs/source/api`, so you might want to update them by hand instead. + +[SASS]: http://sass-lang.com/guide diff --git a/bitshift/assets.py b/bitshift/assets.py index 90564d2..5d15304 100644 --- a/bitshift/assets.py +++ b/bitshift/assets.py @@ -1,6 +1,5 @@ """ -.. module:: assets - :synopsis: Helper functions for use inside the project's Jinja templates. +:synopsis: Helper functions for use inside the project's Jinja templates. """ from flask import Markup diff --git a/bitshift/codelet.py b/bitshift/codelet.py index df81294..5c8ec40 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -1,13 +1,43 @@ __all__ = ["Codelet"] class Codelet(object): - ## object to store the following (it doesn't need to do anything with it): - ## author name, URL, date created/modified, language, source code itself - ## for VCS: project name, file in project - ## also: list of functions, etc (associations data) + """ + A source-code object with code metadata and composition analysis. - ## DICTIONARY MAPPING STRINGS REPRESENTING ASSOCIATION TYPE WITH DICTIONARIES - ## MAPPING ASSOCIATION NAMES WITH TUPLES REPRESENTING THEIR PLACE IN THE FILE - ## STORED AS TWO INTEGERS REPRESENTING THE ROW AND THE COLUMN + :ivar code: (string) A containing the raw source code. + :ivar language: (string) The inferred language of `code`. + :ivar author: (string) The + :ivar url: The url of the (page containing the) source code. + :ivar date_created: The date the code was published. + :ivar date_modified: The date the code was last modified. + """ - ## {"functions": {"foo": (12, 13), "bar": (53, 3)}} + def __init__(self, code, author, language, code_url, author_url, + date_created, date_modified): + """ + Create a Codelet instance. + + :param code: The raw source code. + :param author: The author of the code. + :param language: The inferred language. + :param code_url: The url of the (page containing the) source code. + :param author_url: The url of the code author's public profile on the + framework said code was retrieved from. + :param date_created: The date the code was published. + :param date_modified: The date the code was last modified. + + :type code: string + :type language: string + :type author: string + :type url: string + :type date_created: string + :type date_modified: string + """ + + self.code = code + self.author = author + self.language = language + self.code_url = code_url + self.author_url = author_url + self.date_created = date_created + self.date_modified = date_modified diff --git a/docs/source/api/bitshift.query.rst b/docs/source/api/bitshift.query.rst new file mode 100644 index 0000000..35b39a6 --- /dev/null +++ b/docs/source/api/bitshift.query.rst @@ -0,0 +1,11 @@ +query Package +============= + +:mod:`query` Package +-------------------- + +.. automodule:: bitshift.query + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/source/api/bitshift.rst b/docs/source/api/bitshift.rst index a5f0898..1b1c703 100644 --- a/docs/source/api/bitshift.rst +++ b/docs/source/api/bitshift.rst @@ -1,30 +1,51 @@ -bitshift package +bitshift Package ================ -Submodules ----------- +:mod:`bitshift` Package +----------------------- -bitshift.assets module ----------------------- +.. automodule:: bitshift.__init__ + :members: + :undoc-members: + :show-inheritance: + +:mod:`assets` Module +-------------------- .. automodule:: bitshift.assets :members: :undoc-members: :show-inheritance: -bitshift.config module ----------------------- +:mod:`codelet` Module +--------------------- -.. automodule:: bitshift.config +.. automodule:: bitshift.codelet :members: :undoc-members: :show-inheritance: +:mod:`config` Module +-------------------- -Module contents ---------------- +.. automodule:: bitshift.config + :members: + :undoc-members: + :show-inheritance: + +:mod:`database` Module +---------------------- -.. automodule:: bitshift +.. automodule:: bitshift.database :members: :undoc-members: :show-inheritance: + +Subpackages +----------- + +.. toctree:: + + bitshift.parser + bitshift.query + diff --git a/setup.py b/setup.py index 0ec5f77..1faa5b9 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,8 @@ setup( name = "bitshift", version = "0.1", packages = find_packages(), - install_requires = ["Flask>=0.10.1", "pygments>=1.6"], + install_requires = ["Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", + "BeautifulSoup>=3.2.1"], author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", license = "MIT", url = "https://github.com/earwig/bitshift" From 20b518fccc730b0891229a02b43d0cf5cac4a683 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Fri, 11 Apr 2014 13:03:03 -0400 Subject: [PATCH 02/42] Minor refactor of codelet. Add: bitshift/codelet.py -complete docstrings, add filename to Codelet constructor. --- bitshift/codelet.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/bitshift/codelet.py b/bitshift/codelet.py index 5c8ec40..08b0d36 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -4,37 +4,39 @@ class Codelet(object): """ A source-code object with code metadata and composition analysis. - :ivar code: (string) A containing the raw source code. - :ivar language: (string) The inferred language of `code`. - :ivar author: (string) The - :ivar url: The url of the (page containing the) source code. - :ivar date_created: The date the code was published. - :ivar date_modified: The date the code was last modified. + :ivar code: (str) A containing the raw source code. + :ivar filename: (str, or None) The filename of the snippet. + :ivar language: (str, or None) The inferred language of `code`. + :ivar author: (str, or None) The name of the code's author. + :ivar url: (str) The url of the (page containing the) source code. + :ivar date_created: (str, or None) The date the code was published. + :ivar date_modified: (str, or None) The date the code was last modified. """ - def __init__(self, code, author, language, code_url, author_url, - date_created, date_modified): + def __init__(self, code, filename, author, language, code_url, author_url, + date_created, date_modified): """ Create a Codelet instance. :param code: The raw source code. + :param filename: The filename of the code, if any. :param author: The author of the code. :param language: The inferred language. :param code_url: The url of the (page containing the) source code. - :param author_url: The url of the code author's public profile on the - framework said code was retrieved from. :param date_created: The date the code was published. :param date_modified: The date the code was last modified. - :type code: string - :type language: string - :type author: string - :type url: string - :type date_created: string - :type date_modified: string + :type code: str + :type filename: str, or None + :type language: str, or None + :type author: str, or None + :type url: str + :type date_created: str, or None + :type date_modified: str, or None """ self.code = code + self.filename = filename self.author = author self.language = language self.code_url = code_url From 962dd9aef55a50a5ffa395dc78e897158157b27d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 14 Apr 2014 12:02:23 -0400 Subject: [PATCH 03/42] Docstrings for Database methods; oursql dependency. --- app.py | 9 ++++++--- bitshift/database.py | 21 +++++++++++++++++++++ setup.py | 2 +- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/app.py b/app.py index c4083c9..2e3b0c8 100644 --- a/app.py +++ b/app.py @@ -5,6 +5,7 @@ Module to contain all the project's Flask server plumbing. from flask import Flask from flask import render_template, session +from bitshift.database import Database from bitshift.query import parse_query app = Flask(__name__) @@ -12,7 +13,9 @@ app.config.from_object("bitshift.config") app_env = app.jinja_env app_env.line_statement_prefix = "=" -app_env.globals.update(assets = assets) +app_env.globals.update(assets=assets) + +database = Database() @app.route("/") def index(): @@ -20,8 +23,8 @@ def index(): @app.route("/search/") def search(query): - ## tree = parse_query(query) - ## database.search(tree) + tree = parse_query(query) + database.search(tree) pass if __name__ == "__main__": diff --git a/bitshift/database.py b/bitshift/database.py index b8995ee..36b984e 100644 --- a/bitshift/database.py +++ b/bitshift/database.py @@ -16,3 +16,24 @@ class Database(object): def _create(self): pass + + def search(self, query): + """ + Search the database. + + :param query: The query to search for. + :type query: :py:class:`~.query.tree.Tree` + + :return: A list of search results. + :rtype: list of :py:class:`.Codelet`\ s + """ + pass + + def insert(self, codelet): + """ + Insert a codelet into the database. + + :param codelet: The codelet to insert. + :type codelet: :py:class:`.Codelet` + """ + pass diff --git a/setup.py b/setup.py index 1faa5b9..5fa1189 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( version = "0.1", packages = find_packages(), install_requires = ["Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", - "BeautifulSoup>=3.2.1"], + "BeautifulSoup>=3.2.1", "oursql>=0.9.3.1"], author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", license = "MIT", url = "https://github.com/earwig/bitshift" From 085fd62704c1ee5d9b88daef4f5992082e9c56dc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 15 Apr 2014 00:38:12 -0400 Subject: [PATCH 04/42] Database schema, hashing module, some other things. --- .gitignore | 1 + bitshift/database.py | 10 +++++----- schema.sql | 23 +++++++++++++++++++++++ setup.py | 5 +++-- 4 files changed, 32 insertions(+), 7 deletions(-) create mode 100644 schema.sql diff --git a/.gitignore b/.gitignore index 6a014f5..7e00121 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .sass-cache .DS_Store +.my.cnf # github premade rules *.py[cod] diff --git a/bitshift/database.py b/bitshift/database.py index 36b984e..647fe55 100644 --- a/bitshift/database.py +++ b/bitshift/database.py @@ -3,19 +3,18 @@ Module with classes and functions to handle communication with the MySQL database backend, which manages the search index. """ +import mmh3 import oursql class Database(object): """Represents the MySQL database.""" def __init__(self): - pass + self._connect() def _connect(self): - pass - - def _create(self): - pass + """Establish a connection to the database.""" + self._conn = oursql.connect() def search(self, query): """ @@ -36,4 +35,5 @@ class Database(object): :param codelet: The codelet to insert. :type codelet: :py:class:`.Codelet` """ + # code_hash = mmh3.hash64(codelet.code)[0] pass diff --git a/schema.sql b/schema.sql new file mode 100644 index 0000000..3cb915c --- /dev/null +++ b/schema.sql @@ -0,0 +1,23 @@ +CREATE DATABASE bitshift DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; +USE `bitshift`; + +CREATE TABLE codelets ( + `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, + `codelet_name` VARCHAR(512) NOT NULL, + `codelet_code_id` BIGINT UNSIGNED NOT NULL, + `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL, + `codelet_origin` TINYINT UNSIGNED DEFAULT NULL, + `codelet_url` VARCHAR(512) NOT NULL, + `codelet_date_created` DATETIME DEFAULT NULL, + `codelet_date_modified` DATETIME DEFAULT NULL, + PRIMARY KEY (`codelet_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; + +CREATE TABLE code ( + `code_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, + `code_hash` BIGINT NOT NULL, + `code_code` MEDIUMTEXT NOT NULL, + PRIMARY KEY (`code_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; + +-- separate tables: authors, symbols, caches, search indices diff --git a/setup.py b/setup.py index 5fa1189..97441b7 100644 --- a/setup.py +++ b/setup.py @@ -4,8 +4,9 @@ setup( name = "bitshift", version = "0.1", packages = find_packages(), - install_requires = ["Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", - "BeautifulSoup>=3.2.1", "oursql>=0.9.3.1"], + install_requires = [ + "Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", + "BeautifulSoup>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"], author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", license = "MIT", url = "https://github.com/earwig/bitshift" From bc3b9e7587e40579bfceeb448c8260a554d87854 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 17 Apr 2014 17:33:14 -0400 Subject: [PATCH 05/42] Some more database design work. --- bitshift/database.py | 13 ++++++++-- bitshift/query/__init__.py | 2 ++ schema.sql | 65 +++++++++++++++++++++++++++++++++++++++------- 3 files changed, 68 insertions(+), 12 deletions(-) diff --git a/bitshift/database.py b/bitshift/database.py index 647fe55..07c71c2 100644 --- a/bitshift/database.py +++ b/bitshift/database.py @@ -16,16 +16,25 @@ class Database(object): """Establish a connection to the database.""" self._conn = oursql.connect() - def search(self, query): + def search(self, query, page=1): """ - Search the database. + Search the database for a query and return the *n*\ th page of results. :param query: The query to search for. :type query: :py:class:`~.query.tree.Tree` + :param page: The result page to display. + :type page: int :return: A list of search results. :rtype: list of :py:class:`.Codelet`\ s """ + # query tree hash + page -> cached? + # cache HIT: + # if qcache_created is too old: invalidate cache, goto cache MISS + # update qcache_last_used + # parse qcache_results, fetch codelets + # cache MISS: + # build complex search query pass def insert(self, codelet): diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py index 7d6e0d5..6971c04 100644 --- a/bitshift/query/__init__.py +++ b/bitshift/query/__init__.py @@ -6,4 +6,6 @@ __all__ = ["parse_query"] def parse_query(query): # gets a string, returns a Tree + # TODO: note: resultant Trees should be normalized so that "foo OR bar" + # and "bar OR foo" result in equivalent trees pass diff --git a/schema.sql b/schema.sql index 3cb915c..d49fc6e 100644 --- a/schema.sql +++ b/schema.sql @@ -1,23 +1,68 @@ -CREATE DATABASE bitshift DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; +CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; USE `bitshift`; -CREATE TABLE codelets ( +CREATE TABLE `languages` ( + `language_id` SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, + `language_name` VARCHAR(64) NOT NULL, + PRIMARY KEY (`language_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; + +CREATE TABLE `origins` ( + `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, + `origin_name` VARCHAR(64) NOT NULL, + `origin_url` VARCHAR(512) NOT NULL, + `origin_url_base` VARCHAR(512) NOT NULL, + `origin_image` TINYBLOB DEFAULT NULL, -- TODO: verify size (<64kB) + PRIMARY KEY (`origin_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; + +CREATE TABLE `codelets` ( `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, `codelet_name` VARCHAR(512) NOT NULL, `codelet_code_id` BIGINT UNSIGNED NOT NULL, - `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL, - `codelet_origin` TINYINT UNSIGNED DEFAULT NULL, + `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL, -- TODO: needs index + `codelet_origin` TINYINT UNSIGNED NOT NULL, `codelet_url` VARCHAR(512) NOT NULL, - `codelet_date_created` DATETIME DEFAULT NULL, - `codelet_date_modified` DATETIME DEFAULT NULL, + `codelet_date_created` DATETIME DEFAULT NULL, -- TODO: needs index + `codelet_date_modified` DATETIME DEFAULT NULL, -- TODO: needs index PRIMARY KEY (`codelet_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; -CREATE TABLE code ( +CREATE TABLE `code` ( `code_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, - `code_hash` BIGINT NOT NULL, - `code_code` MEDIUMTEXT NOT NULL, + `code_hash` BIGINT NOT NULL, -- TODO: needs index + `code_code` MEDIUMTEXT NOT NULL, -- TODO: verify size (16mB?) PRIMARY KEY (`code_id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; --- separate tables: authors, symbols, caches, search indices +CREATE TABLE `authors` ( + `author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, + `author_codelet` BIGINT UNSIGNED NOT NULL, -- TODO: foreign index? + `author_name` VARCHAR(128) NOT NULL, -- TODO: needs index + `author_url` VARCHAR(512) DEFAULT NULL, + PRIMARY KEY (`author_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; + +CREATE TABLE `symbols` ( + `symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, + `symbol_codelet` BIGINT UNSIGNED NOT NULL, -- TODO: foreign index? + `symbol_type` TINYINT UNSIGNED NOT NULL, -- TODO: multi-column index? + `symbol_name` VARCHAR(512) NOT NULL, -- TODO: needs index + `symbol_row` INT UNSIGNED NOT NULL, + `symbol_col` INT UNSIGNED NOT NULL, + PRIMARY KEY (`symbol_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; + +CREATE TABLE `query_cache` ( + `qcache_id` INT NOT NULL UNIQUE, + `qcache_query` VARCHAR(512) NOT NULL, + `qcache_results` BLOB NOT NULL, -- TODO: verify; perhaps use some kind of array + `qcache_page` TINYINT UNSIGNED NOT NULL, + `qcache_count_mnt` TINYINT UNSIGNED NOT NULL, + `qcache_count_exp` TINYINT UNSIGNED NOT NULL, + `qcache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- TODO: verify + `qcache_last_used` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- TODO: verify + PRIMARY KEY (`cache_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; + +-- TODO: full-text search index table From 1cbe669c0247446fba178c07d3f8daf86e73e5ca Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 17 Apr 2014 19:25:42 -0400 Subject: [PATCH 06/42] More work on db schema; all except FTS indices. --- bitshift/database.py | 12 +++--- schema.sql | 108 ++++++++++++++++++++++++++++++++------------------- 2 files changed, 74 insertions(+), 46 deletions(-) diff --git a/bitshift/database.py b/bitshift/database.py index 07c71c2..b86b05a 100644 --- a/bitshift/database.py +++ b/bitshift/database.py @@ -28,13 +28,15 @@ class Database(object): :return: A list of search results. :rtype: list of :py:class:`.Codelet`\ s """ - # query tree hash + page -> cached? + # search for cache_hash = mmh3.hash(query.serialize() + str(page)) # cache HIT: - # if qcache_created is too old: invalidate cache, goto cache MISS - # update qcache_last_used - # parse qcache_results, fetch codelets + # update cache_last_used + # return codelets # cache MISS: # build complex search query + # fetch codelets + # cache results + # return codelets pass def insert(self, codelet): @@ -44,5 +46,5 @@ class Database(object): :param codelet: The codelet to insert. :type codelet: :py:class:`.Codelet` """ - # code_hash = mmh3.hash64(codelet.code)[0] + # code_hash = mmh3.hash64(codelet.code.encode("utf8"))[0] pass diff --git a/schema.sql b/schema.sql index d49fc6e..21c9c07 100644 --- a/schema.sql +++ b/schema.sql @@ -2,67 +2,93 @@ CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; USE `bitshift`; CREATE TABLE `languages` ( - `language_id` SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, + `language_id` SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT, `language_name` VARCHAR(64) NOT NULL, PRIMARY KEY (`language_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; +) ENGINE=InnoDB; CREATE TABLE `origins` ( - `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, + `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, `origin_name` VARCHAR(64) NOT NULL, `origin_url` VARCHAR(512) NOT NULL, `origin_url_base` VARCHAR(512) NOT NULL, - `origin_image` TINYBLOB DEFAULT NULL, -- TODO: verify size (<64kB) + `origin_image` BLOB DEFAULT NULL, PRIMARY KEY (`origin_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; +) ENGINE=InnoDB; + +CREATE TABLE `code` ( + `code_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `code_hash` BIGINT NOT NULL, + `code_code` MEDIUMTEXT NOT NULL, -- TODO: full-text search index + PRIMARY KEY (`code_id`), + KEY (`code_hash`) +) ENGINE=InnoDB; CREATE TABLE `codelets` ( - `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, - `codelet_name` VARCHAR(512) NOT NULL, + `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `codelet_name` VARCHAR(300) NOT NULL, -- TODO: full-text search index `codelet_code_id` BIGINT UNSIGNED NOT NULL, - `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL, -- TODO: needs index + `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL, `codelet_origin` TINYINT UNSIGNED NOT NULL, `codelet_url` VARCHAR(512) NOT NULL, - `codelet_date_created` DATETIME DEFAULT NULL, -- TODO: needs index - `codelet_date_modified` DATETIME DEFAULT NULL, -- TODO: needs index - PRIMARY KEY (`codelet_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; - -CREATE TABLE `code` ( - `code_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, - `code_hash` BIGINT NOT NULL, -- TODO: needs index - `code_code` MEDIUMTEXT NOT NULL, -- TODO: verify size (16mB?) - PRIMARY KEY (`code_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; + `codelet_date_created` DATETIME DEFAULT NULL, + `codelet_date_modified` DATETIME DEFAULT NULL, + PRIMARY KEY (`codelet_id`), + KEY (`codelet_date_created`), + KEY (`codelet_date_modified`), + FOREIGN KEY (`codelet_code_id`) + REFERENCES `code` (`code_id`) + ON DELETE RESTRICT ON UPDATE CASCADE, + FOREIGN KEY (`codelet_lang`) + REFERENCES `languages` (`language_id`) + ON DELETE RESTRICT ON UPDATE CASCADE, + FOREIGN KEY (`codelet_origin`) + REFERENCES `origins` (`origin_id`) + ON DELETE RESTRICT ON UPDATE CASCADE +) ENGINE=InnoDB; CREATE TABLE `authors` ( - `author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, - `author_codelet` BIGINT UNSIGNED NOT NULL, -- TODO: foreign index? - `author_name` VARCHAR(128) NOT NULL, -- TODO: needs index + `author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `author_codelet` BIGINT UNSIGNED NOT NULL, + `author_name` VARCHAR(128) NOT NULL, -- TODO: full-text search index `author_url` VARCHAR(512) DEFAULT NULL, - PRIMARY KEY (`author_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; + PRIMARY KEY (`author_id`), + FOREIGN KEY (`author_codelet`) + REFERENCES `codelet` (`codelet_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; CREATE TABLE `symbols` ( - `symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE, - `symbol_codelet` BIGINT UNSIGNED NOT NULL, -- TODO: foreign index? - `symbol_type` TINYINT UNSIGNED NOT NULL, -- TODO: multi-column index? - `symbol_name` VARCHAR(512) NOT NULL, -- TODO: needs index + `symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `symbol_codelet` BIGINT UNSIGNED NOT NULL, + `symbol_type` TINYINT UNSIGNED NOT NULL, + `symbol_name` VARCHAR(512) NOT NULL, `symbol_row` INT UNSIGNED NOT NULL, `symbol_col` INT UNSIGNED NOT NULL, - PRIMARY KEY (`symbol_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; + PRIMARY KEY (`symbol_id`), + KEY (`symbol_type`, `symbol_name`(32)), + FOREIGN KEY (`symbol_codelet`) + REFERENCES `codelet` (`codelet_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; -CREATE TABLE `query_cache` ( - `qcache_id` INT NOT NULL UNIQUE, - `qcache_query` VARCHAR(512) NOT NULL, - `qcache_results` BLOB NOT NULL, -- TODO: verify; perhaps use some kind of array - `qcache_page` TINYINT UNSIGNED NOT NULL, - `qcache_count_mnt` TINYINT UNSIGNED NOT NULL, - `qcache_count_exp` TINYINT UNSIGNED NOT NULL, - `qcache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- TODO: verify - `qcache_last_used` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- TODO: verify +CREATE TABLE `cache` ( + `cache_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, + `cache_hash` BIGINT NOT NULL, + `cache_count_mnt` TINYINT UNSIGNED NOT NULL, + `cache_count_exp` TINYINT UNSIGNED NOT NULL, + `cache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + `cache_last_used` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (`cache_id`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; +) ENGINE=InnoDB; --- TODO: full-text search index table +CREATE TABLE `cache_data` ( + `cdata_cache` INT UNSIGNED NOT NULL, + `cdata_codelet` BIGINT UNSIGNED NOT NULL, + FOREIGN KEY (`cdata_cache`) + REFERENCES `cache` (`cache_id`) + ON DELETE CASCADE ON UPDATE CASCADE, + FOREIGN KEY (`cdata_codelet`) + REFERENCES `codelet` (`codelet_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; From 75b243f6853f224593c6aff1153ea9a74f768ba4 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 17 Apr 2014 20:33:14 -0400 Subject: [PATCH 07/42] Remove languages table; add indexed field for codelet rank. --- bitshift/database.py | 2 ++ schema.sql | 11 ++--------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/bitshift/database.py b/bitshift/database.py index b86b05a..02aa38e 100644 --- a/bitshift/database.py +++ b/bitshift/database.py @@ -6,6 +6,8 @@ database backend, which manages the search index. import mmh3 import oursql +# from .languages import ... + class Database(object): """Represents the MySQL database.""" diff --git a/schema.sql b/schema.sql index 21c9c07..a76f8f8 100644 --- a/schema.sql +++ b/schema.sql @@ -1,12 +1,6 @@ CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; USE `bitshift`; -CREATE TABLE `languages` ( - `language_id` SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT, - `language_name` VARCHAR(64) NOT NULL, - PRIMARY KEY (`language_id`) -) ENGINE=InnoDB; - CREATE TABLE `origins` ( `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, `origin_name` VARCHAR(64) NOT NULL, @@ -31,17 +25,16 @@ CREATE TABLE `codelets` ( `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL, `codelet_origin` TINYINT UNSIGNED NOT NULL, `codelet_url` VARCHAR(512) NOT NULL, + `codelet_rank` FLOAT NOT NULL, `codelet_date_created` DATETIME DEFAULT NULL, `codelet_date_modified` DATETIME DEFAULT NULL, PRIMARY KEY (`codelet_id`), + KEY (`codelet_rank`), KEY (`codelet_date_created`), KEY (`codelet_date_modified`), FOREIGN KEY (`codelet_code_id`) REFERENCES `code` (`code_id`) ON DELETE RESTRICT ON UPDATE CASCADE, - FOREIGN KEY (`codelet_lang`) - REFERENCES `languages` (`language_id`) - ON DELETE RESTRICT ON UPDATE CASCADE, FOREIGN KEY (`codelet_origin`) REFERENCES `origins` (`origin_id`) ON DELETE RESTRICT ON UPDATE CASCADE From fb4e0d5916d6e6edcae9e5c6ef6cedb55ed9725f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 18 Apr 2014 02:16:42 -0400 Subject: [PATCH 08/42] FULLTEXT KEYs where appropriate. --- schema.sql | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/schema.sql b/schema.sql index a76f8f8..df77720 100644 --- a/schema.sql +++ b/schema.sql @@ -13,14 +13,15 @@ CREATE TABLE `origins` ( CREATE TABLE `code` ( `code_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, `code_hash` BIGINT NOT NULL, - `code_code` MEDIUMTEXT NOT NULL, -- TODO: full-text search index + `code_code` MEDIUMTEXT NOT NULL, PRIMARY KEY (`code_id`), - KEY (`code_hash`) + KEY (`code_hash`), + FULLTEXT KEY (`codelet_code`) ) ENGINE=InnoDB; CREATE TABLE `codelets` ( `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, - `codelet_name` VARCHAR(300) NOT NULL, -- TODO: full-text search index + `codelet_name` VARCHAR(300) NOT NULL, `codelet_code_id` BIGINT UNSIGNED NOT NULL, `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL, `codelet_origin` TINYINT UNSIGNED NOT NULL, @@ -29,6 +30,7 @@ CREATE TABLE `codelets` ( `codelet_date_created` DATETIME DEFAULT NULL, `codelet_date_modified` DATETIME DEFAULT NULL, PRIMARY KEY (`codelet_id`), + FULLTEXT KEY (`codelet_name`), KEY (`codelet_rank`), KEY (`codelet_date_created`), KEY (`codelet_date_modified`), @@ -43,9 +45,10 @@ CREATE TABLE `codelets` ( CREATE TABLE `authors` ( `author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, `author_codelet` BIGINT UNSIGNED NOT NULL, - `author_name` VARCHAR(128) NOT NULL, -- TODO: full-text search index + `author_name` VARCHAR(128) NOT NULL, `author_url` VARCHAR(512) DEFAULT NULL, PRIMARY KEY (`author_id`), + FULLTEXT KEY (`author_name`), FOREIGN KEY (`author_codelet`) REFERENCES `codelet` (`codelet_id`) ON DELETE CASCADE ON UPDATE CASCADE From ad3de0615fdd0fbf5310dd4354abb6daa162e0dc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 24 Apr 2014 14:38:33 -0400 Subject: [PATCH 09/42] Fix some typos in the schema. --- schema.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/schema.sql b/schema.sql index df77720..15979be 100644 --- a/schema.sql +++ b/schema.sql @@ -16,7 +16,7 @@ CREATE TABLE `code` ( `code_code` MEDIUMTEXT NOT NULL, PRIMARY KEY (`code_id`), KEY (`code_hash`), - FULLTEXT KEY (`codelet_code`) + FULLTEXT KEY (`code_code`) ) ENGINE=InnoDB; CREATE TABLE `codelets` ( @@ -50,7 +50,7 @@ CREATE TABLE `authors` ( PRIMARY KEY (`author_id`), FULLTEXT KEY (`author_name`), FOREIGN KEY (`author_codelet`) - REFERENCES `codelet` (`codelet_id`) + REFERENCES `codelets` (`codelet_id`) ON DELETE CASCADE ON UPDATE CASCADE ) ENGINE=InnoDB; @@ -64,7 +64,7 @@ CREATE TABLE `symbols` ( PRIMARY KEY (`symbol_id`), KEY (`symbol_type`, `symbol_name`(32)), FOREIGN KEY (`symbol_codelet`) - REFERENCES `codelet` (`codelet_id`) + REFERENCES `codelets` (`codelet_id`) ON DELETE CASCADE ON UPDATE CASCADE ) ENGINE=InnoDB; @@ -85,6 +85,6 @@ CREATE TABLE `cache_data` ( REFERENCES `cache` (`cache_id`) ON DELETE CASCADE ON UPDATE CASCADE, FOREIGN KEY (`cdata_codelet`) - REFERENCES `codelet` (`codelet_id`) + REFERENCES `codelets` (`codelet_id`) ON DELETE CASCADE ON UPDATE CASCADE ) ENGINE=InnoDB; From 54bca5894f9f0866538292f40593f99e61eeae97 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 27 Apr 2014 00:47:13 -0400 Subject: [PATCH 10/42] Move database stuff to a subpackage; updates. --- bitshift/{database.py => database/__init__.py} | 39 ++++++++++++++++++++++---- schema.sql => bitshift/database/schema.sql | 0 2 files changed, 34 insertions(+), 5 deletions(-) rename bitshift/{database.py => database/__init__.py} (55%) rename schema.sql => bitshift/database/schema.sql (100%) diff --git a/bitshift/database.py b/bitshift/database/__init__.py similarity index 55% rename from bitshift/database.py rename to bitshift/database/__init__.py index 02aa38e..4ed7a02 100644 --- a/bitshift/database.py +++ b/bitshift/database/__init__.py @@ -1,12 +1,16 @@ """ -Module with classes and functions to handle communication with the MySQL +Subpackage with classes and functions to handle communication with the MySQL database backend, which manages the search index. """ +import os + import mmh3 import oursql -# from .languages import ... +# from ..languages import ... + +__all__ = ["Database"] class Database(object): """Represents the MySQL database.""" @@ -16,7 +20,9 @@ class Database(object): def _connect(self): """Establish a connection to the database.""" - self._conn = oursql.connect() + default_file = os.path.join(os.path.dirname(__file__), ".my.cnf") + self._conn = oursql.connect(read_default_file=default_file, + autoping=True, autoreconnect=True) def search(self, query, page=1): """ @@ -48,5 +54,28 @@ class Database(object): :param codelet: The codelet to insert. :type codelet: :py:class:`.Codelet` """ - # code_hash = mmh3.hash64(codelet.code.encode("utf8"))[0] - pass + query = "INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)" + + cursor.execute(query, ()) + + # codelet_id -- auto_increment used here + codelet_name + codelet_code_id + codelet_lang + codelet_origin + codelet_url + codelet_rank + codelet_date_created + codelet_date_modified + + # codelet fields + codelet.name + codelet.code + codelet.filename + codelet.language + codelet.authors + codelet.code_url + codelet.date_created + codelet.date_modified + + code_hash = mmh3.hash64(codelet.code.encode("utf8"))[0] diff --git a/schema.sql b/bitshift/database/schema.sql similarity index 100% rename from schema.sql rename to bitshift/database/schema.sql From 0d0a74f9dfd7fa382f2dcdb02256246e062d0450 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 27 Apr 2014 23:43:32 -0400 Subject: [PATCH 11/42] Some more work on db stuff. --- bitshift/database/__init__.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 4ed7a02..9a54ef2 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -20,7 +20,8 @@ class Database(object): def _connect(self): """Establish a connection to the database.""" - default_file = os.path.join(os.path.dirname(__file__), ".my.cnf") + root = os.path.dirname(os.path.abspath(__file__)) + default_file = os.path.join(root, ".my.cnf") self._conn = oursql.connect(read_default_file=default_file, autoping=True, autoreconnect=True) @@ -54,9 +55,18 @@ class Database(object): :param codelet: The codelet to insert. :type codelet: :py:class:`.Codelet` """ - query = "INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)" + frag_size = 16384 # 16 kB + query_slt1 = """SELECT code_id, LEFT(code_code, {0}) + FROM code WHERE code_hash = ?""".format(frag_size) + query_ins1 = "INSERT INTO code VALUES (?, ?)" + query_ins2 = "INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)" + query_ins3 = "INSERT INTO authors VALUES", " (?, ?, ?)" + query_ins4 = "INSERT INTO symbols VALUES", " (?, ?, ?, ?, ?)" - cursor.execute(query, ()) + # LAST_INSERT_ID() + + code_id = None + code_hash = mmh3.hash64(codelet.code.encode("utf8"))[0] # codelet_id -- auto_increment used here codelet_name @@ -78,4 +88,14 @@ class Database(object): codelet.date_created codelet.date_modified - code_hash = mmh3.hash64(codelet.code.encode("utf8"))[0] + with self._conn.cursor() as cursor: + # Retrieve the ID of the source code if it's already in the DB: + cursor.execute(query_slt1, (code_hash,)) + for c_id, c_code_frag in cursor.fetchall(): + if c_code_frag == codelet.code[:frag_size]: + code_id = c_id + break + + # If the source code isn't already in the DB, add it: + if not code_id: + cursor.execute() From 22d6b625474f535d53adef652bd4d6e3397af04e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 28 Apr 2014 14:05:45 -0400 Subject: [PATCH 12/42] Update schema to v2; database updates. --- bitshift/database/__init__.py | 62 +++++++++++++++++++++++++++--------------- bitshift/database/migration.py | 23 ++++++++++++++++ bitshift/database/schema.sql | 13 +++++++-- 3 files changed, 73 insertions(+), 25 deletions(-) create mode 100644 bitshift/database/migration.py diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 9a54ef2..50486b6 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -8,15 +8,16 @@ import os import mmh3 import oursql -# from ..languages import ... +from .migration import VERSION, MIGRATIONS __all__ = ["Database"] class Database(object): """Represents the MySQL database.""" - def __init__(self): + def __init__(self, migrate=False): self._connect() + self._check_version(migrate) def _connect(self): """Establish a connection to the database.""" @@ -25,6 +26,33 @@ class Database(object): self._conn = oursql.connect(read_default_file=default_file, autoping=True, autoreconnect=True) + def _migrate(self, cursor, current): + """Migrate the database to the latest schema version.""" + for version in xrange(current, VERSION): + for query in MIGRATIONS[version - 1]: + cursor.execute(query) + + def _check_version(self, migrate): + """Check the database schema version and respond accordingly. + + If the schema is out of date, migrate if *migrate* is True, else raise + an exception. + """ + with self._conn.cursor() as cursor: + cursor.execute("SELECT version FROM version") + version = cursor.fetchone()[0] + if version < VERSION: + if migrate: + self._migrate(cursor, version) + else: + err = "Database schema out of date. " \ + "Run `python -m bitshift.database.migration`." + raise RuntimeError(err) + + def close(self): + """Disconnect from the database.""" + self._conn.close() + def search(self, query, page=1): """ Search the database for a query and return the *n*\ th page of results. @@ -55,19 +83,14 @@ class Database(object): :param codelet: The codelet to insert. :type codelet: :py:class:`.Codelet` """ - frag_size = 16384 # 16 kB - query_slt1 = """SELECT code_id, LEFT(code_code, {0}) - FROM code WHERE code_hash = ?""".format(frag_size) - query_ins1 = "INSERT INTO code VALUES (?, ?)" - query_ins2 = "INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)" - query_ins3 = "INSERT INTO authors VALUES", " (?, ?, ?)" - query_ins4 = "INSERT INTO symbols VALUES", " (?, ?, ?, ?, ?)" + query1 = """INSERT INTO code VALUES (?, ?) + ON DUPLICATE KEY UPDATE code_id=code_id""" + query2 = "INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)" + query3 = "INSERT INTO authors VALUES", " (?, ?, ?)" + query4 = "INSERT INTO symbols VALUES", " (?, ?, ?, ?, ?)" # LAST_INSERT_ID() - code_id = None - code_hash = mmh3.hash64(codelet.code.encode("utf8"))[0] - # codelet_id -- auto_increment used here codelet_name codelet_code_id @@ -88,14 +111,9 @@ class Database(object): codelet.date_created codelet.date_modified + ####################################################################### + + code_id = mmh3.hash64(codelet.code.encode("utf8"))[0] + with self._conn.cursor() as cursor: - # Retrieve the ID of the source code if it's already in the DB: - cursor.execute(query_slt1, (code_hash,)) - for c_id, c_code_frag in cursor.fetchall(): - if c_code_frag == codelet.code[:frag_size]: - code_id = c_id - break - - # If the source code isn't already in the DB, add it: - if not code_id: - cursor.execute() + cursor.execute(query1, (code_id, codelet.code)) diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py new file mode 100644 index 0000000..c9fdd39 --- /dev/null +++ b/bitshift/database/migration.py @@ -0,0 +1,23 @@ +""" +Contains information about database schema versions, and SQL queries to update +between them. +""" + +VERSION = 2 + +MIGRATIONS = [ + # 1 -> 2 + [ + # drop index on code_hash + "ALTER TABLE code DROP COLUMN code_hash", + # change code_id to BIGINT NOT NULL, + # add key on codelets to codelet_lang + # add symbol_end_row INT UNSIGNED NOT NULL + # add symbol_end_col INT UNSIGNED NOT NULL + ] +] + +if __name__ == "__main__": + from . import Database + + Database(migrate=True).close() diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql index 15979be..159f85a 100644 --- a/bitshift/database/schema.sql +++ b/bitshift/database/schema.sql @@ -1,6 +1,12 @@ +-- Schema version 2 + CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; USE `bitshift`; +CREATE TABLE `version` ( + `version` INT UNSIGNED NOT NULL +) ENGINE=InnoDB; + CREATE TABLE `origins` ( `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, `origin_name` VARCHAR(64) NOT NULL, @@ -11,11 +17,9 @@ CREATE TABLE `origins` ( ) ENGINE=InnoDB; CREATE TABLE `code` ( - `code_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, - `code_hash` BIGINT NOT NULL, + `code_id` BIGINT NOT NULL, `code_code` MEDIUMTEXT NOT NULL, PRIMARY KEY (`code_id`), - KEY (`code_hash`), FULLTEXT KEY (`code_code`) ) ENGINE=InnoDB; @@ -31,6 +35,7 @@ CREATE TABLE `codelets` ( `codelet_date_modified` DATETIME DEFAULT NULL, PRIMARY KEY (`codelet_id`), FULLTEXT KEY (`codelet_name`), + KEY (`codelet_lang`), KEY (`codelet_rank`), KEY (`codelet_date_created`), KEY (`codelet_date_modified`), @@ -61,6 +66,8 @@ CREATE TABLE `symbols` ( `symbol_name` VARCHAR(512) NOT NULL, `symbol_row` INT UNSIGNED NOT NULL, `symbol_col` INT UNSIGNED NOT NULL, + `symbol_end_row` INT UNSIGNED NOT NULL, + `symbol_end_col` INT UNSIGNED NOT NULL, PRIMARY KEY (`symbol_id`), KEY (`symbol_type`, `symbol_name`(32)), FOREIGN KEY (`symbol_codelet`) From a5cc3537cbec154f7e819f76a870812abddb010b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Apr 2014 12:42:11 -0400 Subject: [PATCH 13/42] Credits. --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0fe39d0..8ca31d7 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ bitshift ======== -bitshift is a semantic search engine for source code. +bitshift is a semantic search engine for source code developed by Benjamin +Attal, Ben Kurtovic, and Severyn Kozak. Branches -------- From 0b655daaff3cdd41f48b96fe34f786f10deed56a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 29 Apr 2014 13:19:02 -0400 Subject: [PATCH 14/42] Finish migration to v2. --- bitshift/database/__init__.py | 2 ++ bitshift/database/migration.py | 21 +++++++++++++++------ bitshift/database/schema.sql | 3 ++- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 50486b6..14f7575 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -29,8 +29,10 @@ class Database(object): def _migrate(self, cursor, current): """Migrate the database to the latest schema version.""" for version in xrange(current, VERSION): + print "Migrating to %d..." % version + 1 for query in MIGRATIONS[version - 1]: cursor.execute(query) + cursor.execute("UPDATE version SET version = ?", (version + 1,)) def _check_version(self, migrate): """Check the database schema version and respond accordingly. diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py index c9fdd39..2ea9666 100644 --- a/bitshift/database/migration.py +++ b/bitshift/database/migration.py @@ -8,12 +8,21 @@ VERSION = 2 MIGRATIONS = [ # 1 -> 2 [ - # drop index on code_hash - "ALTER TABLE code DROP COLUMN code_hash", - # change code_id to BIGINT NOT NULL, - # add key on codelets to codelet_lang - # add symbol_end_row INT UNSIGNED NOT NULL - # add symbol_end_col INT UNSIGNED NOT NULL + """ALTER TABLE `codelets` + DROP FOREIGN KEY `codelets_ibfk_1`""", + """ALTER TABLE `code` + DROP KEY `code_hash`, + DROP COLUMN `code_hash`, + MODIFY COLUMN `code_id` BIGINT NOT NULL""", + """ALTER TABLE `codelets` + MODIFY COLUMN `codelet_code_id` BIGINT NOT NULL, + ADD KEY (`codelet_lang`), + ADD FOREIGN KEY (`codelet_code_id`) + REFERENCES `code` (`code_id`) + ON DELETE RESTRICT ON UPDATE CASCADE""", + """ALTER TABLE `symbols` + ADD COLUMN `symbol_end_row` INT UNSIGNED NOT NULL, + ADD COLUMN `symbol_end_col` INT UNSIGNED NOT NULL""" ] ] diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql index 159f85a..56a2d85 100644 --- a/bitshift/database/schema.sql +++ b/bitshift/database/schema.sql @@ -6,6 +6,7 @@ USE `bitshift`; CREATE TABLE `version` ( `version` INT UNSIGNED NOT NULL ) ENGINE=InnoDB; +INSERT INTO `version` VALUES (2); CREATE TABLE `origins` ( `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, @@ -26,7 +27,7 @@ CREATE TABLE `code` ( CREATE TABLE `codelets` ( `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, `codelet_name` VARCHAR(300) NOT NULL, - `codelet_code_id` BIGINT UNSIGNED NOT NULL, + `codelet_code_id` BIGINT NOT NULL, `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL, `codelet_origin` TINYINT UNSIGNED NOT NULL, `codelet_url` VARCHAR(512) NOT NULL, From 821a6ae4f1a30c2b8b4575c408145f8b34877206 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 30 Apr 2014 14:44:31 -0400 Subject: [PATCH 15/42] DB -> v3 for symbol->code assoc vs. ->codelet (fixes #13) --- bitshift/database/migration.py | 13 +++++++++++-- bitshift/database/schema.sql | 10 +++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py index 2ea9666..caf3020 100644 --- a/bitshift/database/migration.py +++ b/bitshift/database/migration.py @@ -3,7 +3,7 @@ Contains information about database schema versions, and SQL queries to update between them. """ -VERSION = 2 +VERSION = 3 MIGRATIONS = [ # 1 -> 2 @@ -17,12 +17,21 @@ MIGRATIONS = [ """ALTER TABLE `codelets` MODIFY COLUMN `codelet_code_id` BIGINT NOT NULL, ADD KEY (`codelet_lang`), - ADD FOREIGN KEY (`codelet_code_id`) + ADD CONSTRAINT `codelets_ibfk_1` FOREIGN KEY (`codelet_code_id`) REFERENCES `code` (`code_id`) ON DELETE RESTRICT ON UPDATE CASCADE""", """ALTER TABLE `symbols` ADD COLUMN `symbol_end_row` INT UNSIGNED NOT NULL, ADD COLUMN `symbol_end_col` INT UNSIGNED NOT NULL""" + ], + # 2 -> 3 + [ + """ALTER TABLE `symbols` + DROP FOREIGN KEY `symbols_ibfk_1`, + CHANGE COLUMN `symbol_codelet` `symbol_code` BIGINT NOT NULL, + ADD CONSTRAINT `symbols_ibfk_1` FOREIGN KEY (`symbol_code`) + REFERENCES `code` (`code_id`) + ON DELETE CASCADE ON UPDATE CASCADE""" ] ] diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql index 56a2d85..99b9e42 100644 --- a/bitshift/database/schema.sql +++ b/bitshift/database/schema.sql @@ -1,4 +1,4 @@ --- Schema version 2 +-- Schema version 3 CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; USE `bitshift`; @@ -6,7 +6,7 @@ USE `bitshift`; CREATE TABLE `version` ( `version` INT UNSIGNED NOT NULL ) ENGINE=InnoDB; -INSERT INTO `version` VALUES (2); +INSERT INTO `version` VALUES (3); CREATE TABLE `origins` ( `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, @@ -62,7 +62,7 @@ CREATE TABLE `authors` ( CREATE TABLE `symbols` ( `symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, - `symbol_codelet` BIGINT UNSIGNED NOT NULL, + `symbol_code` BIGINT NOT NULL, `symbol_type` TINYINT UNSIGNED NOT NULL, `symbol_name` VARCHAR(512) NOT NULL, `symbol_row` INT UNSIGNED NOT NULL, @@ -71,8 +71,8 @@ CREATE TABLE `symbols` ( `symbol_end_col` INT UNSIGNED NOT NULL, PRIMARY KEY (`symbol_id`), KEY (`symbol_type`, `symbol_name`(32)), - FOREIGN KEY (`symbol_codelet`) - REFERENCES `codelets` (`codelet_id`) + FOREIGN KEY (`symbol_code`) + REFERENCES `code` (`code_id`) ON DELETE CASCADE ON UPDATE CASCADE ) ENGINE=InnoDB; From e3a838220c7394e0985e627a4d7c090ba09e6bb2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 30 Apr 2014 14:44:45 -0400 Subject: [PATCH 16/42] Flesh out most of Database.insert(). --- bitshift/database/__init__.py | 44 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 14f7575..03a5c2c 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -87,35 +87,25 @@ class Database(object): """ query1 = """INSERT INTO code VALUES (?, ?) ON DUPLICATE KEY UPDATE code_id=code_id""" - query2 = "INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)" - query3 = "INSERT INTO authors VALUES", " (?, ?, ?)" - query4 = "INSERT INTO symbols VALUES", " (?, ?, ?, ?, ?)" - - # LAST_INSERT_ID() - - # codelet_id -- auto_increment used here - codelet_name - codelet_code_id - codelet_lang - codelet_origin - codelet_url - codelet_rank - codelet_date_created - codelet_date_modified - - # codelet fields - codelet.name - codelet.code - codelet.filename - codelet.language - codelet.authors - codelet.code_url - codelet.date_created - codelet.date_modified - - ####################################################################### + query2 = """INSERT INTO codelets VALUES + (?, ?, ?, ?, ?, ?, ?, ?)""" + query3 = "SELECT LAST_INSERT_ID()" + query4 = "INSERT INTO authors VALUES (?, ?, ?)" + query5 = "INSERT INTO symbols VALUES (?, ?, ?, ?, ?, ?, ?)" code_id = mmh3.hash64(codelet.code.encode("utf8"))[0] + origin, url = decompose(codelet.url) ## TODO: create decompose() function with self._conn.cursor() as cursor: cursor.execute(query1, (code_id, codelet.code)) + cursor.execute(query2, (codelet.name, code_id, codelet.language, + origin, url, codelet.rank, + codelet.date_created, + codelet.date_modified)) + cursor.execute(query3) + codelet_id = cursor.fetchone()[0] + authors = [(codelet_id, a.name, a.url) for a in codelet.authors] ## TODO: check author fields (is it a tuple?) + cursor.executemany(query4, authors) + if code_id is new: ## TODO: check for this properly + symbols = [(code_id, sym.type, sym.name, sym.row, sym.col, sym.end_row, sym.end_col) for sym in codelet.symbols] # TODO: check symbol fields (dict?) + cursor.executemany(query5, symbols) From 97b0644bf01932ba32863999226ae1ade7cd8fee Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 2 May 2014 14:40:00 -0400 Subject: [PATCH 17/42] Database to v4: split off symbol_locations table. --- bitshift/database/migration.py | 23 ++++++++++++++++++++++- bitshift/database/schema.sql | 22 ++++++++++++++++------ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py index caf3020..e0ec762 100644 --- a/bitshift/database/migration.py +++ b/bitshift/database/migration.py @@ -3,7 +3,7 @@ Contains information about database schema versions, and SQL queries to update between them. """ -VERSION = 3 +VERSION = 4 MIGRATIONS = [ # 1 -> 2 @@ -32,6 +32,27 @@ MIGRATIONS = [ ADD CONSTRAINT `symbols_ibfk_1` FOREIGN KEY (`symbol_code`) REFERENCES `code` (`code_id`) ON DELETE CASCADE ON UPDATE CASCADE""" + ], + # 3 -> 4 + [ + """ALTER TABLE `symbols` + DROP COLUMN `symbol_row`, + DROP COLUMN `symbol_col`, + DROP COLUMN `symbol_end_row`, + DROP COLUMN `symbol_end_col`""", + """CREATE TABLE `symbol_locations` ( + `sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `sloc_symbol` BIGINT UNSIGNED NOT NULL, + `sloc_type` TINYINT UNSIGNED NOT NULL, + `sloc_row` INT UNSIGNED NOT NULL, + `sloc_col` INT UNSIGNED NOT NULL, + `sloc_end_row` INT UNSIGNED NOT NULL, + `sloc_end_col` INT UNSIGNED NOT NULL, + PRIMARY KEY (`sloc_id`), + FOREIGN KEY (`sloc_symbol`) + REFERENCES `symbols` (`symbol_id`) + ON DELETE CASCADE ON UPDATE CASCADE + ) ENGINE=InnoDB""" ] ] diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql index 99b9e42..79dad45 100644 --- a/bitshift/database/schema.sql +++ b/bitshift/database/schema.sql @@ -1,4 +1,4 @@ --- Schema version 3 +-- Schema version 4 CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; USE `bitshift`; @@ -6,7 +6,7 @@ USE `bitshift`; CREATE TABLE `version` ( `version` INT UNSIGNED NOT NULL ) ENGINE=InnoDB; -INSERT INTO `version` VALUES (3); +INSERT INTO `version` VALUES (4); CREATE TABLE `origins` ( `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, @@ -65,10 +65,6 @@ CREATE TABLE `symbols` ( `symbol_code` BIGINT NOT NULL, `symbol_type` TINYINT UNSIGNED NOT NULL, `symbol_name` VARCHAR(512) NOT NULL, - `symbol_row` INT UNSIGNED NOT NULL, - `symbol_col` INT UNSIGNED NOT NULL, - `symbol_end_row` INT UNSIGNED NOT NULL, - `symbol_end_col` INT UNSIGNED NOT NULL, PRIMARY KEY (`symbol_id`), KEY (`symbol_type`, `symbol_name`(32)), FOREIGN KEY (`symbol_code`) @@ -76,6 +72,20 @@ CREATE TABLE `symbols` ( ON DELETE CASCADE ON UPDATE CASCADE ) ENGINE=InnoDB; +CREATE TABLE `symbol_locations` ( + `sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, + `sloc_symbol` BIGINT UNSIGNED NOT NULL, + `sloc_type` TINYINT UNSIGNED NOT NULL, + `sloc_row` INT UNSIGNED NOT NULL, + `sloc_col` INT UNSIGNED NOT NULL, + `sloc_end_row` INT UNSIGNED NOT NULL, + `sloc_end_col` INT UNSIGNED NOT NULL, + PRIMARY KEY (`sloc_id`), + FOREIGN KEY (`sloc_symbol`) + REFERENCES `symbols` (`symbol_id`) + ON DELETE CASCADE ON UPDATE CASCADE +) ENGINE=InnoDB; + CREATE TABLE `cache` ( `cache_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, `cache_hash` BIGINT NOT NULL, From d2aef2829e5edf11c2e392ce14436c5e452af42f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 2 May 2014 14:40:52 -0400 Subject: [PATCH 18/42] Finish database insertion, except for origins. --- bitshift/database/__init__.py | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 03a5c2c..bc4b451 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -51,6 +51,23 @@ class Database(object): "Run `python -m bitshift.database.migration`." raise RuntimeError(err) + def _decompose_url(self, url): + """Break up a URL into an origin (with a URL base) and a suffix.""" + pass ## TODO + + def _insert_symbols(self, cursor, code_id, sym_type, symbols): + """Insert a list of symbols of a given type into the database.""" + sym_types = ["functions", "classes", "variables"] + query1 = "INSERT INTO symbols VALUES (?, ?, ?)" + query2 = "INSERT INTO symbol_locations VALUES (?, ?, ?, ?, ?, ?)" + + for (name, decls, uses) in symbols: + cursor.execute(query1, (code_id, sym_types.index(sym_type), name)) + sym_id = cursor.lastrowid + params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] + + [tuple([sym_id, 1] + list(loc)) for loc in uses]) + cursor.executemany(query2, params) + def close(self): """Disconnect from the database.""" self._conn.close() @@ -89,23 +106,21 @@ class Database(object): ON DUPLICATE KEY UPDATE code_id=code_id""" query2 = """INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)""" - query3 = "SELECT LAST_INSERT_ID()" - query4 = "INSERT INTO authors VALUES (?, ?, ?)" - query5 = "INSERT INTO symbols VALUES (?, ?, ?, ?, ?, ?, ?)" + query3 = "INSERT INTO authors VALUES (?, ?, ?)" code_id = mmh3.hash64(codelet.code.encode("utf8"))[0] - origin, url = decompose(codelet.url) ## TODO: create decompose() function + origin, url = self._decompose_url(codelet.url) with self._conn.cursor() as cursor: cursor.execute(query1, (code_id, codelet.code)) + new_code = cursor.rowcount == 1 cursor.execute(query2, (codelet.name, code_id, codelet.language, origin, url, codelet.rank, codelet.date_created, codelet.date_modified)) - cursor.execute(query3) - codelet_id = cursor.fetchone()[0] - authors = [(codelet_id, a.name, a.url) for a in codelet.authors] ## TODO: check author fields (is it a tuple?) - cursor.executemany(query4, authors) - if code_id is new: ## TODO: check for this properly - symbols = [(code_id, sym.type, sym.name, sym.row, sym.col, sym.end_row, sym.end_col) for sym in codelet.symbols] # TODO: check symbol fields (dict?) - cursor.executemany(query5, symbols) + codelet_id = cursor.lastrowid + authors = [(codelet_id, a[0], a[1]) for a in codelet.authors] + cursor.executemany(query3, authors) + if new_code: + for sym_type, symbols in codelet.symbols.iteritems(): + self._insert_symbols(cursor, code_id, sym_type, symbols) From d6ccdbd16d1db369801ebd7a12ba1bf90df5225a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 2 May 2014 22:43:16 -0400 Subject: [PATCH 19/42] Fix a couble Database bugs. --- bitshift/database/__init__.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index bc4b451..1a2b373 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -16,15 +16,15 @@ class Database(object): """Represents the MySQL database.""" def __init__(self, migrate=False): - self._connect() + self._conn = self._connect() self._check_version(migrate) def _connect(self): """Establish a connection to the database.""" root = os.path.dirname(os.path.abspath(__file__)) default_file = os.path.join(root, ".my.cnf") - self._conn = oursql.connect(read_default_file=default_file, - autoping=True, autoreconnect=True) + return oursql.connect(db="bitshift", read_default_file=default_file, + autoping=True, autoreconnect=True) def _migrate(self, cursor, current): """Migrate the database to the latest schema version.""" @@ -58,8 +58,9 @@ class Database(object): def _insert_symbols(self, cursor, code_id, sym_type, symbols): """Insert a list of symbols of a given type into the database.""" sym_types = ["functions", "classes", "variables"] - query1 = "INSERT INTO symbols VALUES (?, ?, ?)" - query2 = "INSERT INTO symbol_locations VALUES (?, ?, ?, ?, ?, ?)" + query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)" + query2 = """INSERT INTO symbol_locations VALUES + (DEFAULT, ?, ?, ?, ?, ?, ?)""" for (name, decls, uses) in symbols: cursor.execute(query1, (code_id, sym_types.index(sym_type), name)) @@ -105,8 +106,8 @@ class Database(object): query1 = """INSERT INTO code VALUES (?, ?) ON DUPLICATE KEY UPDATE code_id=code_id""" query2 = """INSERT INTO codelets VALUES - (?, ?, ?, ?, ?, ?, ?, ?)""" - query3 = "INSERT INTO authors VALUES (?, ?, ?)" + (DEFAULT, ?, ?, ?, ?, ?, ?, ?, ?)""" + query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)" code_id = mmh3.hash64(codelet.code.encode("utf8"))[0] origin, url = self._decompose_url(codelet.url) From 950b6994f0abb83192065cedaeeef07bd1b5dd99 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 3 May 2014 17:50:16 -0400 Subject: [PATCH 20/42] Database to v5; finish Database.insert(). --- bitshift/database/__init__.py | 23 ++++++++++++++--------- bitshift/database/migration.py | 9 ++++++++- bitshift/database/schema.sql | 11 ++++++----- 3 files changed, 28 insertions(+), 15 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 1a2b373..9b039ca 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -51,9 +51,15 @@ class Database(object): "Run `python -m bitshift.database.migration`." raise RuntimeError(err) - def _decompose_url(self, url): + def _decompose_url(self, cursor, url): """Break up a URL into an origin (with a URL base) and a suffix.""" - pass ## TODO + query = """SELECT origin_id, SUBSTR(?, LENGTH(origin_url_base)) + FROM origins WHERE origin_url_base IS NOT NULL + AND ? LIKE CONCAT(origin_url_base, "%")""" + + cursor.execute(query, (url, url)) + result = cursor.fetchone() + return result if result else (1, url) def _insert_symbols(self, cursor, code_id, sym_type, symbols): """Insert a list of symbols of a given type into the database.""" @@ -109,12 +115,14 @@ class Database(object): (DEFAULT, ?, ?, ?, ?, ?, ?, ?, ?)""" query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)" - code_id = mmh3.hash64(codelet.code.encode("utf8"))[0] - origin, url = self._decompose_url(codelet.url) - with self._conn.cursor() as cursor: + code_id = mmh3.hash64(codelet.code.encode("utf8"))[0] + origin, url = self._decompose_url(cursor, codelet.url) + cursor.execute(query1, (code_id, codelet.code)) - new_code = cursor.rowcount == 1 + if cursor.rowcount == 1: + for sym_type, symbols in codelet.symbols.iteritems(): + self._insert_symbols(cursor, code_id, sym_type, symbols) cursor.execute(query2, (codelet.name, code_id, codelet.language, origin, url, codelet.rank, codelet.date_created, @@ -122,6 +130,3 @@ class Database(object): codelet_id = cursor.lastrowid authors = [(codelet_id, a[0], a[1]) for a in codelet.authors] cursor.executemany(query3, authors) - if new_code: - for sym_type, symbols in codelet.symbols.iteritems(): - self._insert_symbols(cursor, code_id, sym_type, symbols) diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py index e0ec762..743f906 100644 --- a/bitshift/database/migration.py +++ b/bitshift/database/migration.py @@ -3,7 +3,7 @@ Contains information about database schema versions, and SQL queries to update between them. """ -VERSION = 4 +VERSION = 5 MIGRATIONS = [ # 1 -> 2 @@ -53,6 +53,13 @@ MIGRATIONS = [ REFERENCES `symbols` (`symbol_id`) ON DELETE CASCADE ON UPDATE CASCADE ) ENGINE=InnoDB""" + ], + # 4 -> 5 + [ + """ALTER TABLE `origins` + MODIFY COLUMN `origin_name` VARCHAR(64) DEFAULT NULL, + MODIFY COLUMN `origin_url` VARCHAR(512) DEFAULT NULL, + MODIFY COLUMN `origin_url_base` VARCHAR(512) DEFAULT NULL""" ] ] diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql index 79dad45..50b4f9e 100644 --- a/bitshift/database/schema.sql +++ b/bitshift/database/schema.sql @@ -1,4 +1,4 @@ --- Schema version 4 +-- Schema version 5 CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; USE `bitshift`; @@ -6,16 +6,17 @@ USE `bitshift`; CREATE TABLE `version` ( `version` INT UNSIGNED NOT NULL ) ENGINE=InnoDB; -INSERT INTO `version` VALUES (4); +INSERT INTO `version` VALUES (5); CREATE TABLE `origins` ( `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, - `origin_name` VARCHAR(64) NOT NULL, - `origin_url` VARCHAR(512) NOT NULL, - `origin_url_base` VARCHAR(512) NOT NULL, + `origin_name` VARCHAR(64) DEFAULT NULL, + `origin_url` VARCHAR(512) DEFAULT NULL, + `origin_url_base` VARCHAR(512) DEFAULT NULL, `origin_image` BLOB DEFAULT NULL, PRIMARY KEY (`origin_id`) ) ENGINE=InnoDB; +INSERT INTO `origins` VALUES (1, NULL, NULL, NULL, NULL); CREATE TABLE `code` ( `code_id` BIGINT NOT NULL, From ef73c043479f8cf899757981c6e4248665c8bae8 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Sun, 13 Apr 2014 21:57:22 -0400 Subject: [PATCH 21/42] Add prototype repo-indexer script author_files.py. Add: author_files.py -add prototype script to output metadata about every file in a Git repository: filename, author names, dates of creation and modification. -lacking Sphinx documentation. --- bitshift/author_files.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 bitshift/author_files.py diff --git a/bitshift/author_files.py b/bitshift/author_files.py new file mode 100644 index 0000000..ed9f2c8 --- /dev/null +++ b/bitshift/author_files.py @@ -0,0 +1,53 @@ +""" +Output author/date information about the latest files in a Git repository. + +When executed inside a Git archive, prints a single line of metadata for every +file in the work tree. A given line contains the file's filename, authors, +and Unix timestamps for the file's time of creation and last modification; the +separate entries are null-delimited. + +Sample output: + socket_io.c\x00John Doe Jane Doe\x001384488690\x001384534626 + # filename: socket_io.c + # Author Names: +""" + +import fileinput, subprocess + +git_log = subprocess.check_output("git --no-pager log --name-only \ + --pretty=format:'%n%n%an%n%at' -z", shell=True) + +commits = [] +for commit in git_log.split("\n\n"): + fields = commit.split("\n") + if len(fields) > 2: + commits.append({ + "author" : fields[0], + "timestamp" : int(fields[1]), + "filenames" : fields[2].split("\0")[:-2] + }) + + +tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if -f && \ + T }' $(find . -type d -name .git -prune -o -print)", shell=True) +tracked_files = [filename[2:] for filename in tracked_files.split("\n")[:-1]] + +file_authors = {} +for commit in commits: + for filename in commit["filenames"]: + if filename in tracked_files: + if filename not in file_authors.keys(): + file_authors[filename] = { + "authors" : [commit["author"]], + "timestamps" : [commit["timestamp"]] + } + else: + if commit["author"] not in file_authors[filename]["authors"]: + file_authors[filename]["authors"].append(commit["author"]) + file_authors[filename]["timestamps"].append(commit["timestamp"]) + +for filename in file_authors.keys(): + authors = "\0".join(file_authors[filename]["authors"]) + time_created = min(file_authors[filename]["timestamps"]) + time_last_modified = max(file_authors[filename]["timestamps"]) + print "%s\0%s\0%d\0%d" % (filename, authors, time_created, time_last_modified) From ef9c0609fed4c432f475a3bdd89b4b1ab062a3e3 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Mon, 14 Apr 2014 13:02:59 -0400 Subject: [PATCH 22/42] Mov author_files > git_inder, heavily refactor. Add: bitshift/crawler/crawler.py -add base crawler module -add github(), to index Github. Mod: bitshift/crawler/ -add package subdirectory for the crawler module, and any subsidiary modules (eg, git_indexer). bitshift/author_files.py > bitshift/crawler/git_indexer.py -rename the module to "git_indexer", to better reflect its use. -convert from stand-alone script to a module whose functions integrate cleanly with the rest of the application. -add all necessary, tested functions, with Sphinx documentation. --- bitshift/author_files.py | 53 ---------------- bitshift/crawler/crawler.py | 37 +++++++++++ bitshift/crawler/git_indexer.py | 134 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+), 53 deletions(-) delete mode 100644 bitshift/author_files.py create mode 100644 bitshift/crawler/crawler.py create mode 100644 bitshift/crawler/git_indexer.py diff --git a/bitshift/author_files.py b/bitshift/author_files.py deleted file mode 100644 index ed9f2c8..0000000 --- a/bitshift/author_files.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Output author/date information about the latest files in a Git repository. - -When executed inside a Git archive, prints a single line of metadata for every -file in the work tree. A given line contains the file's filename, authors, -and Unix timestamps for the file's time of creation and last modification; the -separate entries are null-delimited. - -Sample output: - socket_io.c\x00John Doe Jane Doe\x001384488690\x001384534626 - # filename: socket_io.c - # Author Names: -""" - -import fileinput, subprocess - -git_log = subprocess.check_output("git --no-pager log --name-only \ - --pretty=format:'%n%n%an%n%at' -z", shell=True) - -commits = [] -for commit in git_log.split("\n\n"): - fields = commit.split("\n") - if len(fields) > 2: - commits.append({ - "author" : fields[0], - "timestamp" : int(fields[1]), - "filenames" : fields[2].split("\0")[:-2] - }) - - -tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if -f && \ - T }' $(find . -type d -name .git -prune -o -print)", shell=True) -tracked_files = [filename[2:] for filename in tracked_files.split("\n")[:-1]] - -file_authors = {} -for commit in commits: - for filename in commit["filenames"]: - if filename in tracked_files: - if filename not in file_authors.keys(): - file_authors[filename] = { - "authors" : [commit["author"]], - "timestamps" : [commit["timestamp"]] - } - else: - if commit["author"] not in file_authors[filename]["authors"]: - file_authors[filename]["authors"].append(commit["author"]) - file_authors[filename]["timestamps"].append(commit["timestamp"]) - -for filename in file_authors.keys(): - authors = "\0".join(file_authors[filename]["authors"]) - time_created = min(file_authors[filename]["timestamps"]) - time_last_modified = max(file_authors[filename]["timestamps"]) - print "%s\0%s\0%d\0%d" % (filename, authors, time_created, time_last_modified) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py new file mode 100644 index 0000000..46cd54e --- /dev/null +++ b/bitshift/crawler/crawler.py @@ -0,0 +1,37 @@ +""" + +""" + +import requests, time + +import git_indexer + +# from .codelet import Codelet +# from .database import Database + +def github(): + """ + Query the GitHub API for data about every public repository. + + Pull all of GitHub's repositories by making calls to its API in a loop, + accessing a subsequent page of results via the "next" URL returned in an + API response header. Uses Severyn Kozak's (sevko) authentication + credentials. + """ + + next_api_url = "https://api.github.com/repositories" + authentication_params = { + "client_id" : "436cb884ae09be7f2a4e", + "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" + } + + while len(next_api_url) > 0: + response = requests.get(next_api_url, params=authentication_params) + + for repo in response.json(): + codelets = git_indexer.index_repository(repo["html_url"]) + + if int(response.headers["x-ratelimit-remaining"]) == 0: + time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) + + next_api_url = requests.headers["link"].split(">")[0][1:] diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py new file mode 100644 index 0000000..a98c600 --- /dev/null +++ b/bitshift/crawler/git_indexer.py @@ -0,0 +1,134 @@ +""" +:synopsis: Index all the files in a Git repository. + +Clone a Git repository, and retrieve the following information about each file: +filename, contributor names, dates of creation and last modification, and the +file text. +""" + +import fileinput, subprocess, os + +def index_repository(repo_url): + """ + Generate metadata for every file in a Git repository. + + `git clone` the Git repository located at **repo_url**, and return metadata + about every one of non-binary (text) files in its if main branch (usually + *master*). + + :return: An array of metadata dictionaries. + .. code-block:: python + sample_returned_array = [ + { + "filename" : (str) "myfile" + "time_created" : (int) 1395939566, + "time_last_modified" : (int) 1396920409, + "source" : (str) "The source code of the file." + } + ] + """ + + repo_name = repo_url.split("/")[-1] + subprocess.call("git clone %s" % repo_url, shell=True) + os.chdir(repo_name) + + files_meta = [] + commits_meta = _get_commits_metadata() + for filename in commits_meta.keys(): + commits_meta[filename]["filename"] = filename + with open(filename, "r") as source_file: + commits_meta[filename]["source"] = source_file.read() + files_meta.append(commits_meta[filename]) + + os.chdir("..") + subprocess.call("rm -rf %s" % repo_name, shell=True) + return files_meta + +def _get_git_commits(): + """ + Return the current working directory's formatted commit data. + + Uses `git log` to generate metadata about every single file in the + repository's commit history. + + :return: The author, timestamp, and names of all modified files of every + commit. + .. code-block:: python + sample_returned_array = [ + { + "author" : (str) "author" + "timestamp" : (int) 1396919293, + "filename" : (str array) ["file1", "file2"] + } + ] + :rtype: dictionary + """ + + git_log = subprocess.check_output("git --no-pager log --name-only \ + --pretty=format:'%n%n%an%n%at' -z", shell=True) + + commits = [] + for commit in git_log.split("\n\n"): + fields = commit.split("\n") + if len(fields) > 2: + commits.append({ + "author" : fields[0], + "timestamp" : int(fields[1]), + "filenames" : fields[2].split("\0")[:-2] + }) + + return commits + +def _get_tracked_files(): + """ + Return a list of the filenames of all files in the Git repository. + + Get a list of the filenames of the non-binary (Perl heuristics used for + filetype identification) files currently inside the current working + directory's Git repository. + + :return: The filenames of all non-binary files. + :rtype: str array + """ + + tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if \ + -f && -T }' $(find . -type d -name .git -prune -o -print)", shell=True) + return [filename[2:] for filename in tracked_files.split("\n")[:-1]] + +def _get_commits_metadata(): + """ + Return a dictionary containing every tracked file's metadata. + + :return: A dictionary with author names, time of creation, and time of last + modification for every filename key. + .. code-block:: python + sample_returned_dict = { + "my_file" : { + "authors" : (str array) ["author1", "author2"], + "time_created" : (int) 1395939566, + "time_last_modified" : (int) 1396920409 + } + } + :rtype: dictionary + """ + + commits = _get_git_commits() + tracked_files = _get_tracked_files() + + files_meta = {} + for commit in commits: + for filename in commit["filenames"]: + if filename not in tracked_files: + continue + + if filename not in files_meta.keys(): + files_meta[filename] = { + "authors" : [commit["author"]], + "time_last_modified" : commit["timestamp"] + } + else: + if commit["author"] not in files_meta[filename]["authors"]: + files_meta[filename]["authors"].append(commit["author"]) + files_meta[filename]["time_created"] = commit["timestamp"] + + return files_meta From 77b448c3deaf980f1cddcee8986cf0c417a62a2c Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Mon, 14 Apr 2014 18:41:00 -0400 Subject: [PATCH 23/42] Mod Codelet, mov codelet creation from crawler. Add: bitshift/crawler/(crawler, git_indexer).py -move Codelet creation from the crawler to the git_indexer, in preparation for making crawling/indexing independent, threaded processes. Mod: bitshift/codelet.py -modify documentation for the author instance variable. --- bitshift/codelet.py | 18 ++++++----- bitshift/crawler/crawler.py | 8 +++-- bitshift/crawler/git_indexer.py | 66 +++++++++++++++++++++++++---------------- 3 files changed, 55 insertions(+), 37 deletions(-) diff --git a/bitshift/codelet.py b/bitshift/codelet.py index 08b0d36..87025e0 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -7,39 +7,41 @@ class Codelet(object): :ivar code: (str) A containing the raw source code. :ivar filename: (str, or None) The filename of the snippet. :ivar language: (str, or None) The inferred language of `code`. - :ivar author: (str, or None) The name of the code's author. - :ivar url: (str) The url of the (page containing the) source code. + :ivar authors: (array of str tuple) An array of tuples containing an + author's name and profile URL (on the service the code was pulled from). + :ivar code_url: (str) The url of the (page containing the) source code. :ivar date_created: (str, or None) The date the code was published. :ivar date_modified: (str, or None) The date the code was last modified. """ - def __init__(self, code, filename, author, language, code_url, author_url, + def __init__(self, name, code, filename, language, authors, code_url, date_created, date_modified): """ Create a Codelet instance. :param code: The raw source code. :param filename: The filename of the code, if any. - :param author: The author of the code. :param language: The inferred language. + :param authors: An array of tuples containing an author's name and + profile URL (on the service the code was pulled from). :param code_url: The url of the (page containing the) source code. :param date_created: The date the code was published. :param date_modified: The date the code was last modified. :type code: str :type filename: str, or None + :type authors: array of str tuples, or None :type language: str, or None - :type author: str, or None - :type url: str + :type code_url: str + :type author_urls: str array, or none :type date_created: str, or None :type date_modified: str, or None """ self.code = code self.filename = filename - self.author = author self.language = language + self.authors = authors self.code_url = code_url - self.author_url = author_url self.date_created = date_created self.date_modified = date_modified diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 46cd54e..1ca65d1 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -1,13 +1,15 @@ """ +:synopsis: Main crawler module, to oversee all site-specific crawlers. +...more info soon... """ import requests, time import git_indexer -# from .codelet import Codelet -# from .database import Database +from .codelet import Codelet +from .database import Database def github(): """ @@ -29,7 +31,7 @@ def github(): response = requests.get(next_api_url, params=authentication_params) for repo in response.json(): - codelets = git_indexer.index_repository(repo["html_url"]) + index_repository(repo["html_url"], framework) if int(response.headers["x-ratelimit-remaining"]) == 0: time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py index a98c600..0c7ce75 100644 --- a/bitshift/crawler/git_indexer.py +++ b/bitshift/crawler/git_indexer.py @@ -1,48 +1,61 @@ """ :synopsis: Index all the files in a Git repository. -Clone a Git repository, and retrieve the following information about each file: -filename, contributor names, dates of creation and last modification, and the -file text. +...more info soon... """ import fileinput, subprocess, os -def index_repository(repo_url): +from .database import Database + +def index_repository(repo_url, framework_name): """ - Generate metadata for every file in a Git repository. + Insert a Codelet for every file in a Git repository. - `git clone` the Git repository located at **repo_url**, and return metadata - about every one of non-binary (text) files in its if main branch (usually + `git clone` the Git repository located at **repo_url**, and create a Codelet + for every one of non-binary (text) files in its if main branch (usually *master*). - - :return: An array of metadata dictionaries. - .. code-block:: python - sample_returned_array = [ - { - "filename" : (str) "myfile" - "time_created" : (int) 1395939566, - "time_last_modified" : (int) 1396920409, - "source" : (str) "The source code of the file." - } - ] """ repo_name = repo_url.split("/")[-1] subprocess.call("git clone %s" % repo_url, shell=True) os.chdir(repo_name) - files_meta = [] commits_meta = _get_commits_metadata() for filename in commits_meta.keys(): - commits_meta[filename]["filename"] = filename with open(filename, "r") as source_file: - commits_meta[filename]["source"] = source_file.read() - files_meta.append(commits_meta[filename]) + source = source_file.read() + + authors = [(author,) for author in commits_meta["authors"]] + codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, + None, authors, _generate_file_url(filename, repo_url), + framework_name, commits_meta["time_created"], + commits_meta["time_last_modified"]) + Database.insert(codelet) os.chdir("..") subprocess.call("rm -rf %s" % repo_name, shell=True) - return files_meta + +def _generate_file_url(filename, repo_url, framework_name): + """ + Return a url for a filename from a Git wrapper framework. + + :param filename: The path of the file. + :param repo_url: The url of the file's parent repository. + :param framework_name: The name of the framework the repository is from. + + :type filename: str + :type repo_url: str + :type framework_name: str + + :return: The file's full url on the given framework. + :rtype: str + """ + + if framework_name == "github": + default branch = subprocess.check_output("git branch --no-color", \ + shell=True)[2:-1] + return "%s/blob/%s/%s" % (repo_url, default_branch, filename) def _get_git_commits(): """ @@ -58,14 +71,15 @@ def _get_git_commits(): { "author" : (str) "author" "timestamp" : (int) 1396919293, - "filename" : (str array) ["file1", "file2"] + "filenames" : (str array) ["file1", "file2"] } ] :rtype: dictionary """ - git_log = subprocess.check_output("git --no-pager log --name-only \ - --pretty=format:'%n%n%an%n%at' -z", shell=True) + git_log_cmd = ("git --no-pager --no-color log --name-only " + "--pretty=format:'%n%n%an%n%at' -z") + git_log = subprocess.check_output(git_log_cmd, shell=True) commits = [] for commit in git_log.split("\n\n"): From 9fc4598001264b58245b5c78ef21b792d7e3385c Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Mon, 14 Apr 2014 21:21:58 -0400 Subject: [PATCH 24/42] Clean up crawler/, fix minor bugs. Add: bitshift/codelet.py -add name field to Codelet. bitshift/crawler/crawler.py -fix previously defunct code (which was committed at a point of incompletion) -- incorrect dictionary keys, etc.. -reformat some function calls' argument alignment to fit PEP standards. bitshift/crawler.py -add sleep() to ensure that an API query is made at regular intervals (determined by the GitHub API limit). --- bitshift/__init__.py | 2 +- bitshift/codelet.py | 9 ++++++--- bitshift/crawler/__init__.py | 6 ++++++ bitshift/crawler/crawler.py | 14 ++++++++++---- bitshift/crawler/git_indexer.py | 36 ++++++++++++++++++++++-------------- 5 files changed, 45 insertions(+), 22 deletions(-) create mode 100644 bitshift/crawler/__init__.py diff --git a/bitshift/__init__.py b/bitshift/__init__.py index 9a18c9b..78ca5e9 100644 --- a/bitshift/__init__.py +++ b/bitshift/__init__.py @@ -1 +1 @@ -from . import assets, codelet, config, database, parser, query +from . import assets, codelet, config, database, parser, query, crawler diff --git a/bitshift/codelet.py b/bitshift/codelet.py index 87025e0..9568a4d 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -4,10 +4,11 @@ class Codelet(object): """ A source-code object with code metadata and composition analysis. + :ivar name: (str) A suitable name for the codelet. :ivar code: (str) A containing the raw source code. :ivar filename: (str, or None) The filename of the snippet. :ivar language: (str, or None) The inferred language of `code`. - :ivar authors: (array of str tuple) An array of tuples containing an + :ivar authors: (array of str tuples) An array of tuples containing an author's name and profile URL (on the service the code was pulled from). :ivar code_url: (str) The url of the (page containing the) source code. :ivar date_created: (str, or None) The date the code was published. @@ -19,6 +20,7 @@ class Codelet(object): """ Create a Codelet instance. + :param name: The name of the codelet. :param code: The raw source code. :param filename: The filename of the code, if any. :param language: The inferred language. @@ -28,16 +30,17 @@ class Codelet(object): :param date_created: The date the code was published. :param date_modified: The date the code was last modified. + :type name: str :type code: str :type filename: str, or None - :type authors: array of str tuples, or None :type language: str, or None + :type authors: array of str tuples, or None :type code_url: str - :type author_urls: str array, or none :type date_created: str, or None :type date_modified: str, or None """ + self.name = name self.code = code self.filename = filename self.language = language diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py new file mode 100644 index 0000000..a518970 --- /dev/null +++ b/bitshift/crawler/__init__.py @@ -0,0 +1,6 @@ +import crawler + +__all__ = ["crawl"] + +def crawl(): + pass diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 1ca65d1..34f2819 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -8,8 +8,8 @@ import requests, time import git_indexer -from .codelet import Codelet -from .database import Database +from ..codelet import Codelet +from ..database import Database def github(): """ @@ -26,14 +26,20 @@ def github(): "client_id" : "436cb884ae09be7f2a4e", "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" } + api_request_interval = 5e3 / 60 ** 2 while len(next_api_url) > 0: + start_time = time.time() response = requests.get(next_api_url, params=authentication_params) for repo in response.json(): - index_repository(repo["html_url"], framework) + print repo["id"] if int(response.headers["x-ratelimit-remaining"]) == 0: time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) - next_api_url = requests.headers["link"].split(">")[0][1:] + next_api_url = response.headers["link"].split(">")[0][1:] + + sleep_time = api_request_interval - (time.time() - start_time) + if sleep_time > 0: + time.sleep(sleep_time) diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py index 0c7ce75..cc9082c 100644 --- a/bitshift/crawler/git_indexer.py +++ b/bitshift/crawler/git_indexer.py @@ -6,7 +6,8 @@ import fileinput, subprocess, os -from .database import Database +from ..database import Database +from ..codelet import Codelet def index_repository(repo_url, framework_name): """ @@ -21,20 +22,25 @@ def index_repository(repo_url, framework_name): subprocess.call("git clone %s" % repo_url, shell=True) os.chdir(repo_name) + codelets = [] commits_meta = _get_commits_metadata() for filename in commits_meta.keys(): with open(filename, "r") as source_file: source = source_file.read() - authors = [(author,) for author in commits_meta["authors"]] - codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, - None, authors, _generate_file_url(filename, repo_url), - framework_name, commits_meta["time_created"], - commits_meta["time_last_modified"]) - Database.insert(codelet) + authors = [(author,) for author in commits_meta[filename]["authors"]] + codelets.append( + Codelet("%s:%s" % (repo_name, filename), source, filename, + None, authors, _generate_file_url(filename, repo_url, + framework_name), + commits_meta[filename]["time_created"], + commits_meta[filename]["time_last_modified"])) + + # Database.insert(codelet) os.chdir("..") subprocess.call("rm -rf %s" % repo_name, shell=True) + return codelets def _generate_file_url(filename, repo_url, framework_name): """ @@ -53,7 +59,7 @@ def _generate_file_url(filename, repo_url, framework_name): """ if framework_name == "github": - default branch = subprocess.check_output("git branch --no-color", \ + default_branch = subprocess.check_output("git branch --no-color", shell=True)[2:-1] return "%s/blob/%s/%s" % (repo_url, default_branch, filename) @@ -77,9 +83,9 @@ def _get_git_commits(): :rtype: dictionary """ - git_log_cmd = ("git --no-pager --no-color log --name-only " - "--pretty=format:'%n%n%an%n%at' -z") - git_log = subprocess.check_output(git_log_cmd, shell=True) + git_log = subprocess.check_output( + ("git --no-pager log --name-only" + " --pretty=format:'%n%n%an%n%at' -z"), shell=True) commits = [] for commit in git_log.split("\n\n"): @@ -105,8 +111,9 @@ def _get_tracked_files(): :rtype: str array """ - tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if \ - -f && -T }' $(find . -type d -name .git -prune -o -print)", shell=True) + tracked_files = subprocess.check_output( + ("perl -le 'for (@ARGV){ print if -f && -T }'" + " $(find . -type d -name .git -prune -o -print)"), shell=True) return [filename[2:] for filename in tracked_files.split("\n")[:-1]] def _get_commits_metadata(): @@ -138,7 +145,8 @@ def _get_commits_metadata(): if filename not in files_meta.keys(): files_meta[filename] = { "authors" : [commit["author"]], - "time_last_modified" : commit["timestamp"] + "time_last_modified" : commit["timestamp"], + "time_created" : commit["timestamp"] } else: if commit["author"] not in files_meta[filename]["authors"]: From c655d97f487b19224ca06a384b3f8c2a327cff9f Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Mon, 14 Apr 2014 22:09:05 -0400 Subject: [PATCH 25/42] Add class ChangeDir, amend unsafe subprocess. Add: bitshift/crawler/git_indexer.py -add ChangeDir class, a context-management wrapper for os.chdir(). -replace unsafe "rm -rf" subprocess call with shutil.rmtree() --- bitshift/crawler/git_indexer.py | 55 +++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py index cc9082c..2ba1200 100644 --- a/bitshift/crawler/git_indexer.py +++ b/bitshift/crawler/git_indexer.py @@ -4,11 +4,47 @@ ...more info soon... """ -import fileinput, subprocess, os +import shutil, subprocess, os from ..database import Database from ..codelet import Codelet +GIT_CLONE_DIR = "/tmp" + +class ChangeDir(object): + """ + A wrapper class for os.chdir(), to map onto `with` and handle exceptions. + + :ivar new_path: (str) The path to change the current directory to. + :ivar old_path: (str) The path of the directory to return to. + """ + + def __init__(self, new_path): + """ + Construct the object. + + :param new_path: The directory to enter. + + :type new_path: str + """ + + self.new_path = new_path + + def __enter__(self): + """ + Change the current working-directory to **new_path**. + """ + + self.old_path = os.getcwd() + os.chdir(self.new_path) + + def __exit__(self, etype, value, traceback): + """ + Change the current working-directory to **old_path**. + """ + + os.chdir(self.old_path) + def index_repository(repo_url, framework_name): """ Insert a Codelet for every file in a Git repository. @@ -19,9 +55,18 @@ def index_repository(repo_url, framework_name): """ repo_name = repo_url.split("/")[-1] - subprocess.call("git clone %s" % repo_url, shell=True) - os.chdir(repo_name) + codelets = [] + with ChangeDir(GIT_CLONE_DIR) as git_clone_dir: + subprocess.call("git clone %s" % repo_url, shell=True) + with ChangeDir(repo_name) as repository_dir: + codelets = _insert_repository_codelets(repo_url, repo_name, + framework_name) + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) + + return codelets + +def _insert_repository_codelets(repo_url, repo_name, framework_name): codelets = [] commits_meta = _get_commits_metadata() for filename in commits_meta.keys(): @@ -36,10 +81,6 @@ def index_repository(repo_url, framework_name): commits_meta[filename]["time_created"], commits_meta[filename]["time_last_modified"])) - # Database.insert(codelet) - - os.chdir("..") - subprocess.call("rm -rf %s" % repo_name, shell=True) return codelets def _generate_file_url(filename, repo_url, framework_name): From 97198ee523df5263f018b8bd581343832583dcc2 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Tue, 15 Apr 2014 09:40:11 -0400 Subject: [PATCH 26/42] Update Crawler documentation. Add: bitshift/crawler/git_indexer.py -add some missing docstrings, complete others. --- bitshift/crawler/git_indexer.py | 43 ++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py index 2ba1200..8cd3ae3 100644 --- a/bitshift/crawler/git_indexer.py +++ b/bitshift/crawler/git_indexer.py @@ -1,7 +1,8 @@ """ :synopsis: Index all the files in a Git repository. -...more info soon... +.. todo:: + Add documentation, threaded Indexer class. """ import shutil, subprocess, os @@ -21,7 +22,7 @@ class ChangeDir(object): def __init__(self, new_path): """ - Construct the object. + Create a ChangeDir instance. :param new_path: The directory to enter. @@ -38,20 +39,32 @@ class ChangeDir(object): self.old_path = os.getcwd() os.chdir(self.new_path) - def __exit__(self, etype, value, traceback): + def __exit__(self, *exception): """ Change the current working-directory to **old_path**. + + :param exception: Various exception arguments passed by `with`. + + :type exception: varargs """ os.chdir(self.old_path) def index_repository(repo_url, framework_name): """ - Insert a Codelet for every file in a Git repository. + Clone and index (create and insert Codeletes for) a Git repository. - `git clone` the Git repository located at **repo_url**, and create a Codelet - for every one of non-binary (text) files in its if main branch (usually - *master*). + `git clone` the Git repository located at **repo_url**, call + _insert_repository_codelets, then remove said repository. + + :param repo_url: The url the Git repository was cloned from. + :param framework_name: The name of the framework the repository is from. + + :type repo_url: str + :type framework_name: str + + :return: Temporary: the new codelets, for testing purposes. + :rtype: Codelet array """ repo_name = repo_url.split("/")[-1] @@ -67,6 +80,22 @@ def index_repository(repo_url, framework_name): return codelets def _insert_repository_codelets(repo_url, repo_name, framework_name): + """ + Create a Codelet for the files inside a Git repository. + + Create a new Codelet, and insert it into the Database singlet, for every + file inside the current working directory's default branch (usually + *master*). + + :param repo_url: The url the Git repository was cloned from. + :param repo_name: The name of the repository. + :param framework_name: The name of the framework the repository is from. + + :type repo_url: str + :type repo_name: str + :type framework_name: str + """ + codelets = [] commits_meta = _get_commits_metadata() for filename in commits_meta.keys(): From b7ccec05015cbd011a7ddaa7e2a69462d518af9e Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Tue, 15 Apr 2014 11:08:53 -0400 Subject: [PATCH 27/42] Add untested threaded indexer/crawler prototype. Additions are not tested and not yet documented. Add: crawler.py -add threaded GitHubCrawler class, which interacts with a GitIndexer via a Queue. git_indexer.py -add threaded GitIndexer class, which interacts with GitHubCrawler via a Queue. -rename context-manager ChangeDir class to _ChangeDir, because it's essentially "private". __init__.py -add body to crawl(), which creates instances of GitHubCrawler and GitIndexer and starts them. --- bitshift/crawler/__init__.py | 12 ++++++++++-- bitshift/crawler/crawler.py | 20 ++++++++++++++++---- bitshift/crawler/git_indexer.py | 28 ++++++++++++++++++++-------- 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index a518970..f38a187 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -1,6 +1,14 @@ -import crawler +import Queue + +from bitshift.crawler import crawler +from bitshift.crawler import git_indexer __all__ = ["crawl"] def crawl(): - pass + repository_queue = Queue.Queue() + github_crawler = crawler.GitHubCrawler(repository_queue) + indexer = git_indexer.GitIndexer(repository_queue) + + for thread in [github_crawler, indexer]: + thread.start() diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 34f2819..fc1aadb 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -4,14 +4,22 @@ ...more info soon... """ -import requests, time +import requests, time, threading -import git_indexer +import bitshift.crawler.git_indexer from ..codelet import Codelet from ..database import Database -def github(): +class GitHubCrawler(threading.Thread): + def __init__(self, repository_queue): + self.repository_queue = repository_queue + super(GitHubCrawler, self).__init__() + + def run(): + _github() + +def _github(): """ Query the GitHub API for data about every public repository. @@ -33,7 +41,11 @@ def github(): response = requests.get(next_api_url, params=authentication_params) for repo in response.json(): - print repo["id"] + self.repository_queue.put({ + "url" : repo["html_url"], + "framework_name" : "GitHub" + }) + self.repository_queue.task_done() if int(response.headers["x-ratelimit-remaining"]) == 0: time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py index 8cd3ae3..2268895 100644 --- a/bitshift/crawler/git_indexer.py +++ b/bitshift/crawler/git_indexer.py @@ -5,14 +5,26 @@ Add documentation, threaded Indexer class. """ -import shutil, subprocess, os +import os, shutil, subprocess, threading from ..database import Database from ..codelet import Codelet GIT_CLONE_DIR = "/tmp" -class ChangeDir(object): +class GitIndexer(threading.Thread): + def __init__(self, repository_queue): + self.repository_queue = repository_queue + super(GitIndexer, self).__init__() + + def run(self): + while True: + while self.repository_queue.empty(): + pass + new_repo = self.repository_queue.get() + _index_repository(new_repo["url"], new_repo["framework_name"]) + +class _ChangeDir(object): """ A wrapper class for os.chdir(), to map onto `with` and handle exceptions. @@ -22,7 +34,7 @@ class ChangeDir(object): def __init__(self, new_path): """ - Create a ChangeDir instance. + Create a _ChangeDir instance. :param new_path: The directory to enter. @@ -50,7 +62,7 @@ class ChangeDir(object): os.chdir(self.old_path) -def index_repository(repo_url, framework_name): +def _index_repository(repo_url, framework_name): """ Clone and index (create and insert Codeletes for) a Git repository. @@ -70,9 +82,9 @@ def index_repository(repo_url, framework_name): repo_name = repo_url.split("/")[-1] codelets = [] - with ChangeDir(GIT_CLONE_DIR) as git_clone_dir: + with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: subprocess.call("git clone %s" % repo_url, shell=True) - with ChangeDir(repo_name) as repository_dir: + with _ChangeDir(repo_name) as repository_dir: codelets = _insert_repository_codelets(repo_url, repo_name, framework_name) shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) @@ -128,7 +140,7 @@ def _generate_file_url(filename, repo_url, framework_name): :rtype: str """ - if framework_name == "github": + if framework_name == "GitHub": default_branch = subprocess.check_output("git branch --no-color", shell=True)[2:-1] return "%s/blob/%s/%s" % (repo_url, default_branch, filename) @@ -164,7 +176,7 @@ def _get_git_commits(): commits.append({ "author" : fields[0], "timestamp" : int(fields[1]), - "filenames" : fields[2].split("\0")[:-2] + "filenames" : fields[2].split("\x00")[:-2] }) return commits From b680756f8dba4f5ab3690f069f5520978846fc06 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Wed, 16 Apr 2014 13:32:04 -0400 Subject: [PATCH 28/42] Test crawler, complete documentation. Add, Fix: bitshift/crawler/ __init__.py -add module and crawl() docstrings. -add repository_queue size limit. crawler.py -account for time spent executing an API query in the run() loop sleep() interval. --- bitshift/crawler/__init__.py | 18 +++- bitshift/crawler/crawler.py | 106 +++++++++++++++++------- bitshift/crawler/{git_indexer.py => indexer.py} | 0 3 files changed, 91 insertions(+), 33 deletions(-) rename bitshift/crawler/{git_indexer.py => indexer.py} (100%) diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index f38a187..6c13be9 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -1,3 +1,9 @@ +""" +:synopsis: Parent crawler module, which supervises all crawlers. + +Contains functions for initializing all subsidiary, threaded crawlers. +""" + import Queue from bitshift.crawler import crawler @@ -5,8 +11,18 @@ from bitshift.crawler import git_indexer __all__ = ["crawl"] +MAX_URL_QUEUE_SIZE = 5e3 + def crawl(): - repository_queue = Queue.Queue() + """ + Initialize all crawlers (and indexers). + + Start the: + 1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler` + 2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer` + """ + + repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) github_crawler = crawler.GitHubCrawler(repository_queue) indexer = git_indexer.GitIndexer(repository_queue) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index fc1aadb..5b0f600 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -12,46 +12,88 @@ from ..codelet import Codelet from ..database import Database class GitHubCrawler(threading.Thread): + """ + Crawler that retrieves links to all of GitHub's public repositories. + + GitHubCrawler is a threaded singleton that queries GitHub's API for URLs + to its public repositories, which it inserts into a :class:`Queue.Queue` + shared with :class:`bitshift.crawler.git_indexer.GitIndexer`. + + :ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with + repository information retrieved by `GitHubCrawler`, and other Git + crawlers, to be processed by + :class:`bitshift.crawler.git_indexer.GitIndexer`. + """ + def __init__(self, repository_queue): + """ + Create an instance of the singleton `GitHubCrawler`. + + :param repository_queue: A queue containing dictionaries of repository + metadata retrieved by `GitHubCrawler`, meant to be processed by an + instance of :class:`bitshift.crawler.git_indexer.GitIndexer`. + + .. code-block:: python + sample_dict = { + "url" : "https://github.com/user/repo", + "name" : "repo", + "framework_name" : "GitHub" + } + + :type repository_queue: :class:`Queue.Queue` + """ + + self.repository_queue = repository_queue super(GitHubCrawler, self).__init__() - def run(): - _github() + def run(self): + """ + Query the GitHub API for data about every public repository. -def _github(): - """ - Query the GitHub API for data about every public repository. + Pull all of GitHub's repositories by making calls to its API in a loop, + accessing a subsequent page of results via the "next" URL returned in an + API response header. Uses Severyn Kozak's (sevko) authentication + credentials. + """ - Pull all of GitHub's repositories by making calls to its API in a loop, - accessing a subsequent page of results via the "next" URL returned in an - API response header. Uses Severyn Kozak's (sevko) authentication - credentials. - """ + next_api_url = "https://api.github.com/repositories" + authentication_params = { + "client_id" : "436cb884ae09be7f2a4e", + "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" + } + api_request_interval = 5e3 / 60 ** 2 - next_api_url = "https://api.github.com/repositories" - authentication_params = { - "client_id" : "436cb884ae09be7f2a4e", - "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" - } - api_request_interval = 5e3 / 60 ** 2 - - while len(next_api_url) > 0: - start_time = time.time() - response = requests.get(next_api_url, params=authentication_params) - - for repo in response.json(): - self.repository_queue.put({ - "url" : repo["html_url"], - "framework_name" : "GitHub" + while len(next_api_url) > 0: + # DEBUG + db.log.insert({ + "time" : str(time.time()).split(".")[0][-4:], + "qsize" : self.repository_queue.qsize() }) - self.repository_queue.task_done() - if int(response.headers["x-ratelimit-remaining"]) == 0: - time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) + start_time = time.time() + response = requests.get(next_api_url, params=authentication_params) + + for repo in response.json(): + logging.basicConfig(filename="crawler.log", level=logging.DEBUG) + logging.debug("crawler: %-20s: %-5s: %-5s: %s", + str(time.time()).split(".")[0], + self.repository_queue.qsize(), repo["id"], + repo["name"]) + while self.repository_queue.full(): + pass + self.repository_queue.put({ + "url" : repo["html_url"], + "name" : repo["html_url"].split("/")[-1], + "framework_name" : "GitHub" + }) + + if int(response.headers["x-ratelimit-remaining"]) == 0: + time.sleep(int(response.headers["x-ratelimit-reset"]) - + time.time()) - next_api_url = response.headers["link"].split(">")[0][1:] + next_api_url = response.headers["link"].split(">")[0][1:] - sleep_time = api_request_interval - (time.time() - start_time) - if sleep_time > 0: - time.sleep(sleep_time) + sleep_time = api_request_interval - (time.time() - start_time) + if sleep_time > 0: + time.sleep(sleep_time) diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/indexer.py similarity index 100% rename from bitshift/crawler/git_indexer.py rename to bitshift/crawler/indexer.py From 627c848f208d65d62389482b3467e47279200ce0 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Wed, 16 Apr 2014 16:41:14 -0400 Subject: [PATCH 29/42] Add tested indexer. Add: bitshift/crawler/indexer.py -add _debug(). -add content to the module docstring; add documentation to GitIndexer, and the functions that were lacking it. -add another perl one-liner to supplement the `git clone` subprocess call, which terminates it after a set amount of time (should it have frozen) -- fixes a major bug that caused the entire indexer to hang. --- bitshift/crawler/__init__.py | 9 ++- bitshift/crawler/crawler.py | 25 ++------ bitshift/crawler/indexer.py | 149 +++++++++++++++++++++++++++++++------------ 3 files changed, 120 insertions(+), 63 deletions(-) diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index 6c13be9..4875712 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -6,8 +6,7 @@ Contains functions for initializing all subsidiary, threaded crawlers. import Queue -from bitshift.crawler import crawler -from bitshift.crawler import git_indexer +from bitshift.crawler import crawler, indexer __all__ = ["crawl"] @@ -19,12 +18,12 @@ def crawl(): Start the: 1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler` - 2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer` + 2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer` """ repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) github_crawler = crawler.GitHubCrawler(repository_queue) - indexer = git_indexer.GitIndexer(repository_queue) + git_indexer = indexer.GitIndexer(repository_queue) - for thread in [github_crawler, indexer]: + for thread in [github_crawler, git_indexer]: thread.start() diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 5b0f600..8b9576d 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -1,12 +1,12 @@ """ :synopsis: Main crawler module, to oversee all site-specific crawlers. -...more info soon... +Contains all website/framework-specific Class crawlers. """ import requests, time, threading -import bitshift.crawler.git_indexer +import bitshift.crawler.indexer from ..codelet import Codelet from ..database import Database @@ -17,12 +17,12 @@ class GitHubCrawler(threading.Thread): GitHubCrawler is a threaded singleton that queries GitHub's API for URLs to its public repositories, which it inserts into a :class:`Queue.Queue` - shared with :class:`bitshift.crawler.git_indexer.GitIndexer`. + shared with :class:`bitshift.crawler.indexer.GitIndexer`. :ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with repository information retrieved by `GitHubCrawler`, and other Git crawlers, to be processed by - :class:`bitshift.crawler.git_indexer.GitIndexer`. + :class:`bitshift.crawler.indexer.GitIndexer`. """ def __init__(self, repository_queue): @@ -31,7 +31,7 @@ class GitHubCrawler(threading.Thread): :param repository_queue: A queue containing dictionaries of repository metadata retrieved by `GitHubCrawler`, meant to be processed by an - instance of :class:`bitshift.crawler.git_indexer.GitIndexer`. + instance of :class:`bitshift.crawler.indexer.GitIndexer`. .. code-block:: python sample_dict = { @@ -43,7 +43,6 @@ class GitHubCrawler(threading.Thread): :type repository_queue: :class:`Queue.Queue` """ - self.repository_queue = repository_queue super(GitHubCrawler, self).__init__() @@ -65,26 +64,16 @@ class GitHubCrawler(threading.Thread): api_request_interval = 5e3 / 60 ** 2 while len(next_api_url) > 0: - # DEBUG - db.log.insert({ - "time" : str(time.time()).split(".")[0][-4:], - "qsize" : self.repository_queue.qsize() - }) - start_time = time.time() response = requests.get(next_api_url, params=authentication_params) for repo in response.json(): - logging.basicConfig(filename="crawler.log", level=logging.DEBUG) - logging.debug("crawler: %-20s: %-5s: %-5s: %s", - str(time.time()).split(".")[0], - self.repository_queue.qsize(), repo["id"], - repo["name"]) while self.repository_queue.full(): pass + self.repository_queue.put({ "url" : repo["html_url"], - "name" : repo["html_url"].split("/")[-1], + "name" : repo["name"], "framework_name" : "GitHub" }) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 2268895..f2a8bbf 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -1,28 +1,60 @@ """ -:synopsis: Index all the files in a Git repository. - -.. todo:: - Add documentation, threaded Indexer class. +:synopsis: Contains a singleton GitIndexer class, which clones and indexes git + repositories. """ -import os, shutil, subprocess, threading +import bs4, os, re, shutil, subprocess, threading from ..database import Database from ..codelet import Codelet -GIT_CLONE_DIR = "/tmp" +GIT_CLONE_DIR = "/tmp/bitshift" class GitIndexer(threading.Thread): + """ + A singleton Git repository indexer. + + `GitIndexer` clones and indexes the repositories at urls found by the + :mod:`bitshift.crawler.crawler` Git crawlers. + + :ivar repository_queue: (:class:`Queue.Queue`) A queue containing urls found + by the :mod:`bitshift.crawler.crawler` Git crawlers. + """ + def __init__(self, repository_queue): + """ + Create an instance of the singleton `GitIndexer`. + + :param repository_queue: see :attr:`GitIndexer.repository_queue` + + :type repository_queue: see :attr:`GitIndexer.repository_queue` + """ + self.repository_queue = repository_queue super(GitIndexer, self).__init__() def run(self): + """ + Retrieve new repository urls, clone, and index them. + + Blocks until new urls appear in :attr:`GitIndexer.repository_queue`, + then retrieves one, and attempts cloning/indexing it. Should any errors + occur, the new repository will be discarded and the crawler will + index the next in the queue. + """ + while True: while self.repository_queue.empty(): pass - new_repo = self.repository_queue.get() - _index_repository(new_repo["url"], new_repo["framework_name"]) + + repo = self.repository_queue.get() + self.repository_queue.task_done() + + try: + _index_repository(repo["url"], repo["name"], + repo["framework_name"]) + except: # desperate times -- will be modified later + pass class _ChangeDir(object): """ @@ -62,7 +94,7 @@ class _ChangeDir(object): os.chdir(self.old_path) -def _index_repository(repo_url, framework_name): +def _index_repository(repo_url, repo_name, framework_name): """ Clone and index (create and insert Codeletes for) a Git repository. @@ -70,32 +102,30 @@ def _index_repository(repo_url, framework_name): _insert_repository_codelets, then remove said repository. :param repo_url: The url the Git repository was cloned from. + :param repo_name: The name of the repository. :param framework_name: The name of the framework the repository is from. :type repo_url: str + :type repo_name: str :type framework_name: str - - :return: Temporary: the new codelets, for testing purposes. - :rtype: Codelet array """ - repo_name = repo_url.split("/")[-1] - codelets = [] + GIT_CLONE_TIMEOUT = 60 with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: - subprocess.call("git clone %s" % repo_url, shell=True) + if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \ + clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0: + return + with _ChangeDir(repo_name) as repository_dir: - codelets = _insert_repository_codelets(repo_url, repo_name, - framework_name) + _insert_repository_codelets(repo_url, repo_name, framework_name) shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) - return codelets - def _insert_repository_codelets(repo_url, repo_name, framework_name): """ - Create a Codelet for the files inside a Git repository. + Create and insert a Codelet for the files inside a Git repository. - Create a new Codelet, and insert it into the Database singlet, for every + Create a new Codelet, and insert it into the Database singleton, for every file inside the current working directory's default branch (usually *master*). @@ -108,21 +138,27 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name): :type framework_name: str """ - codelets = [] commits_meta = _get_commits_metadata() for filename in commits_meta.keys(): with open(filename, "r") as source_file: - source = source_file.read() + source = _decode(source_file.read()) + if source is None: + return - authors = [(author,) for author in commits_meta[filename]["authors"]] - codelets.append( - Codelet("%s:%s" % (repo_name, filename), source, filename, + authors = [(_decode(author),) for author in \ + commits_meta[filename]["authors"]] + codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, None, authors, _generate_file_url(filename, repo_url, - framework_name), + framework_name), commits_meta[filename]["time_created"], - commits_meta[filename]["time_last_modified"])) + commits_meta[filename]["time_last_modified"]) - return codelets + db.codelets.insert({ + "name" : codelet.name, + "authors" : codelet.authors + }) + + # Database.insert(codelet) def _generate_file_url(filename, repo_url, framework_name): """ @@ -142,7 +178,7 @@ def _generate_file_url(filename, repo_url, framework_name): if framework_name == "GitHub": default_branch = subprocess.check_output("git branch --no-color", - shell=True)[2:-1] + shell=True)[2:-1] return "%s/blob/%s/%s" % (repo_url, default_branch, filename) def _get_git_commits(): @@ -165,8 +201,7 @@ def _get_git_commits(): :rtype: dictionary """ - git_log = subprocess.check_output( - ("git --no-pager log --name-only" + git_log = subprocess.check_output(("git --no-pager log --name-only" " --pretty=format:'%n%n%an%n%at' -z"), shell=True) commits = [] @@ -183,24 +218,34 @@ def _get_git_commits(): def _get_tracked_files(): """ - Return a list of the filenames of all files in the Git repository. + Return a list of the filenames of all valuable files in the Git repository. Get a list of the filenames of the non-binary (Perl heuristics used for filetype identification) files currently inside the current working - directory's Git repository. + directory's Git repository. Then, weed out any boilerplate/non-code files + that match the regex rules in GIT_IGNORE_FILES. - :return: The filenames of all non-binary files. + :return: The filenames of all index-worthy non-binary files. :rtype: str array """ - tracked_files = subprocess.check_output( - ("perl -le 'for (@ARGV){ print if -f && -T }'" - " $(find . -type d -name .git -prune -o -print)"), shell=True) - return [filename[2:] for filename in tracked_files.split("\n")[:-1]] + GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"] + + tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \ + -f && -T }' $(find . -type d -name .git -prune -o -print)"), + shell=True).split("\n")[:-1] + + valuable_files = [] + for filename in tracked_files: + filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE) + for pattern in GIT_IGNORE_FILES]) + if not filename_match: + valuable_files.append(filename[2:]) + return valuable_files def _get_commits_metadata(): """ - Return a dictionary containing every tracked file's metadata. + Return a dictionary containing every valuable tracked file's metadata. :return: A dictionary with author names, time of creation, and time of last modification for every filename key. @@ -236,3 +281,27 @@ def _get_commits_metadata(): files_meta[filename]["time_created"] = commit["timestamp"] return files_meta + +def _decode(raw): + """ + Return a decoded a raw string. + + :param raw: The string to string. + + :type raw: (str) + + :return: If the original encoding is successfully inferenced, return the + decoded string. + :rtype: str, or None + + .. warning:: + The raw string's original encoding is identified by heuristics which + can, and occasionally will, fail. Decoding will then fail, and None + will be returned. + """ + + try: + return raw.decode(bs4.BeautifulSoup(raw).original_encoding) + + except (UnicodeDecodeError, UserWarning): + return None From f4b28e617856e02d5f77570fa3dc66c1828063c6 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Thu, 17 Apr 2014 09:05:28 -0400 Subject: [PATCH 30/42] Add file-ext regex rules, exception handlers. Add: bitshift/crawler/indexer.py -add two `try: except: pass` blocks, one to _decode() and another to GitIndexer.run(); bad practice, but GitIndexer has numerous unreliable moving parts that can throw too many unforseeable exceptions. Only current viable option. -add file-extension regex ignore rules (for text, markdown, etc. files) to _get_tracked_files(). --- bitshift/crawler/indexer.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index f2a8bbf..50dbe8c 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -31,6 +31,10 @@ class GitIndexer(threading.Thread): """ self.repository_queue = repository_queue + + if not os.path.exists(GIT_CLONE_DIR): + os.makedirs(GIT_CLONE_DIR) + super(GitIndexer, self).__init__() def run(self): @@ -53,7 +57,7 @@ class GitIndexer(threading.Thread): try: _index_repository(repo["url"], repo["name"], repo["framework_name"]) - except: # desperate times -- will be modified later + except: pass class _ChangeDir(object): @@ -110,16 +114,19 @@ def _index_repository(repo_url, repo_name, framework_name): :type framework_name: str """ - GIT_CLONE_TIMEOUT = 60 + GIT_CLONE_TIMEOUT = 600 with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \ clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0: + if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) return with _ChangeDir(repo_name) as repository_dir: _insert_repository_codelets(repo_url, repo_name, framework_name) - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) + + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) def _insert_repository_codelets(repo_url, repo_name, framework_name): """ @@ -153,11 +160,6 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name): commits_meta[filename]["time_created"], commits_meta[filename]["time_last_modified"]) - db.codelets.insert({ - "name" : codelet.name, - "authors" : codelet.authors - }) - # Database.insert(codelet) def _generate_file_url(filename, repo_url, framework_name): @@ -230,6 +232,8 @@ def _get_tracked_files(): """ GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"] + GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?", + "md(wn|t[e]?xt)?", "rst"] tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \ -f && -T }' $(find . -type d -name .git -prune -o -print)"), @@ -239,7 +243,11 @@ def _get_tracked_files(): for filename in tracked_files: filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE) for pattern in GIT_IGNORE_FILES]) - if not filename_match: + extension = filename.split(".")[-1] + extension_match = any([re.match(pattern, filename, flags=re.IGNORECASE) + for pattern in GIT_IGNORE_EXTENSIONS]) + + if not (filename_match or extension_match): valuable_files.append(filename[2:]) return valuable_files @@ -301,7 +309,8 @@ def _decode(raw): """ try: - return raw.decode(bs4.BeautifulSoup(raw).original_encoding) + encoding = bs4.BeautifulSoup(raw).original_encoding + return raw.decode(encoding) if encoding is not None else None - except (UnicodeDecodeError, UserWarning): + except: return None From 755dce6ae3ca2be4f72e16b09eb9fa6ef9614420 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Thu, 17 Apr 2014 09:53:27 -0400 Subject: [PATCH 31/42] Add logging to crawler/indexer. Add: bitshift/crawler/(__init__, crawler, indexer).py -add `logging` module to all `bitshift.crawler` modules, for some basic diagnostic output. --- bitshift/crawler/__init__.py | 11 ++++++++--- bitshift/crawler/crawler.py | 7 +++++-- bitshift/crawler/indexer.py | 26 ++++++++++++++++---------- 3 files changed, 29 insertions(+), 15 deletions(-) diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index 4875712..39a1a28 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -4,14 +4,12 @@ Contains functions for initializing all subsidiary, threaded crawlers. """ -import Queue +import logging, Queue from bitshift.crawler import crawler, indexer __all__ = ["crawl"] -MAX_URL_QUEUE_SIZE = 5e3 - def crawl(): """ Initialize all crawlers (and indexers). @@ -21,6 +19,13 @@ def crawl(): 2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer` """ + MAX_URL_QUEUE_SIZE = 5e3 + DEBUG_FILE = "crawler.log" + + logging.basicConfig(filename=DEBUG_FILE, + format="%(asctime)s:\t%(threadName)s:\t%(message)s", + level=logging.DEBUG) + repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) github_crawler = crawler.GitHubCrawler(repository_queue) git_indexer = indexer.GitIndexer(repository_queue) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 8b9576d..edd8eaf 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -4,7 +4,7 @@ Contains all website/framework-specific Class crawlers. """ -import requests, time, threading +import logging, requests, time, threading import bitshift.crawler.indexer @@ -44,7 +44,8 @@ class GitHubCrawler(threading.Thread): """ self.repository_queue = repository_queue - super(GitHubCrawler, self).__init__() + logging.info("Starting.") + super(GitHubCrawler, self).__init__(name=self.__class__.__name__) def run(self): """ @@ -66,6 +67,8 @@ class GitHubCrawler(threading.Thread): while len(next_api_url) > 0: start_time = time.time() response = requests.get(next_api_url, params=authentication_params) + logging.info("API call made. Limit remaining: %s." % + response.headers["x-ratelimit-remaining"]) for repo in response.json(): while self.repository_queue.full(): diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 50dbe8c..b1e8e34 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -3,7 +3,7 @@ repositories. """ -import bs4, os, re, shutil, subprocess, threading +import bs4, logging, os, re, shutil, subprocess, threading from ..database import Database from ..codelet import Codelet @@ -35,7 +35,8 @@ class GitIndexer(threading.Thread): if not os.path.exists(GIT_CLONE_DIR): os.makedirs(GIT_CLONE_DIR) - super(GitIndexer, self).__init__() + logging.info("Starting.") + super(GitIndexer, self).__init__(name=self.__class__.__name__) def run(self): """ @@ -53,12 +54,8 @@ class GitIndexer(threading.Thread): repo = self.repository_queue.get() self.repository_queue.task_done() - - try: - _index_repository(repo["url"], repo["name"], - repo["framework_name"]) - except: - pass + _index_repository(repo["url"], repo["name"], + repo["framework_name"]) class _ChangeDir(object): """ @@ -116,15 +113,23 @@ def _index_repository(repo_url, repo_name, framework_name): GIT_CLONE_TIMEOUT = 600 + logging.info("Indexing repository %s." % repo_url) with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \ clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0: + logging.debug("_index_repository(): Cloning %s failed." % repo_url) if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) return with _ChangeDir(repo_name) as repository_dir: - _insert_repository_codelets(repo_url, repo_name, framework_name) + try: + _insert_repository_codelets(repo_url, repo_name, + framework_name) + except Exception as exception: + logging.warning("%s: _insert_repository_codelets" + " failed %s." % (exception, repo_url)) + pass shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) @@ -312,5 +317,6 @@ def _decode(raw): encoding = bs4.BeautifulSoup(raw).original_encoding return raw.decode(encoding) if encoding is not None else None - except: + except Exception as exception: + logging.warning("_debug(): %s", exception) return None From 3ce399adbf5ebae2fcff017c8c680e21be31d4a7 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Thu, 17 Apr 2014 14:05:12 -0400 Subject: [PATCH 32/42] Add threaded cloner, GitRepository class (#7). Add: bitshift/crawler/ (crawler, indexer).py -add a 'time.sleep()' call whenever a thread is blocking on items in a Queue, to prevent excessive polling (which hogs system resources). indexer.py -move 'git clone' functionality from the 'GitIndexer' singleton to a separate, threaded '_GitCloner'. -'crawler.GitHubCrawler' now shares a "clone" queue with '_GitCloner', which shares an "index" queue with 'GitIndexer'. -both indexing and cloning are time-intensive processes, so this improvement should (hypothetically) boost performance. -add `GitRepository` class, instances of which are passed around in the queues. --- bitshift/crawler/crawler.py | 51 ++++++------ bitshift/crawler/indexer.py | 187 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 169 insertions(+), 69 deletions(-) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index edd8eaf..8509c6d 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -6,7 +6,7 @@ Contains all website/framework-specific Class crawlers. import logging, requests, time, threading -import bitshift.crawler.indexer +from bitshift.crawler import indexer from ..codelet import Codelet from ..database import Database @@ -19,31 +19,22 @@ class GitHubCrawler(threading.Thread): to its public repositories, which it inserts into a :class:`Queue.Queue` shared with :class:`bitshift.crawler.indexer.GitIndexer`. - :ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with - repository information retrieved by `GitHubCrawler`, and other Git - crawlers, to be processed by + :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository` + with repository metadata retrieved by :class:`GitHubCrawler`, and other + Git crawlers, to be processed by :class:`bitshift.crawler.indexer.GitIndexer`. """ - def __init__(self, repository_queue): + def __init__(self, clone_queue): """ Create an instance of the singleton `GitHubCrawler`. - :param repository_queue: A queue containing dictionaries of repository - metadata retrieved by `GitHubCrawler`, meant to be processed by an - instance of :class:`bitshift.crawler.indexer.GitIndexer`. + :param clone_queue: see :attr:`self.clone_queue` - .. code-block:: python - sample_dict = { - "url" : "https://github.com/user/repo", - "name" : "repo", - "framework_name" : "GitHub" - } - - :type repository_queue: :class:`Queue.Queue` + :type clone_queue: see :attr:`self.clone_queue` """ - self.repository_queue = repository_queue + self.clone_queue = clone_queue logging.info("Starting.") super(GitHubCrawler, self).__init__(name=self.__class__.__name__) @@ -54,7 +45,8 @@ class GitHubCrawler(threading.Thread): Pull all of GitHub's repositories by making calls to its API in a loop, accessing a subsequent page of results via the "next" URL returned in an API response header. Uses Severyn Kozak's (sevko) authentication - credentials. + credentials. For every new repository, a :class:`GitRepository` is + inserted into :attr:`self.clone_queue`. """ next_api_url = "https://api.github.com/repositories" @@ -67,18 +59,21 @@ class GitHubCrawler(threading.Thread): while len(next_api_url) > 0: start_time = time.time() response = requests.get(next_api_url, params=authentication_params) - logging.info("API call made. Limit remaining: %s." % - response.headers["x-ratelimit-remaining"]) + + queue_percent_full = (float(self.clone_queue.qsize()) / + self.clone_queue.maxsize) * 100 + logging.info("API call made. Limit remaining: %s. Queue-size: (%d" + "%%) %d/%d" % (response.headers["x-ratelimit-remaining"], + queue_percent_full, self.clone_queue.qsize(), + self.clone_queue.maxsize)) for repo in response.json(): - while self.repository_queue.full(): - pass - - self.repository_queue.put({ - "url" : repo["html_url"], - "name" : repo["name"], - "framework_name" : "GitHub" - }) + while self.clone_queue.full(): + time.sleep(1) + + self.clone_queue.put(indexer.GitRepository( + repo["html_url"], repo["full_name"].replace("/", ""), + "GitHub")) if int(response.headers["x-ratelimit-remaining"]) == 0: time.sleep(int(response.headers["x-ratelimit-reset"]) - diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index b1e8e34..7e82bb5 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -3,59 +3,171 @@ repositories. """ -import bs4, logging, os, re, shutil, subprocess, threading +import bs4, logging, os, Queue, re, shutil, subprocess, time, threading from ..database import Database from ..codelet import Codelet +import pymongo #debug +db = pymongo.MongoClient().bitshift #debug + GIT_CLONE_DIR = "/tmp/bitshift" +THREAD_QUEUE_SLEEP = 0.5 + +class GitRepository(object): + """ + A representation of a Git repository's metadata. + + :ivar url: (str) The repository's url. + :ivar name: (str) The name of the repository. + :ivar framework_name: (str) The name of the online Git framework that the + repository belongs to (eg, GitHub, BitBucket). + """ + + def __init__(self, url, name, framework_name): + """ + Create a GitRepository instance. + + :param url: see :attr:`GitRepository.url` + :param name: see :attr:`GitRepository.name` + :param framework_name: see :attr:`GitRepository.framework_name` + + :type url: str + :type name: str + :type framework_name: str + """ + + self.url = url + self.name = name + self.framework_name = framework_name class GitIndexer(threading.Thread): """ A singleton Git repository indexer. - `GitIndexer` clones and indexes the repositories at urls found by the - :mod:`bitshift.crawler.crawler` Git crawlers. + :class:`GitIndexer` indexes the repositories cloned by the + :class:`_GitCloner` singleton. - :ivar repository_queue: (:class:`Queue.Queue`) A queue containing urls found - by the :mod:`bitshift.crawler.crawler` Git crawlers. + :ivar index_queue: (:class:`Queue.Queue`) A queue containing + :class:`GitRepository` objects for every new repository succesfully + cloned by :class:`_GitCloner`, which are to be indexed. + :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner, + which feeds :class:`GitIndexer`. """ - def __init__(self, repository_queue): + def __init__(self, clone_queue): """ Create an instance of the singleton `GitIndexer`. - :param repository_queue: see :attr:`GitIndexer.repository_queue` + :param clone_queue: see :attr:`self.index_queue` - :type repository_queue: see :attr:`GitIndexer.repository_queue` + :type index_queue: see :attr:`self.index_queue` """ - self.repository_queue = repository_queue + MAX_INDEX_QUEUE_SIZE = 10 + + logging.info("Starting.") + self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) + self.git_cloner = _GitCloner(clone_queue, self.index_queue) + self.git_cloner.start() if not os.path.exists(GIT_CLONE_DIR): os.makedirs(GIT_CLONE_DIR) - logging.info("Starting.") super(GitIndexer, self).__init__(name=self.__class__.__name__) def run(self): """ - Retrieve new repository urls, clone, and index them. + Retrieve metadata about newly cloned repositories and index them. + + Blocks until new repositories appear in :attr:`self.index_queue`, then + retrieves one, and attempts indexing it. Should any errors occur, the + new repository will be discarded and the indexer will index the next in + the queue. + """ + + while True: + while self.index_queue.empty(): + logging.warning("Empty.") + time.sleep(THREAD_QUEUE_SLEEP) + + repo = self.index_queue.get() + self.index_queue.task_done() + _index_repository(repo.url, repo.name, repo.framework_name) + +class _GitCloner(threading.Thread): + """ + A singleton Git repository cloner. + + :ivar clone_queue: (:class:`Queue.Queue`) see + :attr:`bitshift.crawler.crawler.GitHubCrawler.clone_queue`. + :ivar index_queue: (:class:`Queue.Queue`) see + :attr:`GitIndexer.index_queue`. + """ + + def __init__(self, clone_queue, index_queue): + """ + Create an instance of the singleton :class:`_GitCloner`. + + :param clone_queue: see :attr:`self.clone_queue` + :param index_queue: see :attr:`self.index_queue` + + :type clone_queue: see :attr:`self.clone_queue` + :type index_queue: see :attr:`self.index_queue` + """ + + self.clone_queue = clone_queue + self.index_queue = index_queue + super(_GitCloner, self).__init__(name=self.__class__.__name__) + + def run(self): + """ + Retrieve metadata about newly crawled repositories and clone them. - Blocks until new urls appear in :attr:`GitIndexer.repository_queue`, - then retrieves one, and attempts cloning/indexing it. Should any errors - occur, the new repository will be discarded and the crawler will - index the next in the queue. + Blocks until new :class:`GitRepository` appear in + :attr:`self.clone_queue`, then attempts cloning them. If + succcessful, the cloned repository is added to :attr:`self.index_queue` + for the `GitIndexer` to clone; otherwise, it is discarded. """ while True: - while self.repository_queue.empty(): - pass + while self.clone_queue.empty(): + time.sleep(THREAD_QUEUE_SLEEP) + repo = self.clone_queue.get() + self.clone_queue.task_done() + self._clone_repository(repo) - repo = self.repository_queue.get() - self.repository_queue.task_done() - _index_repository(repo["url"], repo["name"], - repo["framework_name"]) + def _clone_repository(self, repo): + """ + Attempt cloning a Git repository. + + :param repo: Metadata about the repository to clone. + + :type repo: :class:`GitRepository` + """ + + GIT_CLONE_TIMEOUT = 500 + + queue_percent_full = (float(self.index_queue.qsize()) / + self.index_queue.maxsize) * 100 + logging.info("Cloning %s. Queue-size: (%d%%) %d/%d" % (repo.url, + queue_percent_full, self.index_queue.qsize(), + self.index_queue.maxsize)) + + with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: + if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git" + " clone %s %s" % (GIT_CLONE_TIMEOUT, repo.url, repo.name), + shell=True) != 0: + logging.debug("_clone_repository(): Cloning %s failed." % + repo.url) + if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) + return + + while self.index_queue.full(): + time.sleep(THREAD_QUEUE_SLEEP) + + self.index_queue.put(repo) class _ChangeDir(object): """ @@ -111,27 +223,17 @@ def _index_repository(repo_url, repo_name, framework_name): :type framework_name: str """ - GIT_CLONE_TIMEOUT = 600 - logging.info("Indexing repository %s." % repo_url) - with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: - if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \ - clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0: - logging.debug("_index_repository(): Cloning %s failed." % repo_url) - if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) - return - - with _ChangeDir(repo_name) as repository_dir: - try: - _insert_repository_codelets(repo_url, repo_name, - framework_name) - except Exception as exception: - logging.warning("%s: _insert_repository_codelets" - " failed %s." % (exception, repo_url)) - pass - - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) + with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir: + try: + _insert_repository_codelets(repo_url, repo_name, + framework_name) + except Exception as exception: + logging.warning("%s: _insert_repository_codelets failed %s." % + (exception, repo_url)) + + if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) def _insert_repository_codelets(repo_url, repo_name, framework_name): """ @@ -164,6 +266,9 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name): framework_name), commits_meta[filename]["time_created"], commits_meta[filename]["time_last_modified"]) + db.codelets.insert({ + "name" : codelet.name + }) # Database.insert(codelet) From 6718650a8c4ef72d31e4f1dc071bc12cad50adb9 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Fri, 18 Apr 2014 12:01:06 -0400 Subject: [PATCH 33/42] First part of #8 fix. Add: bitshift/crawler/indexer.py -Add 'pkill git' to the 'git clone' subprocess in '_clone_repository()', to kill hanging remotes -- it's un-Pythonic, but, thus far, the only method that's proved successful. The RAM problem still persists; the latest dry-run lasted 01:11:00 before terminating due to a lack of allocatable memory. -Add exception names to `logging` messages. bitshift/assets -Update 'tag()' docstring to current 'bitshift' standards (add a ':type' and ':rtype:' field). --- bitshift/assets.py | 3 ++ bitshift/crawler/indexer.py | 74 +++++++++++++++++++++++++-------------------- 2 files changed, 45 insertions(+), 32 deletions(-) diff --git a/bitshift/assets.py b/bitshift/assets.py index 5d15304..b4f597b 100644 --- a/bitshift/assets.py +++ b/bitshift/assets.py @@ -15,8 +15,11 @@ def tag(filename): :param filename: The filename of the asset to create a tag for. + :type filename: str + :return: A string containing a `` tag for JS files, and a `` for CSS files. + :rtype: str """ file_ext = filename.split(".")[-1] diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 7e82bb5..563f369 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -8,9 +8,6 @@ import bs4, logging, os, Queue, re, shutil, subprocess, time, threading from ..database import Database from ..codelet import Codelet -import pymongo #debug -db = pymongo.MongoClient().bitshift #debug - GIT_CLONE_DIR = "/tmp/bitshift" THREAD_QUEUE_SLEEP = 0.5 @@ -88,7 +85,6 @@ class GitIndexer(threading.Thread): while True: while self.index_queue.empty(): - logging.warning("Empty.") time.sleep(THREAD_QUEUE_SLEEP) repo = self.index_queue.get() @@ -154,20 +150,20 @@ class _GitCloner(threading.Thread): queue_percent_full, self.index_queue.qsize(), self.index_queue.maxsize)) - with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir: - if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git" - " clone %s %s" % (GIT_CLONE_TIMEOUT, repo.url, repo.name), - shell=True) != 0: - logging.debug("_clone_repository(): Cloning %s failed." % - repo.url) - if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) - return - - while self.index_queue.full(): - time.sleep(THREAD_QUEUE_SLEEP) + command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone" + " --single-branch %s %s/%s || pkill -f git") + if subprocess.call(command % (GIT_CLONE_TIMEOUT, repo.url, + GIT_CLONE_DIR, repo.name), shell=True) != 0: + logging.warning("_clone_repository(): Cloning %s failed." % + repo.url) + if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) + return - self.index_queue.put(repo) + while self.index_queue.full(): + time.sleep(THREAD_QUEUE_SLEEP) + + self.index_queue.put(repo) class _ChangeDir(object): """ @@ -229,8 +225,9 @@ def _index_repository(repo_url, repo_name, framework_name): _insert_repository_codelets(repo_url, repo_name, framework_name) except Exception as exception: - logging.warning("%s: _insert_repository_codelets failed %s." % - (exception, repo_url)) + logging.warning( + "_insert_repository_codelets() failed: %s: %s: %s" % + (exception.__class__.__name__, exception, repo_url)) if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) @@ -254,10 +251,15 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name): commits_meta = _get_commits_metadata() for filename in commits_meta.keys(): - with open(filename, "r") as source_file: - source = _decode(source_file.read()) - if source is None: - return + try: + with open(filename, "r") as source_file: + source = _decode(source_file.read()) + if source is None: + return + except IOError as exception: + logging.warning( + "_insert_repository_codelets() failed: %s: %s: %s" % + (exception.__class__.__name__, exception, repo_url)) authors = [(_decode(author),) for author in \ commits_meta[filename]["authors"]] @@ -266,9 +268,6 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name): framework_name), commits_meta[filename]["time_created"], commits_meta[filename]["time_last_modified"]) - db.codelets.insert({ - "name" : codelet.name - }) # Database.insert(codelet) @@ -284,14 +283,24 @@ def _generate_file_url(filename, repo_url, framework_name): :type repo_url: str :type framework_name: str - :return: The file's full url on the given framework. - :rtype: str + :return: The file's full url on the given framework, if successfully + derived. + :rtype: str, or None + + .. warning:: + `git branch` will occasionally fail, and, seeing as its a crucial + component of GitHub's repository file urls, None will be returned. """ if framework_name == "GitHub": - default_branch = subprocess.check_output("git branch --no-color", - shell=True)[2:-1] - return "%s/blob/%s/%s" % (repo_url, default_branch, filename) + try: + default_branch = subprocess.check_output("git branch --no-color", + shell=True)[2:-1] + return "%s/blob/%s/%s" % (repo_url, default_branch, filename) + except CalledProcessError as exception: + logging.warning("_generate_file_url(): %s: %s", + exception.__class__.name, exception) + return None def _get_git_commits(): """ @@ -423,5 +432,6 @@ def _decode(raw): return raw.decode(encoding) if encoding is not None else None except Exception as exception: - logging.warning("_debug(): %s", exception) + logging.warning("_decode(): %s: %s", exception.__class__.__name__, + exception) return None From 93ed68645d760d36a2eb169ed22c4fec1c99a129 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Fri, 18 Apr 2014 21:31:10 -0400 Subject: [PATCH 34/42] Add partially integrated BitbucketCrawler(). Add: bitshift/crawler/ __init__.py -Initialize 'BitbucketCrawler()' singleton. -Instantiate all thread instances on-the-fly in a 'threads' array, as opposed to individual named variables. crawler.py -Add 'BitbucketCrawler()', to crawl Bitbucket for repositories. -Not entirely tested for proper functionality. -The Bitbucket framework is not yet accounted for in 'indexer._generate_file_url()'. --- bitshift/crawler/crawler.py | 72 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 6 deletions(-) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 8509c6d..347fd9a 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -15,14 +15,13 @@ class GitHubCrawler(threading.Thread): """ Crawler that retrieves links to all of GitHub's public repositories. - GitHubCrawler is a threaded singleton that queries GitHub's API for URLs + GitHubCrawler is a threaded singleton that queries GitHub's API for urls to its public repositories, which it inserts into a :class:`Queue.Queue` - shared with :class:`bitshift.crawler.indexer.GitIndexer`. + shared with :class:`indexer.GitIndexer`. :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository` - with repository metadata retrieved by :class:`GitHubCrawler`, and other - Git crawlers, to be processed by - :class:`bitshift.crawler.indexer.GitIndexer`. + with repository metadata retrieved by :class:`GitHubCrawler`, and other Git + crawlers, to be processed by :class:`indexer.GitIndexer`. """ def __init__(self, clone_queue): @@ -35,7 +34,7 @@ class GitHubCrawler(threading.Thread): """ self.clone_queue = clone_queue - logging.info("Starting.") + logging.info("Starting %s." % self.__class__.__name__) super(GitHubCrawler, self).__init__(name=self.__class__.__name__) def run(self): @@ -84,3 +83,64 @@ class GitHubCrawler(threading.Thread): sleep_time = api_request_interval - (time.time() - start_time) if sleep_time > 0: time.sleep(sleep_time) + +class BitbucketCrawler(threading.Thread): + """ + Crawler that retrieves links to all of Bitbucket's public repositories. + + BitbucketCrawler is a threaded singleton that queries Bitbucket's API for + urls to its public repositories, and inserts them as + :class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with + :class:`indexer.GitIndexer`. + + :ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert + :class:`indexer.GitRepository` repository urls into. + """ + + def __init__(self, clone_queue): + """ + Create an instance of the singleton `BitbucketCrawler`. + + :param clone_queue: see :attr:`self.clone_queue` + + :type clone_queue: see :attr:`self.clone_queue` + """ + + self.clone_queue = clone_queue + logging.info("Starting %s." % self.__class__.__name__) + super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) + + def run(self): + """ + Query the Bitbucket API for data about every public repository. + + Query the Bitbucket API's "/repositories" endpoint and read its + paginated responses in a loop; any "git" repositories have their + clone-urls and names inserted into a :class:`indexer.GitRepository` in + :attr:`self.clone_queue`. + """ + + next_api_url = "https://api.bitbucket.org/2.0/repositories" + + while True: + response = requests.get(next_api_url).json() + + queue_percent_full = (float(self.clone_queue.qsize()) / + self.clone_queue.maxsize) * 100 + logging.info("API call made. Queue-size: (%d%%) %d/%d" % ( + queue_percent_full, self.clone_queue.qsize(), + self.clone_queue.maxsize)) + + for repo in response["values"]: + if repo["scm"] == "git": + while self.clone_queue.full(): + time.sleep(1) + + clone_links = repo["links"]["clone"] + clone_url = (clone[0]["href"] if clone[0]["name"] == "https" + else clone[1]["href"]) + links.append("clone_url") + self.clone_queue.put(indexer.GitRepository( + clone_url, repo["full_name"], "Bitbucket")) + + next_api_url = response["next"] From 2954161747106b000d6e1a70ed2f1e32bf46cad6 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Fri, 18 Apr 2014 21:31:10 -0400 Subject: [PATCH 35/42] Add partially integrated BitbucketCrawler(). Add: bitshift/crawler/ __init__.py -Initialize 'BitbucketCrawler()' singleton. -Instantiate all thread instances on-the-fly in a 'threads' array, as opposed to individual named variables. crawler.py -Add 'BitbucketCrawler()', to crawl Bitbucket for repositories. -Not entirely tested for proper functionality. -The Bitbucket framework is not yet accounted for in 'indexer._generate_file_url()'. --- bitshift/crawler/__init__.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index 39a1a28..75e8b61 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -15,20 +15,22 @@ def crawl(): Initialize all crawlers (and indexers). Start the: - 1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler` - 2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer` + 1. GitHub crawler, :class:`crawler.GitHubCrawler`. + 2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`. + 3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. """ MAX_URL_QUEUE_SIZE = 5e3 DEBUG_FILE = "crawler.log" logging.basicConfig(filename=DEBUG_FILE, - format="%(asctime)s:\t%(threadName)s:\t%(message)s", + format="%(levelname)s %(asctime)s:\t%(threadName)s:\t%(message)s", level=logging.DEBUG) - repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) - github_crawler = crawler.GitHubCrawler(repository_queue) - git_indexer = indexer.GitIndexer(repository_queue) + repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) + threads = [crawler.GitHubCrawler(repo_clone_queue), + crawler.BitbucketCrawler(repo_clone_queue), + indexer.GitIndexer(repo_clone_queue)] - for thread in [github_crawler, git_indexer]: + for thread in threads: thread.start() From f38772760b6dbe46410ca87407c7dab919079c3f Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Sat, 19 Apr 2014 15:33:21 -0400 Subject: [PATCH 36/42] Remove some subprocesses, comment out logging. Add: bitshift/crawler/ (crawler, indexer).py -comment out all logging statements, as they may be causing a memory leak (the crawler is meant to run perpetually, meaning that, depending on how the `logging` module is implemented, it may be accumulating logged strings in memory.) bitshift/crawler/indexer.py -make `_index_repository()` and `_index_repository_codelets()` functions of the `GitIndexer` class. -replace `_get_tracked_files()` subprocess call, which found the files in a Git repository and removed any that were non-ASCII, with a pure Python solution. -add `_is_ascii()`. --- bitshift/crawler/crawler.py | 18 +-- bitshift/crawler/indexer.py | 269 ++++++++++++++++++++++++++++---------------- 2 files changed, 181 insertions(+), 106 deletions(-) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 347fd9a..10dd961 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -34,7 +34,7 @@ class GitHubCrawler(threading.Thread): """ self.clone_queue = clone_queue - logging.info("Starting %s." % self.__class__.__name__) + # logging.info("Starting %s." % self.__class__.__name__) super(GitHubCrawler, self).__init__(name=self.__class__.__name__) def run(self): @@ -61,10 +61,10 @@ class GitHubCrawler(threading.Thread): queue_percent_full = (float(self.clone_queue.qsize()) / self.clone_queue.maxsize) * 100 - logging.info("API call made. Limit remaining: %s. Queue-size: (%d" - "%%) %d/%d" % (response.headers["x-ratelimit-remaining"], - queue_percent_full, self.clone_queue.qsize(), - self.clone_queue.maxsize)) + # logging.info("API call made. Limit remaining: %s. Queue-size: (%d" + # "%%) %d/%d" % (response.headers["x-ratelimit-remaining"], + # queue_percent_full, self.clone_queue.qsize(), + # self.clone_queue.maxsize)) for repo in response.json(): while self.clone_queue.full(): @@ -107,7 +107,7 @@ class BitbucketCrawler(threading.Thread): """ self.clone_queue = clone_queue - logging.info("Starting %s." % self.__class__.__name__) + # logging.info("Starting %s." % self.__class__.__name__) super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) def run(self): @@ -127,9 +127,9 @@ class BitbucketCrawler(threading.Thread): queue_percent_full = (float(self.clone_queue.qsize()) / self.clone_queue.maxsize) * 100 - logging.info("API call made. Queue-size: (%d%%) %d/%d" % ( - queue_percent_full, self.clone_queue.qsize(), - self.clone_queue.maxsize)) + # logging.info("API call made. Queue-size: (%d%%) %d/%d" % ( + # queue_percent_full, self.clone_queue.qsize(), + # self.clone_queue.maxsize)) for repo in response["values"]: if repo["scm"] == "git": diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 563f369..3bff3e7 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -3,7 +3,7 @@ repositories. """ -import bs4, logging, os, Queue, re, shutil, subprocess, time, threading +import bs4, logging, os, Queue, re, shutil, string, subprocess, time, threading from ..database import Database from ..codelet import Codelet @@ -63,10 +63,12 @@ class GitIndexer(threading.Thread): MAX_INDEX_QUEUE_SIZE = 10 - logging.info("Starting.") + # logging.info("Starting.") + self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) self.git_cloner = _GitCloner(clone_queue, self.index_queue) self.git_cloner.start() + self.codelet_count = 0 #debug if not os.path.exists(GIT_CLONE_DIR): os.makedirs(GIT_CLONE_DIR) @@ -89,14 +91,91 @@ class GitIndexer(threading.Thread): repo = self.index_queue.get() self.index_queue.task_done() - _index_repository(repo.url, repo.name, repo.framework_name) + self._index_repository(repo.url, repo.name, repo.framework_name) + + def _index_repository(self, repo_url, repo_name, framework_name): + """ + Clone and index (create and insert Codeletes for) a Git repository. + + `git clone` the Git repository located at **repo_url**, call + _insert_repository_codelets, then remove said repository. + + :param repo_url: The url the Git repository was cloned from. + :param repo_name: The name of the repository. + :param framework_name: The name of the framework the repository is from. + + :type repo_url: str + :type repo_name: str + :type framework_name: str + """ + + # logging.info("Indexing repository %s." % repo_url) + with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir: + try: + self._insert_repository_codelets(repo_url, repo_name, + framework_name) + except Exception as exception: + # logging.warning( + # "_insert_repository_codelets() failed: %s: %s: %s" % + # (exception.__class__.__name__, exception, repo_url)) + pass + + if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) + + def _insert_repository_codelets(self, repo_url, repo_name, framework_name): + """ + Create and insert a Codelet for the files inside a Git repository. + + Create a new Codelet, and insert it into the Database singleton, for every + file inside the current working directory's default branch (usually + *master*). + + :param repo_url: The url the Git repository was cloned from. + :param repo_name: The name of the repository. + :param framework_name: The name of the framework the repository is from. + + :type repo_url: str + :type repo_name: str + :type framework_name: str + """ + + commits_meta = _get_commits_metadata() + for filename in commits_meta.keys(): + try: + with open(filename, "r") as source_file: + source = _decode(source_file.read()) + if source is None: + return + except IOError as exception: + # logging.warning( + # "_insert_repository_codelets() failed: %s: %s: %s" % + # (exception.__class__.__name__, exception, repo_url)) + pass + + authors = [(_decode(author),) for author in \ + commits_meta[filename]["authors"]] + codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, + None, authors, _generate_file_url(filename, repo_url, + framework_name), + commits_meta[filename]["time_created"], + commits_meta[filename]["time_last_modified"]) + + self.codelet_count += 1 #debug + if self.codelet_count % 500 == 0: #debug + logging.info("Number of codelets indexed: %d.", self.codelet_count) #debug + + # Database.insert(codelet) class _GitCloner(threading.Thread): """ A singleton Git repository cloner. + Clones the repositories crawled by :class:`crawler.GitHubCrawler` for + :class:`GitIndexer` to index. + :ivar clone_queue: (:class:`Queue.Queue`) see - :attr:`bitshift.crawler.crawler.GitHubCrawler.clone_queue`. + :attr:`crawler.GitHubCrawler.clone_queue`. :ivar index_queue: (:class:`Queue.Queue`) see :attr:`GitIndexer.index_queue`. """ @@ -112,6 +191,8 @@ class _GitCloner(threading.Thread): :type index_queue: see :attr:`self.index_queue` """ + # logging.info("Starting.") + self.clone_queue = clone_queue self.index_queue = index_queue super(_GitCloner, self).__init__(name=self.__class__.__name__) @@ -146,16 +227,29 @@ class _GitCloner(threading.Thread): queue_percent_full = (float(self.index_queue.qsize()) / self.index_queue.maxsize) * 100 - logging.info("Cloning %s. Queue-size: (%d%%) %d/%d" % (repo.url, - queue_percent_full, self.index_queue.qsize(), - self.index_queue.maxsize)) + # logging.info("Cloning %s. Queue-size: (%d%%) %d/%d" % (repo.url, + # queue_percent_full, self.index_queue.qsize(), + # self.index_queue.maxsize)) + exit_code = None command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone" " --single-branch %s %s/%s || pkill -f git") - if subprocess.call(command % (GIT_CLONE_TIMEOUT, repo.url, - GIT_CLONE_DIR, repo.name), shell=True) != 0: - logging.warning("_clone_repository(): Cloning %s failed." % - repo.url) + + while exit_code is None: + try: + exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT, + repo.url, GIT_CLONE_DIR, repo.name), shell=True) + except: + # logging.warning("_clone_repository() failed: %s: %s", + # exception.__class__.__name__, exception) + time.sleep(1) + continue + else: + break + + if exit_code != 0: + # logging.warning("_clone_repository(): Cloning %s failed." % + # repo.url) if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) return @@ -203,74 +297,6 @@ class _ChangeDir(object): os.chdir(self.old_path) -def _index_repository(repo_url, repo_name, framework_name): - """ - Clone and index (create and insert Codeletes for) a Git repository. - - `git clone` the Git repository located at **repo_url**, call - _insert_repository_codelets, then remove said repository. - - :param repo_url: The url the Git repository was cloned from. - :param repo_name: The name of the repository. - :param framework_name: The name of the framework the repository is from. - - :type repo_url: str - :type repo_name: str - :type framework_name: str - """ - - logging.info("Indexing repository %s." % repo_url) - with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir: - try: - _insert_repository_codelets(repo_url, repo_name, - framework_name) - except Exception as exception: - logging.warning( - "_insert_repository_codelets() failed: %s: %s: %s" % - (exception.__class__.__name__, exception, repo_url)) - - if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) - -def _insert_repository_codelets(repo_url, repo_name, framework_name): - """ - Create and insert a Codelet for the files inside a Git repository. - - Create a new Codelet, and insert it into the Database singleton, for every - file inside the current working directory's default branch (usually - *master*). - - :param repo_url: The url the Git repository was cloned from. - :param repo_name: The name of the repository. - :param framework_name: The name of the framework the repository is from. - - :type repo_url: str - :type repo_name: str - :type framework_name: str - """ - - commits_meta = _get_commits_metadata() - for filename in commits_meta.keys(): - try: - with open(filename, "r") as source_file: - source = _decode(source_file.read()) - if source is None: - return - except IOError as exception: - logging.warning( - "_insert_repository_codelets() failed: %s: %s: %s" % - (exception.__class__.__name__, exception, repo_url)) - - authors = [(_decode(author),) for author in \ - commits_meta[filename]["authors"]] - codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, - None, authors, _generate_file_url(filename, repo_url, - framework_name), - commits_meta[filename]["time_created"], - commits_meta[filename]["time_last_modified"]) - - # Database.insert(codelet) - def _generate_file_url(filename, repo_url, framework_name): """ Return a url for a filename from a Git wrapper framework. @@ -288,19 +314,25 @@ def _generate_file_url(filename, repo_url, framework_name): :rtype: str, or None .. warning:: - `git branch` will occasionally fail, and, seeing as its a crucial - component of GitHub's repository file urls, None will be returned. + Various Git subprocesses will occasionally fail, and, seeing as the + information they provide is a crucial component of some repository file + urls, None may be returned. """ - if framework_name == "GitHub": - try: - default_branch = subprocess.check_output("git branch --no-color", - shell=True)[2:-1] - return "%s/blob/%s/%s" % (repo_url, default_branch, filename) - except CalledProcessError as exception: - logging.warning("_generate_file_url(): %s: %s", - exception.__class__.name, exception) - return None + try: + if framework_name == "GitHub": + default_branch = subprocess.check_output("git branch" + " --no-color", shell=True)[2:-1] + return ("%s/blob/%s/%s" % (repo_url, default_branch, + filename)).replace("//", "/") + elif framework_name == "Bitbucket": + commit_hash = subprocess.check_output("git rev-parse HEAD", + shell=True).replace("\n", "") + return ("%s/src/%s/%s" % (repo_url, commit_hash, + filename)).replace("//", "/") + except subprocess.CalledProcessError as exception: + # logging.warning("_generate_file_url() failed: %s", exception) + return None def _get_git_commits(): """ @@ -354,12 +386,15 @@ def _get_tracked_files(): GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?", "md(wn|t[e]?xt)?", "rst"] - tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \ - -f && -T }' $(find . -type d -name .git -prune -o -print)"), - shell=True).split("\n")[:-1] + files = [] + for dirname, subdir_names, filenames in os.walk("."): + for filename in filenames: + path = os.path.join(dirname, filename) + if _is_ascii(path): + files.append(path) valuable_files = [] - for filename in tracked_files: + for filename in files: filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE) for pattern in GIT_IGNORE_FILES]) extension = filename.split(".")[-1] @@ -431,7 +466,47 @@ def _decode(raw): encoding = bs4.BeautifulSoup(raw).original_encoding return raw.decode(encoding) if encoding is not None else None - except Exception as exception: - logging.warning("_decode(): %s: %s", exception.__class__.__name__, - exception) + except (LookupError, UnicodeDecodeError, UserWarning) as exception: + # logging.warning("_decode() failed: %s: %s", + # exception.__class__.__name__, exception) return None + +def _is_ascii(filename): + """ + Heuristically determine whether a file is ASCII text or binary. + + If a portion of the file contains null bytes, or the percentage of bytes + that aren't ASCII is greater than 30%, then the file is concluded to be + binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T` + operator, and is the de-facto method for in : passdetermining whether a + file is ASCII. + + :param filename: The path of the file to test. + + :type filename: str + + :return: Whether the file is probably ASCII. + :rtype: Boolean + """ + + try: + with open(filename) as source: + file_snippet = source.read(512) + + if not file_snippet: + return True + + ascii_characters = "".join(map(chr, range(32, 127)) + + list("\n\r\t\b")) + null_trans = string.maketrans("", "") + + if "\0" in file_snippet: + return False + + non_ascii = file_snippet.translate(null_trans, ascii_characters) + return not float(len(non_ascii)) / len(file_snippet) > 0.30 + + except IOError as exception: + # logging.warning("_is_ascii() failed: %s: %s", + # exception.__class__.__name__, exception) + return False From ad7ce9d9cf1b5f267efae2832d26749c47b52609 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Tue, 29 Apr 2014 12:53:49 -0400 Subject: [PATCH 37/42] Commit latest crawler, continue fix of #8. Add: bitshift/crawler/*.py -Remove use of the `logging` module, which appeared to be causing a memory leak even with log-file rotation. --- app.py | 4 ++- bitshift/crawler/__init__.py | 7 +----- bitshift/crawler/crawler.py | 30 +++++++++++----------- bitshift/crawler/indexer.py | 60 ++++++++++++++++++-------------------------- setup.py | 2 +- 5 files changed, 45 insertions(+), 58 deletions(-) diff --git a/app.py b/app.py index c4083c9..6a77b97 100644 --- a/app.py +++ b/app.py @@ -5,7 +5,9 @@ Module to contain all the project's Flask server plumbing. from flask import Flask from flask import render_template, session +from bitshift import assets from bitshift.query import parse_query +from bitshift.crawler import crawl app = Flask(__name__) app.config.from_object("bitshift.config") @@ -25,4 +27,4 @@ def search(query): pass if __name__ == "__main__": - app.run() + crawl() diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index 75e8b61..b4ad922 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -4,7 +4,7 @@ Contains functions for initializing all subsidiary, threaded crawlers. """ -import logging, Queue +import os, Queue from bitshift.crawler import crawler, indexer @@ -21,11 +21,6 @@ def crawl(): """ MAX_URL_QUEUE_SIZE = 5e3 - DEBUG_FILE = "crawler.log" - - logging.basicConfig(filename=DEBUG_FILE, - format="%(levelname)s %(asctime)s:\t%(threadName)s:\t%(message)s", - level=logging.DEBUG) repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) threads = [crawler.GitHubCrawler(repo_clone_queue), diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 10dd961..6196a13 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -4,7 +4,7 @@ Contains all website/framework-specific Class crawlers. """ -import logging, requests, time, threading +import requests, time, threading from bitshift.crawler import indexer @@ -34,7 +34,6 @@ class GitHubCrawler(threading.Thread): """ self.clone_queue = clone_queue - # logging.info("Starting %s." % self.__class__.__name__) super(GitHubCrawler, self).__init__(name=self.__class__.__name__) def run(self): @@ -57,14 +56,15 @@ class GitHubCrawler(threading.Thread): while len(next_api_url) > 0: start_time = time.time() - response = requests.get(next_api_url, params=authentication_params) + + try: + response = requests.get(next_api_url, + params=authentication_params) + except ConnectionError as exception: + continue queue_percent_full = (float(self.clone_queue.qsize()) / self.clone_queue.maxsize) * 100 - # logging.info("API call made. Limit remaining: %s. Queue-size: (%d" - # "%%) %d/%d" % (response.headers["x-ratelimit-remaining"], - # queue_percent_full, self.clone_queue.qsize(), - # self.clone_queue.maxsize)) for repo in response.json(): while self.clone_queue.full(): @@ -107,7 +107,6 @@ class BitbucketCrawler(threading.Thread): """ self.clone_queue = clone_queue - # logging.info("Starting %s." % self.__class__.__name__) super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) def run(self): @@ -123,13 +122,14 @@ class BitbucketCrawler(threading.Thread): next_api_url = "https://api.bitbucket.org/2.0/repositories" while True: - response = requests.get(next_api_url).json() + try: + response = requests.get(next_api_url).json() + except ConnectionError as exception: + time.sleep(0.5) + continue queue_percent_full = (float(self.clone_queue.qsize()) / self.clone_queue.maxsize) * 100 - # logging.info("API call made. Queue-size: (%d%%) %d/%d" % ( - # queue_percent_full, self.clone_queue.qsize(), - # self.clone_queue.maxsize)) for repo in response["values"]: if repo["scm"] == "git": @@ -137,10 +137,12 @@ class BitbucketCrawler(threading.Thread): time.sleep(1) clone_links = repo["links"]["clone"] - clone_url = (clone[0]["href"] if clone[0]["name"] == "https" - else clone[1]["href"]) + clone_url = (clone_links[0]["href"] if + clone_links[0]["name"] == "https" else + clone_links[1]["href"]) links.append("clone_url") self.clone_queue.put(indexer.GitRepository( clone_url, repo["full_name"], "Bitbucket")) next_api_url = response["next"] + time.sleep(0.2) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 3bff3e7..d2ef907 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -3,7 +3,7 @@ repositories. """ -import bs4, logging, os, Queue, re, shutil, string, subprocess, time, threading +import bs4, os, Queue, re, shutil, string, subprocess, time, threading from ..database import Database from ..codelet import Codelet @@ -63,12 +63,9 @@ class GitIndexer(threading.Thread): MAX_INDEX_QUEUE_SIZE = 10 - # logging.info("Starting.") - self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) self.git_cloner = _GitCloner(clone_queue, self.index_queue) self.git_cloner.start() - self.codelet_count = 0 #debug if not os.path.exists(GIT_CLONE_DIR): os.makedirs(GIT_CLONE_DIR) @@ -91,7 +88,10 @@ class GitIndexer(threading.Thread): repo = self.index_queue.get() self.index_queue.task_done() - self._index_repository(repo.url, repo.name, repo.framework_name) + try: + self._index_repository(repo.url, repo.name, repo.framework_name) + except Exception as exception: + pass def _index_repository(self, repo_url, repo_name, framework_name): """ @@ -109,15 +109,11 @@ class GitIndexer(threading.Thread): :type framework_name: str """ - # logging.info("Indexing repository %s." % repo_url) with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir: try: self._insert_repository_codelets(repo_url, repo_name, framework_name) except Exception as exception: - # logging.warning( - # "_insert_repository_codelets() failed: %s: %s: %s" % - # (exception.__class__.__name__, exception, repo_url)) pass if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): @@ -141,17 +137,18 @@ class GitIndexer(threading.Thread): """ commits_meta = _get_commits_metadata() + if commits_meta is None: + return + for filename in commits_meta.keys(): try: - with open(filename, "r") as source_file: + source = "" + with open(filename) as source_file: source = _decode(source_file.read()) if source is None: - return + continue except IOError as exception: - # logging.warning( - # "_insert_repository_codelets() failed: %s: %s: %s" % - # (exception.__class__.__name__, exception, repo_url)) - pass + continue authors = [(_decode(author),) for author in \ commits_meta[filename]["authors"]] @@ -161,10 +158,6 @@ class GitIndexer(threading.Thread): commits_meta[filename]["time_created"], commits_meta[filename]["time_last_modified"]) - self.codelet_count += 1 #debug - if self.codelet_count % 500 == 0: #debug - logging.info("Number of codelets indexed: %d.", self.codelet_count) #debug - # Database.insert(codelet) class _GitCloner(threading.Thread): @@ -191,8 +184,6 @@ class _GitCloner(threading.Thread): :type index_queue: see :attr:`self.index_queue` """ - # logging.info("Starting.") - self.clone_queue = clone_queue self.index_queue = index_queue super(_GitCloner, self).__init__(name=self.__class__.__name__) @@ -212,7 +203,11 @@ class _GitCloner(threading.Thread): time.sleep(THREAD_QUEUE_SLEEP) repo = self.clone_queue.get() self.clone_queue.task_done() - self._clone_repository(repo) + + try: + self._clone_repository(repo) + except Exception as exception: + pass def _clone_repository(self, repo): """ @@ -227,29 +222,27 @@ class _GitCloner(threading.Thread): queue_percent_full = (float(self.index_queue.qsize()) / self.index_queue.maxsize) * 100 - # logging.info("Cloning %s. Queue-size: (%d%%) %d/%d" % (repo.url, - # queue_percent_full, self.index_queue.qsize(), - # self.index_queue.maxsize)) exit_code = None command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone" " --single-branch %s %s/%s || pkill -f git") + command_attempt = 0 while exit_code is None: try: exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT, repo.url, GIT_CLONE_DIR, repo.name), shell=True) - except: - # logging.warning("_clone_repository() failed: %s: %s", - # exception.__class__.__name__, exception) + except Exception as exception: time.sleep(1) - continue + command_attempt += 1 + if command_attempt == 20: + break + else: + continue else: break if exit_code != 0: - # logging.warning("_clone_repository(): Cloning %s failed." % - # repo.url) if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) return @@ -331,7 +324,6 @@ def _generate_file_url(filename, repo_url, framework_name): return ("%s/src/%s/%s" % (repo_url, commit_hash, filename)).replace("//", "/") except subprocess.CalledProcessError as exception: - # logging.warning("_generate_file_url() failed: %s", exception) return None def _get_git_commits(): @@ -467,8 +459,6 @@ def _decode(raw): return raw.decode(encoding) if encoding is not None else None except (LookupError, UnicodeDecodeError, UserWarning) as exception: - # logging.warning("_decode() failed: %s: %s", - # exception.__class__.__name__, exception) return None def _is_ascii(filename): @@ -507,6 +497,4 @@ def _is_ascii(filename): return not float(len(non_ascii)) / len(file_snippet) > 0.30 except IOError as exception: - # logging.warning("_is_ascii() failed: %s: %s", - # exception.__class__.__name__, exception) return False diff --git a/setup.py b/setup.py index 1faa5b9..0f9fc84 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( version = "0.1", packages = find_packages(), install_requires = ["Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0", - "BeautifulSoup>=3.2.1"], + "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1"], author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak", license = "MIT", url = "https://github.com/earwig/bitshift" From 1b2739f8c4439219d18a5f4f3d9bd02d3360ef85 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Wed, 30 Apr 2014 15:20:15 -0400 Subject: [PATCH 38/42] Add GitHub repo star count, simple logging. Add: bitshift/crawler/crawler.py -add `_get_repo_stars()` to `GitHubCrawler`, which queries the GitHub API for the number of a stars that a given repository has. -log the `next_api_url` every time it's generated by `GitHubCrawler` and `BitbucketCrawler` to two respective log-files. --- bitshift/crawler/crawler.py | 51 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 6196a13..e4b4929 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -24,6 +24,11 @@ class GitHubCrawler(threading.Thread): crawlers, to be processed by :class:`indexer.GitIndexer`. """ + AUTHENTICATION = { + "client_id" : "436cb884ae09be7f2a4e", + "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" + } + def __init__(self, clone_queue): """ Create an instance of the singleton `GitHubCrawler`. @@ -48,10 +53,6 @@ class GitHubCrawler(threading.Thread): """ next_api_url = "https://api.github.com/repositories" - authentication_params = { - "client_id" : "436cb884ae09be7f2a4e", - "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e" - } api_request_interval = 5e3 / 60 ** 2 while len(next_api_url) > 0: @@ -59,7 +60,7 @@ class GitHubCrawler(threading.Thread): try: response = requests.get(next_api_url, - params=authentication_params) + params=self.AUTHENTICATION) except ConnectionError as exception: continue @@ -76,14 +77,49 @@ class GitHubCrawler(threading.Thread): if int(response.headers["x-ratelimit-remaining"]) == 0: time.sleep(int(response.headers["x-ratelimit-reset"]) - - time.time()) + time.time()) next_api_url = response.headers["link"].split(">")[0][1:] + with open(".github_api.log", "w") as log_file: + log_file.write("%s\n" % next_api_url) sleep_time = api_request_interval - (time.time() - start_time) if sleep_time > 0: time.sleep(sleep_time) + def _get_repo_stars(self, repo_name): + """ + Return the number of stargazers for a repository. + + Queries the GitHub API for the number of stargazers for a given + repository, and blocks if the query limit is exceeded. + + :param repo_name: The name of the repository, in + `username/repository_name` format. + + :type repo_name: str + + :return: The number of stargazers for the repository. + :rtype: int + """ + + API_URL = "https://api.github.com/search/repositories" + + + params = self.AUTHENTICATION + params["q"] = "repo:%s" % repo_name + + resp = requests.get(API_URL, + params=params, + headers={ + "Accept" : "application/vnd.github.preview" + }) + + if int(resp.headers["x-ratelimit-remaining"]) == 0: + time.sleep(int(resp.headers["x-ratelimit-reset"]) - time.time()) + + return int(resp.json()["items"][0]["stargazers_count"]) + class BitbucketCrawler(threading.Thread): """ Crawler that retrieves links to all of Bitbucket's public repositories. @@ -145,4 +181,7 @@ class BitbucketCrawler(threading.Thread): clone_url, repo["full_name"], "Bitbucket")) next_api_url = response["next"] + with open(".bitbucket_api.log", "w") as log_file: + log_file.write("%s\n" % next_api_url) + time.sleep(0.2) From 6762c1fa3db340f96e06ee7f5ab371c20decd2e3 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Sat, 3 May 2014 15:06:03 -0400 Subject: [PATCH 39/42] Re-add logging, rem file filters. Add: bitshift/ __init__.py -add `_configure_logging()`, which sets up a more robust logging infrastructure than was previously used: log files are rotated once per hour, and have some additional formatting rules. (crawler, indexer).py -add hierarchically-descending loggers to individual threaded classes (`GitHubCrawler`, `GitIndexer`, etc.); add logging calls. indexer.py -remove file filtering regex matches from `_get_tracked_files()`, as non-code files will be discarded by the parsers. --- bitshift/crawler/__init__.py | 25 +++++++++- bitshift/crawler/crawler.py | 46 +++++++++++++---- bitshift/crawler/indexer.py | 116 ++++++++++++++++++++----------------------- 3 files changed, 114 insertions(+), 73 deletions(-) diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index b4ad922..cfec64c 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -4,7 +4,7 @@ Contains functions for initializing all subsidiary, threaded crawlers. """ -import os, Queue +import logging, logging.handlers, os, Queue from bitshift.crawler import crawler, indexer @@ -20,6 +20,8 @@ def crawl(): 3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`. """ + _configure_logging() + MAX_URL_QUEUE_SIZE = 5e3 repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE) @@ -29,3 +31,24 @@ def crawl(): for thread in threads: thread.start() + +def _configure_logging(): + LOG_FILE_DIR = "log" + + if not os.path.exists(LOG_FILE_DIR): + os.mkdir(LOG_FILE_DIR) + + logging.getLogger("requests").setLevel(logging.WARNING) + + formatter = logging.Formatter( + fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" + " %(message)s"), datefmt="%y-%m-%d %H:%M:%S") + + handler = logging.handlers.TimedRotatingFileHandler( + "%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1, + backupCount=20) + handler.setFormatter(formatter) + + root_logger = logging.getLogger() + root_logger.addHandler(handler) + root_logger.setLevel(logging.NOTSET) diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index e4b4929..785ac61 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -4,7 +4,7 @@ Contains all website/framework-specific Class crawlers. """ -import requests, time, threading +import logging, requests, time, threading from bitshift.crawler import indexer @@ -22,6 +22,7 @@ class GitHubCrawler(threading.Thread): :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository` with repository metadata retrieved by :class:`GitHubCrawler`, and other Git crawlers, to be processed by :class:`indexer.GitIndexer`. + :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. """ AUTHENTICATION = { @@ -39,6 +40,9 @@ class GitHubCrawler(threading.Thread): """ self.clone_queue = clone_queue + self._logger = logging.getLogger("%s.%s" % + (__name__, self.__class__.__name__)) + self._logger.info("Starting.") super(GitHubCrawler, self).__init__(name=self.__class__.__name__) def run(self): @@ -61,11 +65,17 @@ class GitHubCrawler(threading.Thread): try: response = requests.get(next_api_url, params=self.AUTHENTICATION) - except ConnectionError as exception: + except ConnectionError as excep: + self._logger.warning("API %s call failed: %s: %s", + next_api_url, excep.__class__.__name__, excep) + time.sleep(0.5) continue queue_percent_full = (float(self.clone_queue.qsize()) / self.clone_queue.maxsize) * 100 + self._logger.info("API call made. Queue size: %d/%d, %d%%." % + ((self.clone_queue.qsize(), self.clone_queue.maxsize, + queue_percent_full))) for repo in response.json(): while self.clone_queue.full(): @@ -73,15 +83,15 @@ class GitHubCrawler(threading.Thread): self.clone_queue.put(indexer.GitRepository( repo["html_url"], repo["full_name"].replace("/", ""), - "GitHub")) + "GitHub", + #self._get_repo_stars(repo["full_name"])) + 0)) if int(response.headers["x-ratelimit-remaining"]) == 0: time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time()) next_api_url = response.headers["link"].split(">")[0][1:] - with open(".github_api.log", "w") as log_file: - log_file.write("%s\n" % next_api_url) sleep_time = api_request_interval - (time.time() - start_time) if sleep_time > 0: @@ -105,7 +115,6 @@ class GitHubCrawler(threading.Thread): API_URL = "https://api.github.com/search/repositories" - params = self.AUTHENTICATION params["q"] = "repo:%s" % repo_name @@ -116,9 +125,18 @@ class GitHubCrawler(threading.Thread): }) if int(resp.headers["x-ratelimit-remaining"]) == 0: - time.sleep(int(resp.headers["x-ratelimit-reset"]) - time.time()) + sleep_time = int(resp.headers["x-ratelimit-reset"]) - time.time() + if sleep_time > 0: + logging.info("API quota exceeded. Sleep time: %d." % sleep_time) + time.sleep(sleep_time) - return int(resp.json()["items"][0]["stargazers_count"]) + if "items" not in resp.json() or len(resp.json()["items"]) == 0: + self._logger.critical("No API result: %s. Result: %s" % (resp.url, + str(resp.json()))) + return 0 + else: + rank = float(resp.json()["items"][0]["stargazers_count"]) / 1000 + return rank if rank < 1.0 else 1.0 class BitbucketCrawler(threading.Thread): """ @@ -131,6 +149,7 @@ class BitbucketCrawler(threading.Thread): :ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert :class:`indexer.GitRepository` repository urls into. + :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. """ def __init__(self, clone_queue): @@ -143,6 +162,9 @@ class BitbucketCrawler(threading.Thread): """ self.clone_queue = clone_queue + self._logger = logging.getLogger("%s.%s" % + (__name__, self.__class__.__name__)) + self._logger.info("Starting.") super(BitbucketCrawler, self).__init__(name=self.__class__.__name__) def run(self): @@ -162,10 +184,15 @@ class BitbucketCrawler(threading.Thread): response = requests.get(next_api_url).json() except ConnectionError as exception: time.sleep(0.5) + self._logger.warning("API %s call failed: %s: %s", + next_api_url, excep.__class__.__name__, excep) continue queue_percent_full = (float(self.clone_queue.qsize()) / self.clone_queue.maxsize) * 100 + self._logger.info("API call made. Queue size: %d/%d, %d%%." % + ((self.clone_queue.qsize(), self.clone_queue.maxsize, + queue_percent_full))) for repo in response["values"]: if repo["scm"] == "git": @@ -181,7 +208,4 @@ class BitbucketCrawler(threading.Thread): clone_url, repo["full_name"], "Bitbucket")) next_api_url = response["next"] - with open(".bitbucket_api.log", "w") as log_file: - log_file.write("%s\n" % next_api_url) - time.sleep(0.2) diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index d2ef907..69c579c 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -3,7 +3,8 @@ repositories. """ -import bs4, os, Queue, re, shutil, string, subprocess, time, threading +import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\ + threading from ..database import Database from ..codelet import Codelet @@ -11,6 +12,9 @@ from ..codelet import Codelet GIT_CLONE_DIR = "/tmp/bitshift" THREAD_QUEUE_SLEEP = 0.5 +import pymongo #debug +db = pymongo.MongoClient().bitshift #debug + class GitRepository(object): """ A representation of a Git repository's metadata. @@ -19,24 +23,29 @@ class GitRepository(object): :ivar name: (str) The name of the repository. :ivar framework_name: (str) The name of the online Git framework that the repository belongs to (eg, GitHub, BitBucket). + :ivar rank: (float) The rank of the repository, as assigned by + :class:`crawler.GitHubCrawler`. """ - def __init__(self, url, name, framework_name): + def __init__(self, url, name, framework_name, rank): """ Create a GitRepository instance. :param url: see :attr:`GitRepository.url` :param name: see :attr:`GitRepository.name` :param framework_name: see :attr:`GitRepository.framework_name` + :param rank: see :attr:`GitRepository.rank` :type url: str :type name: str :type framework_name: str + :type rank: float """ self.url = url self.name = name self.framework_name = framework_name + self.rank = rank class GitIndexer(threading.Thread): """ @@ -50,6 +59,7 @@ class GitIndexer(threading.Thread): cloned by :class:`_GitCloner`, which are to be indexed. :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner, which feeds :class:`GitIndexer`. + :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. """ def __init__(self, clone_queue): @@ -66,6 +76,9 @@ class GitIndexer(threading.Thread): self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE) self.git_cloner = _GitCloner(clone_queue, self.index_queue) self.git_cloner.start() + self._logger = logging.getLogger("%s.%s" % + (__name__, self.__class__.__name__)) + self._logger.info("Starting.") if not os.path.exists(GIT_CLONE_DIR): os.makedirs(GIT_CLONE_DIR) @@ -88,52 +101,43 @@ class GitIndexer(threading.Thread): repo = self.index_queue.get() self.index_queue.task_done() - try: - self._index_repository(repo.url, repo.name, repo.framework_name) - except Exception as exception: - pass + # try: + self._index_repository(repo) + # except Exception as excep: + # self._logger.warning("%s: %s.", excep.__class__.__name__, excep) - def _index_repository(self, repo_url, repo_name, framework_name): + def _index_repository(self, repo): """ Clone and index (create and insert Codeletes for) a Git repository. - `git clone` the Git repository located at **repo_url**, call - _insert_repository_codelets, then remove said repository. + `git clone` the Git repository located at **repo.url**, call + `_insert_repository_codelets()`, then remove said repository. - :param repo_url: The url the Git repository was cloned from. - :param repo_name: The name of the repository. - :param framework_name: The name of the framework the repository is from. + :param repo_url: The metadata of the repository to be indexed. - :type repo_url: str - :type repo_name: str - :type framework_name: str + :type repo_url: :class:`GitRepository` """ - with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir: - try: - self._insert_repository_codelets(repo_url, repo_name, - framework_name) - except Exception as exception: - pass + with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir: + # try: + self._insert_repository_codelets(repo) + # except Exception as excep: + # self._logger.warning("%s: %s.", excep.__class__.__name__, excep) - if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)): - shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name)) + if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): + shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) - def _insert_repository_codelets(self, repo_url, repo_name, framework_name): + def _insert_repository_codelets(self, repo): """ Create and insert a Codelet for the files inside a Git repository. - Create a new Codelet, and insert it into the Database singleton, for every - file inside the current working directory's default branch (usually - *master*). + Create a new Codelet, and insert it into the Database singleton, for + every file inside the current working directory's default branch + (usually *master*). - :param repo_url: The url the Git repository was cloned from. - :param repo_name: The name of the repository. - :param framework_name: The name of the framework the repository is from. + :param repo_url: The metadata of the repository to be indexed. - :type repo_url: str - :type repo_name: str - :type framework_name: str + :type repo_url: :class:`GitRepository` """ commits_meta = _get_commits_metadata() @@ -142,7 +146,6 @@ class GitIndexer(threading.Thread): for filename in commits_meta.keys(): try: - source = "" with open(filename) as source_file: source = _decode(source_file.read()) if source is None: @@ -152,13 +155,14 @@ class GitIndexer(threading.Thread): authors = [(_decode(author),) for author in \ commits_meta[filename]["authors"]] - codelet = Codelet("%s:%s" % (repo_name, filename), source, filename, - None, authors, _generate_file_url(filename, repo_url, - framework_name), + codelet = Codelet("%s:%s" % (repo.name, filename), source, filename, + None, authors, _generate_file_url(filename, + repo.url, repo.framework_name), commits_meta[filename]["time_created"], - commits_meta[filename]["time_last_modified"]) + commits_meta[filename]["time_last_modified"], + repo.rank) - # Database.insert(codelet) + db.codelets.insert(codelet.__dict__) #debug class _GitCloner(threading.Thread): """ @@ -171,6 +175,7 @@ class _GitCloner(threading.Thread): :attr:`crawler.GitHubCrawler.clone_queue`. :ivar index_queue: (:class:`Queue.Queue`) see :attr:`GitIndexer.index_queue`. + :ivar _logger: (:class:`logging.Logger`) A class-specific logger object. """ def __init__(self, clone_queue, index_queue): @@ -186,6 +191,9 @@ class _GitCloner(threading.Thread): self.clone_queue = clone_queue self.index_queue = index_queue + self._logger = logging.getLogger("%s.%s" % + (__name__, self.__class__.__name__)) + self._logger.info("Starting.") super(_GitCloner, self).__init__(name=self.__class__.__name__) def run(self): @@ -339,11 +347,11 @@ def _get_git_commits(): sample_returned_array = [ { "author" : (str) "author" - "timestamp" : (int) 1396919293, + "timestamp" : (`datetime.datetime`) , "filenames" : (str array) ["file1", "file2"] } ] - :rtype: dictionary + :rtype: array of dictionaries """ git_log = subprocess.check_output(("git --no-pager log --name-only" @@ -355,7 +363,7 @@ def _get_git_commits(): if len(fields) > 2: commits.append({ "author" : fields[0], - "timestamp" : int(fields[1]), + "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), "filenames" : fields[2].split("\x00")[:-2] }) @@ -374,28 +382,14 @@ def _get_tracked_files(): :rtype: str array """ - GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"] - GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?", - "md(wn|t[e]?xt)?", "rst"] - files = [] for dirname, subdir_names, filenames in os.walk("."): for filename in filenames: path = os.path.join(dirname, filename) if _is_ascii(path): - files.append(path) - - valuable_files = [] - for filename in files: - filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE) - for pattern in GIT_IGNORE_FILES]) - extension = filename.split(".")[-1] - extension_match = any([re.match(pattern, filename, flags=re.IGNORECASE) - for pattern in GIT_IGNORE_EXTENSIONS]) + files.append(path[2:]) - if not (filename_match or extension_match): - valuable_files.append(filename[2:]) - return valuable_files + return files def _get_commits_metadata(): """ @@ -407,11 +401,11 @@ def _get_commits_metadata(): sample_returned_dict = { "my_file" : { "authors" : (str array) ["author1", "author2"], - "time_created" : (int) 1395939566, - "time_last_modified" : (int) 1396920409 + "time_created" : (`datetime.datetime`) , + "time_last_modified" : (`datetime.datetime`) } } - :rtype: dictionary + :rtype: dictionary of dictionaries """ commits = _get_git_commits() From d142f1fd55dc180900dd564810e94464f8debbb0 Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Sat, 3 May 2014 15:22:29 -0400 Subject: [PATCH 40/42] Complete Crawler. Close #15, #14, #11, #8. Several of the closed issues were addressed partly in previous commits; definitively close them with this, for the moment, final update to the crawler package. Ref: bitshift/crawler/indexer.py -move all `GitIndexer` specific functions (eg, `_decode`, `_is_ascii()`)from the global scope to the class definition. --- bitshift/codelet.py | 53 +++--- bitshift/crawler/indexer.py | 417 ++++++++++++++++++++++---------------------- 2 files changed, 236 insertions(+), 234 deletions(-) diff --git a/bitshift/codelet.py b/bitshift/codelet.py index 9568a4d..453ace0 100644 --- a/bitshift/codelet.py +++ b/bitshift/codelet.py @@ -7,37 +7,43 @@ class Codelet(object): :ivar name: (str) A suitable name for the codelet. :ivar code: (str) A containing the raw source code. :ivar filename: (str, or None) The filename of the snippet. - :ivar language: (str, or None) The inferred language of `code`. - :ivar authors: (array of str tuples) An array of tuples containing an - author's name and profile URL (on the service the code was pulled from). + :ivar language: (int, or None) The inferred language of `code`. + :ivar authors: (array of tuples (str, str or None)) An array of tuples + containing an author's name and profile URL (on the service the code + was pulled from). :ivar code_url: (str) The url of the (page containing the) source code. - :ivar date_created: (str, or None) The date the code was published. - :ivar date_modified: (str, or None) The date the code was last modified. + :ivar date_created: (:class:`datetime.datetime`, or None) The date the code + was published. + :ivar date_modified: (:class:`datetime.datetime`, or None) The date the + code was last modified. + :ivar rank: (float) A quanitification of the source code's quality, as + per available ratings (stars, forks, upvotes, etc.). """ def __init__(self, name, code, filename, language, authors, code_url, - date_created, date_modified): + date_created, date_modified, rank): """ Create a Codelet instance. - :param name: The name of the codelet. - :param code: The raw source code. - :param filename: The filename of the code, if any. - :param language: The inferred language. - :param authors: An array of tuples containing an author's name and - profile URL (on the service the code was pulled from). - :param code_url: The url of the (page containing the) source code. - :param date_created: The date the code was published. - :param date_modified: The date the code was last modified. + :param name: see :attr:`self.name` + :param code: see :attr:`self.code` + :param filename: see :attr:`self.filename` + :param language: see :attr:`self.language` + :param authors: see :attr:`self.authors` + :param code_url: see :attr:`self.code_url` + :param date_created: see :attr:`self.date_created` + :param date_modified: see :attr:`self.date_modified` + :param rank: see :attr:`self.rank` - :type name: str - :type code: str - :type filename: str, or None - :type language: str, or None - :type authors: array of str tuples, or None - :type code_url: str - :type date_created: str, or None - :type date_modified: str, or None + :type name: see :attr:`self.name` + :type code: see :attr:`self.code` + :type filename: see :attr:`self.filename` + :type language: see :attr:`self.language` + :type authors: see :attr:`self.authors` + :type code_url: see :attr:`self.code_url` + :type date_created: see :attr:`self.date_created` + :type date_modified: see :attr:`self.date_modified` + :type rank: see :attr:`self.rank` """ self.name = name @@ -48,3 +54,4 @@ class Codelet(object): self.code_url = code_url self.date_created = date_created self.date_modified = date_modified + self.rank = rank diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py index 69c579c..c1c77ad 100644 --- a/bitshift/crawler/indexer.py +++ b/bitshift/crawler/indexer.py @@ -12,9 +12,6 @@ from ..codelet import Codelet GIT_CLONE_DIR = "/tmp/bitshift" THREAD_QUEUE_SLEEP = 0.5 -import pymongo #debug -db = pymongo.MongoClient().bitshift #debug - class GitRepository(object): """ A representation of a Git repository's metadata. @@ -101,10 +98,10 @@ class GitIndexer(threading.Thread): repo = self.index_queue.get() self.index_queue.task_done() - # try: - self._index_repository(repo) - # except Exception as excep: - # self._logger.warning("%s: %s.", excep.__class__.__name__, excep) + try: + self._index_repository(repo) + except Exception as excep: + self._logger.warning("%s: %s.", excep.__class__.__name__, excep) def _index_repository(self, repo): """ @@ -119,10 +116,10 @@ class GitIndexer(threading.Thread): """ with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir: - # try: - self._insert_repository_codelets(repo) - # except Exception as excep: - # self._logger.warning("%s: %s.", excep.__class__.__name__, excep) + try: + self._insert_repository_codelets(repo) + except Exception as excep: + self._logger.warning("%s: %s.", excep.__class__.__name__, excep) if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)): shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name)) @@ -140,29 +137,222 @@ class GitIndexer(threading.Thread): :type repo_url: :class:`GitRepository` """ - commits_meta = _get_commits_metadata() + commits_meta = self._get_commits_metadata() if commits_meta is None: return for filename in commits_meta.keys(): try: with open(filename) as source_file: - source = _decode(source_file.read()) + source = self._decode(source_file.read()) if source is None: continue except IOError as exception: continue - authors = [(_decode(author),) for author in \ + authors = [(self._decode(author), None) for author in \ commits_meta[filename]["authors"]] codelet = Codelet("%s:%s" % (repo.name, filename), source, filename, - None, authors, _generate_file_url(filename, + None, authors, self._generate_file_url(filename, repo.url, repo.framework_name), commits_meta[filename]["time_created"], commits_meta[filename]["time_last_modified"], repo.rank) - db.codelets.insert(codelet.__dict__) #debug + def _generate_file_url(self, filename, repo_url, framework_name): + """ + Return a url for a filename from a Git wrapper framework. + + :param filename: The path of the file. + :param repo_url: The url of the file's parent repository. + :param framework_name: The name of the framework the repository is from. + + :type filename: str + :type repo_url: str + :type framework_name: str + + :return: The file's full url on the given framework, if successfully + derived. + :rtype: str, or None + + .. warning:: + Various Git subprocesses will occasionally fail, and, seeing as the + information they provide is a crucial component of some repository file + urls, None may be returned. + """ + + try: + if framework_name == "GitHub": + default_branch = subprocess.check_output("git branch" + " --no-color", shell=True)[2:-1] + return ("%s/blob/%s/%s" % (repo_url, default_branch, + filename)).replace("//", "/") + elif framework_name == "Bitbucket": + commit_hash = subprocess.check_output("git rev-parse HEAD", + shell=True).replace("\n", "") + return ("%s/src/%s/%s" % (repo_url, commit_hash, + filename)).replace("//", "/") + except subprocess.CalledProcessError as exception: + return None + + def _get_git_commits(self): + """ + Return the current working directory's formatted commit data. + + Uses `git log` to generate metadata about every single file in the + repository's commit history. + + :return: The author, timestamp, and names of all modified files of every + commit. + .. code-block:: python + sample_returned_array = [ + { + "author" : (str) "author" + "timestamp" : (`datetime.datetime`) , + "filenames" : (str array) ["file1", "file2"] + } + ] + :rtype: array of dictionaries + """ + + git_log = subprocess.check_output(("git --no-pager log --name-only" + " --pretty=format:'%n%n%an%n%at' -z"), shell=True) + + commits = [] + for commit in git_log.split("\n\n"): + fields = commit.split("\n") + if len(fields) > 2: + commits.append({ + "author" : fields[0], + "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), + "filenames" : fields[2].split("\x00")[:-2] + }) + + return commits + + def _get_tracked_files(self): + """ + Return a list of the filenames of all valuable files in the Git repository. + + Get a list of the filenames of the non-binary (Perl heuristics used for + filetype identification) files currently inside the current working + directory's Git repository. Then, weed out any boilerplate/non-code files + that match the regex rules in GIT_IGNORE_FILES. + + :return: The filenames of all index-worthy non-binary files. + :rtype: str array + """ + + files = [] + for dirname, subdir_names, filenames in os.walk("."): + for filename in filenames: + path = os.path.join(dirname, filename) + if self._is_ascii(path): + files.append(path[2:]) + + return files + + def _get_commits_metadata(self): + """ + Return a dictionary containing every valuable tracked file's metadata. + + :return: A dictionary with author names, time of creation, and time of last + modification for every filename key. + .. code-block:: python + sample_returned_dict = { + "my_file" : { + "authors" : (str array) ["author1", "author2"], + "time_created" : (`datetime.datetime`) , + "time_last_modified" : (`datetime.datetime`) + } + } + :rtype: dictionary of dictionaries + """ + + commits = self._get_git_commits() + tracked_files = self._get_tracked_files() + + files_meta = {} + for commit in commits: + for filename in commit["filenames"]: + if filename not in tracked_files: + continue + + if filename not in files_meta.keys(): + files_meta[filename] = { + "authors" : [commit["author"]], + "time_last_modified" : commit["timestamp"], + "time_created" : commit["timestamp"] + } + else: + if commit["author"] not in files_meta[filename]["authors"]: + files_meta[filename]["authors"].append(commit["author"]) + files_meta[filename]["time_created"] = commit["timestamp"] + + return files_meta + + def _decode(self, raw): + """ + Return a decoded a raw string. + + :param raw: The string to string. + + :type raw: (str) + + :return: If the original encoding is successfully inferenced, return the + decoded string. + :rtype: str, or None + + .. warning:: + The raw string's original encoding is identified by heuristics which + can, and occasionally will, fail. Decoding will then fail, and None + will be returned. + """ + + try: + encoding = bs4.BeautifulSoup(raw).original_encoding + return raw.decode(encoding) if encoding is not None else None + + except (LookupError, UnicodeDecodeError, UserWarning) as exception: + return None + + def _is_ascii(self, filename): + """ + Heuristically determine whether a file is ASCII text or binary. + + If a portion of the file contains null bytes, or the percentage of bytes + that aren't ASCII is greater than 30%, then the file is concluded to be + binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T` + operator, and is the de-facto method for in : passdetermining whether a + file is ASCII. + + :param filename: The path of the file to test. + + :type filename: str + + :return: Whether the file is probably ASCII. + :rtype: Boolean + """ + + try: + with open(filename) as source: + file_snippet = source.read(512) + + if not file_snippet: + return True + + ascii_characters = "".join(map(chr, range(32, 127)) + + list("\n\r\t\b")) + null_trans = string.maketrans("", "") + + if "\0" in file_snippet: + return False + + non_ascii = file_snippet.translate(null_trans, ascii_characters) + return not float(len(non_ascii)) / len(file_snippet) > 0.30 + + except IOError as exception: + return False class _GitCloner(threading.Thread): """ @@ -297,198 +487,3 @@ class _ChangeDir(object): """ os.chdir(self.old_path) - -def _generate_file_url(filename, repo_url, framework_name): - """ - Return a url for a filename from a Git wrapper framework. - - :param filename: The path of the file. - :param repo_url: The url of the file's parent repository. - :param framework_name: The name of the framework the repository is from. - - :type filename: str - :type repo_url: str - :type framework_name: str - - :return: The file's full url on the given framework, if successfully - derived. - :rtype: str, or None - - .. warning:: - Various Git subprocesses will occasionally fail, and, seeing as the - information they provide is a crucial component of some repository file - urls, None may be returned. - """ - - try: - if framework_name == "GitHub": - default_branch = subprocess.check_output("git branch" - " --no-color", shell=True)[2:-1] - return ("%s/blob/%s/%s" % (repo_url, default_branch, - filename)).replace("//", "/") - elif framework_name == "Bitbucket": - commit_hash = subprocess.check_output("git rev-parse HEAD", - shell=True).replace("\n", "") - return ("%s/src/%s/%s" % (repo_url, commit_hash, - filename)).replace("//", "/") - except subprocess.CalledProcessError as exception: - return None - -def _get_git_commits(): - """ - Return the current working directory's formatted commit data. - - Uses `git log` to generate metadata about every single file in the - repository's commit history. - - :return: The author, timestamp, and names of all modified files of every - commit. - .. code-block:: python - sample_returned_array = [ - { - "author" : (str) "author" - "timestamp" : (`datetime.datetime`) , - "filenames" : (str array) ["file1", "file2"] - } - ] - :rtype: array of dictionaries - """ - - git_log = subprocess.check_output(("git --no-pager log --name-only" - " --pretty=format:'%n%n%an%n%at' -z"), shell=True) - - commits = [] - for commit in git_log.split("\n\n"): - fields = commit.split("\n") - if len(fields) > 2: - commits.append({ - "author" : fields[0], - "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])), - "filenames" : fields[2].split("\x00")[:-2] - }) - - return commits - -def _get_tracked_files(): - """ - Return a list of the filenames of all valuable files in the Git repository. - - Get a list of the filenames of the non-binary (Perl heuristics used for - filetype identification) files currently inside the current working - directory's Git repository. Then, weed out any boilerplate/non-code files - that match the regex rules in GIT_IGNORE_FILES. - - :return: The filenames of all index-worthy non-binary files. - :rtype: str array - """ - - files = [] - for dirname, subdir_names, filenames in os.walk("."): - for filename in filenames: - path = os.path.join(dirname, filename) - if _is_ascii(path): - files.append(path[2:]) - - return files - -def _get_commits_metadata(): - """ - Return a dictionary containing every valuable tracked file's metadata. - - :return: A dictionary with author names, time of creation, and time of last - modification for every filename key. - .. code-block:: python - sample_returned_dict = { - "my_file" : { - "authors" : (str array) ["author1", "author2"], - "time_created" : (`datetime.datetime`) , - "time_last_modified" : (`datetime.datetime`) - } - } - :rtype: dictionary of dictionaries - """ - - commits = _get_git_commits() - tracked_files = _get_tracked_files() - - files_meta = {} - for commit in commits: - for filename in commit["filenames"]: - if filename not in tracked_files: - continue - - if filename not in files_meta.keys(): - files_meta[filename] = { - "authors" : [commit["author"]], - "time_last_modified" : commit["timestamp"], - "time_created" : commit["timestamp"] - } - else: - if commit["author"] not in files_meta[filename]["authors"]: - files_meta[filename]["authors"].append(commit["author"]) - files_meta[filename]["time_created"] = commit["timestamp"] - - return files_meta - -def _decode(raw): - """ - Return a decoded a raw string. - - :param raw: The string to string. - - :type raw: (str) - - :return: If the original encoding is successfully inferenced, return the - decoded string. - :rtype: str, or None - - .. warning:: - The raw string's original encoding is identified by heuristics which - can, and occasionally will, fail. Decoding will then fail, and None - will be returned. - """ - - try: - encoding = bs4.BeautifulSoup(raw).original_encoding - return raw.decode(encoding) if encoding is not None else None - - except (LookupError, UnicodeDecodeError, UserWarning) as exception: - return None - -def _is_ascii(filename): - """ - Heuristically determine whether a file is ASCII text or binary. - - If a portion of the file contains null bytes, or the percentage of bytes - that aren't ASCII is greater than 30%, then the file is concluded to be - binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T` - operator, and is the de-facto method for in : passdetermining whether a - file is ASCII. - - :param filename: The path of the file to test. - - :type filename: str - - :return: Whether the file is probably ASCII. - :rtype: Boolean - """ - - try: - with open(filename) as source: - file_snippet = source.read(512) - - if not file_snippet: - return True - - ascii_characters = "".join(map(chr, range(32, 127)) + - list("\n\r\t\b")) - null_trans = string.maketrans("", "") - - if "\0" in file_snippet: - return False - - non_ascii = file_snippet.translate(null_trans, ascii_characters) - return not float(len(non_ascii)) / len(file_snippet) > 0.30 - - except IOError as exception: - return False From 7c5c9fc7e1c99c1d67146570c43e60d0b04c899f Mon Sep 17 00:00:00 2001 From: Severyn Kozak Date: Sat, 3 May 2014 22:20:12 -0400 Subject: [PATCH 41/42] Add GitHub stars, Bitbucket watchers; close #14. Add: bitshift/crawler/crawler.py -Add more efficient method of querying GitHub's API for stargazer counts, by batching 25 repositories per request. -Add watcher counts for Bitbucket repositories, by querying the Bitbucket API once per repository (inefficient, but the API in question isn't sufficiently robust to accommodate a better approach, and Git repositories surface so infrequently that there shouldn't be any query limit problems). --- bitshift/crawler/__init__.py | 1 + bitshift/crawler/crawler.py | 111 +++++++++++++++++++++++++++---------------- 2 files changed, 71 insertions(+), 41 deletions(-) diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py index cfec64c..73b1c22 100644 --- a/bitshift/crawler/__init__.py +++ b/bitshift/crawler/__init__.py @@ -39,6 +39,7 @@ def _configure_logging(): os.mkdir(LOG_FILE_DIR) logging.getLogger("requests").setLevel(logging.WARNING) + logging.getLogger("urllib3").setLevel(logging.WARNING) formatter = logging.Formatter( fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s" diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py index 785ac61..9501bd0 100644 --- a/bitshift/crawler/crawler.py +++ b/bitshift/crawler/crawler.py @@ -63,8 +63,7 @@ class GitHubCrawler(threading.Thread): start_time = time.time() try: - response = requests.get(next_api_url, - params=self.AUTHENTICATION) + resp = requests.get(next_api_url, params=self.AUTHENTICATION) except ConnectionError as excep: self._logger.warning("API %s call failed: %s: %s", next_api_url, excep.__class__.__name__, excep) @@ -77,66 +76,84 @@ class GitHubCrawler(threading.Thread): ((self.clone_queue.qsize(), self.clone_queue.maxsize, queue_percent_full))) - for repo in response.json(): + repo_names = [repo["full_name"] for repo in resp.json()] + repo_stars = self._get_repositories_stars(repo_names) + + for repo in resp.json(): while self.clone_queue.full(): time.sleep(1) self.clone_queue.put(indexer.GitRepository( repo["html_url"], repo["full_name"].replace("/", ""), - "GitHub", - #self._get_repo_stars(repo["full_name"])) - 0)) + "GitHub", repo_stars[repo["full_name"]])) - if int(response.headers["x-ratelimit-remaining"]) == 0: - time.sleep(int(response.headers["x-ratelimit-reset"]) - + if int(resp.headers["x-ratelimit-remaining"]) == 0: + time.sleep(int(resp.headers["x-ratelimit-reset"]) - time.time()) - next_api_url = response.headers["link"].split(">")[0][1:] + next_api_url = resp.headers["link"].split(">")[0][1:] sleep_time = api_request_interval - (time.time() - start_time) if sleep_time > 0: time.sleep(sleep_time) - def _get_repo_stars(self, repo_name): + def _get_repositories_stars(self, repo_names): """ - Return the number of stargazers for a repository. + Return the number of stargazers for several repositories. - Queries the GitHub API for the number of stargazers for a given - repository, and blocks if the query limit is exceeded. + Queries the GitHub API for the number of stargazers for any given + repositories, and blocks if the query limit is exceeded. - :param repo_name: The name of the repository, in + :param repo_names: An array of repository names, in `username/repository_name` format. - :type repo_name: str - - :return: The number of stargazers for the repository. - :rtype: int - """ - - API_URL = "https://api.github.com/search/repositories" + :type repo_names: str - params = self.AUTHENTICATION - params["q"] = "repo:%s" % repo_name + :return: A dictionary with repository name keys, and corresponding + stargazer count values. - resp = requests.get(API_URL, - params=params, - headers={ - "Accept" : "application/vnd.github.preview" - }) + Example dictionary: + .. code-block:: python + { + "user/repository" : 100 + } - if int(resp.headers["x-ratelimit-remaining"]) == 0: - sleep_time = int(resp.headers["x-ratelimit-reset"]) - time.time() - if sleep_time > 0: - logging.info("API quota exceeded. Sleep time: %d." % sleep_time) - time.sleep(sleep_time) + :rtype: dictionary + """ - if "items" not in resp.json() or len(resp.json()["items"]) == 0: - self._logger.critical("No API result: %s. Result: %s" % (resp.url, - str(resp.json()))) - return 0 - else: - rank = float(resp.json()["items"][0]["stargazers_count"]) / 1000 - return rank if rank < 1.0 else 1.0 + API_URL = "https://api.github.com/search/repositories" + REPOS_PER_QUERY = 25 + + repo_stars = {} + for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in + xrange(0, len(repo_names), REPOS_PER_QUERY)]: + query_url = "%s?q=%s" % (API_URL, + "+".join("repo:%s" % name for name in names)) + + params = self.AUTHENTICATION + resp = requests.get(query_url, + params=params, + headers={ + "Accept" : "application/vnd.github.preview" + }) + + if int(resp.headers["x-ratelimit-remaining"]) == 0: + sleep_time = int(resp.headers["x-ratelimit-reset"]) - \ + time.time() + 1 + if sleep_time > 0: + logging.info("API quota exceeded. Sleep time: %d." % + sleep_time) + time.sleep(sleep_time) + + for repo in resp.json()["items"]: + rank = float(repo["stargazers_count"]) / 1000 + repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0 + + for name in repo_names: + if name not in repo_stars: + repo_stars[name] = 0.5 + + return repo_stars class BitbucketCrawler(threading.Thread): """ @@ -204,8 +221,20 @@ class BitbucketCrawler(threading.Thread): clone_links[0]["name"] == "https" else clone_links[1]["href"]) links.append("clone_url") + + try: + watchers = requests.get( + repo["links"]["watchers"]["href"]) + rank = len(watchers.json()["values"]) / 100 + except ConnectionError as exception: + time.sleep(0.5) + self._logger.warning("API %s call failed: %s: %s", + next_api_url, excep.__class__.__name__, excep) + continue + self.clone_queue.put(indexer.GitRepository( - clone_url, repo["full_name"], "Bitbucket")) + clone_url, repo["full_name"], "Bitbucket"), + rank if rank < 1.0 else 1.0) next_api_url = response["next"] time.sleep(0.2) From 56f23e682a24c3b199cc7add1447cf4130ba2657 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 4 May 2014 01:18:30 -0400 Subject: [PATCH 42/42] Database to v6; flesh out a lot of Database.search(). --- bitshift/database/__init__.py | 65 ++++++++++++++++++++++++++++-------------- bitshift/database/migration.py | 30 ++++++++++++++++++- bitshift/database/schema.sql | 23 +++++++++------ 3 files changed, 86 insertions(+), 32 deletions(-) diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py index 9b039ca..75f39da 100644 --- a/bitshift/database/__init__.py +++ b/bitshift/database/__init__.py @@ -51,10 +51,15 @@ class Database(object): "Run `python -m bitshift.database.migration`." raise RuntimeError(err) + def _get_codelets_from_ids(self, cursor, ids): + """Return a list of Codelet objects given a list of codelet IDs.""" + raise NotImplementedError() ## TODO + def _decompose_url(self, cursor, url): """Break up a URL into an origin (with a URL base) and a suffix.""" query = """SELECT origin_id, SUBSTR(?, LENGTH(origin_url_base)) - FROM origins WHERE origin_url_base IS NOT NULL + FROM origins + WHERE origin_url_base IS NOT NULL AND ? LIKE CONCAT(origin_url_base, "%")""" cursor.execute(query, (url, url)) @@ -88,19 +93,35 @@ class Database(object): :param page: The result page to display. :type page: int - :return: A list of search results. - :rtype: list of :py:class:`.Codelet`\ s + :return: The total number of results, and the *n*\ th page of results. + :rtype: 2-tuple of (long, list of :py:class:`.Codelet`\ s) """ - # search for cache_hash = mmh3.hash(query.serialize() + str(page)) - # cache HIT: - # update cache_last_used - # return codelets - # cache MISS: - # build complex search query - # fetch codelets - # cache results - # return codelets - pass + query1 = """SELECT cdata_codelet, cache_count_mnt, cache_count_exp + FROM cache + INNER JOIN cache_data ON cache_id = cdata_cache + WHERE cache_id = ?""" + query2 = "INSERT INTO cache VALUES (?, ?, ?, DEFAULT)" + query3 = "INSERT INTO cache_data VALUES (?, ?)" + + cache_id = mmh3.hash64(str(page) + ":" + query.serialize())[0] + + with self._conn.cursor() as cursor: + cursor.execute(query1, (cache_id,)) + results = cursor.fetchall() + if results: # Cache hit + num_results = results[0][1] * (10 ** results[0][2]) + ids = [res[0] for res in results] + else: # Cache miss + ## TODO: build and execute search query + results = cursor.fetchall() + ids = NotImplemented ## TODO: extract ids from results + num_results = NotImplemented ## TODO: num if results else 0 + num_exp = max(len(str(num_results)) - 3, 0) + num_results = int(round(num_results, -num_exp)) + num_mnt = num_results / (10 ** num_exp) + cursor.execute(query2, (cache_id, num_mnt, num_exp)) + cursor.executemany(query3, [(cache_id, c_id) for c_id in ids]) + return (num_results, self._get_codelets_from_ids(cursor, ids)) def insert(self, codelet): """ @@ -109,23 +130,23 @@ class Database(object): :param codelet: The codelet to insert. :type codelet: :py:class:`.Codelet` """ - query1 = """INSERT INTO code VALUES (?, ?) + query1 = """INSERT INTO code VALUES (?, ?, ?) ON DUPLICATE KEY UPDATE code_id=code_id""" query2 = """INSERT INTO codelets VALUES - (DEFAULT, ?, ?, ?, ?, ?, ?, ?, ?)""" + (DEFAULT, ?, ?, ?, ?, ?, ?, ?)""" query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)" - with self._conn.cursor() as cursor: - code_id = mmh3.hash64(codelet.code.encode("utf8"))[0] - origin, url = self._decompose_url(cursor, codelet.url) + hash_key = str(codelet.language) + ":" + codelet.code.encode("utf8") + code_id = mmh3.hash64(hash_key)[0] - cursor.execute(query1, (code_id, codelet.code)) + with self._conn.cursor() as cursor: + cursor.execute(query1, (code_id, codelet.language, codelet.code)) if cursor.rowcount == 1: for sym_type, symbols in codelet.symbols.iteritems(): self._insert_symbols(cursor, code_id, sym_type, symbols) - cursor.execute(query2, (codelet.name, code_id, codelet.language, - origin, url, codelet.rank, - codelet.date_created, + origin, url = self._decompose_url(cursor, codelet.url) + cursor.execute(query2, (codelet.name, code_id, origin, url, + codelet.rank, codelet.date_created, codelet.date_modified)) codelet_id = cursor.lastrowid authors = [(codelet_id, a[0], a[1]) for a in codelet.authors] diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py index 743f906..24f744a 100644 --- a/bitshift/database/migration.py +++ b/bitshift/database/migration.py @@ -3,7 +3,7 @@ Contains information about database schema versions, and SQL queries to update between them. """ -VERSION = 5 +VERSION = 6 MIGRATIONS = [ # 1 -> 2 @@ -60,6 +60,34 @@ MIGRATIONS = [ MODIFY COLUMN `origin_name` VARCHAR(64) DEFAULT NULL, MODIFY COLUMN `origin_url` VARCHAR(512) DEFAULT NULL, MODIFY COLUMN `origin_url_base` VARCHAR(512) DEFAULT NULL""" + ], + # 5 -> 6 + [ + """ALTER TABLE `code` + ADD COLUMN `code_lang` SMALLINT UNSIGNED DEFAULT NULL + AFTER `code_id`, + ADD KEY (`code_lang`)""", + """ALTER TABLE `codelets` + DROP KEY `codelet_lang`, + DROP COLUMN `codelet_lang`""", + """ALTER TABLE `cache_data` + DROP FOREIGN KEY `cache_data_ibfk_1`""", + """ALTER TABLE `cache` + MODIFY COLUMN `cache_id` BIGINT NOT NULL, + DROP COLUMN `cache_hash`, + DROP COLUMN `cache_last_used`, + MODIFY COLUMN `cache_count_mnt` SMALLINT UNSIGNED NOT NULL""", + """ALTER TABLE `cache_data` + MODIFY COLUMN `cdata_cache` BIGINT NOT NULL, + ADD PRIMARY KEY (`cdata_cache`, `cdata_codelet`), + ADD CONSTRAINT `cache_data_ibfk_1` FOREIGN KEY (`cdata_codelet`) + REFERENCES `codelets` (`codelet_id`) + ON DELETE CASCADE ON UPDATE CASCADE""", + """CREATE EVENT `flush_cache` + ON SCHEDULE EVERY 1 HOUR + DO + DELETE FROM `cache` + WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY);""" ] ] diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql index 50b4f9e..8634416 100644 --- a/bitshift/database/schema.sql +++ b/bitshift/database/schema.sql @@ -1,4 +1,4 @@ --- Schema version 5 +-- Schema version 6 CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci; USE `bitshift`; @@ -6,7 +6,7 @@ USE `bitshift`; CREATE TABLE `version` ( `version` INT UNSIGNED NOT NULL ) ENGINE=InnoDB; -INSERT INTO `version` VALUES (5); +INSERT INTO `version` VALUES (6); CREATE TABLE `origins` ( `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT, @@ -20,8 +20,10 @@ INSERT INTO `origins` VALUES (1, NULL, NULL, NULL, NULL); CREATE TABLE `code` ( `code_id` BIGINT NOT NULL, + `code_lang` SMALLINT UNSIGNED DEFAULT NULL, `code_code` MEDIUMTEXT NOT NULL, PRIMARY KEY (`code_id`), + KEY (`code_lang`), FULLTEXT KEY (`code_code`) ) ENGINE=InnoDB; @@ -29,7 +31,6 @@ CREATE TABLE `codelets` ( `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT, `codelet_name` VARCHAR(300) NOT NULL, `codelet_code_id` BIGINT NOT NULL, - `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL, `codelet_origin` TINYINT UNSIGNED NOT NULL, `codelet_url` VARCHAR(512) NOT NULL, `codelet_rank` FLOAT NOT NULL, @@ -37,7 +38,6 @@ CREATE TABLE `codelets` ( `codelet_date_modified` DATETIME DEFAULT NULL, PRIMARY KEY (`codelet_id`), FULLTEXT KEY (`codelet_name`), - KEY (`codelet_lang`), KEY (`codelet_rank`), KEY (`codelet_date_created`), KEY (`codelet_date_modified`), @@ -88,18 +88,17 @@ CREATE TABLE `symbol_locations` ( ) ENGINE=InnoDB; CREATE TABLE `cache` ( - `cache_id` INT UNSIGNED NOT NULL AUTO_INCREMENT, - `cache_hash` BIGINT NOT NULL, - `cache_count_mnt` TINYINT UNSIGNED NOT NULL, + `cache_id` BIGINT NOT NULL, + `cache_count_mnt` SMALLINT UNSIGNED NOT NULL, `cache_count_exp` TINYINT UNSIGNED NOT NULL, `cache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, - `cache_last_used` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (`cache_id`) ) ENGINE=InnoDB; CREATE TABLE `cache_data` ( - `cdata_cache` INT UNSIGNED NOT NULL, + `cdata_cache` BIGINT NOT NULL, `cdata_codelet` BIGINT UNSIGNED NOT NULL, + PRIMARY KEY (`cdata_cache`, `cdata_codelet`), FOREIGN KEY (`cdata_cache`) REFERENCES `cache` (`cache_id`) ON DELETE CASCADE ON UPDATE CASCADE, @@ -107,3 +106,9 @@ CREATE TABLE `cache_data` ( REFERENCES `codelets` (`codelet_id`) ON DELETE CASCADE ON UPDATE CASCADE ) ENGINE=InnoDB; + +CREATE EVENT `flush_cache` + ON SCHEDULE EVERY 1 HOUR + DO + DELETE FROM `cache` + WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY);