From 6a4ba580ed024ada5efcfe2149d28b4f4d992d3d Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Fri, 11 Apr 2014 12:43:34 -0400
Subject: [PATCH 01/42] Add Codelet, crawler dependencies to setup.

Add:
    bitshift/codelet.py
        -add Codelet class with constructor.

    README.md
        -add SASS stylesheet documentation
---
 README.md                          |  7 ++++++
 bitshift/assets.py                 |  3 +--
 bitshift/codelet.py                | 46 +++++++++++++++++++++++++++++++-------
 docs/source/api/bitshift.query.rst | 11 +++++++++
 docs/source/api/bitshift.rst       | 43 ++++++++++++++++++++++++++---------
 setup.py                           |  3 ++-
 6 files changed, 91 insertions(+), 22 deletions(-)
 create mode 100644 docs/source/api/bitshift.query.rst

diff --git a/README.md b/README.md
index 3cb81a1..0fe39d0 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,11 @@ Branches
 - `feature/*`: individual components of the project with untested, likely
   horribly broken code - branch off from and merge into `develop` when done
 
+Style
+-----
+bitshift uses [SASS][SASS] for styling; compile the stylesheets to CSS with
+`sass --watch static/sass/:static/css`.
+
 Documentation
 -------------
 
@@ -24,3 +29,5 @@ new modules or packages, but *not* when adding functions or changing
 docstrings), run `sphinx-apidoc -fo docs/source/api bitshift` from the project
 root. Note that this will revert any custom changes made to the files in
 `docs/source/api`, so you might want to update them by hand instead.
+
+[SASS]: http://sass-lang.com/guide
diff --git a/bitshift/assets.py b/bitshift/assets.py
index 90564d2..5d15304 100644
--- a/bitshift/assets.py
+++ b/bitshift/assets.py
@@ -1,6 +1,5 @@
 """
-.. module:: assets
-   :synopsis: Helper functions for use inside the project's Jinja templates.
+:synopsis: Helper functions for use inside the project's Jinja templates.
 """
 
 from flask import Markup
diff --git a/bitshift/codelet.py b/bitshift/codelet.py
index df81294..5c8ec40 100644
--- a/bitshift/codelet.py
+++ b/bitshift/codelet.py
@@ -1,13 +1,43 @@
 __all__ = ["Codelet"]
 
 class Codelet(object):
-    ## object to store the following (it doesn't need to do anything with it):
-    ## author name, URL, date created/modified, language, source code itself
-    ## for VCS: project name, file in project
-    ## also: list of functions, etc (associations data)
+    """
+    A source-code object with code metadata and composition analysis.
 
-    ## DICTIONARY MAPPING STRINGS REPRESENTING ASSOCIATION TYPE WITH DICTIONARIES
-    ## MAPPING ASSOCIATION NAMES WITH TUPLES REPRESENTING THEIR PLACE IN THE FILE
-    ## STORED AS TWO INTEGERS REPRESENTING THE ROW AND THE COLUMN
+    :ivar code: (string) A containing the raw source code.
+    :ivar language: (string) The inferred language of `code`.
+    :ivar author: (string) The
+    :ivar url: The url of the (page containing the) source code.
+    :ivar date_created: The date the code was published.
+    :ivar date_modified: The date the code was last modified.
+    """
 
-    ## {"functions": {"foo": (12, 13), "bar": (53, 3)}}
+    def __init__(self, code, author, language, code_url, author_url,
+            date_created, date_modified):
+        """
+        Create a Codelet instance.
+
+        :param code: The raw source code.
+        :param author: The author of the code.
+        :param language: The inferred language.
+        :param code_url: The url of the (page containing the) source code.
+        :param author_url: The url of the code author's public profile on the
+            framework said code was retrieved from.
+        :param date_created: The date the code was published.
+        :param date_modified: The date the code was last modified.
+
+        :type code: string
+        :type language: string
+        :type author: string
+        :type url: string
+        :type date_created: string
+        :type date_modified: string
+        """
+
+        self.code = code
+        self.author = author
+        self.language = language
+        self.code_url = code_url
+        self.author_url = author_url
+        self.date_created = date_created
+        self.date_modified = date_modified
diff --git a/docs/source/api/bitshift.query.rst b/docs/source/api/bitshift.query.rst
new file mode 100644
index 0000000..35b39a6
--- /dev/null
+++ b/docs/source/api/bitshift.query.rst
@@ -0,0 +1,11 @@
+query Package
+=============
+
+:mod:`query` Package
+--------------------
+
+.. automodule:: bitshift.query
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
diff --git a/docs/source/api/bitshift.rst b/docs/source/api/bitshift.rst
index a5f0898..1b1c703 100644
--- a/docs/source/api/bitshift.rst
+++ b/docs/source/api/bitshift.rst
@@ -1,30 +1,51 @@
-bitshift package
+bitshift Package
 ================
 
-Submodules
-----------
+:mod:`bitshift` Package
+-----------------------
 
-bitshift.assets module
-----------------------
+.. automodule:: bitshift.__init__
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`assets` Module
+--------------------
 
 .. automodule:: bitshift.assets
     :members:
     :undoc-members:
     :show-inheritance:
 
-bitshift.config module
-----------------------
+:mod:`codelet` Module
+---------------------
 
-.. automodule:: bitshift.config
+.. automodule:: bitshift.codelet
     :members:
     :undoc-members:
     :show-inheritance:
 
+:mod:`config` Module
+--------------------
 
-Module contents
----------------
+.. automodule:: bitshift.config
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`database` Module
+----------------------
 
-.. automodule:: bitshift
+.. automodule:: bitshift.database
     :members:
     :undoc-members:
     :show-inheritance:
+
+Subpackages
+-----------
+
+.. toctree::
+
+    bitshift.parser
+    bitshift.query
+
diff --git a/setup.py b/setup.py
index 0ec5f77..1faa5b9 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,8 @@ setup(
     name = "bitshift",
     version = "0.1",
     packages = find_packages(),
-    install_requires = ["Flask>=0.10.1", "pygments>=1.6"],
+    install_requires = ["Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0",
+                        "BeautifulSoup>=3.2.1"],
     author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak",
     license = "MIT",
     url = "https://github.com/earwig/bitshift"

From 20b518fccc730b0891229a02b43d0cf5cac4a683 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Fri, 11 Apr 2014 13:03:03 -0400
Subject: [PATCH 02/42] Minor refactor of codelet.

Add:
    bitshift/codelet.py
        -complete docstrings, add filename to Codelet constructor.
---
 bitshift/codelet.py | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/bitshift/codelet.py b/bitshift/codelet.py
index 5c8ec40..08b0d36 100644
--- a/bitshift/codelet.py
+++ b/bitshift/codelet.py
@@ -4,37 +4,39 @@ class Codelet(object):
     """
     A source-code object with code metadata and composition analysis.
 
-    :ivar code: (string) A containing the raw source code.
-    :ivar language: (string) The inferred language of `code`.
-    :ivar author: (string) The
-    :ivar url: The url of the (page containing the) source code.
-    :ivar date_created: The date the code was published.
-    :ivar date_modified: The date the code was last modified.
+    :ivar code: (str) A containing the raw source code.
+    :ivar filename: (str, or None) The filename of the snippet.
+    :ivar language: (str, or None) The inferred language of `code`.
+    :ivar author: (str, or None) The name of the code's author.
+    :ivar url: (str) The url of the (page containing the) source code.
+    :ivar date_created: (str, or None) The date the code was published.
+    :ivar date_modified: (str, or None) The date the code was last modified.
     """
 
-    def __init__(self, code, author, language, code_url, author_url,
-            date_created, date_modified):
+    def __init__(self, code, filename, author, language, code_url, author_url,
+                 date_created, date_modified):
         """
         Create a Codelet instance.
 
         :param code: The raw source code.
+        :param filename: The filename of the code, if any.
         :param author: The author of the code.
         :param language: The inferred language.
         :param code_url: The url of the (page containing the) source code.
-        :param author_url: The url of the code author's public profile on the
-            framework said code was retrieved from.
         :param date_created: The date the code was published.
         :param date_modified: The date the code was last modified.
 
-        :type code: string
-        :type language: string
-        :type author: string
-        :type url: string
-        :type date_created: string
-        :type date_modified: string
+        :type code: str
+        :type filename: str, or None
+        :type language: str, or None
+        :type author: str, or None
+        :type url: str
+        :type date_created: str, or None
+        :type date_modified: str, or None
         """
 
         self.code = code
+        self.filename = filename
         self.author = author
         self.language = language
         self.code_url = code_url

From 962dd9aef55a50a5ffa395dc78e897158157b27d Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Mon, 14 Apr 2014 12:02:23 -0400
Subject: [PATCH 03/42] Docstrings for Database methods; oursql dependency.

---
 app.py               |  9 ++++++---
 bitshift/database.py | 21 +++++++++++++++++++++
 setup.py             |  2 +-
 3 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/app.py b/app.py
index c4083c9..2e3b0c8 100644
--- a/app.py
+++ b/app.py
@@ -5,6 +5,7 @@ Module to contain all the project's Flask server plumbing.
 from flask import Flask
 from flask import render_template, session
 
+from bitshift.database import Database
 from bitshift.query import parse_query
 
 app = Flask(__name__)
@@ -12,7 +13,9 @@ app.config.from_object("bitshift.config")
 
 app_env = app.jinja_env
 app_env.line_statement_prefix = "="
-app_env.globals.update(assets = assets)
+app_env.globals.update(assets=assets)
+
+database = Database()
 
 @app.route("/")
 def index():
@@ -20,8 +23,8 @@ def index():
 
 @app.route("/search/<query>")
 def search(query):
-    ## tree = parse_query(query)
-    ## database.search(tree)
+    tree = parse_query(query)
+    database.search(tree)
     pass
 
 if __name__ == "__main__":
diff --git a/bitshift/database.py b/bitshift/database.py
index b8995ee..36b984e 100644
--- a/bitshift/database.py
+++ b/bitshift/database.py
@@ -16,3 +16,24 @@ class Database(object):
 
     def _create(self):
         pass
+
+    def search(self, query):
+        """
+        Search the database.
+
+        :param query: The query to search for.
+        :type query: :py:class:`~.query.tree.Tree`
+
+        :return: A list of search results.
+        :rtype: list of :py:class:`.Codelet`\ s
+        """
+        pass
+
+    def insert(self, codelet):
+        """
+        Insert a codelet into the database.
+
+        :param codelet: The codelet to insert.
+        :type codelet: :py:class:`.Codelet`
+        """
+        pass
diff --git a/setup.py b/setup.py
index 1faa5b9..5fa1189 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ setup(
     version = "0.1",
     packages = find_packages(),
     install_requires = ["Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0",
-                        "BeautifulSoup>=3.2.1"],
+                        "BeautifulSoup>=3.2.1", "oursql>=0.9.3.1"],
     author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak",
     license = "MIT",
     url = "https://github.com/earwig/bitshift"

From 085fd62704c1ee5d9b88daef4f5992082e9c56dc Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 15 Apr 2014 00:38:12 -0400
Subject: [PATCH 04/42] Database schema, hashing module, some other things.

---
 .gitignore           |  1 +
 bitshift/database.py | 10 +++++-----
 schema.sql           | 23 +++++++++++++++++++++++
 setup.py             |  5 +++--
 4 files changed, 32 insertions(+), 7 deletions(-)
 create mode 100644 schema.sql

diff --git a/.gitignore b/.gitignore
index 6a014f5..7e00121 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 .sass-cache
 .DS_Store
+.my.cnf
 
 # github premade rules
 *.py[cod]
diff --git a/bitshift/database.py b/bitshift/database.py
index 36b984e..647fe55 100644
--- a/bitshift/database.py
+++ b/bitshift/database.py
@@ -3,19 +3,18 @@ Module with classes and functions to handle communication with the MySQL
 database backend, which manages the search index.
 """
 
+import mmh3
 import oursql
 
 class Database(object):
     """Represents the MySQL database."""
 
     def __init__(self):
-        pass
+        self._connect()
 
     def _connect(self):
-        pass
-
-    def _create(self):
-        pass
+        """Establish a connection to the database."""
+        self._conn = oursql.connect()
 
     def search(self, query):
         """
@@ -36,4 +35,5 @@ class Database(object):
         :param codelet: The codelet to insert.
         :type codelet: :py:class:`.Codelet`
         """
+        # code_hash = mmh3.hash64(codelet.code)[0]
         pass
diff --git a/schema.sql b/schema.sql
new file mode 100644
index 0000000..3cb915c
--- /dev/null
+++ b/schema.sql
@@ -0,0 +1,23 @@
+CREATE DATABASE bitshift DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
+USE `bitshift`;
+
+CREATE TABLE codelets (
+    `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
+    `codelet_name` VARCHAR(512) NOT NULL,
+    `codelet_code_id` BIGINT UNSIGNED NOT NULL,
+    `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL,
+    `codelet_origin` TINYINT UNSIGNED DEFAULT NULL,
+    `codelet_url` VARCHAR(512) NOT NULL,
+    `codelet_date_created` DATETIME DEFAULT NULL,
+    `codelet_date_modified` DATETIME DEFAULT NULL,
+    PRIMARY KEY (`codelet_id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+
+CREATE TABLE code (
+    `code_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
+    `code_hash` BIGINT NOT NULL,
+    `code_code` MEDIUMTEXT NOT NULL,
+    PRIMARY KEY (`code_id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+
+-- separate tables: authors, symbols, caches, search indices
diff --git a/setup.py b/setup.py
index 5fa1189..97441b7 100644
--- a/setup.py
+++ b/setup.py
@@ -4,8 +4,9 @@ setup(
     name = "bitshift",
     version = "0.1",
     packages = find_packages(),
-    install_requires = ["Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0",
-                        "BeautifulSoup>=3.2.1", "oursql>=0.9.3.1"],
+    install_requires = [
+        "Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0",
+        "BeautifulSoup>=3.2.1", "oursql>=0.9.3.1", "mmh3>=2.3"],
     author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak",
     license = "MIT",
     url = "https://github.com/earwig/bitshift"

From bc3b9e7587e40579bfceeb448c8260a554d87854 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Thu, 17 Apr 2014 17:33:14 -0400
Subject: [PATCH 05/42] Some more database design work.

---
 bitshift/database.py       | 13 ++++++++--
 bitshift/query/__init__.py |  2 ++
 schema.sql                 | 65 +++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/bitshift/database.py b/bitshift/database.py
index 647fe55..07c71c2 100644
--- a/bitshift/database.py
+++ b/bitshift/database.py
@@ -16,16 +16,25 @@ class Database(object):
         """Establish a connection to the database."""
         self._conn = oursql.connect()
 
-    def search(self, query):
+    def search(self, query, page=1):
         """
-        Search the database.
+        Search the database for a query and return the *n*\ th page of results.
 
         :param query: The query to search for.
         :type query: :py:class:`~.query.tree.Tree`
+        :param page: The result page to display.
+        :type page: int
 
         :return: A list of search results.
         :rtype: list of :py:class:`.Codelet`\ s
         """
+        # query tree hash + page -> cached?
+        #   cache HIT:
+        #       if qcache_created is too old: invalidate cache, goto cache MISS
+        #       update qcache_last_used
+        #       parse qcache_results, fetch codelets
+        #   cache MISS:
+        #       build complex search query
         pass
 
     def insert(self, codelet):
diff --git a/bitshift/query/__init__.py b/bitshift/query/__init__.py
index 7d6e0d5..6971c04 100644
--- a/bitshift/query/__init__.py
+++ b/bitshift/query/__init__.py
@@ -6,4 +6,6 @@ __all__ = ["parse_query"]
 
 def parse_query(query):
     # gets a string, returns a Tree
+    # TODO: note: resultant Trees should be normalized so that "foo OR bar"
+    # and "bar OR foo" result in equivalent trees
     pass
diff --git a/schema.sql b/schema.sql
index 3cb915c..d49fc6e 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,23 +1,68 @@
-CREATE DATABASE bitshift DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
+CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
 USE `bitshift`;
 
-CREATE TABLE codelets (
+CREATE TABLE `languages` (
+    `language_id` SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
+    `language_name` VARCHAR(64) NOT NULL,
+    PRIMARY KEY (`language_id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+
+CREATE TABLE `origins` (
+    `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
+    `origin_name` VARCHAR(64) NOT NULL,
+    `origin_url` VARCHAR(512) NOT NULL,
+    `origin_url_base` VARCHAR(512) NOT NULL,
+    `origin_image` TINYBLOB DEFAULT NULL, -- TODO: verify size (<64kB)
+    PRIMARY KEY (`origin_id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+
+CREATE TABLE `codelets` (
     `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
     `codelet_name` VARCHAR(512) NOT NULL,
     `codelet_code_id` BIGINT UNSIGNED NOT NULL,
-    `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL,
-    `codelet_origin` TINYINT UNSIGNED DEFAULT NULL,
+    `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL, -- TODO: needs index
+    `codelet_origin` TINYINT UNSIGNED NOT NULL,
     `codelet_url` VARCHAR(512) NOT NULL,
-    `codelet_date_created` DATETIME DEFAULT NULL,
-    `codelet_date_modified` DATETIME DEFAULT NULL,
+    `codelet_date_created` DATETIME DEFAULT NULL, -- TODO: needs index
+    `codelet_date_modified` DATETIME DEFAULT NULL, -- TODO: needs index
     PRIMARY KEY (`codelet_id`)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 
-CREATE TABLE code (
+CREATE TABLE `code` (
     `code_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
-    `code_hash` BIGINT NOT NULL,
-    `code_code` MEDIUMTEXT NOT NULL,
+    `code_hash` BIGINT NOT NULL, -- TODO: needs index
+    `code_code` MEDIUMTEXT NOT NULL, -- TODO: verify size (16mB?)
     PRIMARY KEY (`code_id`)
 ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 
--- separate tables: authors, symbols, caches, search indices
+CREATE TABLE `authors` (
+    `author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
+    `author_codelet` BIGINT UNSIGNED NOT NULL, -- TODO: foreign index?
+    `author_name` VARCHAR(128) NOT NULL, -- TODO: needs index
+    `author_url` VARCHAR(512) DEFAULT NULL,
+    PRIMARY KEY (`author_id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+
+CREATE TABLE `symbols` (
+    `symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
+    `symbol_codelet` BIGINT UNSIGNED NOT NULL, -- TODO: foreign index?
+    `symbol_type` TINYINT UNSIGNED NOT NULL, -- TODO: multi-column index?
+    `symbol_name` VARCHAR(512) NOT NULL, -- TODO: needs index
+    `symbol_row` INT UNSIGNED NOT NULL,
+    `symbol_col` INT UNSIGNED NOT NULL,
+    PRIMARY KEY (`symbol_id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+
+CREATE TABLE `query_cache` (
+    `qcache_id` INT NOT NULL UNIQUE,
+    `qcache_query` VARCHAR(512) NOT NULL,
+    `qcache_results` BLOB NOT NULL, -- TODO: verify; perhaps use some kind of array
+    `qcache_page` TINYINT UNSIGNED NOT NULL,
+    `qcache_count_mnt` TINYINT UNSIGNED NOT NULL,
+    `qcache_count_exp` TINYINT UNSIGNED NOT NULL,
+    `qcache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- TODO: verify
+    `qcache_last_used` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- TODO: verify
+    PRIMARY KEY (`cache_id`)
+) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+
+-- TODO: full-text search index table

From 1cbe669c0247446fba178c07d3f8daf86e73e5ca Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Thu, 17 Apr 2014 19:25:42 -0400
Subject: [PATCH 06/42] More work on db schema; all except FTS indices.

---
 bitshift/database.py |  12 +++---
 schema.sql           | 108 ++++++++++++++++++++++++++++++++-------------------
 2 files changed, 74 insertions(+), 46 deletions(-)

diff --git a/bitshift/database.py b/bitshift/database.py
index 07c71c2..b86b05a 100644
--- a/bitshift/database.py
+++ b/bitshift/database.py
@@ -28,13 +28,15 @@ class Database(object):
         :return: A list of search results.
         :rtype: list of :py:class:`.Codelet`\ s
         """
-        # query tree hash + page -> cached?
+        # search for cache_hash = mmh3.hash(query.serialize() + str(page))
         #   cache HIT:
-        #       if qcache_created is too old: invalidate cache, goto cache MISS
-        #       update qcache_last_used
-        #       parse qcache_results, fetch codelets
+        #       update cache_last_used
+        #       return codelets
         #   cache MISS:
         #       build complex search query
+        #       fetch codelets
+        #       cache results
+        #       return codelets
         pass
 
     def insert(self, codelet):
@@ -44,5 +46,5 @@ class Database(object):
         :param codelet: The codelet to insert.
         :type codelet: :py:class:`.Codelet`
         """
-        # code_hash = mmh3.hash64(codelet.code)[0]
+        # code_hash = mmh3.hash64(codelet.code.encode("utf8"))[0]
         pass
diff --git a/schema.sql b/schema.sql
index d49fc6e..21c9c07 100644
--- a/schema.sql
+++ b/schema.sql
@@ -2,67 +2,93 @@ CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
 USE `bitshift`;
 
 CREATE TABLE `languages` (
-    `language_id` SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
+    `language_id` SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT,
     `language_name` VARCHAR(64) NOT NULL,
     PRIMARY KEY (`language_id`)
-) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+) ENGINE=InnoDB;
 
 CREATE TABLE `origins` (
-    `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
+    `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT,
     `origin_name` VARCHAR(64) NOT NULL,
     `origin_url` VARCHAR(512) NOT NULL,
     `origin_url_base` VARCHAR(512) NOT NULL,
-    `origin_image` TINYBLOB DEFAULT NULL, -- TODO: verify size (<64kB)
+    `origin_image` BLOB DEFAULT NULL,
     PRIMARY KEY (`origin_id`)
-) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+) ENGINE=InnoDB;
+
+CREATE TABLE `code` (
+    `code_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
+    `code_hash` BIGINT NOT NULL,
+    `code_code` MEDIUMTEXT NOT NULL, -- TODO: full-text search index
+    PRIMARY KEY (`code_id`),
+    KEY (`code_hash`)
+) ENGINE=InnoDB;
 
 CREATE TABLE `codelets` (
-    `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
-    `codelet_name` VARCHAR(512) NOT NULL,
+    `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
+    `codelet_name` VARCHAR(300) NOT NULL, -- TODO: full-text search index
     `codelet_code_id` BIGINT UNSIGNED NOT NULL,
-    `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL, -- TODO: needs index
+    `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL,
     `codelet_origin` TINYINT UNSIGNED NOT NULL,
     `codelet_url` VARCHAR(512) NOT NULL,
-    `codelet_date_created` DATETIME DEFAULT NULL, -- TODO: needs index
-    `codelet_date_modified` DATETIME DEFAULT NULL, -- TODO: needs index
-    PRIMARY KEY (`codelet_id`)
-) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
-
-CREATE TABLE `code` (
-    `code_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
-    `code_hash` BIGINT NOT NULL, -- TODO: needs index
-    `code_code` MEDIUMTEXT NOT NULL, -- TODO: verify size (16mB?)
-    PRIMARY KEY (`code_id`)
-) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+    `codelet_date_created` DATETIME DEFAULT NULL,
+    `codelet_date_modified` DATETIME DEFAULT NULL,
+    PRIMARY KEY (`codelet_id`),
+    KEY (`codelet_date_created`),
+    KEY (`codelet_date_modified`),
+    FOREIGN KEY (`codelet_code_id`)
+        REFERENCES `code` (`code_id`)
+        ON DELETE RESTRICT ON UPDATE CASCADE,
+    FOREIGN KEY (`codelet_lang`)
+        REFERENCES `languages` (`language_id`)
+        ON DELETE RESTRICT ON UPDATE CASCADE,
+    FOREIGN KEY (`codelet_origin`)
+        REFERENCES `origins` (`origin_id`)
+        ON DELETE RESTRICT ON UPDATE CASCADE
+) ENGINE=InnoDB;
 
 CREATE TABLE `authors` (
-    `author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
-    `author_codelet` BIGINT UNSIGNED NOT NULL, -- TODO: foreign index?
-    `author_name` VARCHAR(128) NOT NULL, -- TODO: needs index
+    `author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
+    `author_codelet` BIGINT UNSIGNED NOT NULL,
+    `author_name` VARCHAR(128) NOT NULL, -- TODO: full-text search index
     `author_url` VARCHAR(512) DEFAULT NULL,
-    PRIMARY KEY (`author_id`)
-) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+    PRIMARY KEY (`author_id`),
+    FOREIGN KEY (`author_codelet`)
+        REFERENCES `codelet` (`codelet_id`)
+        ON DELETE CASCADE ON UPDATE CASCADE
+) ENGINE=InnoDB;
 
 CREATE TABLE `symbols` (
-    `symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT UNIQUE,
-    `symbol_codelet` BIGINT UNSIGNED NOT NULL, -- TODO: foreign index?
-    `symbol_type` TINYINT UNSIGNED NOT NULL, -- TODO: multi-column index?
-    `symbol_name` VARCHAR(512) NOT NULL, -- TODO: needs index
+    `symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
+    `symbol_codelet` BIGINT UNSIGNED NOT NULL,
+    `symbol_type` TINYINT UNSIGNED NOT NULL,
+    `symbol_name` VARCHAR(512) NOT NULL,
     `symbol_row` INT UNSIGNED NOT NULL,
     `symbol_col` INT UNSIGNED NOT NULL,
-    PRIMARY KEY (`symbol_id`)
-) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+    PRIMARY KEY (`symbol_id`),
+    KEY (`symbol_type`, `symbol_name`(32)),
+    FOREIGN KEY (`symbol_codelet`)
+        REFERENCES `codelet` (`codelet_id`)
+        ON DELETE CASCADE ON UPDATE CASCADE
+) ENGINE=InnoDB;
 
-CREATE TABLE `query_cache` (
-    `qcache_id` INT NOT NULL UNIQUE,
-    `qcache_query` VARCHAR(512) NOT NULL,
-    `qcache_results` BLOB NOT NULL, -- TODO: verify; perhaps use some kind of array
-    `qcache_page` TINYINT UNSIGNED NOT NULL,
-    `qcache_count_mnt` TINYINT UNSIGNED NOT NULL,
-    `qcache_count_exp` TINYINT UNSIGNED NOT NULL,
-    `qcache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- TODO: verify
-    `qcache_last_used` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, -- TODO: verify
+CREATE TABLE `cache` (
+    `cache_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
+    `cache_hash` BIGINT NOT NULL,
+    `cache_count_mnt` TINYINT UNSIGNED NOT NULL,
+    `cache_count_exp` TINYINT UNSIGNED NOT NULL,
+    `cache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
+    `cache_last_used` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
     PRIMARY KEY (`cache_id`)
-) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
+) ENGINE=InnoDB;
 
--- TODO: full-text search index table
+CREATE TABLE `cache_data` (
+    `cdata_cache` INT UNSIGNED NOT NULL,
+    `cdata_codelet` BIGINT UNSIGNED NOT NULL,
+    FOREIGN KEY (`cdata_cache`)
+        REFERENCES `cache` (`cache_id`)
+        ON DELETE CASCADE ON UPDATE CASCADE,
+    FOREIGN KEY (`cdata_codelet`)
+        REFERENCES `codelet` (`codelet_id`)
+        ON DELETE CASCADE ON UPDATE CASCADE
+) ENGINE=InnoDB;

From 75b243f6853f224593c6aff1153ea9a74f768ba4 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Thu, 17 Apr 2014 20:33:14 -0400
Subject: [PATCH 07/42] Remove languages table; add indexed field for codelet
 rank.

---
 bitshift/database.py |  2 ++
 schema.sql           | 11 ++---------
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/bitshift/database.py b/bitshift/database.py
index b86b05a..02aa38e 100644
--- a/bitshift/database.py
+++ b/bitshift/database.py
@@ -6,6 +6,8 @@ database backend, which manages the search index.
 import mmh3
 import oursql
 
+# from .languages import ...
+
 class Database(object):
     """Represents the MySQL database."""
 
diff --git a/schema.sql b/schema.sql
index 21c9c07..a76f8f8 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,12 +1,6 @@
 CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
 USE `bitshift`;
 
-CREATE TABLE `languages` (
-    `language_id` SMALLINT UNSIGNED NOT NULL AUTO_INCREMENT,
-    `language_name` VARCHAR(64) NOT NULL,
-    PRIMARY KEY (`language_id`)
-) ENGINE=InnoDB;
-
 CREATE TABLE `origins` (
     `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT,
     `origin_name` VARCHAR(64) NOT NULL,
@@ -31,17 +25,16 @@ CREATE TABLE `codelets` (
     `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL,
     `codelet_origin` TINYINT UNSIGNED NOT NULL,
     `codelet_url` VARCHAR(512) NOT NULL,
+    `codelet_rank` FLOAT NOT NULL,
     `codelet_date_created` DATETIME DEFAULT NULL,
     `codelet_date_modified` DATETIME DEFAULT NULL,
     PRIMARY KEY (`codelet_id`),
+    KEY (`codelet_rank`),
     KEY (`codelet_date_created`),
     KEY (`codelet_date_modified`),
     FOREIGN KEY (`codelet_code_id`)
         REFERENCES `code` (`code_id`)
         ON DELETE RESTRICT ON UPDATE CASCADE,
-    FOREIGN KEY (`codelet_lang`)
-        REFERENCES `languages` (`language_id`)
-        ON DELETE RESTRICT ON UPDATE CASCADE,
     FOREIGN KEY (`codelet_origin`)
         REFERENCES `origins` (`origin_id`)
         ON DELETE RESTRICT ON UPDATE CASCADE

From fb4e0d5916d6e6edcae9e5c6ef6cedb55ed9725f Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Fri, 18 Apr 2014 02:16:42 -0400
Subject: [PATCH 08/42] FULLTEXT KEYs where appropriate.

---
 schema.sql | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/schema.sql b/schema.sql
index a76f8f8..df77720 100644
--- a/schema.sql
+++ b/schema.sql
@@ -13,14 +13,15 @@ CREATE TABLE `origins` (
 CREATE TABLE `code` (
     `code_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
     `code_hash` BIGINT NOT NULL,
-    `code_code` MEDIUMTEXT NOT NULL, -- TODO: full-text search index
+    `code_code` MEDIUMTEXT NOT NULL,
     PRIMARY KEY (`code_id`),
-    KEY (`code_hash`)
+    KEY (`code_hash`),
+    FULLTEXT KEY (`codelet_code`)
 ) ENGINE=InnoDB;
 
 CREATE TABLE `codelets` (
     `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
-    `codelet_name` VARCHAR(300) NOT NULL, -- TODO: full-text search index
+    `codelet_name` VARCHAR(300) NOT NULL,
     `codelet_code_id` BIGINT UNSIGNED NOT NULL,
     `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL,
     `codelet_origin` TINYINT UNSIGNED NOT NULL,
@@ -29,6 +30,7 @@ CREATE TABLE `codelets` (
     `codelet_date_created` DATETIME DEFAULT NULL,
     `codelet_date_modified` DATETIME DEFAULT NULL,
     PRIMARY KEY (`codelet_id`),
+    FULLTEXT KEY (`codelet_name`),
     KEY (`codelet_rank`),
     KEY (`codelet_date_created`),
     KEY (`codelet_date_modified`),
@@ -43,9 +45,10 @@ CREATE TABLE `codelets` (
 CREATE TABLE `authors` (
     `author_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
     `author_codelet` BIGINT UNSIGNED NOT NULL,
-    `author_name` VARCHAR(128) NOT NULL, -- TODO: full-text search index
+    `author_name` VARCHAR(128) NOT NULL,
     `author_url` VARCHAR(512) DEFAULT NULL,
     PRIMARY KEY (`author_id`),
+    FULLTEXT KEY (`author_name`),
     FOREIGN KEY (`author_codelet`)
         REFERENCES `codelet` (`codelet_id`)
         ON DELETE CASCADE ON UPDATE CASCADE

From ad3de0615fdd0fbf5310dd4354abb6daa162e0dc Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Thu, 24 Apr 2014 14:38:33 -0400
Subject: [PATCH 09/42] Fix some typos in the schema.

---
 schema.sql | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/schema.sql b/schema.sql
index df77720..15979be 100644
--- a/schema.sql
+++ b/schema.sql
@@ -16,7 +16,7 @@ CREATE TABLE `code` (
     `code_code` MEDIUMTEXT NOT NULL,
     PRIMARY KEY (`code_id`),
     KEY (`code_hash`),
-    FULLTEXT KEY (`codelet_code`)
+    FULLTEXT KEY (`code_code`)
 ) ENGINE=InnoDB;
 
 CREATE TABLE `codelets` (
@@ -50,7 +50,7 @@ CREATE TABLE `authors` (
     PRIMARY KEY (`author_id`),
     FULLTEXT KEY (`author_name`),
     FOREIGN KEY (`author_codelet`)
-        REFERENCES `codelet` (`codelet_id`)
+        REFERENCES `codelets` (`codelet_id`)
         ON DELETE CASCADE ON UPDATE CASCADE
 ) ENGINE=InnoDB;
 
@@ -64,7 +64,7 @@ CREATE TABLE `symbols` (
     PRIMARY KEY (`symbol_id`),
     KEY (`symbol_type`, `symbol_name`(32)),
     FOREIGN KEY (`symbol_codelet`)
-        REFERENCES `codelet` (`codelet_id`)
+        REFERENCES `codelets` (`codelet_id`)
         ON DELETE CASCADE ON UPDATE CASCADE
 ) ENGINE=InnoDB;
 
@@ -85,6 +85,6 @@ CREATE TABLE `cache_data` (
         REFERENCES `cache` (`cache_id`)
         ON DELETE CASCADE ON UPDATE CASCADE,
     FOREIGN KEY (`cdata_codelet`)
-        REFERENCES `codelet` (`codelet_id`)
+        REFERENCES `codelets` (`codelet_id`)
         ON DELETE CASCADE ON UPDATE CASCADE
 ) ENGINE=InnoDB;

From 54bca5894f9f0866538292f40593f99e61eeae97 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Sun, 27 Apr 2014 00:47:13 -0400
Subject: [PATCH 10/42] Move database stuff to a subpackage; updates.

---
 bitshift/{database.py => database/__init__.py} | 39 ++++++++++++++++++++++----
 schema.sql => bitshift/database/schema.sql     |  0
 2 files changed, 34 insertions(+), 5 deletions(-)
 rename bitshift/{database.py => database/__init__.py} (55%)
 rename schema.sql => bitshift/database/schema.sql (100%)

diff --git a/bitshift/database.py b/bitshift/database/__init__.py
similarity index 55%
rename from bitshift/database.py
rename to bitshift/database/__init__.py
index 02aa38e..4ed7a02 100644
--- a/bitshift/database.py
+++ b/bitshift/database/__init__.py
@@ -1,12 +1,16 @@
 """
-Module with classes and functions to handle communication with the MySQL
+Subpackage with classes and functions to handle communication with the MySQL
 database backend, which manages the search index.
 """
 
+import os
+
 import mmh3
 import oursql
 
-# from .languages import ...
+# from ..languages import ...
+
+__all__ = ["Database"]
 
 class Database(object):
     """Represents the MySQL database."""
@@ -16,7 +20,9 @@ class Database(object):
 
     def _connect(self):
         """Establish a connection to the database."""
-        self._conn = oursql.connect()
+        default_file = os.path.join(os.path.dirname(__file__), ".my.cnf")
+        self._conn = oursql.connect(read_default_file=default_file,
+                                    autoping=True, autoreconnect=True)
 
     def search(self, query, page=1):
         """
@@ -48,5 +54,28 @@ class Database(object):
         :param codelet: The codelet to insert.
         :type codelet: :py:class:`.Codelet`
         """
-        # code_hash = mmh3.hash64(codelet.code.encode("utf8"))[0]
-        pass
+        query = "INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)"
+
+        cursor.execute(query, ())
+
+        # codelet_id -- auto_increment used here
+        codelet_name
+        codelet_code_id
+        codelet_lang
+        codelet_origin
+        codelet_url
+        codelet_rank
+        codelet_date_created
+        codelet_date_modified
+
+        # codelet fields
+        codelet.name
+        codelet.code
+        codelet.filename
+        codelet.language
+        codelet.authors
+        codelet.code_url
+        codelet.date_created
+        codelet.date_modified
+
+        code_hash = mmh3.hash64(codelet.code.encode("utf8"))[0]
diff --git a/schema.sql b/bitshift/database/schema.sql
similarity index 100%
rename from schema.sql
rename to bitshift/database/schema.sql

From 0d0a74f9dfd7fa382f2dcdb02256246e062d0450 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Sun, 27 Apr 2014 23:43:32 -0400
Subject: [PATCH 11/42] Some more work on db stuff.

---
 bitshift/database/__init__.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py
index 4ed7a02..9a54ef2 100644
--- a/bitshift/database/__init__.py
+++ b/bitshift/database/__init__.py
@@ -20,7 +20,8 @@ class Database(object):
 
     def _connect(self):
         """Establish a connection to the database."""
-        default_file = os.path.join(os.path.dirname(__file__), ".my.cnf")
+        root = os.path.dirname(os.path.abspath(__file__))
+        default_file = os.path.join(root, ".my.cnf")
         self._conn = oursql.connect(read_default_file=default_file,
                                     autoping=True, autoreconnect=True)
 
@@ -54,9 +55,18 @@ class Database(object):
         :param codelet: The codelet to insert.
         :type codelet: :py:class:`.Codelet`
         """
-        query = "INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)"
+        frag_size = 16384  # 16 kB
+        query_slt1 = """SELECT code_id, LEFT(code_code, {0})
+                        FROM code WHERE code_hash = ?""".format(frag_size)
+        query_ins1 = "INSERT INTO code VALUES (?, ?)"
+        query_ins2 = "INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)"
+        query_ins3 = "INSERT INTO authors VALUES", " (?, ?, ?)"
+        query_ins4 = "INSERT INTO symbols VALUES", " (?, ?, ?, ?, ?)"
 
-        cursor.execute(query, ())
+        # LAST_INSERT_ID()
+
+        code_id = None
+        code_hash = mmh3.hash64(codelet.code.encode("utf8"))[0]
 
         # codelet_id -- auto_increment used here
         codelet_name
@@ -78,4 +88,14 @@ class Database(object):
         codelet.date_created
         codelet.date_modified
 
-        code_hash = mmh3.hash64(codelet.code.encode("utf8"))[0]
+        with self._conn.cursor() as cursor:
+            # Retrieve the ID of the source code if it's already in the DB:
+            cursor.execute(query_slt1, (code_hash,))
+            for c_id, c_code_frag in cursor.fetchall():
+                if c_code_frag == codelet.code[:frag_size]:
+                    code_id = c_id
+                    break
+
+            # If the source code isn't already in the DB, add it:
+            if not code_id:
+                cursor.execute()

From 22d6b625474f535d53adef652bd4d6e3397af04e Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Mon, 28 Apr 2014 14:05:45 -0400
Subject: [PATCH 12/42] Update schema to v2; database updates.

---
 bitshift/database/__init__.py  | 62 +++++++++++++++++++++++++++---------------
 bitshift/database/migration.py | 23 ++++++++++++++++
 bitshift/database/schema.sql   | 13 +++++++--
 3 files changed, 73 insertions(+), 25 deletions(-)
 create mode 100644 bitshift/database/migration.py

diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py
index 9a54ef2..50486b6 100644
--- a/bitshift/database/__init__.py
+++ b/bitshift/database/__init__.py
@@ -8,15 +8,16 @@ import os
 import mmh3
 import oursql
 
-# from ..languages import ...
+from .migration import VERSION, MIGRATIONS
 
 __all__ = ["Database"]
 
 class Database(object):
     """Represents the MySQL database."""
 
-    def __init__(self):
+    def __init__(self, migrate=False):
         self._connect()
+        self._check_version(migrate)
 
     def _connect(self):
         """Establish a connection to the database."""
@@ -25,6 +26,33 @@ class Database(object):
         self._conn = oursql.connect(read_default_file=default_file,
                                     autoping=True, autoreconnect=True)
 
+    def _migrate(self, cursor, current):
+        """Migrate the database to the latest schema version."""
+        for version in xrange(current, VERSION):
+            for query in MIGRATIONS[version - 1]:
+                cursor.execute(query)
+
+    def _check_version(self, migrate):
+        """Check the database schema version and respond accordingly.
+
+        If the schema is out of date, migrate if *migrate* is True, else raise
+        an exception.
+        """
+        with self._conn.cursor() as cursor:
+            cursor.execute("SELECT version FROM version")
+            version = cursor.fetchone()[0]
+            if version < VERSION:
+                if migrate:
+                    self._migrate(cursor, version)
+                else:
+                    err = "Database schema out of date. " \
+                          "Run `python -m bitshift.database.migration`."
+                    raise RuntimeError(err)
+
+    def close(self):
+        """Disconnect from the database."""
+        self._conn.close()
+
     def search(self, query, page=1):
         """
         Search the database for a query and return the *n*\ th page of results.
@@ -55,19 +83,14 @@ class Database(object):
         :param codelet: The codelet to insert.
         :type codelet: :py:class:`.Codelet`
         """
-        frag_size = 16384  # 16 kB
-        query_slt1 = """SELECT code_id, LEFT(code_code, {0})
-                        FROM code WHERE code_hash = ?""".format(frag_size)
-        query_ins1 = "INSERT INTO code VALUES (?, ?)"
-        query_ins2 = "INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)"
-        query_ins3 = "INSERT INTO authors VALUES", " (?, ?, ?)"
-        query_ins4 = "INSERT INTO symbols VALUES", " (?, ?, ?, ?, ?)"
+        query1 = """INSERT INTO code VALUES (?, ?)
+                    ON DUPLICATE KEY UPDATE code_id=code_id"""
+        query2 = "INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)"
+        query3 = "INSERT INTO authors VALUES", " (?, ?, ?)"
+        query4 = "INSERT INTO symbols VALUES", " (?, ?, ?, ?, ?)"
 
         # LAST_INSERT_ID()
 
-        code_id = None
-        code_hash = mmh3.hash64(codelet.code.encode("utf8"))[0]
-
         # codelet_id -- auto_increment used here
         codelet_name
         codelet_code_id
@@ -88,14 +111,9 @@ class Database(object):
         codelet.date_created
         codelet.date_modified
 
+        #######################################################################
+
+        code_id = mmh3.hash64(codelet.code.encode("utf8"))[0]
+
         with self._conn.cursor() as cursor:
-            # Retrieve the ID of the source code if it's already in the DB:
-            cursor.execute(query_slt1, (code_hash,))
-            for c_id, c_code_frag in cursor.fetchall():
-                if c_code_frag == codelet.code[:frag_size]:
-                    code_id = c_id
-                    break
-
-            # If the source code isn't already in the DB, add it:
-            if not code_id:
-                cursor.execute()
+            cursor.execute(query1, (code_id, codelet.code))
diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py
new file mode 100644
index 0000000..c9fdd39
--- /dev/null
+++ b/bitshift/database/migration.py
@@ -0,0 +1,23 @@
+"""
+Contains information about database schema versions, and SQL queries to update
+between them.
+"""
+
+VERSION = 2
+
+MIGRATIONS = [
+    # 1 -> 2
+    [
+        # drop index on code_hash
+        "ALTER TABLE code DROP COLUMN code_hash",
+        # change code_id to BIGINT NOT NULL,
+        # add key on codelets to codelet_lang
+        # add symbol_end_row INT UNSIGNED NOT NULL
+        # add symbol_end_col INT UNSIGNED NOT NULL
+    ]
+]
+
+if __name__ == "__main__":
+    from . import Database
+
+    Database(migrate=True).close()
diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql
index 15979be..159f85a 100644
--- a/bitshift/database/schema.sql
+++ b/bitshift/database/schema.sql
@@ -1,6 +1,12 @@
+-- Schema version 2
+
 CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
 USE `bitshift`;
 
+CREATE TABLE `version` (
+    `version` INT UNSIGNED NOT NULL
+) ENGINE=InnoDB;
+
 CREATE TABLE `origins` (
     `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT,
     `origin_name` VARCHAR(64) NOT NULL,
@@ -11,11 +17,9 @@ CREATE TABLE `origins` (
 ) ENGINE=InnoDB;
 
 CREATE TABLE `code` (
-    `code_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
-    `code_hash` BIGINT NOT NULL,
+    `code_id` BIGINT NOT NULL,
     `code_code` MEDIUMTEXT NOT NULL,
     PRIMARY KEY (`code_id`),
-    KEY (`code_hash`),
     FULLTEXT KEY (`code_code`)
 ) ENGINE=InnoDB;
 
@@ -31,6 +35,7 @@ CREATE TABLE `codelets` (
     `codelet_date_modified` DATETIME DEFAULT NULL,
     PRIMARY KEY (`codelet_id`),
     FULLTEXT KEY (`codelet_name`),
+    KEY (`codelet_lang`),
     KEY (`codelet_rank`),
     KEY (`codelet_date_created`),
     KEY (`codelet_date_modified`),
@@ -61,6 +66,8 @@ CREATE TABLE `symbols` (
     `symbol_name` VARCHAR(512) NOT NULL,
     `symbol_row` INT UNSIGNED NOT NULL,
     `symbol_col` INT UNSIGNED NOT NULL,
+    `symbol_end_row` INT UNSIGNED NOT NULL,
+    `symbol_end_col` INT UNSIGNED NOT NULL,
     PRIMARY KEY (`symbol_id`),
     KEY (`symbol_type`, `symbol_name`(32)),
     FOREIGN KEY (`symbol_codelet`)

From a5cc3537cbec154f7e819f76a870812abddb010b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 29 Apr 2014 12:42:11 -0400
Subject: [PATCH 13/42] Credits.

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0fe39d0..8ca31d7 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
 bitshift
 ========
 
-bitshift is a semantic search engine for source code.
+bitshift is a semantic search engine for source code developed by Benjamin
+Attal, Ben Kurtovic, and Severyn Kozak.
 
 Branches
 --------

From 0b655daaff3cdd41f48b96fe34f786f10deed56a Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 29 Apr 2014 13:19:02 -0400
Subject: [PATCH 14/42] Finish migration to v2.

---
 bitshift/database/__init__.py  |  2 ++
 bitshift/database/migration.py | 21 +++++++++++++++------
 bitshift/database/schema.sql   |  3 ++-
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py
index 50486b6..14f7575 100644
--- a/bitshift/database/__init__.py
+++ b/bitshift/database/__init__.py
@@ -29,8 +29,10 @@ class Database(object):
     def _migrate(self, cursor, current):
         """Migrate the database to the latest schema version."""
         for version in xrange(current, VERSION):
+            print "Migrating to %d..." % version + 1
             for query in MIGRATIONS[version - 1]:
                 cursor.execute(query)
+            cursor.execute("UPDATE version SET version = ?", (version + 1,))
 
     def _check_version(self, migrate):
         """Check the database schema version and respond accordingly.
diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py
index c9fdd39..2ea9666 100644
--- a/bitshift/database/migration.py
+++ b/bitshift/database/migration.py
@@ -8,12 +8,21 @@ VERSION = 2
 MIGRATIONS = [
     # 1 -> 2
     [
-        # drop index on code_hash
-        "ALTER TABLE code DROP COLUMN code_hash",
-        # change code_id to BIGINT NOT NULL,
-        # add key on codelets to codelet_lang
-        # add symbol_end_row INT UNSIGNED NOT NULL
-        # add symbol_end_col INT UNSIGNED NOT NULL
+        """ALTER TABLE `codelets`
+           DROP FOREIGN KEY `codelets_ibfk_1`""",
+        """ALTER TABLE `code`
+           DROP KEY `code_hash`,
+           DROP COLUMN `code_hash`,
+           MODIFY COLUMN `code_id` BIGINT NOT NULL""",
+        """ALTER TABLE `codelets`
+           MODIFY COLUMN `codelet_code_id` BIGINT NOT NULL,
+           ADD KEY (`codelet_lang`),
+           ADD FOREIGN KEY (`codelet_code_id`)
+               REFERENCES `code` (`code_id`)
+               ON DELETE RESTRICT ON UPDATE CASCADE""",
+        """ALTER TABLE `symbols`
+           ADD COLUMN `symbol_end_row` INT UNSIGNED NOT NULL,
+           ADD COLUMN `symbol_end_col` INT UNSIGNED NOT NULL"""
     ]
 ]
 
diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql
index 159f85a..56a2d85 100644
--- a/bitshift/database/schema.sql
+++ b/bitshift/database/schema.sql
@@ -6,6 +6,7 @@ USE `bitshift`;
 CREATE TABLE `version` (
     `version` INT UNSIGNED NOT NULL
 ) ENGINE=InnoDB;
+INSERT INTO `version` VALUES (2);
 
 CREATE TABLE `origins` (
     `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT,
@@ -26,7 +27,7 @@ CREATE TABLE `code` (
 CREATE TABLE `codelets` (
     `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
     `codelet_name` VARCHAR(300) NOT NULL,
-    `codelet_code_id` BIGINT UNSIGNED NOT NULL,
+    `codelet_code_id` BIGINT NOT NULL,
     `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL,
     `codelet_origin` TINYINT UNSIGNED NOT NULL,
     `codelet_url` VARCHAR(512) NOT NULL,

From 821a6ae4f1a30c2b8b4575c408145f8b34877206 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Wed, 30 Apr 2014 14:44:31 -0400
Subject: [PATCH 15/42] DB -> v3 for symbol->code assoc vs. ->codelet (fixes
 #13)

---
 bitshift/database/migration.py | 13 +++++++++++--
 bitshift/database/schema.sql   | 10 +++++-----
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py
index 2ea9666..caf3020 100644
--- a/bitshift/database/migration.py
+++ b/bitshift/database/migration.py
@@ -3,7 +3,7 @@ Contains information about database schema versions, and SQL queries to update
 between them.
 """
 
-VERSION = 2
+VERSION = 3
 
 MIGRATIONS = [
     # 1 -> 2
@@ -17,12 +17,21 @@ MIGRATIONS = [
         """ALTER TABLE `codelets`
            MODIFY COLUMN `codelet_code_id` BIGINT NOT NULL,
            ADD KEY (`codelet_lang`),
-           ADD FOREIGN KEY (`codelet_code_id`)
+           ADD CONSTRAINT `codelets_ibfk_1` FOREIGN KEY (`codelet_code_id`)
                REFERENCES `code` (`code_id`)
                ON DELETE RESTRICT ON UPDATE CASCADE""",
         """ALTER TABLE `symbols`
            ADD COLUMN `symbol_end_row` INT UNSIGNED NOT NULL,
            ADD COLUMN `symbol_end_col` INT UNSIGNED NOT NULL"""
+    ],
+    # 2 -> 3
+    [
+        """ALTER TABLE `symbols`
+           DROP FOREIGN KEY `symbols_ibfk_1`,
+           CHANGE COLUMN `symbol_codelet` `symbol_code` BIGINT NOT NULL,
+           ADD CONSTRAINT `symbols_ibfk_1` FOREIGN KEY (`symbol_code`)
+               REFERENCES `code` (`code_id`)
+               ON DELETE CASCADE ON UPDATE CASCADE"""
     ]
 ]
 
diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql
index 56a2d85..99b9e42 100644
--- a/bitshift/database/schema.sql
+++ b/bitshift/database/schema.sql
@@ -1,4 +1,4 @@
--- Schema version 2
+-- Schema version 3
 
 CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
 USE `bitshift`;
@@ -6,7 +6,7 @@ USE `bitshift`;
 CREATE TABLE `version` (
     `version` INT UNSIGNED NOT NULL
 ) ENGINE=InnoDB;
-INSERT INTO `version` VALUES (2);
+INSERT INTO `version` VALUES (3);
 
 CREATE TABLE `origins` (
     `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT,
@@ -62,7 +62,7 @@ CREATE TABLE `authors` (
 
 CREATE TABLE `symbols` (
     `symbol_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
-    `symbol_codelet` BIGINT UNSIGNED NOT NULL,
+    `symbol_code` BIGINT NOT NULL,
     `symbol_type` TINYINT UNSIGNED NOT NULL,
     `symbol_name` VARCHAR(512) NOT NULL,
     `symbol_row` INT UNSIGNED NOT NULL,
@@ -71,8 +71,8 @@ CREATE TABLE `symbols` (
     `symbol_end_col` INT UNSIGNED NOT NULL,
     PRIMARY KEY (`symbol_id`),
     KEY (`symbol_type`, `symbol_name`(32)),
-    FOREIGN KEY (`symbol_codelet`)
-        REFERENCES `codelets` (`codelet_id`)
+    FOREIGN KEY (`symbol_code`)
+        REFERENCES `code` (`code_id`)
         ON DELETE CASCADE ON UPDATE CASCADE
 ) ENGINE=InnoDB;
 

From e3a838220c7394e0985e627a4d7c090ba09e6bb2 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Wed, 30 Apr 2014 14:44:45 -0400
Subject: [PATCH 16/42] Flesh out most of Database.insert().

---
 bitshift/database/__init__.py | 44 +++++++++++++++++--------------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py
index 14f7575..03a5c2c 100644
--- a/bitshift/database/__init__.py
+++ b/bitshift/database/__init__.py
@@ -87,35 +87,25 @@ class Database(object):
         """
         query1 = """INSERT INTO code VALUES (?, ?)
                     ON DUPLICATE KEY UPDATE code_id=code_id"""
-        query2 = "INSERT INTO codelets VALUES (?, ?, ?, ?, ?, ?, ?, ?)"
-        query3 = "INSERT INTO authors VALUES", " (?, ?, ?)"
-        query4 = "INSERT INTO symbols VALUES", " (?, ?, ?, ?, ?)"
-
-        # LAST_INSERT_ID()
-
-        # codelet_id -- auto_increment used here
-        codelet_name
-        codelet_code_id
-        codelet_lang
-        codelet_origin
-        codelet_url
-        codelet_rank
-        codelet_date_created
-        codelet_date_modified
-
-        # codelet fields
-        codelet.name
-        codelet.code
-        codelet.filename
-        codelet.language
-        codelet.authors
-        codelet.code_url
-        codelet.date_created
-        codelet.date_modified
-
-        #######################################################################
+        query2 = """INSERT INTO codelets VALUES
+                    (?, ?, ?, ?, ?, ?, ?, ?)"""
+        query3 = "SELECT LAST_INSERT_ID()"
+        query4 = "INSERT INTO authors VALUES (?, ?, ?)"
+        query5 = "INSERT INTO symbols VALUES (?, ?, ?, ?, ?, ?, ?)"
 
         code_id = mmh3.hash64(codelet.code.encode("utf8"))[0]
+        origin, url = decompose(codelet.url)  ## TODO: create decompose() function
 
         with self._conn.cursor() as cursor:
             cursor.execute(query1, (code_id, codelet.code))
+            cursor.execute(query2, (codelet.name, code_id, codelet.language,
+                                    origin, url, codelet.rank,
+                                    codelet.date_created,
+                                    codelet.date_modified))
+            cursor.execute(query3)
+            codelet_id = cursor.fetchone()[0]
+            authors = [(codelet_id, a.name, a.url) for a in codelet.authors]  ## TODO: check author fields (is it a tuple?)
+            cursor.executemany(query4, authors)
+            if code_id is new:  ## TODO: check for this properly
+                symbols = [(code_id, sym.type, sym.name, sym.row, sym.col, sym.end_row, sym.end_col) for sym in codelet.symbols]  # TODO: check symbol fields (dict?)
+                cursor.executemany(query5, symbols)

From 97b0644bf01932ba32863999226ae1ade7cd8fee Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Fri, 2 May 2014 14:40:00 -0400
Subject: [PATCH 17/42] Database to v4: split off symbol_locations table.

---
 bitshift/database/migration.py | 23 ++++++++++++++++++++++-
 bitshift/database/schema.sql   | 22 ++++++++++++++++------
 2 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py
index caf3020..e0ec762 100644
--- a/bitshift/database/migration.py
+++ b/bitshift/database/migration.py
@@ -3,7 +3,7 @@ Contains information about database schema versions, and SQL queries to update
 between them.
 """
 
-VERSION = 3
+VERSION = 4
 
 MIGRATIONS = [
     # 1 -> 2
@@ -32,6 +32,27 @@ MIGRATIONS = [
            ADD CONSTRAINT `symbols_ibfk_1` FOREIGN KEY (`symbol_code`)
                REFERENCES `code` (`code_id`)
                ON DELETE CASCADE ON UPDATE CASCADE"""
+    ],
+    # 3 -> 4
+    [
+        """ALTER TABLE `symbols`
+           DROP COLUMN `symbol_row`,
+           DROP COLUMN `symbol_col`,
+           DROP COLUMN `symbol_end_row`,
+           DROP COLUMN `symbol_end_col`""",
+        """CREATE TABLE `symbol_locations` (
+           `sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
+           `sloc_symbol` BIGINT UNSIGNED NOT NULL,
+           `sloc_type` TINYINT UNSIGNED NOT NULL,
+           `sloc_row` INT UNSIGNED NOT NULL,
+           `sloc_col` INT UNSIGNED NOT NULL,
+           `sloc_end_row` INT UNSIGNED NOT NULL,
+           `sloc_end_col` INT UNSIGNED NOT NULL,
+           PRIMARY KEY (`sloc_id`),
+           FOREIGN KEY (`sloc_symbol`)
+               REFERENCES `symbols` (`symbol_id`)
+               ON DELETE CASCADE ON UPDATE CASCADE
+           ) ENGINE=InnoDB"""
     ]
 ]
 
diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql
index 99b9e42..79dad45 100644
--- a/bitshift/database/schema.sql
+++ b/bitshift/database/schema.sql
@@ -1,4 +1,4 @@
--- Schema version 3
+-- Schema version 4
 
 CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
 USE `bitshift`;
@@ -6,7 +6,7 @@ USE `bitshift`;
 CREATE TABLE `version` (
     `version` INT UNSIGNED NOT NULL
 ) ENGINE=InnoDB;
-INSERT INTO `version` VALUES (3);
+INSERT INTO `version` VALUES (4);
 
 CREATE TABLE `origins` (
     `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT,
@@ -65,10 +65,6 @@ CREATE TABLE `symbols` (
     `symbol_code` BIGINT NOT NULL,
     `symbol_type` TINYINT UNSIGNED NOT NULL,
     `symbol_name` VARCHAR(512) NOT NULL,
-    `symbol_row` INT UNSIGNED NOT NULL,
-    `symbol_col` INT UNSIGNED NOT NULL,
-    `symbol_end_row` INT UNSIGNED NOT NULL,
-    `symbol_end_col` INT UNSIGNED NOT NULL,
     PRIMARY KEY (`symbol_id`),
     KEY (`symbol_type`, `symbol_name`(32)),
     FOREIGN KEY (`symbol_code`)
@@ -76,6 +72,20 @@ CREATE TABLE `symbols` (
         ON DELETE CASCADE ON UPDATE CASCADE
 ) ENGINE=InnoDB;
 
+CREATE TABLE `symbol_locations` (
+    `sloc_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
+    `sloc_symbol` BIGINT UNSIGNED NOT NULL,
+    `sloc_type` TINYINT UNSIGNED NOT NULL,
+    `sloc_row` INT UNSIGNED NOT NULL,
+    `sloc_col` INT UNSIGNED NOT NULL,
+    `sloc_end_row` INT UNSIGNED NOT NULL,
+    `sloc_end_col` INT UNSIGNED NOT NULL,
+    PRIMARY KEY (`sloc_id`),
+    FOREIGN KEY (`sloc_symbol`)
+        REFERENCES `symbols` (`symbol_id`)
+        ON DELETE CASCADE ON UPDATE CASCADE
+) ENGINE=InnoDB;
+
 CREATE TABLE `cache` (
     `cache_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
     `cache_hash` BIGINT NOT NULL,

From d2aef2829e5edf11c2e392ce14436c5e452af42f Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Fri, 2 May 2014 14:40:52 -0400
Subject: [PATCH 18/42] Finish database insertion, except for origins.

---
 bitshift/database/__init__.py | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py
index 03a5c2c..bc4b451 100644
--- a/bitshift/database/__init__.py
+++ b/bitshift/database/__init__.py
@@ -51,6 +51,23 @@ class Database(object):
                           "Run `python -m bitshift.database.migration`."
                     raise RuntimeError(err)
 
+    def _decompose_url(self, url):
+        """Break up a URL into an origin (with a URL base) and a suffix."""
+        pass  ## TODO
+
+    def _insert_symbols(self, cursor, code_id, sym_type, symbols):
+        """Insert a list of symbols of a given type into the database."""
+        sym_types = ["functions", "classes", "variables"]
+        query1 = "INSERT INTO symbols VALUES (?, ?, ?)"
+        query2 = "INSERT INTO symbol_locations VALUES (?, ?, ?, ?, ?, ?)"
+
+        for (name, decls, uses) in symbols:
+            cursor.execute(query1, (code_id, sym_types.index(sym_type), name))
+            sym_id = cursor.lastrowid
+            params = ([tuple([sym_id, 0] + list(loc)) for loc in decls] +
+                      [tuple([sym_id, 1] + list(loc)) for loc in uses])
+            cursor.executemany(query2, params)
+
     def close(self):
         """Disconnect from the database."""
         self._conn.close()
@@ -89,23 +106,21 @@ class Database(object):
                     ON DUPLICATE KEY UPDATE code_id=code_id"""
         query2 = """INSERT INTO codelets VALUES
                     (?, ?, ?, ?, ?, ?, ?, ?)"""
-        query3 = "SELECT LAST_INSERT_ID()"
-        query4 = "INSERT INTO authors VALUES (?, ?, ?)"
-        query5 = "INSERT INTO symbols VALUES (?, ?, ?, ?, ?, ?, ?)"
+        query3 = "INSERT INTO authors VALUES (?, ?, ?)"
 
         code_id = mmh3.hash64(codelet.code.encode("utf8"))[0]
-        origin, url = decompose(codelet.url)  ## TODO: create decompose() function
+        origin, url = self._decompose_url(codelet.url)
 
         with self._conn.cursor() as cursor:
             cursor.execute(query1, (code_id, codelet.code))
+            new_code = cursor.rowcount == 1
             cursor.execute(query2, (codelet.name, code_id, codelet.language,
                                     origin, url, codelet.rank,
                                     codelet.date_created,
                                     codelet.date_modified))
-            cursor.execute(query3)
-            codelet_id = cursor.fetchone()[0]
-            authors = [(codelet_id, a.name, a.url) for a in codelet.authors]  ## TODO: check author fields (is it a tuple?)
-            cursor.executemany(query4, authors)
-            if code_id is new:  ## TODO: check for this properly
-                symbols = [(code_id, sym.type, sym.name, sym.row, sym.col, sym.end_row, sym.end_col) for sym in codelet.symbols]  # TODO: check symbol fields (dict?)
-                cursor.executemany(query5, symbols)
+            codelet_id = cursor.lastrowid
+            authors = [(codelet_id, a[0], a[1]) for a in codelet.authors]
+            cursor.executemany(query3, authors)
+            if new_code:
+                for sym_type, symbols in codelet.symbols.iteritems():
+                    self._insert_symbols(cursor, code_id, sym_type, symbols)

From d6ccdbd16d1db369801ebd7a12ba1bf90df5225a Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Fri, 2 May 2014 22:43:16 -0400
Subject: [PATCH 19/42] Fix a couble Database bugs.

---
 bitshift/database/__init__.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py
index bc4b451..1a2b373 100644
--- a/bitshift/database/__init__.py
+++ b/bitshift/database/__init__.py
@@ -16,15 +16,15 @@ class Database(object):
     """Represents the MySQL database."""
 
     def __init__(self, migrate=False):
-        self._connect()
+        self._conn = self._connect()
         self._check_version(migrate)
 
     def _connect(self):
         """Establish a connection to the database."""
         root = os.path.dirname(os.path.abspath(__file__))
         default_file = os.path.join(root, ".my.cnf")
-        self._conn = oursql.connect(read_default_file=default_file,
-                                    autoping=True, autoreconnect=True)
+        return oursql.connect(db="bitshift", read_default_file=default_file,
+                              autoping=True, autoreconnect=True)
 
     def _migrate(self, cursor, current):
         """Migrate the database to the latest schema version."""
@@ -58,8 +58,9 @@ class Database(object):
     def _insert_symbols(self, cursor, code_id, sym_type, symbols):
         """Insert a list of symbols of a given type into the database."""
         sym_types = ["functions", "classes", "variables"]
-        query1 = "INSERT INTO symbols VALUES (?, ?, ?)"
-        query2 = "INSERT INTO symbol_locations VALUES (?, ?, ?, ?, ?, ?)"
+        query1 = "INSERT INTO symbols VALUES (DEFAULT, ?, ?, ?)"
+        query2 = """INSERT INTO symbol_locations VALUES
+                    (DEFAULT, ?, ?, ?, ?, ?, ?)"""
 
         for (name, decls, uses) in symbols:
             cursor.execute(query1, (code_id, sym_types.index(sym_type), name))
@@ -105,8 +106,8 @@ class Database(object):
         query1 = """INSERT INTO code VALUES (?, ?)
                     ON DUPLICATE KEY UPDATE code_id=code_id"""
         query2 = """INSERT INTO codelets VALUES
-                    (?, ?, ?, ?, ?, ?, ?, ?)"""
-        query3 = "INSERT INTO authors VALUES (?, ?, ?)"
+                    (DEFAULT, ?, ?, ?, ?, ?, ?, ?, ?)"""
+        query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)"
 
         code_id = mmh3.hash64(codelet.code.encode("utf8"))[0]
         origin, url = self._decompose_url(codelet.url)

From 950b6994f0abb83192065cedaeeef07bd1b5dd99 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Sat, 3 May 2014 17:50:16 -0400
Subject: [PATCH 20/42] Database to v5; finish Database.insert().

---
 bitshift/database/__init__.py  | 23 ++++++++++++++---------
 bitshift/database/migration.py |  9 ++++++++-
 bitshift/database/schema.sql   | 11 ++++++-----
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py
index 1a2b373..9b039ca 100644
--- a/bitshift/database/__init__.py
+++ b/bitshift/database/__init__.py
@@ -51,9 +51,15 @@ class Database(object):
                           "Run `python -m bitshift.database.migration`."
                     raise RuntimeError(err)
 
-    def _decompose_url(self, url):
+    def _decompose_url(self, cursor, url):
         """Break up a URL into an origin (with a URL base) and a suffix."""
-        pass  ## TODO
+        query = """SELECT origin_id, SUBSTR(?, LENGTH(origin_url_base))
+                   FROM origins WHERE origin_url_base IS NOT NULL
+                   AND ? LIKE CONCAT(origin_url_base, "%")"""
+
+        cursor.execute(query, (url, url))
+        result = cursor.fetchone()
+        return result if result else (1, url)
 
     def _insert_symbols(self, cursor, code_id, sym_type, symbols):
         """Insert a list of symbols of a given type into the database."""
@@ -109,12 +115,14 @@ class Database(object):
                     (DEFAULT, ?, ?, ?, ?, ?, ?, ?, ?)"""
         query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)"
 
-        code_id = mmh3.hash64(codelet.code.encode("utf8"))[0]
-        origin, url = self._decompose_url(codelet.url)
-
         with self._conn.cursor() as cursor:
+            code_id = mmh3.hash64(codelet.code.encode("utf8"))[0]
+            origin, url = self._decompose_url(cursor, codelet.url)
+
             cursor.execute(query1, (code_id, codelet.code))
-            new_code = cursor.rowcount == 1
+            if cursor.rowcount == 1:
+                for sym_type, symbols in codelet.symbols.iteritems():
+                    self._insert_symbols(cursor, code_id, sym_type, symbols)
             cursor.execute(query2, (codelet.name, code_id, codelet.language,
                                     origin, url, codelet.rank,
                                     codelet.date_created,
@@ -122,6 +130,3 @@ class Database(object):
             codelet_id = cursor.lastrowid
             authors = [(codelet_id, a[0], a[1]) for a in codelet.authors]
             cursor.executemany(query3, authors)
-            if new_code:
-                for sym_type, symbols in codelet.symbols.iteritems():
-                    self._insert_symbols(cursor, code_id, sym_type, symbols)
diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py
index e0ec762..743f906 100644
--- a/bitshift/database/migration.py
+++ b/bitshift/database/migration.py
@@ -3,7 +3,7 @@ Contains information about database schema versions, and SQL queries to update
 between them.
 """
 
-VERSION = 4
+VERSION = 5
 
 MIGRATIONS = [
     # 1 -> 2
@@ -53,6 +53,13 @@ MIGRATIONS = [
                REFERENCES `symbols` (`symbol_id`)
                ON DELETE CASCADE ON UPDATE CASCADE
            ) ENGINE=InnoDB"""
+    ],
+    # 4 -> 5
+    [
+        """ALTER TABLE `origins`
+           MODIFY COLUMN `origin_name` VARCHAR(64) DEFAULT NULL,
+           MODIFY COLUMN `origin_url` VARCHAR(512) DEFAULT NULL,
+           MODIFY COLUMN `origin_url_base` VARCHAR(512) DEFAULT NULL"""
     ]
 ]
 
diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql
index 79dad45..50b4f9e 100644
--- a/bitshift/database/schema.sql
+++ b/bitshift/database/schema.sql
@@ -1,4 +1,4 @@
--- Schema version 4
+-- Schema version 5
 
 CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
 USE `bitshift`;
@@ -6,16 +6,17 @@ USE `bitshift`;
 CREATE TABLE `version` (
     `version` INT UNSIGNED NOT NULL
 ) ENGINE=InnoDB;
-INSERT INTO `version` VALUES (4);
+INSERT INTO `version` VALUES (5);
 
 CREATE TABLE `origins` (
     `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT,
-    `origin_name` VARCHAR(64) NOT NULL,
-    `origin_url` VARCHAR(512) NOT NULL,
-    `origin_url_base` VARCHAR(512) NOT NULL,
+    `origin_name` VARCHAR(64) DEFAULT NULL,
+    `origin_url` VARCHAR(512) DEFAULT NULL,
+    `origin_url_base` VARCHAR(512) DEFAULT NULL,
     `origin_image` BLOB DEFAULT NULL,
     PRIMARY KEY (`origin_id`)
 ) ENGINE=InnoDB;
+INSERT INTO `origins` VALUES (1, NULL, NULL, NULL, NULL);
 
 CREATE TABLE `code` (
     `code_id` BIGINT NOT NULL,

From ef73c043479f8cf899757981c6e4248665c8bae8 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Sun, 13 Apr 2014 21:57:22 -0400
Subject: [PATCH 21/42] Add prototype repo-indexer script author_files.py.

Add:
    author_files.py
        -add prototype script to output metadata about every file in a Git
        repository: filename, author names, dates of creation and modification.
        -lacking Sphinx documentation.
---
 bitshift/author_files.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 bitshift/author_files.py

diff --git a/bitshift/author_files.py b/bitshift/author_files.py
new file mode 100644
index 0000000..ed9f2c8
--- /dev/null
+++ b/bitshift/author_files.py
@@ -0,0 +1,53 @@
+"""
+Output author/date information about the latest files in a Git repository.
+
+When executed inside a Git archive, prints a single line of metadata for every
+file in the work tree. A given line contains the file's filename, authors,
+and Unix timestamps for the file's time of creation and last modification; the
+separate entries are null-delimited.
+
+Sample output:
+	socket_io.c\x00John Doe Jane Doe\x001384488690\x001384534626
+	# filename: socket_io.c
+	# Author Names:
+"""
+
+import fileinput, subprocess
+
+git_log = subprocess.check_output("git --no-pager log --name-only \
+	--pretty=format:'%n%n%an%n%at' -z", shell=True)
+
+commits = []
+for commit in git_log.split("\n\n"):
+	fields = commit.split("\n")
+	if len(fields) > 2:
+		commits.append({
+			"author" : fields[0],
+			"timestamp" : int(fields[1]),
+			"filenames" : fields[2].split("\0")[:-2]
+		})
+
+
+tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if -f && \
+	T }' $(find . -type d -name .git -prune -o -print)", shell=True)
+tracked_files = [filename[2:] for filename in tracked_files.split("\n")[:-1]]
+
+file_authors = {}
+for commit in commits:
+	for filename in commit["filenames"]:
+		if filename in tracked_files:
+			if filename not in file_authors.keys():
+				file_authors[filename] = {
+					"authors" : [commit["author"]],
+					"timestamps" : [commit["timestamp"]]
+				}
+			else:
+				if commit["author"] not in file_authors[filename]["authors"]:
+					file_authors[filename]["authors"].append(commit["author"])
+				file_authors[filename]["timestamps"].append(commit["timestamp"])
+
+for filename in file_authors.keys():
+	authors = "\0".join(file_authors[filename]["authors"])
+	time_created = min(file_authors[filename]["timestamps"])
+	time_last_modified = max(file_authors[filename]["timestamps"])
+	print "%s\0%s\0%d\0%d" % (filename, authors, time_created, time_last_modified)

From ef9c0609fed4c432f475a3bdd89b4b1ab062a3e3 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Mon, 14 Apr 2014 13:02:59 -0400
Subject: [PATCH 22/42] Mov author_files > git_inder, heavily refactor.

Add:
    bitshift/crawler/crawler.py
        -add base crawler module
        -add github(), to index Github.
Mod:
    bitshift/crawler/
        -add package subdirectory for the crawler module, and any subsidiary
        modules (eg, git_indexer).

    bitshift/author_files.py > bitshift/crawler/git_indexer.py
        -rename the module to "git_indexer", to better reflect its use.
        -convert from stand-alone script to a module whose functions integrate
        cleanly with the rest of the application.
        -add all necessary, tested functions, with Sphinx documentation.
---
 bitshift/author_files.py        |  53 ----------------
 bitshift/crawler/crawler.py     |  37 +++++++++++
 bitshift/crawler/git_indexer.py | 134 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 171 insertions(+), 53 deletions(-)
 delete mode 100644 bitshift/author_files.py
 create mode 100644 bitshift/crawler/crawler.py
 create mode 100644 bitshift/crawler/git_indexer.py

diff --git a/bitshift/author_files.py b/bitshift/author_files.py
deleted file mode 100644
index ed9f2c8..0000000
--- a/bitshift/author_files.py
+++ /dev/null
@@ -1,53 +0,0 @@
-"""
-Output author/date information about the latest files in a Git repository.
-
-When executed inside a Git archive, prints a single line of metadata for every
-file in the work tree. A given line contains the file's filename, authors,
-and Unix timestamps for the file's time of creation and last modification; the
-separate entries are null-delimited.
-
-Sample output:
-	socket_io.c\x00John Doe Jane Doe\x001384488690\x001384534626
-	# filename: socket_io.c
-	# Author Names:
-"""
-
-import fileinput, subprocess
-
-git_log = subprocess.check_output("git --no-pager log --name-only \
-	--pretty=format:'%n%n%an%n%at' -z", shell=True)
-
-commits = []
-for commit in git_log.split("\n\n"):
-	fields = commit.split("\n")
-	if len(fields) > 2:
-		commits.append({
-			"author" : fields[0],
-			"timestamp" : int(fields[1]),
-			"filenames" : fields[2].split("\0")[:-2]
-		})
-
-
-tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if -f && \
-	T }' $(find . -type d -name .git -prune -o -print)", shell=True)
-tracked_files = [filename[2:] for filename in tracked_files.split("\n")[:-1]]
-
-file_authors = {}
-for commit in commits:
-	for filename in commit["filenames"]:
-		if filename in tracked_files:
-			if filename not in file_authors.keys():
-				file_authors[filename] = {
-					"authors" : [commit["author"]],
-					"timestamps" : [commit["timestamp"]]
-				}
-			else:
-				if commit["author"] not in file_authors[filename]["authors"]:
-					file_authors[filename]["authors"].append(commit["author"])
-				file_authors[filename]["timestamps"].append(commit["timestamp"])
-
-for filename in file_authors.keys():
-	authors = "\0".join(file_authors[filename]["authors"])
-	time_created = min(file_authors[filename]["timestamps"])
-	time_last_modified = max(file_authors[filename]["timestamps"])
-	print "%s\0%s\0%d\0%d" % (filename, authors, time_created, time_last_modified)
diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
new file mode 100644
index 0000000..46cd54e
--- /dev/null
+++ b/bitshift/crawler/crawler.py
@@ -0,0 +1,37 @@
+"""
+
+"""
+
+import requests, time
+
+import git_indexer
+
+# from .codelet import Codelet
+# from .database import Database
+
+def github():
+    """
+    Query the GitHub API for data about every public repository.
+
+    Pull all of GitHub's repositories by making calls to its API in a loop,
+    accessing a subsequent page of results via the "next" URL returned in an
+    API response header. Uses Severyn Kozak's (sevko) authentication
+    credentials.
+    """
+
+    next_api_url = "https://api.github.com/repositories"
+    authentication_params = {
+        "client_id" : "436cb884ae09be7f2a4e",
+        "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
+    }
+
+    while len(next_api_url) > 0:
+        response = requests.get(next_api_url, params=authentication_params)
+
+        for repo in response.json():
+            codelets = git_indexer.index_repository(repo["html_url"])
+
+        if int(response.headers["x-ratelimit-remaining"]) == 0:
+            time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())
+
+        next_api_url = requests.headers["link"].split(">")[0][1:]
diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py
new file mode 100644
index 0000000..a98c600
--- /dev/null
+++ b/bitshift/crawler/git_indexer.py
@@ -0,0 +1,134 @@
+"""
+:synopsis: Index all the files in a Git repository.
+
+Clone a Git repository, and retrieve the following information about each file:
+filename, contributor names, dates of creation and last modification, and the
+file text.
+"""
+
+import fileinput, subprocess, os
+
+def index_repository(repo_url):
+    """
+    Generate metadata for every file in a Git repository.
+
+    `git clone` the Git repository located at **repo_url**, and return metadata
+    about every one of non-binary (text) files in its if main branch (usually
+    *master*).
+
+    :return: An array of metadata dictionaries.
+        .. code-block:: python
+           sample_returned_array = [
+               {
+                   "filename" : (str) "myfile"
+                   "time_created" : (int) 1395939566,
+                   "time_last_modified" : (int) 1396920409,
+                   "source" : (str) "The source code of the file."
+               }
+           ]
+    """
+
+    repo_name = repo_url.split("/")[-1]
+    subprocess.call("git clone %s" % repo_url, shell=True)
+    os.chdir(repo_name)
+
+    files_meta = []
+    commits_meta = _get_commits_metadata()
+    for filename in commits_meta.keys():
+        commits_meta[filename]["filename"] = filename
+        with open(filename, "r") as source_file:
+            commits_meta[filename]["source"] = source_file.read()
+        files_meta.append(commits_meta[filename])
+
+    os.chdir("..")
+    subprocess.call("rm -rf %s" % repo_name, shell=True)
+    return files_meta
+
+def _get_git_commits():
+    """
+    Return the current working directory's formatted commit data.
+
+    Uses `git log` to generate metadata about every single file in the
+    repository's commit history.
+
+    :return: The author, timestamp, and names of all modified files of every
+        commit.
+        .. code-block:: python
+           sample_returned_array = [
+               {
+                   "author" : (str) "author"
+                   "timestamp" : (int) 1396919293,
+                   "filename" : (str array) ["file1", "file2"]
+               }
+           ]
+    :rtype: dictionary
+    """
+
+    git_log = subprocess.check_output("git --no-pager log --name-only \
+        --pretty=format:'%n%n%an%n%at' -z", shell=True)
+
+    commits = []
+    for commit in git_log.split("\n\n"):
+        fields = commit.split("\n")
+        if len(fields) > 2:
+            commits.append({
+                "author" : fields[0],
+                "timestamp" : int(fields[1]),
+                "filenames" : fields[2].split("\0")[:-2]
+            })
+
+    return commits
+
+def _get_tracked_files():
+    """
+    Return a list of the filenames of all files in the Git repository.
+
+    Get a list of the filenames of the non-binary (Perl heuristics used for
+    filetype identification) files currently inside the current working
+    directory's Git repository.
+
+    :return: The filenames of all non-binary files.
+    :rtype: str array
+    """
+
+    tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if \
+        -f && -T }' $(find . -type d -name .git -prune -o -print)", shell=True)
+    return [filename[2:] for filename in tracked_files.split("\n")[:-1]]
+
+def _get_commits_metadata():
+    """
+    Return a dictionary containing every tracked file's metadata.
+
+    :return: A dictionary with author names, time of creation, and time of last
+        modification for every filename key.
+        .. code-block:: python
+               sample_returned_dict = {
+                   "my_file" : {
+                       "authors" : (str array) ["author1", "author2"],
+                       "time_created" : (int) 1395939566,
+                       "time_last_modified" : (int) 1396920409
+                   }
+               }
+    :rtype: dictionary
+    """
+
+    commits = _get_git_commits()
+    tracked_files  = _get_tracked_files()
+
+    files_meta = {}
+    for commit in commits:
+        for filename in commit["filenames"]:
+            if filename not in tracked_files:
+                continue
+
+            if filename not in files_meta.keys():
+                files_meta[filename] = {
+                    "authors" : [commit["author"]],
+                    "time_last_modified" : commit["timestamp"]
+                }
+            else:
+                if commit["author"] not in files_meta[filename]["authors"]:
+                    files_meta[filename]["authors"].append(commit["author"])
+                files_meta[filename]["time_created"] = commit["timestamp"]
+
+    return files_meta

From 77b448c3deaf980f1cddcee8986cf0c417a62a2c Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Mon, 14 Apr 2014 18:41:00 -0400
Subject: [PATCH 23/42] Mod Codelet, mov codelet creation from crawler.

Add:
    bitshift/crawler/(crawler, git_indexer).py
        -move Codelet creation from the crawler to the git_indexer, in
        preparation for making crawling/indexing independent, threaded
        processes.

Mod:
    bitshift/codelet.py
        -modify documentation for the author instance variable.
---
 bitshift/codelet.py             | 18 ++++++-----
 bitshift/crawler/crawler.py     |  8 +++--
 bitshift/crawler/git_indexer.py | 66 +++++++++++++++++++++++++----------------
 3 files changed, 55 insertions(+), 37 deletions(-)

diff --git a/bitshift/codelet.py b/bitshift/codelet.py
index 08b0d36..87025e0 100644
--- a/bitshift/codelet.py
+++ b/bitshift/codelet.py
@@ -7,39 +7,41 @@ class Codelet(object):
     :ivar code: (str) A containing the raw source code.
     :ivar filename: (str, or None) The filename of the snippet.
     :ivar language: (str, or None) The inferred language of `code`.
-    :ivar author: (str, or None) The name of the code's author.
-    :ivar url: (str) The url of the (page containing the) source code.
+    :ivar authors: (array of str tuple) An array of tuples containing an
+        author's name and profile URL (on the service the code was pulled from).
+    :ivar code_url: (str) The url of the (page containing the) source code.
     :ivar date_created: (str, or None) The date the code was published.
     :ivar date_modified: (str, or None) The date the code was last modified.
     """
 
-    def __init__(self, code, filename, author, language, code_url, author_url,
+    def __init__(self, name, code, filename, language, authors, code_url,
                  date_created, date_modified):
         """
         Create a Codelet instance.
 
         :param code: The raw source code.
         :param filename: The filename of the code, if any.
-        :param author: The author of the code.
         :param language: The inferred language.
+        :param authors: An array of tuples containing an author's name and
+            profile URL (on the service the code was pulled from).
         :param code_url: The url of the (page containing the) source code.
         :param date_created: The date the code was published.
         :param date_modified: The date the code was last modified.
 
         :type code: str
         :type filename: str, or None
+        :type authors: array of str tuples, or None
         :type language: str, or None
-        :type author: str, or None
-        :type url: str
+        :type code_url: str
+        :type author_urls: str array, or none
         :type date_created: str, or None
         :type date_modified: str, or None
         """
 
         self.code = code
         self.filename = filename
-        self.author = author
         self.language = language
+        self.authors = authors
         self.code_url = code_url
-        self.author_url = author_url
         self.date_created = date_created
         self.date_modified = date_modified
diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index 46cd54e..1ca65d1 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -1,13 +1,15 @@
 """
+:synopsis: Main crawler module, to oversee all site-specific crawlers.
 
+...more info soon...
 """
 
 import requests, time
 
 import git_indexer
 
-# from .codelet import Codelet
-# from .database import Database
+from .codelet import Codelet
+from .database import Database
 
 def github():
     """
@@ -29,7 +31,7 @@ def github():
         response = requests.get(next_api_url, params=authentication_params)
 
         for repo in response.json():
-            codelets = git_indexer.index_repository(repo["html_url"])
+            index_repository(repo["html_url"], framework)
 
         if int(response.headers["x-ratelimit-remaining"]) == 0:
             time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())
diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py
index a98c600..0c7ce75 100644
--- a/bitshift/crawler/git_indexer.py
+++ b/bitshift/crawler/git_indexer.py
@@ -1,48 +1,61 @@
 """
 :synopsis: Index all the files in a Git repository.
 
-Clone a Git repository, and retrieve the following information about each file:
-filename, contributor names, dates of creation and last modification, and the
-file text.
+...more info soon...
 """
 
 import fileinput, subprocess, os
 
-def index_repository(repo_url):
+from .database import Database
+
+def index_repository(repo_url, framework_name):
     """
-    Generate metadata for every file in a Git repository.
+    Insert a Codelet for every file in a Git repository.
 
-    `git clone` the Git repository located at **repo_url**, and return metadata
-    about every one of non-binary (text) files in its if main branch (usually
+    `git clone` the Git repository located at **repo_url**, and create a Codelet
+    for every one of non-binary (text) files in its if main branch (usually
     *master*).
-
-    :return: An array of metadata dictionaries.
-        .. code-block:: python
-           sample_returned_array = [
-               {
-                   "filename" : (str) "myfile"
-                   "time_created" : (int) 1395939566,
-                   "time_last_modified" : (int) 1396920409,
-                   "source" : (str) "The source code of the file."
-               }
-           ]
     """
 
     repo_name = repo_url.split("/")[-1]
     subprocess.call("git clone %s" % repo_url, shell=True)
     os.chdir(repo_name)
 
-    files_meta = []
     commits_meta = _get_commits_metadata()
     for filename in commits_meta.keys():
-        commits_meta[filename]["filename"] = filename
         with open(filename, "r") as source_file:
-            commits_meta[filename]["source"] = source_file.read()
-        files_meta.append(commits_meta[filename])
+            source = source_file.read()
+
+        authors = [(author,) for author in commits_meta["authors"]]
+        codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
+                          None, authors, _generate_file_url(filename, repo_url),
+                          framework_name, commits_meta["time_created"],
+                          commits_meta["time_last_modified"])
+        Database.insert(codelet)
 
     os.chdir("..")
     subprocess.call("rm -rf %s" % repo_name, shell=True)
-    return files_meta
+
+def _generate_file_url(filename, repo_url, framework_name):
+    """
+    Return a url for a filename from a Git wrapper framework.
+
+    :param filename: The path of the file.
+    :param repo_url: The url of the file's parent repository.
+    :param framework_name: The name of the framework the repository is from.
+
+    :type filename: str
+    :type repo_url: str
+    :type framework_name: str
+
+    :return: The file's full url on the given framework.
+    :rtype: str
+    """
+
+    if framework_name == "github":
+        default branch = subprocess.check_output("git branch --no-color", \
+                                                 shell=True)[2:-1]
+        return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
 
 def _get_git_commits():
     """
@@ -58,14 +71,15 @@ def _get_git_commits():
                {
                    "author" : (str) "author"
                    "timestamp" : (int) 1396919293,
-                   "filename" : (str array) ["file1", "file2"]
+                   "filenames" : (str array) ["file1", "file2"]
                }
            ]
     :rtype: dictionary
     """
 
-    git_log = subprocess.check_output("git --no-pager log --name-only \
-        --pretty=format:'%n%n%an%n%at' -z", shell=True)
+    git_log_cmd = ("git --no-pager --no-color log --name-only "
+        "--pretty=format:'%n%n%an%n%at' -z")
+    git_log = subprocess.check_output(git_log_cmd, shell=True)
 
     commits = []
     for commit in git_log.split("\n\n"):

From 9fc4598001264b58245b5c78ef21b792d7e3385c Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Mon, 14 Apr 2014 21:21:58 -0400
Subject: [PATCH 24/42] Clean up crawler/, fix minor bugs.

Add:
    bitshift/codelet.py
        -add name field to Codelet.

    bitshift/crawler/crawler.py
        -fix previously defunct code (which was committed at a point of
        incompletion) -- incorrect dictionary keys, etc..
        -reformat some function calls' argument alignment to fit PEP standards.

    bitshift/crawler.py
        -add sleep() to ensure that an API query is made at regular intervals
        (determined by the GitHub API limit).
---
 bitshift/__init__.py            |  2 +-
 bitshift/codelet.py             |  9 ++++++---
 bitshift/crawler/__init__.py    |  6 ++++++
 bitshift/crawler/crawler.py     | 14 ++++++++++----
 bitshift/crawler/git_indexer.py | 36 ++++++++++++++++++++++--------------
 5 files changed, 45 insertions(+), 22 deletions(-)
 create mode 100644 bitshift/crawler/__init__.py

diff --git a/bitshift/__init__.py b/bitshift/__init__.py
index 9a18c9b..78ca5e9 100644
--- a/bitshift/__init__.py
+++ b/bitshift/__init__.py
@@ -1 +1 @@
-from . import assets, codelet, config, database, parser, query
+from . import assets, codelet, config, database, parser, query, crawler
diff --git a/bitshift/codelet.py b/bitshift/codelet.py
index 87025e0..9568a4d 100644
--- a/bitshift/codelet.py
+++ b/bitshift/codelet.py
@@ -4,10 +4,11 @@ class Codelet(object):
     """
     A source-code object with code metadata and composition analysis.
 
+    :ivar name: (str) A suitable name for the codelet.
     :ivar code: (str) A containing the raw source code.
     :ivar filename: (str, or None) The filename of the snippet.
     :ivar language: (str, or None) The inferred language of `code`.
-    :ivar authors: (array of str tuple) An array of tuples containing an
+    :ivar authors: (array of str tuples) An array of tuples containing an
         author's name and profile URL (on the service the code was pulled from).
     :ivar code_url: (str) The url of the (page containing the) source code.
     :ivar date_created: (str, or None) The date the code was published.
@@ -19,6 +20,7 @@ class Codelet(object):
         """
         Create a Codelet instance.
 
+        :param name: The name of the codelet.
         :param code: The raw source code.
         :param filename: The filename of the code, if any.
         :param language: The inferred language.
@@ -28,16 +30,17 @@ class Codelet(object):
         :param date_created: The date the code was published.
         :param date_modified: The date the code was last modified.
 
+        :type name: str
         :type code: str
         :type filename: str, or None
-        :type authors: array of str tuples, or None
         :type language: str, or None
+        :type authors: array of str tuples, or None
         :type code_url: str
-        :type author_urls: str array, or none
         :type date_created: str, or None
         :type date_modified: str, or None
         """
 
+        self.name = name
         self.code = code
         self.filename = filename
         self.language = language
diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py
new file mode 100644
index 0000000..a518970
--- /dev/null
+++ b/bitshift/crawler/__init__.py
@@ -0,0 +1,6 @@
+import crawler
+
+__all__ = ["crawl"]
+
+def crawl():
+    pass
diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index 1ca65d1..34f2819 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -8,8 +8,8 @@ import requests, time
 
 import git_indexer
 
-from .codelet import Codelet
-from .database import Database
+from ..codelet import Codelet
+from ..database import Database
 
 def github():
     """
@@ -26,14 +26,20 @@ def github():
         "client_id" : "436cb884ae09be7f2a4e",
         "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
     }
+    api_request_interval = 5e3 / 60 ** 2
 
     while len(next_api_url) > 0:
+        start_time = time.time()
         response = requests.get(next_api_url, params=authentication_params)
 
         for repo in response.json():
-            index_repository(repo["html_url"], framework)
+            print repo["id"]
 
         if int(response.headers["x-ratelimit-remaining"]) == 0:
             time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())
 
-        next_api_url = requests.headers["link"].split(">")[0][1:]
+        next_api_url = response.headers["link"].split(">")[0][1:]
+
+        sleep_time = api_request_interval - (time.time() - start_time)
+        if sleep_time > 0:
+            time.sleep(sleep_time)
diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py
index 0c7ce75..cc9082c 100644
--- a/bitshift/crawler/git_indexer.py
+++ b/bitshift/crawler/git_indexer.py
@@ -6,7 +6,8 @@
 
 import fileinput, subprocess, os
 
-from .database import Database
+from ..database import Database
+from ..codelet import Codelet
 
 def index_repository(repo_url, framework_name):
     """
@@ -21,20 +22,25 @@ def index_repository(repo_url, framework_name):
     subprocess.call("git clone %s" % repo_url, shell=True)
     os.chdir(repo_name)
 
+    codelets = []
     commits_meta = _get_commits_metadata()
     for filename in commits_meta.keys():
         with open(filename, "r") as source_file:
             source = source_file.read()
 
-        authors = [(author,) for author in commits_meta["authors"]]
-        codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
-                          None, authors, _generate_file_url(filename, repo_url),
-                          framework_name, commits_meta["time_created"],
-                          commits_meta["time_last_modified"])
-        Database.insert(codelet)
+        authors = [(author,) for author in commits_meta[filename]["authors"]]
+        codelets.append(
+                Codelet("%s:%s" % (repo_name, filename), source, filename,
+                        None, authors, _generate_file_url(filename, repo_url,
+                                                          framework_name),
+                        commits_meta[filename]["time_created"],
+                        commits_meta[filename]["time_last_modified"]))
+
+        # Database.insert(codelet)
 
     os.chdir("..")
     subprocess.call("rm -rf %s" % repo_name, shell=True)
+    return codelets
 
 def _generate_file_url(filename, repo_url, framework_name):
     """
@@ -53,7 +59,7 @@ def _generate_file_url(filename, repo_url, framework_name):
     """
 
     if framework_name == "github":
-        default branch = subprocess.check_output("git branch --no-color", \
+        default_branch = subprocess.check_output("git branch --no-color",
                                                  shell=True)[2:-1]
         return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
 
@@ -77,9 +83,9 @@ def _get_git_commits():
     :rtype: dictionary
     """
 
-    git_log_cmd = ("git --no-pager --no-color log --name-only "
-        "--pretty=format:'%n%n%an%n%at' -z")
-    git_log = subprocess.check_output(git_log_cmd, shell=True)
+    git_log = subprocess.check_output(
+            ("git --no-pager log --name-only"
+            " --pretty=format:'%n%n%an%n%at' -z"), shell=True)
 
     commits = []
     for commit in git_log.split("\n\n"):
@@ -105,8 +111,9 @@ def _get_tracked_files():
     :rtype: str array
     """
 
-    tracked_files = subprocess.check_output("perl -le 'for (@ARGV){ print if \
-        -f && -T }' $(find . -type d -name .git -prune -o -print)", shell=True)
+    tracked_files = subprocess.check_output(
+            ("perl -le 'for (@ARGV){ print if -f && -T }'"
+            " $(find . -type d -name .git -prune -o -print)"), shell=True)
     return [filename[2:] for filename in tracked_files.split("\n")[:-1]]
 
 def _get_commits_metadata():
@@ -138,7 +145,8 @@ def _get_commits_metadata():
             if filename not in files_meta.keys():
                 files_meta[filename] = {
                     "authors" : [commit["author"]],
-                    "time_last_modified" : commit["timestamp"]
+                    "time_last_modified" : commit["timestamp"],
+                    "time_created" : commit["timestamp"]
                 }
             else:
                 if commit["author"] not in files_meta[filename]["authors"]:

From c655d97f487b19224ca06a384b3f8c2a327cff9f Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Mon, 14 Apr 2014 22:09:05 -0400
Subject: [PATCH 25/42] Add class ChangeDir, amend unsafe subprocess.

Add:
    bitshift/crawler/git_indexer.py
        -add ChangeDir class, a context-management wrapper for os.chdir().
        -replace unsafe "rm -rf" subprocess call with shutil.rmtree()
---
 bitshift/crawler/git_indexer.py | 55 +++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py
index cc9082c..2ba1200 100644
--- a/bitshift/crawler/git_indexer.py
+++ b/bitshift/crawler/git_indexer.py
@@ -4,11 +4,47 @@
 ...more info soon...
 """
 
-import fileinput, subprocess, os
+import shutil, subprocess, os
 
 from ..database import Database
 from ..codelet import Codelet
 
+GIT_CLONE_DIR = "/tmp"
+
+class ChangeDir(object):
+    """
+    A wrapper class for os.chdir(), to map onto `with` and handle exceptions.
+
+    :ivar new_path: (str) The path to change the current directory to.
+    :ivar old_path: (str) The path of the directory to return to.
+    """
+
+    def __init__(self, new_path):
+        """
+        Construct the object.
+
+        :param new_path: The directory to enter.
+
+        :type new_path: str
+        """
+
+        self.new_path = new_path
+
+    def __enter__(self):
+        """
+        Change the current working-directory to **new_path**.
+        """
+
+        self.old_path = os.getcwd()
+        os.chdir(self.new_path)
+
+    def __exit__(self, etype, value, traceback):
+        """
+        Change the current working-directory to **old_path**.
+        """
+
+        os.chdir(self.old_path)
+
 def index_repository(repo_url, framework_name):
     """
     Insert a Codelet for every file in a Git repository.
@@ -19,9 +55,18 @@ def index_repository(repo_url, framework_name):
     """
 
     repo_name = repo_url.split("/")[-1]
-    subprocess.call("git clone %s" % repo_url, shell=True)
-    os.chdir(repo_name)
+    codelets = []
 
+    with ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
+        subprocess.call("git clone %s" % repo_url, shell=True)
+        with ChangeDir(repo_name) as repository_dir:
+            codelets = _insert_repository_codelets(repo_url, repo_name,
+                                                   framework_name)
+        shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
+
+    return codelets
+
+def _insert_repository_codelets(repo_url, repo_name, framework_name):
     codelets = []
     commits_meta = _get_commits_metadata()
     for filename in commits_meta.keys():
@@ -36,10 +81,6 @@ def index_repository(repo_url, framework_name):
                         commits_meta[filename]["time_created"],
                         commits_meta[filename]["time_last_modified"]))
 
-        # Database.insert(codelet)
-
-    os.chdir("..")
-    subprocess.call("rm -rf %s" % repo_name, shell=True)
     return codelets
 
 def _generate_file_url(filename, repo_url, framework_name):

From 97198ee523df5263f018b8bd581343832583dcc2 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Tue, 15 Apr 2014 09:40:11 -0400
Subject: [PATCH 26/42] Update Crawler documentation.

Add:
    bitshift/crawler/git_indexer.py
        -add some missing docstrings, complete others.
---
 bitshift/crawler/git_indexer.py | 43 ++++++++++++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py
index 2ba1200..8cd3ae3 100644
--- a/bitshift/crawler/git_indexer.py
+++ b/bitshift/crawler/git_indexer.py
@@ -1,7 +1,8 @@
 """
 :synopsis: Index all the files in a Git repository.
 
-...more info soon...
+.. todo::
+    Add documentation, threaded Indexer class.
 """
 
 import shutil, subprocess, os
@@ -21,7 +22,7 @@ class ChangeDir(object):
 
     def __init__(self, new_path):
         """
-        Construct the object.
+        Create a ChangeDir instance.
 
         :param new_path: The directory to enter.
 
@@ -38,20 +39,32 @@ class ChangeDir(object):
         self.old_path = os.getcwd()
         os.chdir(self.new_path)
 
-    def __exit__(self, etype, value, traceback):
+    def __exit__(self, *exception):
         """
         Change the current working-directory to **old_path**.
+
+        :param exception: Various exception arguments passed by `with`.
+
+        :type exception: varargs
         """
 
         os.chdir(self.old_path)
 
 def index_repository(repo_url, framework_name):
     """
-    Insert a Codelet for every file in a Git repository.
+    Clone and index (create and insert Codeletes for) a Git repository.
 
-    `git clone` the Git repository located at **repo_url**, and create a Codelet
-    for every one of non-binary (text) files in its if main branch (usually
-    *master*).
+    `git clone` the Git repository located at **repo_url**, call
+    _insert_repository_codelets, then remove said repository.
+
+    :param repo_url: The url the Git repository was cloned from.
+    :param framework_name: The name of the framework the repository is from.
+
+    :type repo_url: str
+    :type framework_name: str
+
+    :return: Temporary: the new codelets, for testing purposes.
+    :rtype: Codelet array
     """
 
     repo_name = repo_url.split("/")[-1]
@@ -67,6 +80,22 @@ def index_repository(repo_url, framework_name):
     return codelets
 
 def _insert_repository_codelets(repo_url, repo_name, framework_name):
+    """
+    Create a Codelet for the files inside a Git repository.
+
+    Create a new Codelet, and insert it into the Database singlet, for every
+    file inside the current working directory's default branch (usually
+    *master*).
+
+    :param repo_url: The url the Git repository was cloned from.
+    :param repo_name: The name of the repository.
+    :param framework_name: The name of the framework the repository is from.
+
+    :type repo_url: str
+    :type repo_name: str
+    :type framework_name: str
+    """
+
     codelets = []
     commits_meta = _get_commits_metadata()
     for filename in commits_meta.keys():

From b7ccec05015cbd011a7ddaa7e2a69462d518af9e Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Tue, 15 Apr 2014 11:08:53 -0400
Subject: [PATCH 27/42] Add untested threaded indexer/crawler prototype.

Additions are not tested and not yet documented.

Add:
    crawler.py
        -add threaded GitHubCrawler class, which interacts with a GitIndexer
        via a Queue.

    git_indexer.py
        -add threaded GitIndexer class, which interacts with GitHubCrawler via
        a Queue.
        -rename context-manager ChangeDir class to _ChangeDir, because it's
        essentially "private".

    __init__.py
        -add body to crawl(), which creates instances of GitHubCrawler and
        GitIndexer and starts them.
---
 bitshift/crawler/__init__.py    | 12 ++++++++++--
 bitshift/crawler/crawler.py     | 20 ++++++++++++++++----
 bitshift/crawler/git_indexer.py | 28 ++++++++++++++++++++--------
 3 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py
index a518970..f38a187 100644
--- a/bitshift/crawler/__init__.py
+++ b/bitshift/crawler/__init__.py
@@ -1,6 +1,14 @@
-import crawler
+import Queue
+
+from bitshift.crawler import crawler
+from bitshift.crawler import git_indexer
 
 __all__ = ["crawl"]
 
 def crawl():
-    pass
+    repository_queue = Queue.Queue()
+    github_crawler = crawler.GitHubCrawler(repository_queue)
+    indexer = git_indexer.GitIndexer(repository_queue)
+
+    for thread in [github_crawler, indexer]:
+        thread.start()
diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index 34f2819..fc1aadb 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -4,14 +4,22 @@
 ...more info soon...
 """
 
-import requests, time
+import requests, time, threading
 
-import git_indexer
+import bitshift.crawler.git_indexer
 
 from ..codelet import Codelet
 from ..database import Database
 
-def github():
+class GitHubCrawler(threading.Thread):
+    def __init__(self, repository_queue):
+        self.repository_queue = repository_queue
+        super(GitHubCrawler, self).__init__()
+
+    def run():
+        _github()
+
+def _github():
     """
     Query the GitHub API for data about every public repository.
 
@@ -33,7 +41,11 @@ def github():
         response = requests.get(next_api_url, params=authentication_params)
 
         for repo in response.json():
-            print repo["id"]
+            self.repository_queue.put({
+                "url" : repo["html_url"],
+                "framework_name" : "GitHub"
+            })
+            self.repository_queue.task_done()
 
         if int(response.headers["x-ratelimit-remaining"]) == 0:
             time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())
diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/git_indexer.py
index 8cd3ae3..2268895 100644
--- a/bitshift/crawler/git_indexer.py
+++ b/bitshift/crawler/git_indexer.py
@@ -5,14 +5,26 @@
     Add documentation, threaded Indexer class.
 """
 
-import shutil, subprocess, os
+import os, shutil, subprocess, threading
 
 from ..database import Database
 from ..codelet import Codelet
 
 GIT_CLONE_DIR = "/tmp"
 
-class ChangeDir(object):
+class GitIndexer(threading.Thread):
+    def __init__(self, repository_queue):
+        self.repository_queue = repository_queue
+        super(GitIndexer, self).__init__()
+
+    def run(self):
+        while True:
+            while self.repository_queue.empty():
+                pass
+            new_repo = self.repository_queue.get()
+            _index_repository(new_repo["url"], new_repo["framework_name"])
+
+class _ChangeDir(object):
     """
     A wrapper class for os.chdir(), to map onto `with` and handle exceptions.
 
@@ -22,7 +34,7 @@ class ChangeDir(object):
 
     def __init__(self, new_path):
         """
-        Create a ChangeDir instance.
+        Create a _ChangeDir instance.
 
         :param new_path: The directory to enter.
 
@@ -50,7 +62,7 @@ class ChangeDir(object):
 
         os.chdir(self.old_path)
 
-def index_repository(repo_url, framework_name):
+def _index_repository(repo_url, framework_name):
     """
     Clone and index (create and insert Codeletes for) a Git repository.
 
@@ -70,9 +82,9 @@ def index_repository(repo_url, framework_name):
     repo_name = repo_url.split("/")[-1]
     codelets = []
 
-    with ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
+    with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
         subprocess.call("git clone %s" % repo_url, shell=True)
-        with ChangeDir(repo_name) as repository_dir:
+        with _ChangeDir(repo_name) as repository_dir:
             codelets = _insert_repository_codelets(repo_url, repo_name,
                                                    framework_name)
         shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
@@ -128,7 +140,7 @@ def _generate_file_url(filename, repo_url, framework_name):
     :rtype: str
     """
 
-    if framework_name == "github":
+    if framework_name == "GitHub":
         default_branch = subprocess.check_output("git branch --no-color",
                                                  shell=True)[2:-1]
         return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
@@ -164,7 +176,7 @@ def _get_git_commits():
             commits.append({
                 "author" : fields[0],
                 "timestamp" : int(fields[1]),
-                "filenames" : fields[2].split("\0")[:-2]
+                "filenames" : fields[2].split("\x00")[:-2]
             })
 
     return commits

From b680756f8dba4f5ab3690f069f5520978846fc06 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Wed, 16 Apr 2014 13:32:04 -0400
Subject: [PATCH 28/42] Test crawler, complete documentation.

Add, Fix:
    bitshift/crawler/
        __init__.py
            -add module and crawl() docstrings.
            -add repository_queue size limit.

        crawler.py
            -account for time spent executing an API query in the run() loop
            sleep() interval.
---
 bitshift/crawler/__init__.py                    |  18 +++-
 bitshift/crawler/crawler.py                     | 106 +++++++++++++++++-------
 bitshift/crawler/{git_indexer.py => indexer.py} |   0
 3 files changed, 91 insertions(+), 33 deletions(-)
 rename bitshift/crawler/{git_indexer.py => indexer.py} (100%)

diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py
index f38a187..6c13be9 100644
--- a/bitshift/crawler/__init__.py
+++ b/bitshift/crawler/__init__.py
@@ -1,3 +1,9 @@
+"""
+:synopsis: Parent crawler module, which supervises all crawlers.
+
+Contains functions for initializing all subsidiary, threaded crawlers.
+"""
+
 import Queue
 
 from bitshift.crawler import crawler
@@ -5,8 +11,18 @@ from bitshift.crawler import git_indexer
 
 __all__ = ["crawl"]
 
+MAX_URL_QUEUE_SIZE = 5e3
+
 def crawl():
-    repository_queue = Queue.Queue()
+    """
+    Initialize all crawlers (and indexers).
+
+    Start the:
+    1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler`
+    2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer`
+    """
+
+    repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
     github_crawler = crawler.GitHubCrawler(repository_queue)
     indexer = git_indexer.GitIndexer(repository_queue)
 
diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index fc1aadb..5b0f600 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -12,46 +12,88 @@ from ..codelet import Codelet
 from ..database import Database
 
 class GitHubCrawler(threading.Thread):
+    """
+    Crawler that retrieves links to all of GitHub's public repositories.
+
+    GitHubCrawler is a threaded singleton that queries GitHub's API for URLs
+    to its public repositories, which it inserts into a :class:`Queue.Queue`
+    shared with :class:`bitshift.crawler.git_indexer.GitIndexer`.
+
+    :ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with
+        repository information retrieved by `GitHubCrawler`, and other Git
+        crawlers, to be processed by
+        :class:`bitshift.crawler.git_indexer.GitIndexer`.
+    """
+
     def __init__(self, repository_queue):
+        """
+        Create an instance of the singleton `GitHubCrawler`.
+
+        :param repository_queue: A queue containing dictionaries of  repository
+            metadata retrieved by `GitHubCrawler`, meant to be processed by an
+            instance of :class:`bitshift.crawler.git_indexer.GitIndexer`.
+
+            .. code-block:: python
+                sample_dict = {
+                    "url" : "https://github.com/user/repo",
+                    "name" : "repo",
+                    "framework_name" : "GitHub"
+                }
+
+        :type repository_queue: :class:`Queue.Queue`
+        """
+
+
         self.repository_queue = repository_queue
         super(GitHubCrawler, self).__init__()
 
-    def run():
-        _github()
+    def run(self):
+        """
+        Query the GitHub API for data about every public repository.
 
-def _github():
-    """
-    Query the GitHub API for data about every public repository.
+        Pull all of GitHub's repositories by making calls to its API in a loop,
+        accessing a subsequent page of results via the "next" URL returned in an
+        API response header. Uses Severyn Kozak's (sevko) authentication
+        credentials.
+        """
 
-    Pull all of GitHub's repositories by making calls to its API in a loop,
-    accessing a subsequent page of results via the "next" URL returned in an
-    API response header. Uses Severyn Kozak's (sevko) authentication
-    credentials.
-    """
+        next_api_url = "https://api.github.com/repositories"
+        authentication_params = {
+            "client_id" : "436cb884ae09be7f2a4e",
+            "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
+        }
+        api_request_interval = 5e3 / 60 ** 2
 
-    next_api_url = "https://api.github.com/repositories"
-    authentication_params = {
-        "client_id" : "436cb884ae09be7f2a4e",
-        "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
-    }
-    api_request_interval = 5e3 / 60 ** 2
-
-    while len(next_api_url) > 0:
-        start_time = time.time()
-        response = requests.get(next_api_url, params=authentication_params)
-
-        for repo in response.json():
-            self.repository_queue.put({
-                "url" : repo["html_url"],
-                "framework_name" : "GitHub"
+        while len(next_api_url) > 0:
+            # DEBUG
+            db.log.insert({
+                "time" : str(time.time()).split(".")[0][-4:],
+                "qsize" : self.repository_queue.qsize()
             })
-            self.repository_queue.task_done()
 
-        if int(response.headers["x-ratelimit-remaining"]) == 0:
-            time.sleep(int(response.headers["x-ratelimit-reset"]) - time.time())
+            start_time = time.time()
+            response = requests.get(next_api_url, params=authentication_params)
+
+            for repo in response.json():
+                logging.basicConfig(filename="crawler.log", level=logging.DEBUG)
+                logging.debug("crawler: %-20s: %-5s: %-5s: %s",
+                             str(time.time()).split(".")[0],
+                             self.repository_queue.qsize(), repo["id"],
+                             repo["name"])
+                while self.repository_queue.full():
+                    pass
+                self.repository_queue.put({
+                    "url" : repo["html_url"],
+                    "name" : repo["html_url"].split("/")[-1],
+                    "framework_name" : "GitHub"
+                })
+
+            if int(response.headers["x-ratelimit-remaining"]) == 0:
+                time.sleep(int(response.headers["x-ratelimit-reset"]) -
+                           time.time())
 
-        next_api_url = response.headers["link"].split(">")[0][1:]
+            next_api_url = response.headers["link"].split(">")[0][1:]
 
-        sleep_time = api_request_interval - (time.time() - start_time)
-        if sleep_time > 0:
-            time.sleep(sleep_time)
+            sleep_time = api_request_interval - (time.time() - start_time)
+            if sleep_time > 0:
+                time.sleep(sleep_time)
diff --git a/bitshift/crawler/git_indexer.py b/bitshift/crawler/indexer.py
similarity index 100%
rename from bitshift/crawler/git_indexer.py
rename to bitshift/crawler/indexer.py

From 627c848f208d65d62389482b3467e47279200ce0 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Wed, 16 Apr 2014 16:41:14 -0400
Subject: [PATCH 29/42] Add tested indexer.

Add:
    bitshift/crawler/indexer.py
        -add _debug().
        -add content to the module docstring; add documentation to GitIndexer,
        and the functions that were lacking it.
        -add another perl one-liner to supplement the `git clone` subprocess
        call, which terminates it after a set amount of time (should it have
        frozen) -- fixes a major bug that caused the entire indexer to hang.
---
 bitshift/crawler/__init__.py |   9 ++-
 bitshift/crawler/crawler.py  |  25 ++------
 bitshift/crawler/indexer.py  | 149 +++++++++++++++++++++++++++++++------------
 3 files changed, 120 insertions(+), 63 deletions(-)

diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py
index 6c13be9..4875712 100644
--- a/bitshift/crawler/__init__.py
+++ b/bitshift/crawler/__init__.py
@@ -6,8 +6,7 @@ Contains functions for initializing all subsidiary, threaded crawlers.
 
 import Queue
 
-from bitshift.crawler import crawler
-from bitshift.crawler import git_indexer
+from bitshift.crawler import crawler, indexer
 
 __all__ = ["crawl"]
 
@@ -19,12 +18,12 @@ def crawl():
 
     Start the:
     1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler`
-    2. Git indexer, :class:`bitshift.crawler.git_indexer.GitIndexer`
+    2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`
     """
 
     repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
     github_crawler = crawler.GitHubCrawler(repository_queue)
-    indexer = git_indexer.GitIndexer(repository_queue)
+    git_indexer = indexer.GitIndexer(repository_queue)
 
-    for thread in [github_crawler, indexer]:
+    for thread in [github_crawler, git_indexer]:
         thread.start()
diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index 5b0f600..8b9576d 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -1,12 +1,12 @@
 """
 :synopsis: Main crawler module, to oversee all site-specific crawlers.
 
-...more info soon...
+Contains all website/framework-specific Class crawlers.
 """
 
 import requests, time, threading
 
-import bitshift.crawler.git_indexer
+import bitshift.crawler.indexer
 
 from ..codelet import Codelet
 from ..database import Database
@@ -17,12 +17,12 @@ class GitHubCrawler(threading.Thread):
 
     GitHubCrawler is a threaded singleton that queries GitHub's API for URLs
     to its public repositories, which it inserts into a :class:`Queue.Queue`
-    shared with :class:`bitshift.crawler.git_indexer.GitIndexer`.
+    shared with :class:`bitshift.crawler.indexer.GitIndexer`.
 
     :ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with
         repository information retrieved by `GitHubCrawler`, and other Git
         crawlers, to be processed by
-        :class:`bitshift.crawler.git_indexer.GitIndexer`.
+        :class:`bitshift.crawler.indexer.GitIndexer`.
     """
 
     def __init__(self, repository_queue):
@@ -31,7 +31,7 @@ class GitHubCrawler(threading.Thread):
 
         :param repository_queue: A queue containing dictionaries of  repository
             metadata retrieved by `GitHubCrawler`, meant to be processed by an
-            instance of :class:`bitshift.crawler.git_indexer.GitIndexer`.
+            instance of :class:`bitshift.crawler.indexer.GitIndexer`.
 
             .. code-block:: python
                 sample_dict = {
@@ -43,7 +43,6 @@ class GitHubCrawler(threading.Thread):
         :type repository_queue: :class:`Queue.Queue`
         """
 
-
         self.repository_queue = repository_queue
         super(GitHubCrawler, self).__init__()
 
@@ -65,26 +64,16 @@ class GitHubCrawler(threading.Thread):
         api_request_interval = 5e3 / 60 ** 2
 
         while len(next_api_url) > 0:
-            # DEBUG
-            db.log.insert({
-                "time" : str(time.time()).split(".")[0][-4:],
-                "qsize" : self.repository_queue.qsize()
-            })
-
             start_time = time.time()
             response = requests.get(next_api_url, params=authentication_params)
 
             for repo in response.json():
-                logging.basicConfig(filename="crawler.log", level=logging.DEBUG)
-                logging.debug("crawler: %-20s: %-5s: %-5s: %s",
-                             str(time.time()).split(".")[0],
-                             self.repository_queue.qsize(), repo["id"],
-                             repo["name"])
                 while self.repository_queue.full():
                     pass
+
                 self.repository_queue.put({
                     "url" : repo["html_url"],
-                    "name" : repo["html_url"].split("/")[-1],
+                    "name" : repo["name"],
                     "framework_name" : "GitHub"
                 })
 
diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py
index 2268895..f2a8bbf 100644
--- a/bitshift/crawler/indexer.py
+++ b/bitshift/crawler/indexer.py
@@ -1,28 +1,60 @@
 """
-:synopsis: Index all the files in a Git repository.
-
-.. todo::
-    Add documentation, threaded Indexer class.
+:synopsis: Contains a singleton GitIndexer class, which clones and indexes git
+    repositories.
 """
 
-import os, shutil, subprocess, threading
+import bs4, os, re, shutil, subprocess, threading
 
 from ..database import Database
 from ..codelet import Codelet
 
-GIT_CLONE_DIR = "/tmp"
+GIT_CLONE_DIR = "/tmp/bitshift"
 
 class GitIndexer(threading.Thread):
+    """
+    A singleton Git repository indexer.
+
+    `GitIndexer` clones and indexes the repositories at urls found by the
+    :mod:`bitshift.crawler.crawler` Git crawlers.
+
+    :ivar repository_queue: (:class:`Queue.Queue`) A queue containing urls found
+        by the :mod:`bitshift.crawler.crawler` Git crawlers.
+    """
+
     def __init__(self, repository_queue):
+        """
+        Create an instance of the singleton `GitIndexer`.
+
+        :param repository_queue: see :attr:`GitIndexer.repository_queue`
+
+        :type repository_queue: see :attr:`GitIndexer.repository_queue`
+        """
+
         self.repository_queue = repository_queue
         super(GitIndexer, self).__init__()
 
     def run(self):
+        """
+        Retrieve new repository urls, clone, and index them.
+
+        Blocks until new urls appear in :attr:`GitIndexer.repository_queue`,
+        then retrieves one, and attempts cloning/indexing it. Should any errors
+        occur, the new repository will be discarded and the crawler will
+        index the next in the queue.
+        """
+
         while True:
             while self.repository_queue.empty():
                 pass
-            new_repo = self.repository_queue.get()
-            _index_repository(new_repo["url"], new_repo["framework_name"])
+
+            repo = self.repository_queue.get()
+            self.repository_queue.task_done()
+
+            try:
+                _index_repository(repo["url"], repo["name"],
+                        repo["framework_name"])
+            except: # desperate times -- will be modified later
+                pass
 
 class _ChangeDir(object):
     """
@@ -62,7 +94,7 @@ class _ChangeDir(object):
 
         os.chdir(self.old_path)
 
-def _index_repository(repo_url, framework_name):
+def _index_repository(repo_url, repo_name, framework_name):
     """
     Clone and index (create and insert Codeletes for) a Git repository.
 
@@ -70,32 +102,30 @@ def _index_repository(repo_url, framework_name):
     _insert_repository_codelets, then remove said repository.
 
     :param repo_url: The url the Git repository was cloned from.
+    :param repo_name: The name of the repository.
     :param framework_name: The name of the framework the repository is from.
 
     :type repo_url: str
+    :type repo_name: str
     :type framework_name: str
-
-    :return: Temporary: the new codelets, for testing purposes.
-    :rtype: Codelet array
     """
 
-    repo_name = repo_url.split("/")[-1]
-    codelets = []
+    GIT_CLONE_TIMEOUT = 60
 
     with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
-        subprocess.call("git clone %s" % repo_url, shell=True)
+        if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \
+                clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0:
+            return
+
         with _ChangeDir(repo_name) as repository_dir:
-            codelets = _insert_repository_codelets(repo_url, repo_name,
-                                                   framework_name)
+            _insert_repository_codelets(repo_url, repo_name, framework_name)
         shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
 
-    return codelets
-
 def _insert_repository_codelets(repo_url, repo_name, framework_name):
     """
-    Create a Codelet for the files inside a Git repository.
+    Create and insert a Codelet for the files inside a Git repository.
 
-    Create a new Codelet, and insert it into the Database singlet, for every
+    Create a new Codelet, and insert it into the Database singleton, for every
     file inside the current working directory's default branch (usually
     *master*).
 
@@ -108,21 +138,27 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name):
     :type framework_name: str
     """
 
-    codelets = []
     commits_meta = _get_commits_metadata()
     for filename in commits_meta.keys():
         with open(filename, "r") as source_file:
-            source = source_file.read()
+            source = _decode(source_file.read())
+            if source is None:
+                return
 
-        authors = [(author,) for author in commits_meta[filename]["authors"]]
-        codelets.append(
-                Codelet("%s:%s" % (repo_name, filename), source, filename,
+        authors = [(_decode(author),) for author in \
+                commits_meta[filename]["authors"]]
+        codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
                         None, authors, _generate_file_url(filename, repo_url,
-                                                          framework_name),
+                                framework_name),
                         commits_meta[filename]["time_created"],
-                        commits_meta[filename]["time_last_modified"]))
+                        commits_meta[filename]["time_last_modified"])
 
-    return codelets
+        db.codelets.insert({
+            "name" : codelet.name,
+            "authors" : codelet.authors
+        })
+
+        # Database.insert(codelet)
 
 def _generate_file_url(filename, repo_url, framework_name):
     """
@@ -142,7 +178,7 @@ def _generate_file_url(filename, repo_url, framework_name):
 
     if framework_name == "GitHub":
         default_branch = subprocess.check_output("git branch --no-color",
-                                                 shell=True)[2:-1]
+                shell=True)[2:-1]
         return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
 
 def _get_git_commits():
@@ -165,8 +201,7 @@ def _get_git_commits():
     :rtype: dictionary
     """
 
-    git_log = subprocess.check_output(
-            ("git --no-pager log --name-only"
+    git_log = subprocess.check_output(("git --no-pager log --name-only"
             " --pretty=format:'%n%n%an%n%at' -z"), shell=True)
 
     commits = []
@@ -183,24 +218,34 @@ def _get_git_commits():
 
 def _get_tracked_files():
     """
-    Return a list of the filenames of all files in the Git repository.
+    Return a list of the filenames of all valuable files in the Git repository.
 
     Get a list of the filenames of the non-binary (Perl heuristics used for
     filetype identification) files currently inside the current working
-    directory's Git repository.
+    directory's Git repository. Then, weed out any boilerplate/non-code files
+    that match the regex rules in GIT_IGNORE_FILES.
 
-    :return: The filenames of all non-binary files.
+    :return: The filenames of all index-worthy non-binary files.
     :rtype: str array
     """
 
-    tracked_files = subprocess.check_output(
-            ("perl -le 'for (@ARGV){ print if -f && -T }'"
-            " $(find . -type d -name .git -prune -o -print)"), shell=True)
-    return [filename[2:] for filename in tracked_files.split("\n")[:-1]]
+    GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"]
+
+    tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \
+            -f && -T }' $(find . -type d -name .git -prune -o -print)"),
+            shell=True).split("\n")[:-1]
+
+    valuable_files = []
+    for filename in tracked_files:
+        filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
+                for pattern in GIT_IGNORE_FILES])
+        if not filename_match:
+            valuable_files.append(filename[2:])
+    return valuable_files
 
 def _get_commits_metadata():
     """
-    Return a dictionary containing every tracked file's metadata.
+    Return a dictionary containing every valuable tracked file's metadata.
 
     :return: A dictionary with author names, time of creation, and time of last
         modification for every filename key.
@@ -236,3 +281,27 @@ def _get_commits_metadata():
                 files_meta[filename]["time_created"] = commit["timestamp"]
 
     return files_meta
+
+def _decode(raw):
+    """
+    Return a decoded a raw string.
+
+    :param raw: The string to string.
+
+    :type raw: (str)
+
+    :return: If the original encoding is successfully inferenced, return the
+        decoded string.
+    :rtype: str, or None
+
+    .. warning::
+        The raw string's original encoding is identified by heuristics which
+        can, and occasionally will, fail. Decoding will then fail, and None
+        will be returned.
+    """
+
+    try:
+        return raw.decode(bs4.BeautifulSoup(raw).original_encoding)
+
+    except (UnicodeDecodeError, UserWarning):
+        return None

From f4b28e617856e02d5f77570fa3dc66c1828063c6 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Thu, 17 Apr 2014 09:05:28 -0400
Subject: [PATCH 30/42] Add file-ext regex rules, exception handlers.

Add:
    bitshift/crawler/indexer.py
        -add two `try: except: pass` blocks, one to _decode() and another to
        GitIndexer.run(); bad practice, but GitIndexer has numerous unreliable
        moving parts that can throw too many unforseeable exceptions. Only
        current viable option.
        -add file-extension regex ignore rules (for text, markdown, etc. files)
        to _get_tracked_files().
---
 bitshift/crawler/indexer.py | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py
index f2a8bbf..50dbe8c 100644
--- a/bitshift/crawler/indexer.py
+++ b/bitshift/crawler/indexer.py
@@ -31,6 +31,10 @@ class GitIndexer(threading.Thread):
         """
 
         self.repository_queue = repository_queue
+
+        if not os.path.exists(GIT_CLONE_DIR):
+            os.makedirs(GIT_CLONE_DIR)
+
         super(GitIndexer, self).__init__()
 
     def run(self):
@@ -53,7 +57,7 @@ class GitIndexer(threading.Thread):
             try:
                 _index_repository(repo["url"], repo["name"],
                         repo["framework_name"])
-            except: # desperate times -- will be modified later
+            except:
                 pass
 
 class _ChangeDir(object):
@@ -110,16 +114,19 @@ def _index_repository(repo_url, repo_name, framework_name):
     :type framework_name: str
     """
 
-    GIT_CLONE_TIMEOUT = 60
+    GIT_CLONE_TIMEOUT = 600
 
     with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
         if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \
                 clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0:
+            if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
+                shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
             return
 
         with _ChangeDir(repo_name) as repository_dir:
             _insert_repository_codelets(repo_url, repo_name, framework_name)
-        shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
+
+    shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
 
 def _insert_repository_codelets(repo_url, repo_name, framework_name):
     """
@@ -153,11 +160,6 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name):
                         commits_meta[filename]["time_created"],
                         commits_meta[filename]["time_last_modified"])
 
-        db.codelets.insert({
-            "name" : codelet.name,
-            "authors" : codelet.authors
-        })
-
         # Database.insert(codelet)
 
 def _generate_file_url(filename, repo_url, framework_name):
@@ -230,6 +232,8 @@ def _get_tracked_files():
     """
 
     GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"]
+    GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?",
+            "md(wn|t[e]?xt)?", "rst"]
 
     tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \
             -f && -T }' $(find . -type d -name .git -prune -o -print)"),
@@ -239,7 +243,11 @@ def _get_tracked_files():
     for filename in tracked_files:
         filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
                 for pattern in GIT_IGNORE_FILES])
-        if not filename_match:
+        extension = filename.split(".")[-1]
+        extension_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
+                for pattern in GIT_IGNORE_EXTENSIONS])
+
+        if not (filename_match or extension_match):
             valuable_files.append(filename[2:])
     return valuable_files
 
@@ -301,7 +309,8 @@ def _decode(raw):
     """
 
     try:
-        return raw.decode(bs4.BeautifulSoup(raw).original_encoding)
+        encoding = bs4.BeautifulSoup(raw).original_encoding
+        return raw.decode(encoding) if encoding is not None else None
 
-    except (UnicodeDecodeError, UserWarning):
+    except:
         return None

From 755dce6ae3ca2be4f72e16b09eb9fa6ef9614420 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Thu, 17 Apr 2014 09:53:27 -0400
Subject: [PATCH 31/42] Add logging to crawler/indexer.

Add:
    bitshift/crawler/(__init__, crawler, indexer).py
        -add `logging` module to all `bitshift.crawler` modules, for some basic
        diagnostic output.
---
 bitshift/crawler/__init__.py | 11 ++++++++---
 bitshift/crawler/crawler.py  |  7 +++++--
 bitshift/crawler/indexer.py  | 26 ++++++++++++++++----------
 3 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py
index 4875712..39a1a28 100644
--- a/bitshift/crawler/__init__.py
+++ b/bitshift/crawler/__init__.py
@@ -4,14 +4,12 @@
 Contains functions for initializing all subsidiary, threaded crawlers.
 """
 
-import Queue
+import logging, Queue
 
 from bitshift.crawler import crawler, indexer
 
 __all__ = ["crawl"]
 
-MAX_URL_QUEUE_SIZE = 5e3
-
 def crawl():
     """
     Initialize all crawlers (and indexers).
@@ -21,6 +19,13 @@ def crawl():
     2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`
     """
 
+    MAX_URL_QUEUE_SIZE = 5e3
+    DEBUG_FILE = "crawler.log"
+
+    logging.basicConfig(filename=DEBUG_FILE,
+            format="%(asctime)s:\t%(threadName)s:\t%(message)s",
+            level=logging.DEBUG)
+
     repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
     github_crawler = crawler.GitHubCrawler(repository_queue)
     git_indexer = indexer.GitIndexer(repository_queue)
diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index 8b9576d..edd8eaf 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -4,7 +4,7 @@
 Contains all website/framework-specific Class crawlers.
 """
 
-import requests, time, threading
+import logging, requests, time, threading
 
 import bitshift.crawler.indexer
 
@@ -44,7 +44,8 @@ class GitHubCrawler(threading.Thread):
         """
 
         self.repository_queue = repository_queue
-        super(GitHubCrawler, self).__init__()
+        logging.info("Starting.")
+        super(GitHubCrawler, self).__init__(name=self.__class__.__name__)
 
     def run(self):
         """
@@ -66,6 +67,8 @@ class GitHubCrawler(threading.Thread):
         while len(next_api_url) > 0:
             start_time = time.time()
             response = requests.get(next_api_url, params=authentication_params)
+            logging.info("API call made. Limit remaining: %s." %
+                    response.headers["x-ratelimit-remaining"])
 
             for repo in response.json():
                 while self.repository_queue.full():
diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py
index 50dbe8c..b1e8e34 100644
--- a/bitshift/crawler/indexer.py
+++ b/bitshift/crawler/indexer.py
@@ -3,7 +3,7 @@
     repositories.
 """
 
-import bs4, os, re, shutil, subprocess, threading
+import bs4, logging, os, re, shutil, subprocess, threading
 
 from ..database import Database
 from ..codelet import Codelet
@@ -35,7 +35,8 @@ class GitIndexer(threading.Thread):
         if not os.path.exists(GIT_CLONE_DIR):
             os.makedirs(GIT_CLONE_DIR)
 
-        super(GitIndexer, self).__init__()
+        logging.info("Starting.")
+        super(GitIndexer, self).__init__(name=self.__class__.__name__)
 
     def run(self):
         """
@@ -53,12 +54,8 @@ class GitIndexer(threading.Thread):
 
             repo = self.repository_queue.get()
             self.repository_queue.task_done()
-
-            try:
-                _index_repository(repo["url"], repo["name"],
-                        repo["framework_name"])
-            except:
-                pass
+            _index_repository(repo["url"], repo["name"],
+                    repo["framework_name"])
 
 class _ChangeDir(object):
     """
@@ -116,15 +113,23 @@ def _index_repository(repo_url, repo_name, framework_name):
 
     GIT_CLONE_TIMEOUT = 600
 
+    logging.info("Indexing repository %s." % repo_url)
     with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
         if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \
                 clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0:
+            logging.debug("_index_repository(): Cloning %s failed." % repo_url)
             if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
                 shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
             return
 
         with _ChangeDir(repo_name) as repository_dir:
-            _insert_repository_codelets(repo_url, repo_name, framework_name)
+            try:
+                _insert_repository_codelets(repo_url, repo_name,
+                        framework_name)
+            except Exception as exception:
+                logging.warning("%s: _insert_repository_codelets"
+                        " failed %s." % (exception, repo_url))
+                pass
 
     shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
 
@@ -312,5 +317,6 @@ def _decode(raw):
         encoding = bs4.BeautifulSoup(raw).original_encoding
         return raw.decode(encoding) if encoding is not None else None
 
-    except:
+    except Exception as exception:
+        logging.warning("_debug(): %s", exception)
         return None

From 3ce399adbf5ebae2fcff017c8c680e21be31d4a7 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Thu, 17 Apr 2014 14:05:12 -0400
Subject: [PATCH 32/42] Add threaded cloner, GitRepository class (#7).

Add:
    bitshift/crawler/
        (crawler, indexer).py
            -add  a 'time.sleep()' call whenever a thread is blocking on items
            in a Queue, to prevent excessive polling (which hogs system
            resources).

        indexer.py
            -move 'git clone' functionality from the 'GitIndexer' singleton to
            a separate, threaded '_GitCloner'.
            -'crawler.GitHubCrawler' now shares a "clone" queue with
            '_GitCloner', which shares an "index" queue with 'GitIndexer'.
            -both indexing and cloning are time-intensive processes, so this
            improvement should (hypothetically) boost performance.
            -add `GitRepository` class, instances of which are passed around in
            the queues.
---
 bitshift/crawler/crawler.py |  51 ++++++------
 bitshift/crawler/indexer.py | 187 ++++++++++++++++++++++++++++++++++----------
 2 files changed, 169 insertions(+), 69 deletions(-)

diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index edd8eaf..8509c6d 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -6,7 +6,7 @@ Contains all website/framework-specific Class crawlers.
 
 import logging, requests, time, threading
 
-import bitshift.crawler.indexer
+from bitshift.crawler import indexer
 
 from ..codelet import Codelet
 from ..database import Database
@@ -19,31 +19,22 @@ class GitHubCrawler(threading.Thread):
     to its public repositories, which it inserts into a :class:`Queue.Queue`
     shared with :class:`bitshift.crawler.indexer.GitIndexer`.
 
-    :ivar repository_queue: (:class:`Queue.Queue`) Contains dictionaries with
-        repository information retrieved by `GitHubCrawler`, and other Git
-        crawlers, to be processed by
+    :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository`
+        with repository metadata retrieved by :class:`GitHubCrawler`, and other
+        Git crawlers, to be processed by
         :class:`bitshift.crawler.indexer.GitIndexer`.
     """
 
-    def __init__(self, repository_queue):
+    def __init__(self, clone_queue):
         """
         Create an instance of the singleton `GitHubCrawler`.
 
-        :param repository_queue: A queue containing dictionaries of  repository
-            metadata retrieved by `GitHubCrawler`, meant to be processed by an
-            instance of :class:`bitshift.crawler.indexer.GitIndexer`.
+        :param clone_queue: see :attr:`self.clone_queue`
 
-            .. code-block:: python
-                sample_dict = {
-                    "url" : "https://github.com/user/repo",
-                    "name" : "repo",
-                    "framework_name" : "GitHub"
-                }
-
-        :type repository_queue: :class:`Queue.Queue`
+        :type clone_queue: see :attr:`self.clone_queue`
         """
 
-        self.repository_queue = repository_queue
+        self.clone_queue = clone_queue
         logging.info("Starting.")
         super(GitHubCrawler, self).__init__(name=self.__class__.__name__)
 
@@ -54,7 +45,8 @@ class GitHubCrawler(threading.Thread):
         Pull all of GitHub's repositories by making calls to its API in a loop,
         accessing a subsequent page of results via the "next" URL returned in an
         API response header. Uses Severyn Kozak's (sevko) authentication
-        credentials.
+        credentials. For every new repository, a :class:`GitRepository` is
+        inserted into :attr:`self.clone_queue`.
         """
 
         next_api_url = "https://api.github.com/repositories"
@@ -67,18 +59,21 @@ class GitHubCrawler(threading.Thread):
         while len(next_api_url) > 0:
             start_time = time.time()
             response = requests.get(next_api_url, params=authentication_params)
-            logging.info("API call made. Limit remaining: %s." %
-                    response.headers["x-ratelimit-remaining"])
+
+            queue_percent_full = (float(self.clone_queue.qsize()) /
+                    self.clone_queue.maxsize) * 100
+            logging.info("API call made. Limit remaining: %s. Queue-size: (%d"
+                    "%%) %d/%d" % (response.headers["x-ratelimit-remaining"],
+                    queue_percent_full, self.clone_queue.qsize(),
+                    self.clone_queue.maxsize))
 
             for repo in response.json():
-                while self.repository_queue.full():
-                    pass
-
-                self.repository_queue.put({
-                    "url" : repo["html_url"],
-                    "name" : repo["name"],
-                    "framework_name" : "GitHub"
-                })
+                while self.clone_queue.full():
+                    time.sleep(1)
+
+                self.clone_queue.put(indexer.GitRepository(
+                        repo["html_url"], repo["full_name"].replace("/", ""),
+                        "GitHub"))
 
             if int(response.headers["x-ratelimit-remaining"]) == 0:
                 time.sleep(int(response.headers["x-ratelimit-reset"]) -
diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py
index b1e8e34..7e82bb5 100644
--- a/bitshift/crawler/indexer.py
+++ b/bitshift/crawler/indexer.py
@@ -3,59 +3,171 @@
     repositories.
 """
 
-import bs4, logging, os, re, shutil, subprocess, threading
+import bs4, logging, os, Queue, re, shutil, subprocess, time, threading
 
 from ..database import Database
 from ..codelet import Codelet
 
+import pymongo #debug
+db = pymongo.MongoClient().bitshift #debug
+
 GIT_CLONE_DIR = "/tmp/bitshift"
+THREAD_QUEUE_SLEEP = 0.5
+
+class GitRepository(object):
+    """
+    A representation of a Git repository's metadata.
+
+    :ivar url: (str) The repository's url.
+    :ivar name: (str) The name of the repository.
+    :ivar framework_name: (str) The name of the online Git framework that the
+        repository belongs to (eg, GitHub, BitBucket).
+    """
+
+    def __init__(self, url, name, framework_name):
+        """
+        Create a GitRepository instance.
+
+        :param url: see :attr:`GitRepository.url`
+        :param name: see :attr:`GitRepository.name`
+        :param framework_name: see :attr:`GitRepository.framework_name`
+
+        :type url: str
+        :type name: str
+        :type framework_name: str
+        """
+
+        self.url = url
+        self.name = name
+        self.framework_name = framework_name
 
 class GitIndexer(threading.Thread):
     """
     A singleton Git repository indexer.
 
-    `GitIndexer` clones and indexes the repositories at urls found by the
-    :mod:`bitshift.crawler.crawler` Git crawlers.
+    :class:`GitIndexer` indexes the repositories cloned by the
+    :class:`_GitCloner` singleton.
 
-    :ivar repository_queue: (:class:`Queue.Queue`) A queue containing urls found
-        by the :mod:`bitshift.crawler.crawler` Git crawlers.
+    :ivar index_queue: (:class:`Queue.Queue`) A queue containing
+        :class:`GitRepository` objects for every new repository succesfully
+        cloned by :class:`_GitCloner`, which are to be indexed.
+    :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner,
+        which feeds :class:`GitIndexer`.
     """
 
-    def __init__(self, repository_queue):
+    def __init__(self, clone_queue):
         """
         Create an instance of the singleton `GitIndexer`.
 
-        :param repository_queue: see :attr:`GitIndexer.repository_queue`
+        :param clone_queue: see :attr:`self.index_queue`
 
-        :type repository_queue: see :attr:`GitIndexer.repository_queue`
+        :type index_queue: see :attr:`self.index_queue`
         """
 
-        self.repository_queue = repository_queue
+        MAX_INDEX_QUEUE_SIZE = 10
+
+        logging.info("Starting.")
+        self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
+        self.git_cloner = _GitCloner(clone_queue, self.index_queue)
+        self.git_cloner.start()
 
         if not os.path.exists(GIT_CLONE_DIR):
             os.makedirs(GIT_CLONE_DIR)
 
-        logging.info("Starting.")
         super(GitIndexer, self).__init__(name=self.__class__.__name__)
 
     def run(self):
         """
-        Retrieve new repository urls, clone, and index them.
+        Retrieve metadata about newly cloned repositories and index them.
+
+        Blocks until new repositories appear in :attr:`self.index_queue`, then
+        retrieves one, and attempts indexing it. Should any errors occur, the
+        new repository will be discarded and the indexer will index the next in
+        the queue.
+        """
+
+        while True:
+            while self.index_queue.empty():
+                logging.warning("Empty.")
+                time.sleep(THREAD_QUEUE_SLEEP)
+
+            repo = self.index_queue.get()
+            self.index_queue.task_done()
+            _index_repository(repo.url, repo.name, repo.framework_name)
+
+class _GitCloner(threading.Thread):
+    """
+    A singleton Git repository cloner.
+
+    :ivar clone_queue: (:class:`Queue.Queue`) see
+        :attr:`bitshift.crawler.crawler.GitHubCrawler.clone_queue`.
+    :ivar index_queue: (:class:`Queue.Queue`) see
+        :attr:`GitIndexer.index_queue`.
+    """
+
+    def __init__(self, clone_queue, index_queue):
+        """
+        Create an instance of the singleton :class:`_GitCloner`.
+
+        :param clone_queue: see :attr:`self.clone_queue`
+        :param index_queue: see :attr:`self.index_queue`
+
+        :type clone_queue: see :attr:`self.clone_queue`
+        :type index_queue: see :attr:`self.index_queue`
+        """
+
+        self.clone_queue = clone_queue
+        self.index_queue = index_queue
+        super(_GitCloner, self).__init__(name=self.__class__.__name__)
+
+    def run(self):
+        """
+        Retrieve metadata about newly crawled repositories and clone them.
 
-        Blocks until new urls appear in :attr:`GitIndexer.repository_queue`,
-        then retrieves one, and attempts cloning/indexing it. Should any errors
-        occur, the new repository will be discarded and the crawler will
-        index the next in the queue.
+        Blocks until new :class:`GitRepository` appear in
+        :attr:`self.clone_queue`, then attempts cloning them. If
+        succcessful, the cloned repository is added to :attr:`self.index_queue`
+        for the `GitIndexer` to clone; otherwise, it is discarded.
         """
 
         while True:
-            while self.repository_queue.empty():
-                pass
+            while self.clone_queue.empty():
+                time.sleep(THREAD_QUEUE_SLEEP)
+            repo = self.clone_queue.get()
+            self.clone_queue.task_done()
+            self._clone_repository(repo)
 
-            repo = self.repository_queue.get()
-            self.repository_queue.task_done()
-            _index_repository(repo["url"], repo["name"],
-                    repo["framework_name"])
+    def _clone_repository(self, repo):
+        """
+        Attempt cloning a Git repository.
+
+        :param repo: Metadata about the repository to clone.
+
+        :type repo: :class:`GitRepository`
+        """
+
+        GIT_CLONE_TIMEOUT = 500
+
+        queue_percent_full = (float(self.index_queue.qsize()) /
+                self.index_queue.maxsize) * 100
+        logging.info("Cloning %s. Queue-size: (%d%%) %d/%d" % (repo.url,
+                queue_percent_full, self.index_queue.qsize(),
+                self.index_queue.maxsize))
+
+        with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
+            if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git"
+                " clone %s %s" % (GIT_CLONE_TIMEOUT, repo.url, repo.name),
+                shell=True) != 0:
+                logging.debug("_clone_repository(): Cloning %s failed." %
+                        repo.url)
+                if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
+                    shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
+                return
+
+            while self.index_queue.full():
+                time.sleep(THREAD_QUEUE_SLEEP)
+
+            self.index_queue.put(repo)
 
 class _ChangeDir(object):
     """
@@ -111,27 +223,17 @@ def _index_repository(repo_url, repo_name, framework_name):
     :type framework_name: str
     """
 
-    GIT_CLONE_TIMEOUT = 600
-
     logging.info("Indexing repository %s." % repo_url)
-    with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
-        if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git \
-                clone %s" % (GIT_CLONE_TIMEOUT, repo_url), shell=True) != 0:
-            logging.debug("_index_repository(): Cloning %s failed." % repo_url)
-            if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
-                shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
-            return
-
-        with _ChangeDir(repo_name) as repository_dir:
-            try:
-                _insert_repository_codelets(repo_url, repo_name,
-                        framework_name)
-            except Exception as exception:
-                logging.warning("%s: _insert_repository_codelets"
-                        " failed %s." % (exception, repo_url))
-                pass
-
-    shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
+    with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir:
+        try:
+            _insert_repository_codelets(repo_url, repo_name,
+                    framework_name)
+        except Exception as exception:
+            logging.warning("%s: _insert_repository_codelets failed %s." %
+                    (exception, repo_url))
+
+    if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
+        shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
 
 def _insert_repository_codelets(repo_url, repo_name, framework_name):
     """
@@ -164,6 +266,9 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name):
                                 framework_name),
                         commits_meta[filename]["time_created"],
                         commits_meta[filename]["time_last_modified"])
+        db.codelets.insert({
+            "name" : codelet.name
+        })
 
         # Database.insert(codelet)
 

From 6718650a8c4ef72d31e4f1dc071bc12cad50adb9 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Fri, 18 Apr 2014 12:01:06 -0400
Subject: [PATCH 33/42] First part of #8 fix.

Add:
    bitshift/crawler/indexer.py
        -Add 'pkill git' to the 'git clone' subprocess in '_clone_repository()',
        to kill hanging remotes -- it's un-Pythonic, but, thus far, the only
        method that's proved successful. The RAM problem still persists; the
        latest dry-run lasted 01:11:00 before terminating due to a lack of
        allocatable memory.
        -Add exception names to `logging` messages.

    bitshift/assets
        -Update 'tag()' docstring to current 'bitshift' standards (add a ':type'
        and ':rtype:' field).
---
 bitshift/assets.py          |  3 ++
 bitshift/crawler/indexer.py | 74 +++++++++++++++++++++++++--------------------
 2 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/bitshift/assets.py b/bitshift/assets.py
index 5d15304..b4f597b 100644
--- a/bitshift/assets.py
+++ b/bitshift/assets.py
@@ -15,8 +15,11 @@ def tag(filename):
 
     :param filename: The filename of the asset to create a tag for.
 
+    :type filename: str
+
     :return: A string containing a `<source>` tag for JS files, and a `<link>`
         for CSS files.
+    :rtype: str
     """
 
     file_ext = filename.split(".")[-1]
diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py
index 7e82bb5..563f369 100644
--- a/bitshift/crawler/indexer.py
+++ b/bitshift/crawler/indexer.py
@@ -8,9 +8,6 @@ import bs4, logging, os, Queue, re, shutil, subprocess, time, threading
 from ..database import Database
 from ..codelet import Codelet
 
-import pymongo #debug
-db = pymongo.MongoClient().bitshift #debug
-
 GIT_CLONE_DIR = "/tmp/bitshift"
 THREAD_QUEUE_SLEEP = 0.5
 
@@ -88,7 +85,6 @@ class GitIndexer(threading.Thread):
 
         while True:
             while self.index_queue.empty():
-                logging.warning("Empty.")
                 time.sleep(THREAD_QUEUE_SLEEP)
 
             repo = self.index_queue.get()
@@ -154,20 +150,20 @@ class _GitCloner(threading.Thread):
                 queue_percent_full, self.index_queue.qsize(),
                 self.index_queue.maxsize))
 
-        with _ChangeDir(GIT_CLONE_DIR) as git_clone_dir:
-            if subprocess.call("perl -e 'alarm shift @ARGV; exec @ARGV' %d git"
-                " clone %s %s" % (GIT_CLONE_TIMEOUT, repo.url, repo.name),
-                shell=True) != 0:
-                logging.debug("_clone_repository(): Cloning %s failed." %
-                        repo.url)
-                if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
-                    shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
-                return
-
-            while self.index_queue.full():
-                time.sleep(THREAD_QUEUE_SLEEP)
+        command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone"
+        " --single-branch %s %s/%s || pkill -f git")
+        if subprocess.call(command % (GIT_CLONE_TIMEOUT, repo.url,
+                GIT_CLONE_DIR, repo.name), shell=True) != 0:
+            logging.warning("_clone_repository(): Cloning %s failed." %
+                    repo.url)
+            if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
+                shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
+            return
 
-            self.index_queue.put(repo)
+        while self.index_queue.full():
+            time.sleep(THREAD_QUEUE_SLEEP)
+
+        self.index_queue.put(repo)
 
 class _ChangeDir(object):
     """
@@ -229,8 +225,9 @@ def _index_repository(repo_url, repo_name, framework_name):
             _insert_repository_codelets(repo_url, repo_name,
                     framework_name)
         except Exception as exception:
-            logging.warning("%s: _insert_repository_codelets failed %s." %
-                    (exception, repo_url))
+            logging.warning(
+                    "_insert_repository_codelets() failed: %s: %s: %s" %
+                    (exception.__class__.__name__, exception, repo_url))
 
     if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
         shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
@@ -254,10 +251,15 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name):
 
     commits_meta = _get_commits_metadata()
     for filename in commits_meta.keys():
-        with open(filename, "r") as source_file:
-            source = _decode(source_file.read())
-            if source is None:
-                return
+        try:
+            with open(filename, "r") as source_file:
+                source = _decode(source_file.read())
+                if source is None:
+                    return
+        except IOError as exception:
+            logging.warning(
+                    "_insert_repository_codelets() failed: %s: %s: %s" %
+                    (exception.__class__.__name__, exception, repo_url))
 
         authors = [(_decode(author),) for author in \
                 commits_meta[filename]["authors"]]
@@ -266,9 +268,6 @@ def _insert_repository_codelets(repo_url, repo_name, framework_name):
                                 framework_name),
                         commits_meta[filename]["time_created"],
                         commits_meta[filename]["time_last_modified"])
-        db.codelets.insert({
-            "name" : codelet.name
-        })
 
         # Database.insert(codelet)
 
@@ -284,14 +283,24 @@ def _generate_file_url(filename, repo_url, framework_name):
     :type repo_url: str
     :type framework_name: str
 
-    :return: The file's full url on the given framework.
-    :rtype: str
+    :return: The file's full url on the given framework, if successfully
+        derived.
+    :rtype: str, or None
+
+    .. warning::
+        `git branch` will occasionally fail, and, seeing as its a crucial
+        component of GitHub's repository file urls, None will be returned.
     """
 
     if framework_name == "GitHub":
-        default_branch = subprocess.check_output("git branch --no-color",
-                shell=True)[2:-1]
-        return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
+        try:
+            default_branch = subprocess.check_output("git branch --no-color",
+                    shell=True)[2:-1]
+            return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
+        except CalledProcessError as exception:
+            logging.warning("_generate_file_url(): %s: %s",
+                    exception.__class__.name, exception)
+            return None
 
 def _get_git_commits():
     """
@@ -423,5 +432,6 @@ def _decode(raw):
         return raw.decode(encoding) if encoding is not None else None
 
     except Exception as exception:
-        logging.warning("_debug(): %s", exception)
+        logging.warning("_decode(): %s: %s", exception.__class__.__name__,
+                exception)
         return None

From 93ed68645d760d36a2eb169ed22c4fec1c99a129 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Fri, 18 Apr 2014 21:31:10 -0400
Subject: [PATCH 34/42] Add partially integrated BitbucketCrawler().

Add:
    bitshift/crawler/
        __init__.py
            -Initialize 'BitbucketCrawler()' singleton.
            -Instantiate all thread instances on-the-fly in a 'threads' array, as
            opposed to individual named variables.

        crawler.py
            -Add 'BitbucketCrawler()', to crawl Bitbucket for repositories.
            -Not entirely tested for proper functionality.
            -The Bitbucket framework is not yet accounted for in
            'indexer._generate_file_url()'.
---
 bitshift/crawler/crawler.py | 72 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 66 insertions(+), 6 deletions(-)

diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index 8509c6d..347fd9a 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -15,14 +15,13 @@ class GitHubCrawler(threading.Thread):
     """
     Crawler that retrieves links to all of GitHub's public repositories.
 
-    GitHubCrawler is a threaded singleton that queries GitHub's API for URLs
+    GitHubCrawler is a threaded singleton that queries GitHub's API for urls
     to its public repositories, which it inserts into a :class:`Queue.Queue`
-    shared with :class:`bitshift.crawler.indexer.GitIndexer`.
+    shared with :class:`indexer.GitIndexer`.
 
     :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository`
-        with repository metadata retrieved by :class:`GitHubCrawler`, and other
-        Git crawlers, to be processed by
-        :class:`bitshift.crawler.indexer.GitIndexer`.
+    with repository metadata retrieved by :class:`GitHubCrawler`, and other Git
+    crawlers, to be processed by :class:`indexer.GitIndexer`.
     """
 
     def __init__(self, clone_queue):
@@ -35,7 +34,7 @@ class GitHubCrawler(threading.Thread):
         """
 
         self.clone_queue = clone_queue
-        logging.info("Starting.")
+        logging.info("Starting %s." % self.__class__.__name__)
         super(GitHubCrawler, self).__init__(name=self.__class__.__name__)
 
     def run(self):
@@ -84,3 +83,64 @@ class GitHubCrawler(threading.Thread):
             sleep_time = api_request_interval - (time.time() - start_time)
             if sleep_time > 0:
                 time.sleep(sleep_time)
+
+class BitbucketCrawler(threading.Thread):
+    """
+    Crawler that retrieves links to all of Bitbucket's public repositories.
+
+    BitbucketCrawler is a threaded singleton that queries Bitbucket's API for
+    urls to its public repositories, and inserts them as
+    :class:`indexer.GitRepository` into a :class:`Queue.Queue` shared with
+    :class:`indexer.GitIndexer`.
+
+    :ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert
+        :class:`indexer.GitRepository` repository urls into.
+    """
+
+    def __init__(self, clone_queue):
+        """
+        Create an instance of the singleton `BitbucketCrawler`.
+
+        :param clone_queue: see :attr:`self.clone_queue`
+
+        :type clone_queue: see :attr:`self.clone_queue`
+        """
+
+        self.clone_queue = clone_queue
+        logging.info("Starting %s." % self.__class__.__name__)
+        super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)
+
+    def run(self):
+        """
+        Query  the Bitbucket API for data about every public repository.
+
+        Query the Bitbucket API's "/repositories" endpoint and read its
+        paginated responses in a loop; any "git" repositories have their
+        clone-urls and names inserted into a :class:`indexer.GitRepository` in
+        :attr:`self.clone_queue`.
+        """
+
+        next_api_url = "https://api.bitbucket.org/2.0/repositories"
+
+        while True:
+            response = requests.get(next_api_url).json()
+
+            queue_percent_full = (float(self.clone_queue.qsize()) /
+                    self.clone_queue.maxsize) * 100
+            logging.info("API call made. Queue-size: (%d%%) %d/%d" % (
+                queue_percent_full, self.clone_queue.qsize(),
+                self.clone_queue.maxsize))
+
+            for repo in response["values"]:
+                if repo["scm"] == "git":
+                    while self.clone_queue.full():
+                        time.sleep(1)
+
+                    clone_links = repo["links"]["clone"]
+                    clone_url = (clone[0]["href"] if clone[0]["name"] == "https"
+                             else clone[1]["href"])
+                    links.append("clone_url")
+                    self.clone_queue.put(indexer.GitRepository(
+                        clone_url, repo["full_name"], "Bitbucket"))
+
+            next_api_url = response["next"]

From 2954161747106b000d6e1a70ed2f1e32bf46cad6 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Fri, 18 Apr 2014 21:31:10 -0400
Subject: [PATCH 35/42] Add partially integrated BitbucketCrawler().

Add:
    bitshift/crawler/
        __init__.py
            -Initialize 'BitbucketCrawler()' singleton.
            -Instantiate all thread instances on-the-fly in a 'threads' array, as
            opposed to individual named variables.

        crawler.py
            -Add 'BitbucketCrawler()', to crawl Bitbucket for repositories.
            -Not entirely tested for proper functionality.
            -The Bitbucket framework is not yet accounted for in
            'indexer._generate_file_url()'.
---
 bitshift/crawler/__init__.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py
index 39a1a28..75e8b61 100644
--- a/bitshift/crawler/__init__.py
+++ b/bitshift/crawler/__init__.py
@@ -15,20 +15,22 @@ def crawl():
     Initialize all crawlers (and indexers).
 
     Start the:
-    1. GitHub crawler, :class:`bitshift.crawler.crawler.GitHubCrawler`
-    2. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`
+    1. GitHub crawler, :class:`crawler.GitHubCrawler`.
+    2. Bitbucket crawler, :class:`crawler.BitbucketCrawler`.
+    3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
     """
 
     MAX_URL_QUEUE_SIZE = 5e3
     DEBUG_FILE = "crawler.log"
 
     logging.basicConfig(filename=DEBUG_FILE,
-            format="%(asctime)s:\t%(threadName)s:\t%(message)s",
+            format="%(levelname)s %(asctime)s:\t%(threadName)s:\t%(message)s",
             level=logging.DEBUG)
 
-    repository_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
-    github_crawler = crawler.GitHubCrawler(repository_queue)
-    git_indexer = indexer.GitIndexer(repository_queue)
+    repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
+    threads = [crawler.GitHubCrawler(repo_clone_queue),
+            crawler.BitbucketCrawler(repo_clone_queue),
+            indexer.GitIndexer(repo_clone_queue)]
 
-    for thread in [github_crawler, git_indexer]:
+    for thread in threads:
         thread.start()

From f38772760b6dbe46410ca87407c7dab919079c3f Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Sat, 19 Apr 2014 15:33:21 -0400
Subject: [PATCH 36/42] Remove some subprocesses, comment out logging.

Add:
    bitshift/crawler/
        (crawler, indexer).py
            -comment out all logging statements, as they may be causing a
            memory leak (the crawler is meant to run perpetually, meaning that,
            depending on how the `logging` module is implemented, it may be
            accumulating logged strings in memory.)

        bitshift/crawler/indexer.py
            -make `_index_repository()` and `_index_repository_codelets()`
            functions of the `GitIndexer` class.
            -replace `_get_tracked_files()` subprocess call, which found the
            files in a Git repository and removed any that were non-ASCII, with
            a pure Python solution.
            -add `_is_ascii()`.
---
 bitshift/crawler/crawler.py |  18 +--
 bitshift/crawler/indexer.py | 269 ++++++++++++++++++++++++++++----------------
 2 files changed, 181 insertions(+), 106 deletions(-)

diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index 347fd9a..10dd961 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -34,7 +34,7 @@ class GitHubCrawler(threading.Thread):
         """
 
         self.clone_queue = clone_queue
-        logging.info("Starting %s." % self.__class__.__name__)
+        # logging.info("Starting %s." % self.__class__.__name__)
         super(GitHubCrawler, self).__init__(name=self.__class__.__name__)
 
     def run(self):
@@ -61,10 +61,10 @@ class GitHubCrawler(threading.Thread):
 
             queue_percent_full = (float(self.clone_queue.qsize()) /
                     self.clone_queue.maxsize) * 100
-            logging.info("API call made. Limit remaining: %s. Queue-size: (%d"
-                    "%%) %d/%d" % (response.headers["x-ratelimit-remaining"],
-                    queue_percent_full, self.clone_queue.qsize(),
-                    self.clone_queue.maxsize))
+            # logging.info("API call made. Limit remaining: %s. Queue-size: (%d"
+                    # "%%) %d/%d" % (response.headers["x-ratelimit-remaining"],
+                    # queue_percent_full, self.clone_queue.qsize(),
+                    # self.clone_queue.maxsize))
 
             for repo in response.json():
                 while self.clone_queue.full():
@@ -107,7 +107,7 @@ class BitbucketCrawler(threading.Thread):
         """
 
         self.clone_queue = clone_queue
-        logging.info("Starting %s." % self.__class__.__name__)
+        # logging.info("Starting %s." % self.__class__.__name__)
         super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)
 
     def run(self):
@@ -127,9 +127,9 @@ class BitbucketCrawler(threading.Thread):
 
             queue_percent_full = (float(self.clone_queue.qsize()) /
                     self.clone_queue.maxsize) * 100
-            logging.info("API call made. Queue-size: (%d%%) %d/%d" % (
-                queue_percent_full, self.clone_queue.qsize(),
-                self.clone_queue.maxsize))
+            # logging.info("API call made. Queue-size: (%d%%) %d/%d" % (
+                # queue_percent_full, self.clone_queue.qsize(),
+                # self.clone_queue.maxsize))
 
             for repo in response["values"]:
                 if repo["scm"] == "git":
diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py
index 563f369..3bff3e7 100644
--- a/bitshift/crawler/indexer.py
+++ b/bitshift/crawler/indexer.py
@@ -3,7 +3,7 @@
     repositories.
 """
 
-import bs4, logging, os, Queue, re, shutil, subprocess, time, threading
+import bs4, logging, os, Queue, re, shutil, string, subprocess, time, threading
 
 from ..database import Database
 from ..codelet import Codelet
@@ -63,10 +63,12 @@ class GitIndexer(threading.Thread):
 
         MAX_INDEX_QUEUE_SIZE = 10
 
-        logging.info("Starting.")
+        # logging.info("Starting.")
+
         self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
         self.git_cloner = _GitCloner(clone_queue, self.index_queue)
         self.git_cloner.start()
+        self.codelet_count = 0 #debug
 
         if not os.path.exists(GIT_CLONE_DIR):
             os.makedirs(GIT_CLONE_DIR)
@@ -89,14 +91,91 @@ class GitIndexer(threading.Thread):
 
             repo = self.index_queue.get()
             self.index_queue.task_done()
-            _index_repository(repo.url, repo.name, repo.framework_name)
+            self._index_repository(repo.url, repo.name, repo.framework_name)
+
+    def _index_repository(self, repo_url, repo_name, framework_name):
+        """
+        Clone and index (create and insert Codeletes for) a Git repository.
+
+        `git clone` the Git repository located at **repo_url**, call
+        _insert_repository_codelets, then remove said repository.
+
+        :param repo_url: The url the Git repository was cloned from.
+        :param repo_name: The name of the repository.
+        :param framework_name: The name of the framework the repository is from.
+
+        :type repo_url: str
+        :type repo_name: str
+        :type framework_name: str
+        """
+
+        # logging.info("Indexing repository %s." % repo_url)
+        with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir:
+            try:
+                self._insert_repository_codelets(repo_url, repo_name,
+                        framework_name)
+            except Exception as exception:
+                # logging.warning(
+                        # "_insert_repository_codelets() failed: %s: %s: %s" %
+                        # (exception.__class__.__name__, exception, repo_url))
+                pass
+
+        if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
+            shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
+
+    def _insert_repository_codelets(self, repo_url, repo_name, framework_name):
+        """
+        Create and insert a Codelet for the files inside a Git repository.
+
+        Create a new Codelet, and insert it into the Database singleton, for every
+        file inside the current working directory's default branch (usually
+        *master*).
+
+        :param repo_url: The url the Git repository was cloned from.
+        :param repo_name: The name of the repository.
+        :param framework_name: The name of the framework the repository is from.
+
+        :type repo_url: str
+        :type repo_name: str
+        :type framework_name: str
+        """
+
+        commits_meta = _get_commits_metadata()
+        for filename in commits_meta.keys():
+            try:
+                with open(filename, "r") as source_file:
+                    source = _decode(source_file.read())
+                    if source is None:
+                        return
+            except IOError as exception:
+                # logging.warning(
+                        # "_insert_repository_codelets() failed: %s: %s: %s" %
+                        # (exception.__class__.__name__, exception, repo_url))
+                pass
+
+            authors = [(_decode(author),) for author in \
+                    commits_meta[filename]["authors"]]
+            codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
+                            None, authors, _generate_file_url(filename, repo_url,
+                                    framework_name),
+                            commits_meta[filename]["time_created"],
+                            commits_meta[filename]["time_last_modified"])
+
+            self.codelet_count += 1 #debug
+            if self.codelet_count % 500 == 0: #debug
+                logging.info("Number of codelets indexed: %d.", self.codelet_count) #debug
+
+            # Database.insert(codelet)
 
 class _GitCloner(threading.Thread):
     """
     A singleton Git repository cloner.
 
+    Clones the repositories crawled by :class:`crawler.GitHubCrawler` for
+    :class:`GitIndexer` to index.
+
     :ivar clone_queue: (:class:`Queue.Queue`) see
-        :attr:`bitshift.crawler.crawler.GitHubCrawler.clone_queue`.
+        :attr:`crawler.GitHubCrawler.clone_queue`.
     :ivar index_queue: (:class:`Queue.Queue`) see
         :attr:`GitIndexer.index_queue`.
     """
@@ -112,6 +191,8 @@ class _GitCloner(threading.Thread):
         :type index_queue: see :attr:`self.index_queue`
         """
 
+        # logging.info("Starting.")
+
         self.clone_queue = clone_queue
         self.index_queue = index_queue
         super(_GitCloner, self).__init__(name=self.__class__.__name__)
@@ -146,16 +227,29 @@ class _GitCloner(threading.Thread):
 
         queue_percent_full = (float(self.index_queue.qsize()) /
                 self.index_queue.maxsize) * 100
-        logging.info("Cloning %s. Queue-size: (%d%%) %d/%d" % (repo.url,
-                queue_percent_full, self.index_queue.qsize(),
-                self.index_queue.maxsize))
+        # logging.info("Cloning %s. Queue-size: (%d%%) %d/%d" % (repo.url,
+                # queue_percent_full, self.index_queue.qsize(),
+                # self.index_queue.maxsize))
 
+        exit_code = None
         command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone"
         " --single-branch %s %s/%s || pkill -f git")
-        if subprocess.call(command % (GIT_CLONE_TIMEOUT, repo.url,
-                GIT_CLONE_DIR, repo.name), shell=True) != 0:
-            logging.warning("_clone_repository(): Cloning %s failed." %
-                    repo.url)
+
+        while exit_code is None:
+            try:
+                exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT,
+                        repo.url, GIT_CLONE_DIR, repo.name), shell=True)
+            except:
+                # logging.warning("_clone_repository() failed: %s: %s",
+                        # exception.__class__.__name__, exception)
+                time.sleep(1)
+                continue
+            else:
+                break
+
+        if exit_code != 0:
+            # logging.warning("_clone_repository(): Cloning %s failed." %
+                    # repo.url)
             if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
                 shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
             return
@@ -203,74 +297,6 @@ class _ChangeDir(object):
 
         os.chdir(self.old_path)
 
-def _index_repository(repo_url, repo_name, framework_name):
-    """
-    Clone and index (create and insert Codeletes for) a Git repository.
-
-    `git clone` the Git repository located at **repo_url**, call
-    _insert_repository_codelets, then remove said repository.
-
-    :param repo_url: The url the Git repository was cloned from.
-    :param repo_name: The name of the repository.
-    :param framework_name: The name of the framework the repository is from.
-
-    :type repo_url: str
-    :type repo_name: str
-    :type framework_name: str
-    """
-
-    logging.info("Indexing repository %s." % repo_url)
-    with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir:
-        try:
-            _insert_repository_codelets(repo_url, repo_name,
-                    framework_name)
-        except Exception as exception:
-            logging.warning(
-                    "_insert_repository_codelets() failed: %s: %s: %s" %
-                    (exception.__class__.__name__, exception, repo_url))
-
-    if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
-        shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
-
-def _insert_repository_codelets(repo_url, repo_name, framework_name):
-    """
-    Create and insert a Codelet for the files inside a Git repository.
-
-    Create a new Codelet, and insert it into the Database singleton, for every
-    file inside the current working directory's default branch (usually
-    *master*).
-
-    :param repo_url: The url the Git repository was cloned from.
-    :param repo_name: The name of the repository.
-    :param framework_name: The name of the framework the repository is from.
-
-    :type repo_url: str
-    :type repo_name: str
-    :type framework_name: str
-    """
-
-    commits_meta = _get_commits_metadata()
-    for filename in commits_meta.keys():
-        try:
-            with open(filename, "r") as source_file:
-                source = _decode(source_file.read())
-                if source is None:
-                    return
-        except IOError as exception:
-            logging.warning(
-                    "_insert_repository_codelets() failed: %s: %s: %s" %
-                    (exception.__class__.__name__, exception, repo_url))
-
-        authors = [(_decode(author),) for author in \
-                commits_meta[filename]["authors"]]
-        codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
-                        None, authors, _generate_file_url(filename, repo_url,
-                                framework_name),
-                        commits_meta[filename]["time_created"],
-                        commits_meta[filename]["time_last_modified"])
-
-        # Database.insert(codelet)
-
 def _generate_file_url(filename, repo_url, framework_name):
     """
     Return a url for a filename from a Git wrapper framework.
@@ -288,19 +314,25 @@ def _generate_file_url(filename, repo_url, framework_name):
     :rtype: str, or None
 
     .. warning::
-        `git branch` will occasionally fail, and, seeing as its a crucial
-        component of GitHub's repository file urls, None will be returned.
+        Various Git subprocesses will occasionally fail, and, seeing as the
+        information they provide is a crucial component of some repository file
+        urls, None may be returned.
     """
 
-    if framework_name == "GitHub":
-        try:
-            default_branch = subprocess.check_output("git branch --no-color",
-                    shell=True)[2:-1]
-            return "%s/blob/%s/%s" % (repo_url, default_branch, filename)
-        except CalledProcessError as exception:
-            logging.warning("_generate_file_url(): %s: %s",
-                    exception.__class__.name, exception)
-            return None
+    try:
+        if framework_name == "GitHub":
+                default_branch = subprocess.check_output("git branch"
+                        " --no-color", shell=True)[2:-1]
+                return ("%s/blob/%s/%s" % (repo_url, default_branch,
+                        filename)).replace("//", "/")
+        elif framework_name == "Bitbucket":
+                commit_hash = subprocess.check_output("git rev-parse HEAD",
+                        shell=True).replace("\n", "")
+                return ("%s/src/%s/%s" % (repo_url, commit_hash,
+                        filename)).replace("//", "/")
+    except subprocess.CalledProcessError as exception:
+        # logging.warning("_generate_file_url() failed: %s", exception)
+        return None
 
 def _get_git_commits():
     """
@@ -354,12 +386,15 @@ def _get_tracked_files():
     GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?",
             "md(wn|t[e]?xt)?", "rst"]
 
-    tracked_files = subprocess.check_output(("perl -le 'for (@ARGV){ print if \
-            -f && -T }' $(find . -type d -name .git -prune -o -print)"),
-            shell=True).split("\n")[:-1]
+    files = []
+    for dirname, subdir_names, filenames in os.walk("."):
+        for filename in filenames:
+            path = os.path.join(dirname, filename)
+            if _is_ascii(path):
+                files.append(path)
 
     valuable_files = []
-    for filename in tracked_files:
+    for filename in files:
         filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
                 for pattern in GIT_IGNORE_FILES])
         extension = filename.split(".")[-1]
@@ -431,7 +466,47 @@ def _decode(raw):
         encoding = bs4.BeautifulSoup(raw).original_encoding
         return raw.decode(encoding) if encoding is not None else None
 
-    except Exception as exception:
-        logging.warning("_decode(): %s: %s", exception.__class__.__name__,
-                exception)
+    except (LookupError, UnicodeDecodeError, UserWarning) as exception:
+        # logging.warning("_decode() failed: %s: %s",
+                # exception.__class__.__name__, exception)
         return None
+
+def _is_ascii(filename):
+    """
+    Heuristically determine whether a file is ASCII text or binary.
+
+    If a portion of the file contains null bytes, or the percentage of bytes
+    that aren't ASCII is greater than 30%, then the file is concluded to be
+    binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T`
+    operator, and is the de-facto method for in : passdetermining whether a
+    file is ASCII.
+
+    :param filename: The path of the file to test.
+
+    :type filename: str
+
+    :return: Whether the file is probably ASCII.
+    :rtype: Boolean
+    """
+
+    try:
+        with open(filename) as source:
+            file_snippet = source.read(512)
+
+            if not file_snippet:
+                return True
+
+            ascii_characters = "".join(map(chr, range(32, 127)) +
+                    list("\n\r\t\b"))
+            null_trans = string.maketrans("", "")
+
+            if "\0" in file_snippet:
+                return False
+
+            non_ascii = file_snippet.translate(null_trans, ascii_characters)
+            return not float(len(non_ascii)) / len(file_snippet) > 0.30
+
+    except IOError as exception:
+        # logging.warning("_is_ascii() failed: %s: %s",
+                # exception.__class__.__name__, exception)
+        return False

From ad7ce9d9cf1b5f267efae2832d26749c47b52609 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Tue, 29 Apr 2014 12:53:49 -0400
Subject: [PATCH 37/42] Commit latest crawler, continue fix of #8.

Add:
    bitshift/crawler/*.py
        -Remove use of the `logging` module, which appeared to be causing a
        memory leak even with log-file rotation.
---
 app.py                       |  4 ++-
 bitshift/crawler/__init__.py |  7 +-----
 bitshift/crawler/crawler.py  | 30 +++++++++++-----------
 bitshift/crawler/indexer.py  | 60 ++++++++++++++++++--------------------------
 setup.py                     |  2 +-
 5 files changed, 45 insertions(+), 58 deletions(-)

diff --git a/app.py b/app.py
index c4083c9..6a77b97 100644
--- a/app.py
+++ b/app.py
@@ -5,7 +5,9 @@ Module to contain all the project's Flask server plumbing.
 from flask import Flask
 from flask import render_template, session
 
+from bitshift import assets
 from bitshift.query import parse_query
+from bitshift.crawler import crawl
 
 app = Flask(__name__)
 app.config.from_object("bitshift.config")
@@ -25,4 +27,4 @@ def search(query):
     pass
 
 if __name__ == "__main__":
-    app.run()
+    crawl()
diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py
index 75e8b61..b4ad922 100644
--- a/bitshift/crawler/__init__.py
+++ b/bitshift/crawler/__init__.py
@@ -4,7 +4,7 @@
 Contains functions for initializing all subsidiary, threaded crawlers.
 """
 
-import logging, Queue
+import os, Queue
 
 from bitshift.crawler import crawler, indexer
 
@@ -21,11 +21,6 @@ def crawl():
     """
 
     MAX_URL_QUEUE_SIZE = 5e3
-    DEBUG_FILE = "crawler.log"
-
-    logging.basicConfig(filename=DEBUG_FILE,
-            format="%(levelname)s %(asctime)s:\t%(threadName)s:\t%(message)s",
-            level=logging.DEBUG)
 
     repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
     threads = [crawler.GitHubCrawler(repo_clone_queue),
diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index 10dd961..6196a13 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -4,7 +4,7 @@
 Contains all website/framework-specific Class crawlers.
 """
 
-import logging, requests, time, threading
+import requests, time, threading
 
 from bitshift.crawler import indexer
 
@@ -34,7 +34,6 @@ class GitHubCrawler(threading.Thread):
         """
 
         self.clone_queue = clone_queue
-        # logging.info("Starting %s." % self.__class__.__name__)
         super(GitHubCrawler, self).__init__(name=self.__class__.__name__)
 
     def run(self):
@@ -57,14 +56,15 @@ class GitHubCrawler(threading.Thread):
 
         while len(next_api_url) > 0:
             start_time = time.time()
-            response = requests.get(next_api_url, params=authentication_params)
+
+            try:
+                response = requests.get(next_api_url,
+                        params=authentication_params)
+            except ConnectionError as exception:
+                continue
 
             queue_percent_full = (float(self.clone_queue.qsize()) /
                     self.clone_queue.maxsize) * 100
-            # logging.info("API call made. Limit remaining: %s. Queue-size: (%d"
-                    # "%%) %d/%d" % (response.headers["x-ratelimit-remaining"],
-                    # queue_percent_full, self.clone_queue.qsize(),
-                    # self.clone_queue.maxsize))
 
             for repo in response.json():
                 while self.clone_queue.full():
@@ -107,7 +107,6 @@ class BitbucketCrawler(threading.Thread):
         """
 
         self.clone_queue = clone_queue
-        # logging.info("Starting %s." % self.__class__.__name__)
         super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)
 
     def run(self):
@@ -123,13 +122,14 @@ class BitbucketCrawler(threading.Thread):
         next_api_url = "https://api.bitbucket.org/2.0/repositories"
 
         while True:
-            response = requests.get(next_api_url).json()
+            try:
+                response = requests.get(next_api_url).json()
+            except ConnectionError as exception:
+                time.sleep(0.5)
+                continue
 
             queue_percent_full = (float(self.clone_queue.qsize()) /
                     self.clone_queue.maxsize) * 100
-            # logging.info("API call made. Queue-size: (%d%%) %d/%d" % (
-                # queue_percent_full, self.clone_queue.qsize(),
-                # self.clone_queue.maxsize))
 
             for repo in response["values"]:
                 if repo["scm"] == "git":
@@ -137,10 +137,12 @@ class BitbucketCrawler(threading.Thread):
                         time.sleep(1)
 
                     clone_links = repo["links"]["clone"]
-                    clone_url = (clone[0]["href"] if clone[0]["name"] == "https"
-                             else clone[1]["href"])
+                    clone_url = (clone_links[0]["href"] if
+                            clone_links[0]["name"] == "https" else
+                            clone_links[1]["href"])
                     links.append("clone_url")
                     self.clone_queue.put(indexer.GitRepository(
                         clone_url, repo["full_name"], "Bitbucket"))
 
             next_api_url = response["next"]
+            time.sleep(0.2)
diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py
index 3bff3e7..d2ef907 100644
--- a/bitshift/crawler/indexer.py
+++ b/bitshift/crawler/indexer.py
@@ -3,7 +3,7 @@
     repositories.
 """
 
-import bs4, logging, os, Queue, re, shutil, string, subprocess, time, threading
+import bs4, os, Queue, re, shutil, string, subprocess, time, threading
 
 from ..database import Database
 from ..codelet import Codelet
@@ -63,12 +63,9 @@ class GitIndexer(threading.Thread):
 
         MAX_INDEX_QUEUE_SIZE = 10
 
-        # logging.info("Starting.")
-
         self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
         self.git_cloner = _GitCloner(clone_queue, self.index_queue)
         self.git_cloner.start()
-        self.codelet_count = 0 #debug
 
         if not os.path.exists(GIT_CLONE_DIR):
             os.makedirs(GIT_CLONE_DIR)
@@ -91,7 +88,10 @@ class GitIndexer(threading.Thread):
 
             repo = self.index_queue.get()
             self.index_queue.task_done()
-            self._index_repository(repo.url, repo.name, repo.framework_name)
+            try:
+                self._index_repository(repo.url, repo.name, repo.framework_name)
+            except Exception as exception:
+                pass
 
     def _index_repository(self, repo_url, repo_name, framework_name):
         """
@@ -109,15 +109,11 @@ class GitIndexer(threading.Thread):
         :type framework_name: str
         """
 
-        # logging.info("Indexing repository %s." % repo_url)
         with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir:
             try:
                 self._insert_repository_codelets(repo_url, repo_name,
                         framework_name)
             except Exception as exception:
-                # logging.warning(
-                        # "_insert_repository_codelets() failed: %s: %s: %s" %
-                        # (exception.__class__.__name__, exception, repo_url))
                 pass
 
         if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
@@ -141,17 +137,18 @@ class GitIndexer(threading.Thread):
         """
 
         commits_meta = _get_commits_metadata()
+        if commits_meta is None:
+            return
+
         for filename in commits_meta.keys():
             try:
-                with open(filename, "r") as source_file:
+                source = ""
+                with open(filename) as source_file:
                     source = _decode(source_file.read())
                     if source is None:
-                        return
+                        continue
             except IOError as exception:
-                # logging.warning(
-                        # "_insert_repository_codelets() failed: %s: %s: %s" %
-                        # (exception.__class__.__name__, exception, repo_url))
-                pass
+                continue
 
             authors = [(_decode(author),) for author in \
                     commits_meta[filename]["authors"]]
@@ -161,10 +158,6 @@ class GitIndexer(threading.Thread):
                             commits_meta[filename]["time_created"],
                             commits_meta[filename]["time_last_modified"])
 
-            self.codelet_count += 1 #debug
-            if self.codelet_count % 500 == 0: #debug
-                logging.info("Number of codelets indexed: %d.", self.codelet_count) #debug
-
             # Database.insert(codelet)
 
 class _GitCloner(threading.Thread):
@@ -191,8 +184,6 @@ class _GitCloner(threading.Thread):
         :type index_queue: see :attr:`self.index_queue`
         """
 
-        # logging.info("Starting.")
-
         self.clone_queue = clone_queue
         self.index_queue = index_queue
         super(_GitCloner, self).__init__(name=self.__class__.__name__)
@@ -212,7 +203,11 @@ class _GitCloner(threading.Thread):
                 time.sleep(THREAD_QUEUE_SLEEP)
             repo = self.clone_queue.get()
             self.clone_queue.task_done()
-            self._clone_repository(repo)
+
+            try:
+                self._clone_repository(repo)
+            except Exception as exception:
+                pass
 
     def _clone_repository(self, repo):
         """
@@ -227,29 +222,27 @@ class _GitCloner(threading.Thread):
 
         queue_percent_full = (float(self.index_queue.qsize()) /
                 self.index_queue.maxsize) * 100
-        # logging.info("Cloning %s. Queue-size: (%d%%) %d/%d" % (repo.url,
-                # queue_percent_full, self.index_queue.qsize(),
-                # self.index_queue.maxsize))
 
         exit_code = None
         command = ("perl -e 'alarm shift @ARGV; exec @ARGV' %d git clone"
         " --single-branch %s %s/%s || pkill -f git")
 
+        command_attempt = 0
         while exit_code is None:
             try:
                 exit_code = subprocess.call(command % (GIT_CLONE_TIMEOUT,
                         repo.url, GIT_CLONE_DIR, repo.name), shell=True)
-            except:
-                # logging.warning("_clone_repository() failed: %s: %s",
-                        # exception.__class__.__name__, exception)
+            except Exception as exception:
                 time.sleep(1)
-                continue
+                command_attempt += 1
+                if command_attempt == 20:
+                    break
+                else:
+                    continue
             else:
                 break
 
         if exit_code != 0:
-            # logging.warning("_clone_repository(): Cloning %s failed." %
-                    # repo.url)
             if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
                 shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
             return
@@ -331,7 +324,6 @@ def _generate_file_url(filename, repo_url, framework_name):
                 return ("%s/src/%s/%s" % (repo_url, commit_hash,
                         filename)).replace("//", "/")
     except subprocess.CalledProcessError as exception:
-        # logging.warning("_generate_file_url() failed: %s", exception)
         return None
 
 def _get_git_commits():
@@ -467,8 +459,6 @@ def _decode(raw):
         return raw.decode(encoding) if encoding is not None else None
 
     except (LookupError, UnicodeDecodeError, UserWarning) as exception:
-        # logging.warning("_decode() failed: %s: %s",
-                # exception.__class__.__name__, exception)
         return None
 
 def _is_ascii(filename):
@@ -507,6 +497,4 @@ def _is_ascii(filename):
             return not float(len(non_ascii)) / len(file_snippet) > 0.30
 
     except IOError as exception:
-        # logging.warning("_is_ascii() failed: %s: %s",
-                # exception.__class__.__name__, exception)
         return False
diff --git a/setup.py b/setup.py
index 1faa5b9..0f9fc84 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@ setup(
     version = "0.1",
     packages = find_packages(),
     install_requires = ["Flask>=0.10.1", "pygments>=1.6", "requests>=2.2.0",
-                        "BeautifulSoup>=3.2.1"],
+                        "beautifulsoup4>=3.2.1", "oursql>=0.9.3.1"],
     author = "Benjamin Attal, Ben Kurtovic, Severyn Kozak",
     license = "MIT",
     url = "https://github.com/earwig/bitshift"

From 1b2739f8c4439219d18a5f4f3d9bd02d3360ef85 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Wed, 30 Apr 2014 15:20:15 -0400
Subject: [PATCH 38/42] Add GitHub repo star count, simple logging.

Add:
    bitshift/crawler/crawler.py
        -add `_get_repo_stars()` to `GitHubCrawler`, which queries the GitHub
        API for the number of a stars that a given repository has.
        -log the `next_api_url` every time it's generated by `GitHubCrawler` and
        `BitbucketCrawler` to two respective log-files.
---
 bitshift/crawler/crawler.py | 51 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 6 deletions(-)

diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index 6196a13..e4b4929 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -24,6 +24,11 @@ class GitHubCrawler(threading.Thread):
     crawlers, to be processed by :class:`indexer.GitIndexer`.
     """
 
+    AUTHENTICATION = {
+        "client_id" : "436cb884ae09be7f2a4e",
+        "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
+    }
+
     def __init__(self, clone_queue):
         """
         Create an instance of the singleton `GitHubCrawler`.
@@ -48,10 +53,6 @@ class GitHubCrawler(threading.Thread):
         """
 
         next_api_url = "https://api.github.com/repositories"
-        authentication_params = {
-            "client_id" : "436cb884ae09be7f2a4e",
-            "client_secret" : "8deeefbc2439409c5b7a092fd086772fe8b1f24e"
-        }
         api_request_interval = 5e3 / 60 ** 2
 
         while len(next_api_url) > 0:
@@ -59,7 +60,7 @@ class GitHubCrawler(threading.Thread):
 
             try:
                 response = requests.get(next_api_url,
-                        params=authentication_params)
+                        params=self.AUTHENTICATION)
             except ConnectionError as exception:
                 continue
 
@@ -76,14 +77,49 @@ class GitHubCrawler(threading.Thread):
 
             if int(response.headers["x-ratelimit-remaining"]) == 0:
                 time.sleep(int(response.headers["x-ratelimit-reset"]) -
-                           time.time())
+                        time.time())
 
             next_api_url = response.headers["link"].split(">")[0][1:]
+            with open(".github_api.log", "w") as log_file:
+                log_file.write("%s\n" % next_api_url)
 
             sleep_time = api_request_interval - (time.time() - start_time)
             if sleep_time > 0:
                 time.sleep(sleep_time)
 
+    def _get_repo_stars(self, repo_name):
+        """
+        Return the number of stargazers for a repository.
+
+        Queries the GitHub API for the number of stargazers for a given
+        repository, and blocks if the query limit is exceeded.
+
+        :param repo_name: The name of the repository, in
+            `username/repository_name` format.
+
+        :type repo_name: str
+
+        :return: The number of stargazers for the repository.
+        :rtype: int
+        """
+
+        API_URL = "https://api.github.com/search/repositories"
+
+
+        params = self.AUTHENTICATION
+        params["q"] = "repo:%s" % repo_name
+
+        resp = requests.get(API_URL,
+                params=params,
+                headers={
+                    "Accept" : "application/vnd.github.preview"
+                })
+
+        if int(resp.headers["x-ratelimit-remaining"]) == 0:
+            time.sleep(int(resp.headers["x-ratelimit-reset"]) - time.time())
+
+        return int(resp.json()["items"][0]["stargazers_count"])
+
 class BitbucketCrawler(threading.Thread):
     """
     Crawler that retrieves links to all of Bitbucket's public repositories.
@@ -145,4 +181,7 @@ class BitbucketCrawler(threading.Thread):
                         clone_url, repo["full_name"], "Bitbucket"))
 
             next_api_url = response["next"]
+            with open(".bitbucket_api.log", "w") as log_file:
+                log_file.write("%s\n" % next_api_url)
+
             time.sleep(0.2)

From 6762c1fa3db340f96e06ee7f5ab371c20decd2e3 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Sat, 3 May 2014 15:06:03 -0400
Subject: [PATCH 39/42] Re-add logging, rem file filters.

Add:
    bitshift/
        __init__.py
            -add `_configure_logging()`, which sets up a more robust logging
            infrastructure than was previously used: log files are rotated once
            per hour, and have some additional formatting rules.

        (crawler, indexer).py
            -add hierarchically-descending loggers to individual threaded
            classes (`GitHubCrawler`, `GitIndexer`, etc.); add logging calls.

        indexer.py
            -remove file filtering regex matches from `_get_tracked_files()`,
            as non-code files will be discarded by the parsers.
---
 bitshift/crawler/__init__.py |  25 +++++++++-
 bitshift/crawler/crawler.py  |  46 +++++++++++++----
 bitshift/crawler/indexer.py  | 116 ++++++++++++++++++++-----------------------
 3 files changed, 114 insertions(+), 73 deletions(-)

diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py
index b4ad922..cfec64c 100644
--- a/bitshift/crawler/__init__.py
+++ b/bitshift/crawler/__init__.py
@@ -4,7 +4,7 @@
 Contains functions for initializing all subsidiary, threaded crawlers.
 """
 
-import os, Queue
+import logging, logging.handlers, os, Queue
 
 from bitshift.crawler import crawler, indexer
 
@@ -20,6 +20,8 @@ def crawl():
     3. Git indexer, :class:`bitshift.crawler.indexer.GitIndexer`.
     """
 
+    _configure_logging()
+
     MAX_URL_QUEUE_SIZE = 5e3
 
     repo_clone_queue = Queue.Queue(maxsize=MAX_URL_QUEUE_SIZE)
@@ -29,3 +31,24 @@ def crawl():
 
     for thread in threads:
         thread.start()
+
+def _configure_logging():
+    LOG_FILE_DIR = "log"
+
+    if not os.path.exists(LOG_FILE_DIR):
+        os.mkdir(LOG_FILE_DIR)
+
+    logging.getLogger("requests").setLevel(logging.WARNING)
+
+    formatter = logging.Formatter(
+            fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"
+            " %(message)s"), datefmt="%y-%m-%d %H:%M:%S")
+
+    handler = logging.handlers.TimedRotatingFileHandler(
+            "%s/%s" % (LOG_FILE_DIR, "app.log"), when="H", interval=1,
+            backupCount=20)
+    handler.setFormatter(formatter)
+
+    root_logger = logging.getLogger()
+    root_logger.addHandler(handler)
+    root_logger.setLevel(logging.NOTSET)
diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index e4b4929..785ac61 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -4,7 +4,7 @@
 Contains all website/framework-specific Class crawlers.
 """
 
-import requests, time, threading
+import logging, requests, time, threading
 
 from bitshift.crawler import indexer
 
@@ -22,6 +22,7 @@ class GitHubCrawler(threading.Thread):
     :ivar clone_queue: (:class:`Queue.Queue`) Contains :class:`GitRepository`
     with repository metadata retrieved by :class:`GitHubCrawler`, and other Git
     crawlers, to be processed by :class:`indexer.GitIndexer`.
+    :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
     """
 
     AUTHENTICATION = {
@@ -39,6 +40,9 @@ class GitHubCrawler(threading.Thread):
         """
 
         self.clone_queue = clone_queue
+        self._logger = logging.getLogger("%s.%s" %
+                (__name__, self.__class__.__name__))
+        self._logger.info("Starting.")
         super(GitHubCrawler, self).__init__(name=self.__class__.__name__)
 
     def run(self):
@@ -61,11 +65,17 @@ class GitHubCrawler(threading.Thread):
             try:
                 response = requests.get(next_api_url,
                         params=self.AUTHENTICATION)
-            except ConnectionError as exception:
+            except ConnectionError as excep:
+                self._logger.warning("API %s call failed: %s: %s",
+                        next_api_url, excep.__class__.__name__, excep)
+                time.sleep(0.5)
                 continue
 
             queue_percent_full = (float(self.clone_queue.qsize()) /
                     self.clone_queue.maxsize) * 100
+            self._logger.info("API call made. Queue size: %d/%d, %d%%." %
+                    ((self.clone_queue.qsize(), self.clone_queue.maxsize,
+                    queue_percent_full)))
 
             for repo in response.json():
                 while self.clone_queue.full():
@@ -73,15 +83,15 @@ class GitHubCrawler(threading.Thread):
 
                 self.clone_queue.put(indexer.GitRepository(
                         repo["html_url"], repo["full_name"].replace("/", ""),
-                        "GitHub"))
+                        "GitHub",
+                        #self._get_repo_stars(repo["full_name"]))
+                        0))
 
             if int(response.headers["x-ratelimit-remaining"]) == 0:
                 time.sleep(int(response.headers["x-ratelimit-reset"]) -
                         time.time())
 
             next_api_url = response.headers["link"].split(">")[0][1:]
-            with open(".github_api.log", "w") as log_file:
-                log_file.write("%s\n" % next_api_url)
 
             sleep_time = api_request_interval - (time.time() - start_time)
             if sleep_time > 0:
@@ -105,7 +115,6 @@ class GitHubCrawler(threading.Thread):
 
         API_URL = "https://api.github.com/search/repositories"
 
-
         params = self.AUTHENTICATION
         params["q"] = "repo:%s" % repo_name
 
@@ -116,9 +125,18 @@ class GitHubCrawler(threading.Thread):
                 })
 
         if int(resp.headers["x-ratelimit-remaining"]) == 0:
-            time.sleep(int(resp.headers["x-ratelimit-reset"]) - time.time())
+            sleep_time = int(resp.headers["x-ratelimit-reset"]) - time.time()
+            if sleep_time > 0:
+                logging.info("API quota exceeded. Sleep time: %d." % sleep_time)
+                time.sleep(sleep_time)
 
-        return int(resp.json()["items"][0]["stargazers_count"])
+        if "items" not in resp.json() or len(resp.json()["items"]) == 0:
+            self._logger.critical("No API result: %s. Result: %s" % (resp.url,
+                    str(resp.json())))
+            return 0
+        else:
+            rank = float(resp.json()["items"][0]["stargazers_count"]) / 1000
+            return rank if rank < 1.0 else 1.0
 
 class BitbucketCrawler(threading.Thread):
     """
@@ -131,6 +149,7 @@ class BitbucketCrawler(threading.Thread):
 
     :ivar clone_queue: (:class:`Queue.Queue`) The shared queue to insert
         :class:`indexer.GitRepository` repository urls into.
+    :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
     """
 
     def __init__(self, clone_queue):
@@ -143,6 +162,9 @@ class BitbucketCrawler(threading.Thread):
         """
 
         self.clone_queue = clone_queue
+        self._logger = logging.getLogger("%s.%s" %
+                (__name__, self.__class__.__name__))
+        self._logger.info("Starting.")
         super(BitbucketCrawler, self).__init__(name=self.__class__.__name__)
 
     def run(self):
@@ -162,10 +184,15 @@ class BitbucketCrawler(threading.Thread):
                 response = requests.get(next_api_url).json()
             except ConnectionError as exception:
                 time.sleep(0.5)
+                self._logger.warning("API %s call failed: %s: %s",
+                        next_api_url, excep.__class__.__name__, excep)
                 continue
 
             queue_percent_full = (float(self.clone_queue.qsize()) /
                     self.clone_queue.maxsize) * 100
+            self._logger.info("API call made. Queue size: %d/%d, %d%%." %
+                    ((self.clone_queue.qsize(), self.clone_queue.maxsize,
+                    queue_percent_full)))
 
             for repo in response["values"]:
                 if repo["scm"] == "git":
@@ -181,7 +208,4 @@ class BitbucketCrawler(threading.Thread):
                         clone_url, repo["full_name"], "Bitbucket"))
 
             next_api_url = response["next"]
-            with open(".bitbucket_api.log", "w") as log_file:
-                log_file.write("%s\n" % next_api_url)
-
             time.sleep(0.2)
diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py
index d2ef907..69c579c 100644
--- a/bitshift/crawler/indexer.py
+++ b/bitshift/crawler/indexer.py
@@ -3,7 +3,8 @@
     repositories.
 """
 
-import bs4, os, Queue, re, shutil, string, subprocess, time, threading
+import bs4, datetime, logging, os, Queue, re, shutil, string, subprocess, time,\
+        threading
 
 from ..database import Database
 from ..codelet import Codelet
@@ -11,6 +12,9 @@ from ..codelet import Codelet
 GIT_CLONE_DIR = "/tmp/bitshift"
 THREAD_QUEUE_SLEEP = 0.5
 
+import pymongo #debug
+db = pymongo.MongoClient().bitshift #debug
+
 class GitRepository(object):
     """
     A representation of a Git repository's metadata.
@@ -19,24 +23,29 @@ class GitRepository(object):
     :ivar name: (str) The name of the repository.
     :ivar framework_name: (str) The name of the online Git framework that the
         repository belongs to (eg, GitHub, BitBucket).
+    :ivar rank: (float) The rank of the repository, as assigned by
+        :class:`crawler.GitHubCrawler`.
     """
 
-    def __init__(self, url, name, framework_name):
+    def __init__(self, url, name, framework_name, rank):
         """
         Create a GitRepository instance.
 
         :param url: see :attr:`GitRepository.url`
         :param name: see :attr:`GitRepository.name`
         :param framework_name: see :attr:`GitRepository.framework_name`
+        :param rank: see :attr:`GitRepository.rank`
 
         :type url: str
         :type name: str
         :type framework_name: str
+        :type rank: float
         """
 
         self.url = url
         self.name = name
         self.framework_name = framework_name
+        self.rank = rank
 
 class GitIndexer(threading.Thread):
     """
@@ -50,6 +59,7 @@ class GitIndexer(threading.Thread):
         cloned by :class:`_GitCloner`, which are to be indexed.
     :ivar git_cloner: (:class:`_GitCloner`) The corresponding repository cloner,
         which feeds :class:`GitIndexer`.
+    :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
     """
 
     def __init__(self, clone_queue):
@@ -66,6 +76,9 @@ class GitIndexer(threading.Thread):
         self.index_queue = Queue.Queue(maxsize=MAX_INDEX_QUEUE_SIZE)
         self.git_cloner = _GitCloner(clone_queue, self.index_queue)
         self.git_cloner.start()
+        self._logger = logging.getLogger("%s.%s" %
+                (__name__, self.__class__.__name__))
+        self._logger.info("Starting.")
 
         if not os.path.exists(GIT_CLONE_DIR):
             os.makedirs(GIT_CLONE_DIR)
@@ -88,52 +101,43 @@ class GitIndexer(threading.Thread):
 
             repo = self.index_queue.get()
             self.index_queue.task_done()
-            try:
-                self._index_repository(repo.url, repo.name, repo.framework_name)
-            except Exception as exception:
-                pass
+            # try:
+            self._index_repository(repo)
+            # except Exception as excep:
+                # self._logger.warning("%s: %s.", excep.__class__.__name__, excep)
 
-    def _index_repository(self, repo_url, repo_name, framework_name):
+    def _index_repository(self, repo):
         """
         Clone and index (create and insert Codeletes for) a Git repository.
 
-        `git clone` the Git repository located at **repo_url**, call
-        _insert_repository_codelets, then remove said repository.
+        `git clone` the Git repository located at **repo.url**, call
+        `_insert_repository_codelets()`, then remove said repository.
 
-        :param repo_url: The url the Git repository was cloned from.
-        :param repo_name: The name of the repository.
-        :param framework_name: The name of the framework the repository is from.
+        :param repo_url: The metadata of the repository to be indexed.
 
-        :type repo_url: str
-        :type repo_name: str
-        :type framework_name: str
+        :type repo_url: :class:`GitRepository`
         """
 
-        with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo_name)) as repository_dir:
-            try:
-                self._insert_repository_codelets(repo_url, repo_name,
-                        framework_name)
-            except Exception as exception:
-                pass
+        with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir:
+            # try:
+            self._insert_repository_codelets(repo)
+            # except Exception as excep:
+                # self._logger.warning("%s: %s.", excep.__class__.__name__, excep)
 
-        if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo_name)):
-            shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo_name))
+        if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
+            shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
 
-    def _insert_repository_codelets(self, repo_url, repo_name, framework_name):
+    def _insert_repository_codelets(self, repo):
         """
         Create and insert a Codelet for the files inside a Git repository.
 
-        Create a new Codelet, and insert it into the Database singleton, for every
-        file inside the current working directory's default branch (usually
-        *master*).
+        Create a new Codelet, and insert it into the Database singleton, for
+        every file inside the current working directory's default branch
+        (usually *master*).
 
-        :param repo_url: The url the Git repository was cloned from.
-        :param repo_name: The name of the repository.
-        :param framework_name: The name of the framework the repository is from.
+        :param repo_url: The metadata of the repository to be indexed.
 
-        :type repo_url: str
-        :type repo_name: str
-        :type framework_name: str
+        :type repo_url: :class:`GitRepository`
         """
 
         commits_meta = _get_commits_metadata()
@@ -142,7 +146,6 @@ class GitIndexer(threading.Thread):
 
         for filename in commits_meta.keys():
             try:
-                source = ""
                 with open(filename) as source_file:
                     source = _decode(source_file.read())
                     if source is None:
@@ -152,13 +155,14 @@ class GitIndexer(threading.Thread):
 
             authors = [(_decode(author),) for author in \
                     commits_meta[filename]["authors"]]
-            codelet = Codelet("%s:%s" % (repo_name, filename), source, filename,
-                            None, authors, _generate_file_url(filename, repo_url,
-                                    framework_name),
+            codelet = Codelet("%s:%s" % (repo.name, filename), source, filename,
+                            None, authors, _generate_file_url(filename,
+                                    repo.url, repo.framework_name),
                             commits_meta[filename]["time_created"],
-                            commits_meta[filename]["time_last_modified"])
+                            commits_meta[filename]["time_last_modified"],
+                            repo.rank)
 
-            # Database.insert(codelet)
+            db.codelets.insert(codelet.__dict__) #debug
 
 class _GitCloner(threading.Thread):
     """
@@ -171,6 +175,7 @@ class _GitCloner(threading.Thread):
         :attr:`crawler.GitHubCrawler.clone_queue`.
     :ivar index_queue: (:class:`Queue.Queue`) see
         :attr:`GitIndexer.index_queue`.
+    :ivar _logger: (:class:`logging.Logger`) A class-specific logger object.
     """
 
     def __init__(self, clone_queue, index_queue):
@@ -186,6 +191,9 @@ class _GitCloner(threading.Thread):
 
         self.clone_queue = clone_queue
         self.index_queue = index_queue
+        self._logger = logging.getLogger("%s.%s" %
+                (__name__, self.__class__.__name__))
+        self._logger.info("Starting.")
         super(_GitCloner, self).__init__(name=self.__class__.__name__)
 
     def run(self):
@@ -339,11 +347,11 @@ def _get_git_commits():
            sample_returned_array = [
                {
                    "author" : (str) "author"
-                   "timestamp" : (int) 1396919293,
+                   "timestamp" : (`datetime.datetime`) <object>,
                    "filenames" : (str array) ["file1", "file2"]
                }
            ]
-    :rtype: dictionary
+    :rtype: array of dictionaries
     """
 
     git_log = subprocess.check_output(("git --no-pager log --name-only"
@@ -355,7 +363,7 @@ def _get_git_commits():
         if len(fields) > 2:
             commits.append({
                 "author" : fields[0],
-                "timestamp" : int(fields[1]),
+                "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
                 "filenames" : fields[2].split("\x00")[:-2]
             })
 
@@ -374,28 +382,14 @@ def _get_tracked_files():
     :rtype: str array
     """
 
-    GIT_IGNORE_FILES = [".*licen[cs]e.*", ".*readme.*"]
-    GIT_IGNORE_EXTENSIONS = ["t[e]?xt(ile)?", "m(ark)?down", "mkd[n]?",
-            "md(wn|t[e]?xt)?", "rst"]
-
     files = []
     for dirname, subdir_names, filenames in os.walk("."):
         for filename in filenames:
             path = os.path.join(dirname, filename)
             if _is_ascii(path):
-                files.append(path)
-
-    valuable_files = []
-    for filename in files:
-        filename_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
-                for pattern in GIT_IGNORE_FILES])
-        extension = filename.split(".")[-1]
-        extension_match = any([re.match(pattern, filename, flags=re.IGNORECASE)
-                for pattern in GIT_IGNORE_EXTENSIONS])
+                files.append(path[2:])
 
-        if not (filename_match or extension_match):
-            valuable_files.append(filename[2:])
-    return valuable_files
+    return files
 
 def _get_commits_metadata():
     """
@@ -407,11 +401,11 @@ def _get_commits_metadata():
                sample_returned_dict = {
                    "my_file" : {
                        "authors" : (str array) ["author1", "author2"],
-                       "time_created" : (int) 1395939566,
-                       "time_last_modified" : (int) 1396920409
+                       "time_created" : (`datetime.datetime`) <object>,
+                       "time_last_modified" : (`datetime.datetime`) <object>
                    }
                }
-    :rtype: dictionary
+    :rtype: dictionary of dictionaries
     """
 
     commits = _get_git_commits()

From d142f1fd55dc180900dd564810e94464f8debbb0 Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Sat, 3 May 2014 15:22:29 -0400
Subject: [PATCH 40/42] Complete Crawler. Close #15, #14, #11, #8.

Several of the closed issues were addressed partly in previous commits;
definitively close them with this, for the moment, final update to the crawler
package.

Ref:
    bitshift/crawler/indexer.py
            -move all `GitIndexer` specific functions (eg, `_decode`,
            `_is_ascii()`)from the global scope to the class definition.
---
 bitshift/codelet.py         |  53 +++---
 bitshift/crawler/indexer.py | 417 ++++++++++++++++++++++----------------------
 2 files changed, 236 insertions(+), 234 deletions(-)

diff --git a/bitshift/codelet.py b/bitshift/codelet.py
index 9568a4d..453ace0 100644
--- a/bitshift/codelet.py
+++ b/bitshift/codelet.py
@@ -7,37 +7,43 @@ class Codelet(object):
     :ivar name: (str) A suitable name for the codelet.
     :ivar code: (str) A containing the raw source code.
     :ivar filename: (str, or None) The filename of the snippet.
-    :ivar language: (str, or None) The inferred language of `code`.
-    :ivar authors: (array of str tuples) An array of tuples containing an
-        author's name and profile URL (on the service the code was pulled from).
+    :ivar language: (int, or None) The inferred language of `code`.
+    :ivar authors: (array of tuples (str, str or None)) An array of tuples
+        containing an author's name and profile URL (on the service the code
+        was pulled from).
     :ivar code_url: (str) The url of the (page containing the) source code.
-    :ivar date_created: (str, or None) The date the code was published.
-    :ivar date_modified: (str, or None) The date the code was last modified.
+    :ivar date_created: (:class:`datetime.datetime`, or None) The date the code
+        was published.
+    :ivar date_modified: (:class:`datetime.datetime`, or None) The date the
+        code was last modified.
+    :ivar rank: (float) A quanitification of the source code's quality, as
+        per available ratings (stars, forks, upvotes, etc.).
     """
 
     def __init__(self, name, code, filename, language, authors, code_url,
-                 date_created, date_modified):
+            date_created, date_modified, rank):
         """
         Create a Codelet instance.
 
-        :param name: The name of the codelet.
-        :param code: The raw source code.
-        :param filename: The filename of the code, if any.
-        :param language: The inferred language.
-        :param authors: An array of tuples containing an author's name and
-            profile URL (on the service the code was pulled from).
-        :param code_url: The url of the (page containing the) source code.
-        :param date_created: The date the code was published.
-        :param date_modified: The date the code was last modified.
+        :param name: see :attr:`self.name`
+        :param code: see :attr:`self.code`
+        :param filename: see :attr:`self.filename`
+        :param language: see :attr:`self.language`
+        :param authors: see :attr:`self.authors`
+        :param code_url: see :attr:`self.code_url`
+        :param date_created: see :attr:`self.date_created`
+        :param date_modified: see :attr:`self.date_modified`
+        :param rank: see :attr:`self.rank`
 
-        :type name: str
-        :type code: str
-        :type filename: str, or None
-        :type language: str, or None
-        :type authors: array of str tuples, or None
-        :type code_url: str
-        :type date_created: str, or None
-        :type date_modified: str, or None
+        :type name: see :attr:`self.name`
+        :type code: see :attr:`self.code`
+        :type filename: see :attr:`self.filename`
+        :type language: see :attr:`self.language`
+        :type authors: see :attr:`self.authors`
+        :type code_url: see :attr:`self.code_url`
+        :type date_created: see :attr:`self.date_created`
+        :type date_modified: see :attr:`self.date_modified`
+        :type rank: see :attr:`self.rank`
         """
 
         self.name = name
@@ -48,3 +54,4 @@ class Codelet(object):
         self.code_url = code_url
         self.date_created = date_created
         self.date_modified = date_modified
+        self.rank = rank
diff --git a/bitshift/crawler/indexer.py b/bitshift/crawler/indexer.py
index 69c579c..c1c77ad 100644
--- a/bitshift/crawler/indexer.py
+++ b/bitshift/crawler/indexer.py
@@ -12,9 +12,6 @@ from ..codelet import Codelet
 GIT_CLONE_DIR = "/tmp/bitshift"
 THREAD_QUEUE_SLEEP = 0.5
 
-import pymongo #debug
-db = pymongo.MongoClient().bitshift #debug
-
 class GitRepository(object):
     """
     A representation of a Git repository's metadata.
@@ -101,10 +98,10 @@ class GitIndexer(threading.Thread):
 
             repo = self.index_queue.get()
             self.index_queue.task_done()
-            # try:
-            self._index_repository(repo)
-            # except Exception as excep:
-                # self._logger.warning("%s: %s.", excep.__class__.__name__, excep)
+            try:
+                self._index_repository(repo)
+            except Exception as excep:
+                self._logger.warning("%s: %s.", excep.__class__.__name__, excep)
 
     def _index_repository(self, repo):
         """
@@ -119,10 +116,10 @@ class GitIndexer(threading.Thread):
         """
 
         with _ChangeDir("%s/%s" % (GIT_CLONE_DIR, repo.name)) as repository_dir:
-            # try:
-            self._insert_repository_codelets(repo)
-            # except Exception as excep:
-                # self._logger.warning("%s: %s.", excep.__class__.__name__, excep)
+            try:
+                self._insert_repository_codelets(repo)
+            except Exception as excep:
+                self._logger.warning("%s: %s.", excep.__class__.__name__, excep)
 
         if os.path.isdir("%s/%s" % (GIT_CLONE_DIR, repo.name)):
             shutil.rmtree("%s/%s" % (GIT_CLONE_DIR, repo.name))
@@ -140,29 +137,222 @@ class GitIndexer(threading.Thread):
         :type repo_url: :class:`GitRepository`
         """
 
-        commits_meta = _get_commits_metadata()
+        commits_meta = self._get_commits_metadata()
         if commits_meta is None:
             return
 
         for filename in commits_meta.keys():
             try:
                 with open(filename) as source_file:
-                    source = _decode(source_file.read())
+                    source = self._decode(source_file.read())
                     if source is None:
                         continue
             except IOError as exception:
                 continue
 
-            authors = [(_decode(author),) for author in \
+            authors = [(self._decode(author), None) for author in \
                     commits_meta[filename]["authors"]]
             codelet = Codelet("%s:%s" % (repo.name, filename), source, filename,
-                            None, authors, _generate_file_url(filename,
+                            None, authors, self._generate_file_url(filename,
                                     repo.url, repo.framework_name),
                             commits_meta[filename]["time_created"],
                             commits_meta[filename]["time_last_modified"],
                             repo.rank)
 
-            db.codelets.insert(codelet.__dict__) #debug
+    def _generate_file_url(self, filename, repo_url, framework_name):
+        """
+        Return a url for a filename from a Git wrapper framework.
+
+        :param filename: The path of the file.
+        :param repo_url: The url of the file's parent repository.
+        :param framework_name: The name of the framework the repository is from.
+
+        :type filename: str
+        :type repo_url: str
+        :type framework_name: str
+
+        :return: The file's full url on the given framework, if successfully
+            derived.
+        :rtype: str, or None
+
+        .. warning::
+            Various Git subprocesses will occasionally fail, and, seeing as the
+            information they provide is a crucial component of some repository file
+            urls, None may be returned.
+        """
+
+        try:
+            if framework_name == "GitHub":
+                    default_branch = subprocess.check_output("git branch"
+                            " --no-color", shell=True)[2:-1]
+                    return ("%s/blob/%s/%s" % (repo_url, default_branch,
+                            filename)).replace("//", "/")
+            elif framework_name == "Bitbucket":
+                    commit_hash = subprocess.check_output("git rev-parse HEAD",
+                            shell=True).replace("\n", "")
+                    return ("%s/src/%s/%s" % (repo_url, commit_hash,
+                            filename)).replace("//", "/")
+        except subprocess.CalledProcessError as exception:
+            return None
+
+    def _get_git_commits(self):
+        """
+        Return the current working directory's formatted commit data.
+
+        Uses `git log` to generate metadata about every single file in the
+        repository's commit history.
+
+        :return: The author, timestamp, and names of all modified files of every
+            commit.
+            .. code-block:: python
+               sample_returned_array = [
+                   {
+                       "author" : (str) "author"
+                       "timestamp" : (`datetime.datetime`) <object>,
+                       "filenames" : (str array) ["file1", "file2"]
+                   }
+               ]
+        :rtype: array of dictionaries
+        """
+
+        git_log = subprocess.check_output(("git --no-pager log --name-only"
+                " --pretty=format:'%n%n%an%n%at' -z"), shell=True)
+
+        commits = []
+        for commit in git_log.split("\n\n"):
+            fields = commit.split("\n")
+            if len(fields) > 2:
+                commits.append({
+                    "author" : fields[0],
+                    "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
+                    "filenames" : fields[2].split("\x00")[:-2]
+                })
+
+        return commits
+
+    def _get_tracked_files(self):
+        """
+        Return a list of the filenames of all valuable files in the Git repository.
+
+        Get a list of the filenames of the non-binary (Perl heuristics used for
+        filetype identification) files currently inside the current working
+        directory's Git repository. Then, weed out any boilerplate/non-code files
+        that match the regex rules in GIT_IGNORE_FILES.
+
+        :return: The filenames of all index-worthy non-binary files.
+        :rtype: str array
+        """
+
+        files = []
+        for dirname, subdir_names, filenames in os.walk("."):
+            for filename in filenames:
+                path = os.path.join(dirname, filename)
+                if self._is_ascii(path):
+                    files.append(path[2:])
+
+        return files
+
+    def _get_commits_metadata(self):
+        """
+        Return a dictionary containing every valuable tracked file's metadata.
+
+        :return: A dictionary with author names, time of creation, and time of last
+            modification for every filename key.
+            .. code-block:: python
+                   sample_returned_dict = {
+                       "my_file" : {
+                           "authors" : (str array) ["author1", "author2"],
+                           "time_created" : (`datetime.datetime`) <object>,
+                           "time_last_modified" : (`datetime.datetime`) <object>
+                       }
+                   }
+        :rtype: dictionary of dictionaries
+        """
+
+        commits = self._get_git_commits()
+        tracked_files = self._get_tracked_files()
+
+        files_meta = {}
+        for commit in commits:
+            for filename in commit["filenames"]:
+                if filename not in tracked_files:
+                    continue
+
+                if filename not in files_meta.keys():
+                    files_meta[filename] = {
+                        "authors" : [commit["author"]],
+                        "time_last_modified" : commit["timestamp"],
+                        "time_created" : commit["timestamp"]
+                    }
+                else:
+                    if commit["author"] not in files_meta[filename]["authors"]:
+                        files_meta[filename]["authors"].append(commit["author"])
+                    files_meta[filename]["time_created"] = commit["timestamp"]
+
+        return files_meta
+
+    def _decode(self, raw):
+        """
+        Return a decoded a raw string.
+
+        :param raw: The string to string.
+
+        :type raw: (str)
+
+        :return: If the original encoding is successfully inferenced, return the
+            decoded string.
+        :rtype: str, or None
+
+        .. warning::
+            The raw string's original encoding is identified by heuristics which
+            can, and occasionally will, fail. Decoding will then fail, and None
+            will be returned.
+        """
+
+        try:
+            encoding = bs4.BeautifulSoup(raw).original_encoding
+            return raw.decode(encoding) if encoding is not None else None
+
+        except (LookupError, UnicodeDecodeError, UserWarning) as exception:
+            return None
+
+    def _is_ascii(self, filename):
+        """
+        Heuristically determine whether a file is ASCII text or binary.
+
+        If a portion of the file contains null bytes, or the percentage of bytes
+        that aren't ASCII is greater than 30%, then the file is concluded to be
+        binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T`
+        operator, and is the de-facto method for in : passdetermining whether a
+        file is ASCII.
+
+        :param filename: The path of the file to test.
+
+        :type filename: str
+
+        :return: Whether the file is probably ASCII.
+        :rtype: Boolean
+        """
+
+        try:
+            with open(filename) as source:
+                file_snippet = source.read(512)
+
+                if not file_snippet:
+                    return True
+
+                ascii_characters = "".join(map(chr, range(32, 127)) +
+                        list("\n\r\t\b"))
+                null_trans = string.maketrans("", "")
+
+                if "\0" in file_snippet:
+                    return False
+
+                non_ascii = file_snippet.translate(null_trans, ascii_characters)
+                return not float(len(non_ascii)) / len(file_snippet) > 0.30
+
+        except IOError as exception:
+            return False
 
 class _GitCloner(threading.Thread):
     """
@@ -297,198 +487,3 @@ class _ChangeDir(object):
         """
 
         os.chdir(self.old_path)
-
-def _generate_file_url(filename, repo_url, framework_name):
-    """
-    Return a url for a filename from a Git wrapper framework.
-
-    :param filename: The path of the file.
-    :param repo_url: The url of the file's parent repository.
-    :param framework_name: The name of the framework the repository is from.
-
-    :type filename: str
-    :type repo_url: str
-    :type framework_name: str
-
-    :return: The file's full url on the given framework, if successfully
-        derived.
-    :rtype: str, or None
-
-    .. warning::
-        Various Git subprocesses will occasionally fail, and, seeing as the
-        information they provide is a crucial component of some repository file
-        urls, None may be returned.
-    """
-
-    try:
-        if framework_name == "GitHub":
-                default_branch = subprocess.check_output("git branch"
-                        " --no-color", shell=True)[2:-1]
-                return ("%s/blob/%s/%s" % (repo_url, default_branch,
-                        filename)).replace("//", "/")
-        elif framework_name == "Bitbucket":
-                commit_hash = subprocess.check_output("git rev-parse HEAD",
-                        shell=True).replace("\n", "")
-                return ("%s/src/%s/%s" % (repo_url, commit_hash,
-                        filename)).replace("//", "/")
-    except subprocess.CalledProcessError as exception:
-        return None
-
-def _get_git_commits():
-    """
-    Return the current working directory's formatted commit data.
-
-    Uses `git log` to generate metadata about every single file in the
-    repository's commit history.
-
-    :return: The author, timestamp, and names of all modified files of every
-        commit.
-        .. code-block:: python
-           sample_returned_array = [
-               {
-                   "author" : (str) "author"
-                   "timestamp" : (`datetime.datetime`) <object>,
-                   "filenames" : (str array) ["file1", "file2"]
-               }
-           ]
-    :rtype: array of dictionaries
-    """
-
-    git_log = subprocess.check_output(("git --no-pager log --name-only"
-            " --pretty=format:'%n%n%an%n%at' -z"), shell=True)
-
-    commits = []
-    for commit in git_log.split("\n\n"):
-        fields = commit.split("\n")
-        if len(fields) > 2:
-            commits.append({
-                "author" : fields[0],
-                "timestamp" : datetime.datetime.fromtimestamp(int(fields[1])),
-                "filenames" : fields[2].split("\x00")[:-2]
-            })
-
-    return commits
-
-def _get_tracked_files():
-    """
-    Return a list of the filenames of all valuable files in the Git repository.
-
-    Get a list of the filenames of the non-binary (Perl heuristics used for
-    filetype identification) files currently inside the current working
-    directory's Git repository. Then, weed out any boilerplate/non-code files
-    that match the regex rules in GIT_IGNORE_FILES.
-
-    :return: The filenames of all index-worthy non-binary files.
-    :rtype: str array
-    """
-
-    files = []
-    for dirname, subdir_names, filenames in os.walk("."):
-        for filename in filenames:
-            path = os.path.join(dirname, filename)
-            if _is_ascii(path):
-                files.append(path[2:])
-
-    return files
-
-def _get_commits_metadata():
-    """
-    Return a dictionary containing every valuable tracked file's metadata.
-
-    :return: A dictionary with author names, time of creation, and time of last
-        modification for every filename key.
-        .. code-block:: python
-               sample_returned_dict = {
-                   "my_file" : {
-                       "authors" : (str array) ["author1", "author2"],
-                       "time_created" : (`datetime.datetime`) <object>,
-                       "time_last_modified" : (`datetime.datetime`) <object>
-                   }
-               }
-    :rtype: dictionary of dictionaries
-    """
-
-    commits = _get_git_commits()
-    tracked_files  = _get_tracked_files()
-
-    files_meta = {}
-    for commit in commits:
-        for filename in commit["filenames"]:
-            if filename not in tracked_files:
-                continue
-
-            if filename not in files_meta.keys():
-                files_meta[filename] = {
-                    "authors" : [commit["author"]],
-                    "time_last_modified" : commit["timestamp"],
-                    "time_created" : commit["timestamp"]
-                }
-            else:
-                if commit["author"] not in files_meta[filename]["authors"]:
-                    files_meta[filename]["authors"].append(commit["author"])
-                files_meta[filename]["time_created"] = commit["timestamp"]
-
-    return files_meta
-
-def _decode(raw):
-    """
-    Return a decoded a raw string.
-
-    :param raw: The string to string.
-
-    :type raw: (str)
-
-    :return: If the original encoding is successfully inferenced, return the
-        decoded string.
-    :rtype: str, or None
-
-    .. warning::
-        The raw string's original encoding is identified by heuristics which
-        can, and occasionally will, fail. Decoding will then fail, and None
-        will be returned.
-    """
-
-    try:
-        encoding = bs4.BeautifulSoup(raw).original_encoding
-        return raw.decode(encoding) if encoding is not None else None
-
-    except (LookupError, UnicodeDecodeError, UserWarning) as exception:
-        return None
-
-def _is_ascii(filename):
-    """
-    Heuristically determine whether a file is ASCII text or binary.
-
-    If a portion of the file contains null bytes, or the percentage of bytes
-    that aren't ASCII is greater than 30%, then the file is concluded to be
-    binary. This heuristic is used by the `file` utility, Perl's inbuilt `-T`
-    operator, and is the de-facto method for in : passdetermining whether a
-    file is ASCII.
-
-    :param filename: The path of the file to test.
-
-    :type filename: str
-
-    :return: Whether the file is probably ASCII.
-    :rtype: Boolean
-    """
-
-    try:
-        with open(filename) as source:
-            file_snippet = source.read(512)
-
-            if not file_snippet:
-                return True
-
-            ascii_characters = "".join(map(chr, range(32, 127)) +
-                    list("\n\r\t\b"))
-            null_trans = string.maketrans("", "")
-
-            if "\0" in file_snippet:
-                return False
-
-            non_ascii = file_snippet.translate(null_trans, ascii_characters)
-            return not float(len(non_ascii)) / len(file_snippet) > 0.30
-
-    except IOError as exception:
-        return False

From 7c5c9fc7e1c99c1d67146570c43e60d0b04c899f Mon Sep 17 00:00:00 2001
From: Severyn Kozak <severyn.kozak@gmail.com>
Date: Sat, 3 May 2014 22:20:12 -0400
Subject: [PATCH 41/42] Add GitHub stars, Bitbucket watchers; close #14.

Add:
    bitshift/crawler/crawler.py
        -Add more efficient method of querying GitHub's API for stargazer
        counts, by batching 25 repositories per request.
        -Add watcher counts for Bitbucket repositories, by querying the
        Bitbucket API once per repository (inefficient, but the API in question
        isn't sufficiently robust to accommodate a better approach, and Git
        repositories surface so infrequently that there shouldn't be any query
        limit problems).
---
 bitshift/crawler/__init__.py |   1 +
 bitshift/crawler/crawler.py  | 111 +++++++++++++++++++++++++++----------------
 2 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/bitshift/crawler/__init__.py b/bitshift/crawler/__init__.py
index cfec64c..73b1c22 100644
--- a/bitshift/crawler/__init__.py
+++ b/bitshift/crawler/__init__.py
@@ -39,6 +39,7 @@ def _configure_logging():
         os.mkdir(LOG_FILE_DIR)
 
     logging.getLogger("requests").setLevel(logging.WARNING)
+    logging.getLogger("urllib3").setLevel(logging.WARNING)
 
     formatter = logging.Formatter(
             fmt=("%(asctime)s %(levelname)s %(name)s %(funcName)s"
diff --git a/bitshift/crawler/crawler.py b/bitshift/crawler/crawler.py
index 785ac61..9501bd0 100644
--- a/bitshift/crawler/crawler.py
+++ b/bitshift/crawler/crawler.py
@@ -63,8 +63,7 @@ class GitHubCrawler(threading.Thread):
             start_time = time.time()
 
             try:
-                response = requests.get(next_api_url,
-                        params=self.AUTHENTICATION)
+                resp = requests.get(next_api_url, params=self.AUTHENTICATION)
             except ConnectionError as excep:
                 self._logger.warning("API %s call failed: %s: %s",
                         next_api_url, excep.__class__.__name__, excep)
@@ -77,66 +76,84 @@ class GitHubCrawler(threading.Thread):
                     ((self.clone_queue.qsize(), self.clone_queue.maxsize,
                     queue_percent_full)))
 
-            for repo in response.json():
+            repo_names = [repo["full_name"] for repo in resp.json()]
+            repo_stars = self._get_repositories_stars(repo_names)
+
+            for repo in resp.json():
                 while self.clone_queue.full():
                     time.sleep(1)
 
                 self.clone_queue.put(indexer.GitRepository(
                         repo["html_url"], repo["full_name"].replace("/", ""),
-                        "GitHub",
-                        #self._get_repo_stars(repo["full_name"]))
-                        0))
+                        "GitHub", repo_stars[repo["full_name"]]))
 
-            if int(response.headers["x-ratelimit-remaining"]) == 0:
-                time.sleep(int(response.headers["x-ratelimit-reset"]) -
+            if int(resp.headers["x-ratelimit-remaining"]) == 0:
+                time.sleep(int(resp.headers["x-ratelimit-reset"]) -
                         time.time())
 
-            next_api_url = response.headers["link"].split(">")[0][1:]
+            next_api_url = resp.headers["link"].split(">")[0][1:]
 
             sleep_time = api_request_interval - (time.time() - start_time)
             if sleep_time > 0:
                 time.sleep(sleep_time)
 
-    def _get_repo_stars(self, repo_name):
+    def _get_repositories_stars(self, repo_names):
         """
-        Return the number of stargazers for a repository.
+        Return the number of stargazers for several repositories.
 
-        Queries the GitHub API for the number of stargazers for a given
-        repository, and blocks if the query limit is exceeded.
+        Queries the GitHub API for the number of stargazers for any given
+        repositories, and blocks if the query limit is exceeded.
 
-        :param repo_name: The name of the repository, in
+        :param repo_names: An array of repository names, in
             `username/repository_name` format.
 
-        :type repo_name: str
-
-        :return: The number of stargazers for the repository.
-        :rtype: int
-        """
-
-        API_URL = "https://api.github.com/search/repositories"
+        :type repo_names: str
 
-        params = self.AUTHENTICATION
-        params["q"] = "repo:%s" % repo_name
+        :return: A dictionary with repository name keys, and corresponding
+            stargazer count values.
 
-        resp = requests.get(API_URL,
-                params=params,
-                headers={
-                    "Accept" : "application/vnd.github.preview"
-                })
+            Example dictionary:
+            .. code-block:: python
+                {
+                    "user/repository" : 100
+                }
 
-        if int(resp.headers["x-ratelimit-remaining"]) == 0:
-            sleep_time = int(resp.headers["x-ratelimit-reset"]) - time.time()
-            if sleep_time > 0:
-                logging.info("API quota exceeded. Sleep time: %d." % sleep_time)
-                time.sleep(sleep_time)
+        :rtype: dictionary
+        """
 
-        if "items" not in resp.json() or len(resp.json()["items"]) == 0:
-            self._logger.critical("No API result: %s. Result: %s" % (resp.url,
-                    str(resp.json())))
-            return 0
-        else:
-            rank = float(resp.json()["items"][0]["stargazers_count"]) / 1000
-            return rank if rank < 1.0 else 1.0
+        API_URL = "https://api.github.com/search/repositories"
+        REPOS_PER_QUERY = 25
+
+        repo_stars = {}
+        for names in [repo_names[ind:ind + REPOS_PER_QUERY] for ind in
+                xrange(0, len(repo_names), REPOS_PER_QUERY)]:
+            query_url = "%s?q=%s" % (API_URL,
+                "+".join("repo:%s" % name for name in names))
+
+            params = self.AUTHENTICATION
+            resp = requests.get(query_url,
+                    params=params,
+                    headers={
+                        "Accept" : "application/vnd.github.preview"
+                    })
+
+            if int(resp.headers["x-ratelimit-remaining"]) == 0:
+                sleep_time = int(resp.headers["x-ratelimit-reset"]) - \
+                        time.time() + 1
+                if sleep_time > 0:
+                    logging.info("API quota exceeded. Sleep time: %d." %
+                            sleep_time)
+                    time.sleep(sleep_time)
+
+            for repo in resp.json()["items"]:
+                rank = float(repo["stargazers_count"]) / 1000
+                repo_stars[repo["full_name"]] = rank if rank < 1.0 else 1.0
+
+        for name in repo_names:
+            if name not in repo_stars:
+                repo_stars[name] = 0.5
+
+        return repo_stars
 
 class BitbucketCrawler(threading.Thread):
     """
@@ -204,8 +221,20 @@ class BitbucketCrawler(threading.Thread):
                             clone_links[0]["name"] == "https" else
                             clone_links[1]["href"])
                     links.append("clone_url")
+
+                    try:
+                        watchers = requests.get(
+                                repo["links"]["watchers"]["href"])
+                        rank = len(watchers.json()["values"]) / 100
+                    except ConnectionError as exception:
+                        time.sleep(0.5)
+                        self._logger.warning("API %s call failed: %s: %s",
+                                next_api_url, excep.__class__.__name__, excep)
+                        continue
+
                     self.clone_queue.put(indexer.GitRepository(
-                        clone_url, repo["full_name"], "Bitbucket"))
+                        clone_url, repo["full_name"], "Bitbucket"),
+                        rank if rank < 1.0 else 1.0)
 
             next_api_url = response["next"]
             time.sleep(0.2)

From 56f23e682a24c3b199cc7add1447cf4130ba2657 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Sun, 4 May 2014 01:18:30 -0400
Subject: [PATCH 42/42] Database to v6; flesh out a lot of Database.search().

---
 bitshift/database/__init__.py  | 65 ++++++++++++++++++++++++++++--------------
 bitshift/database/migration.py | 30 ++++++++++++++++++-
 bitshift/database/schema.sql   | 23 +++++++++------
 3 files changed, 86 insertions(+), 32 deletions(-)

diff --git a/bitshift/database/__init__.py b/bitshift/database/__init__.py
index 9b039ca..75f39da 100644
--- a/bitshift/database/__init__.py
+++ b/bitshift/database/__init__.py
@@ -51,10 +51,15 @@ class Database(object):
                           "Run `python -m bitshift.database.migration`."
                     raise RuntimeError(err)
 
+    def _get_codelets_from_ids(self, cursor, ids):
+        """Return a list of Codelet objects given a list of codelet IDs."""
+        raise NotImplementedError()  ## TODO
+
     def _decompose_url(self, cursor, url):
         """Break up a URL into an origin (with a URL base) and a suffix."""
         query = """SELECT origin_id, SUBSTR(?, LENGTH(origin_url_base))
-                   FROM origins WHERE origin_url_base IS NOT NULL
+                   FROM origins
+                   WHERE origin_url_base IS NOT NULL
                    AND ? LIKE CONCAT(origin_url_base, "%")"""
 
         cursor.execute(query, (url, url))
@@ -88,19 +93,35 @@ class Database(object):
         :param page: The result page to display.
         :type page: int
 
-        :return: A list of search results.
-        :rtype: list of :py:class:`.Codelet`\ s
+        :return: The total number of results, and the *n*\ th page of results.
+        :rtype: 2-tuple of (long, list of :py:class:`.Codelet`\ s)
         """
-        # search for cache_hash = mmh3.hash(query.serialize() + str(page))
-        #   cache HIT:
-        #       update cache_last_used
-        #       return codelets
-        #   cache MISS:
-        #       build complex search query
-        #       fetch codelets
-        #       cache results
-        #       return codelets
-        pass
+        query1 = """SELECT cdata_codelet, cache_count_mnt, cache_count_exp
+                    FROM cache
+                    INNER JOIN cache_data ON cache_id = cdata_cache
+                    WHERE cache_id = ?"""
+        query2 = "INSERT INTO cache VALUES (?, ?, ?, DEFAULT)"
+        query3 = "INSERT INTO cache_data VALUES (?, ?)"
+
+        cache_id = mmh3.hash64(str(page) + ":" + query.serialize())[0]
+
+        with self._conn.cursor() as cursor:
+            cursor.execute(query1, (cache_id,))
+            results = cursor.fetchall()
+            if results:  # Cache hit
+                num_results = results[0][1] * (10 ** results[0][2])
+                ids = [res[0] for res in results]
+            else:  # Cache miss
+                ## TODO: build and execute search query
+                results = cursor.fetchall()
+                ids = NotImplemented  ## TODO: extract ids from results
+                num_results = NotImplemented  ## TODO: num if results else 0
+                num_exp = max(len(str(num_results)) - 3, 0)
+                num_results = int(round(num_results, -num_exp))
+                num_mnt = num_results / (10 ** num_exp)
+                cursor.execute(query2, (cache_id, num_mnt, num_exp))
+                cursor.executemany(query3, [(cache_id, c_id) for c_id in ids])
+            return (num_results, self._get_codelets_from_ids(cursor, ids))
 
     def insert(self, codelet):
         """
@@ -109,23 +130,23 @@ class Database(object):
         :param codelet: The codelet to insert.
         :type codelet: :py:class:`.Codelet`
         """
-        query1 = """INSERT INTO code VALUES (?, ?)
+        query1 = """INSERT INTO code VALUES (?, ?, ?)
                     ON DUPLICATE KEY UPDATE code_id=code_id"""
         query2 = """INSERT INTO codelets VALUES
-                    (DEFAULT, ?, ?, ?, ?, ?, ?, ?, ?)"""
+                    (DEFAULT, ?, ?, ?, ?, ?, ?, ?)"""
         query3 = "INSERT INTO authors VALUES (DEFAULT, ?, ?, ?)"
 
-        with self._conn.cursor() as cursor:
-            code_id = mmh3.hash64(codelet.code.encode("utf8"))[0]
-            origin, url = self._decompose_url(cursor, codelet.url)
+        hash_key = str(codelet.language) + ":" + codelet.code.encode("utf8")
+        code_id = mmh3.hash64(hash_key)[0]
 
-            cursor.execute(query1, (code_id, codelet.code))
+        with self._conn.cursor() as cursor:
+            cursor.execute(query1, (code_id, codelet.language, codelet.code))
             if cursor.rowcount == 1:
                 for sym_type, symbols in codelet.symbols.iteritems():
                     self._insert_symbols(cursor, code_id, sym_type, symbols)
-            cursor.execute(query2, (codelet.name, code_id, codelet.language,
-                                    origin, url, codelet.rank,
-                                    codelet.date_created,
+            origin, url = self._decompose_url(cursor, codelet.url)
+            cursor.execute(query2, (codelet.name, code_id, origin, url,
+                                    codelet.rank, codelet.date_created,
                                     codelet.date_modified))
             codelet_id = cursor.lastrowid
             authors = [(codelet_id, a[0], a[1]) for a in codelet.authors]
diff --git a/bitshift/database/migration.py b/bitshift/database/migration.py
index 743f906..24f744a 100644
--- a/bitshift/database/migration.py
+++ b/bitshift/database/migration.py
@@ -3,7 +3,7 @@ Contains information about database schema versions, and SQL queries to update
 between them.
 """
 
-VERSION = 5
+VERSION = 6
 
 MIGRATIONS = [
     # 1 -> 2
@@ -60,6 +60,34 @@ MIGRATIONS = [
            MODIFY COLUMN `origin_name` VARCHAR(64) DEFAULT NULL,
            MODIFY COLUMN `origin_url` VARCHAR(512) DEFAULT NULL,
            MODIFY COLUMN `origin_url_base` VARCHAR(512) DEFAULT NULL"""
+    ],
+    # 5 -> 6
+    [
+        """ALTER TABLE `code`
+           ADD COLUMN `code_lang` SMALLINT UNSIGNED DEFAULT NULL
+               AFTER `code_id`,
+           ADD KEY (`code_lang`)""",
+        """ALTER TABLE `codelets`
+           DROP KEY `codelet_lang`,
+           DROP COLUMN `codelet_lang`""",
+        """ALTER TABLE `cache_data`
+           DROP FOREIGN KEY `cache_data_ibfk_1`""",
+        """ALTER TABLE `cache`
+           MODIFY COLUMN `cache_id` BIGINT NOT NULL,
+           DROP COLUMN `cache_hash`,
+           DROP COLUMN `cache_last_used`,
+           MODIFY COLUMN `cache_count_mnt` SMALLINT UNSIGNED NOT NULL""",
+        """ALTER TABLE `cache_data`
+           MODIFY COLUMN `cdata_cache` BIGINT NOT NULL,
+           ADD PRIMARY KEY (`cdata_cache`, `cdata_codelet`),
+           ADD CONSTRAINT `cache_data_ibfk_1` FOREIGN KEY (`cdata_codelet`)
+               REFERENCES `codelets` (`codelet_id`)
+               ON DELETE CASCADE ON UPDATE CASCADE""",
+        """CREATE EVENT `flush_cache`
+           ON SCHEDULE EVERY 1 HOUR
+           DO
+               DELETE FROM `cache`
+                   WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY);"""
     ]
 ]
 
diff --git a/bitshift/database/schema.sql b/bitshift/database/schema.sql
index 50b4f9e..8634416 100644
--- a/bitshift/database/schema.sql
+++ b/bitshift/database/schema.sql
@@ -1,4 +1,4 @@
--- Schema version 5
+-- Schema version 6
 
 CREATE DATABASE `bitshift` DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci;
 USE `bitshift`;
@@ -6,7 +6,7 @@ USE `bitshift`;
 CREATE TABLE `version` (
     `version` INT UNSIGNED NOT NULL
 ) ENGINE=InnoDB;
-INSERT INTO `version` VALUES (5);
+INSERT INTO `version` VALUES (6);
 
 CREATE TABLE `origins` (
     `origin_id` TINYINT UNSIGNED NOT NULL AUTO_INCREMENT,
@@ -20,8 +20,10 @@ INSERT INTO `origins` VALUES (1, NULL, NULL, NULL, NULL);
 
 CREATE TABLE `code` (
     `code_id` BIGINT NOT NULL,
+    `code_lang` SMALLINT UNSIGNED DEFAULT NULL,
     `code_code` MEDIUMTEXT NOT NULL,
     PRIMARY KEY (`code_id`),
+    KEY (`code_lang`),
     FULLTEXT KEY (`code_code`)
 ) ENGINE=InnoDB;
 
@@ -29,7 +31,6 @@ CREATE TABLE `codelets` (
     `codelet_id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
     `codelet_name` VARCHAR(300) NOT NULL,
     `codelet_code_id` BIGINT NOT NULL,
-    `codelet_lang` SMALLINT UNSIGNED DEFAULT NULL,
     `codelet_origin` TINYINT UNSIGNED NOT NULL,
     `codelet_url` VARCHAR(512) NOT NULL,
     `codelet_rank` FLOAT NOT NULL,
@@ -37,7 +38,6 @@ CREATE TABLE `codelets` (
     `codelet_date_modified` DATETIME DEFAULT NULL,
     PRIMARY KEY (`codelet_id`),
     FULLTEXT KEY (`codelet_name`),
-    KEY (`codelet_lang`),
     KEY (`codelet_rank`),
     KEY (`codelet_date_created`),
     KEY (`codelet_date_modified`),
@@ -88,18 +88,17 @@ CREATE TABLE `symbol_locations` (
 ) ENGINE=InnoDB;
 
 CREATE TABLE `cache` (
-    `cache_id` INT UNSIGNED NOT NULL AUTO_INCREMENT,
-    `cache_hash` BIGINT NOT NULL,
-    `cache_count_mnt` TINYINT UNSIGNED NOT NULL,
+    `cache_id` BIGINT NOT NULL,
+    `cache_count_mnt` SMALLINT UNSIGNED NOT NULL,
     `cache_count_exp` TINYINT UNSIGNED NOT NULL,
     `cache_created` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
-    `cache_last_used` TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
     PRIMARY KEY (`cache_id`)
 ) ENGINE=InnoDB;
 
 CREATE TABLE `cache_data` (
-    `cdata_cache` INT UNSIGNED NOT NULL,
+    `cdata_cache` BIGINT NOT NULL,
     `cdata_codelet` BIGINT UNSIGNED NOT NULL,
+    PRIMARY KEY (`cdata_cache`, `cdata_codelet`),
     FOREIGN KEY (`cdata_cache`)
         REFERENCES `cache` (`cache_id`)
         ON DELETE CASCADE ON UPDATE CASCADE,
@@ -107,3 +106,9 @@ CREATE TABLE `cache_data` (
         REFERENCES `codelets` (`codelet_id`)
         ON DELETE CASCADE ON UPDATE CASCADE
 ) ENGINE=InnoDB;
+
+CREATE EVENT `flush_cache`
+    ON SCHEDULE EVERY 1 HOUR
+    DO
+        DELETE FROM `cache`
+            WHERE `cache_created` < DATE_SUB(NOW(), INTERVAL 1 DAY);