Merge branch 'feature/win_builds' into develop

9 years ago · d6d3b45cd8
--- a/+ 8
+++ b/+ 8
@@ -8,9 +8,17 @@ v0.4.1 (unreleased):
  includes when denoting tags, but not comments.
 - Fixed the behavior of preserve_spacing in Template.add() and keep_field in
  Template.remove() on parameters with hidden keys.
 - Removed _ListProxy.detach(). SmartLists now use weak references and their
  children are garbage-collected properly.
 - Fixed parser bugs involving:
  - templates with completely blank names;
  - templates with newlines and comments.
 - Heavy refactoring and fixes to the C tokenizer, including:
  - corrected a design flaw in text handling, allowing for substantial speed
    improvements when parsing long strings of plain text;
  - implemented new Python 3.3 PEP 393 Unicode APIs.
 - Fixed various bugs in SmartList, including one that was causing memory issues
  on 64-bit builds of Python 2 on Windows.
 - Fixed some bugs in the release scripts.

 v0.4 (released May 23, 2015):
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -0,0 +1,64 @@
 # This config file is used by appveyor.com to build Windows release binaries

 version: 0.4.1.dev0-b{build}

 branches:
  only:
    - master

 skip_tags: true

 environment:
  global:
    # See: http://stackoverflow.com/a/13751649/163740
    WRAPPER: "cmd /E:ON /V:ON /C .\\scripts\\win_wrapper.cmd"
    PIP:     "%WRAPPER% %PYTHON%\\Scripts\\pip.exe"
    SETUPPY: "%WRAPPER% %PYTHON%\\python setup.py --with-extension"
    PYPI_USERNAME: "earwigbot"
    PYPI_PASSWORD:
      secure: gOIcvPxSC2ujuhwOzwj3v8xjq3CCYd8keFWVnguLM+gcL0e02qshDHy7gwZZwj0+

  matrix:
    - PYTHON:         "C:\\Python27"
      PYTHON_VERSION: "2.7"
      PYTHON_ARCH:    "32"

    - PYTHON:         "C:\\Python27-x64"
      PYTHON_VERSION: "2.7"
      PYTHON_ARCH:    "64"

    - PYTHON:         "C:\\Python33"
      PYTHON_VERSION: "3.3"
      PYTHON_ARCH:    "32"

    - PYTHON:         "C:\\Python33-x64"
      PYTHON_VERSION: "3.3"
      PYTHON_ARCH:    "64"

    - PYTHON:         "C:\\Python34"
      PYTHON_VERSION: "3.4"
      PYTHON_ARCH:    "32"

    - PYTHON:         "C:\\Python34-x64"
      PYTHON_VERSION: "3.4"
      PYTHON_ARCH:    "64"

 install:
  - "%PIP% install wheel twine"

 build_script:
  - "%SETUPPY% build"

 test_script:
  - "%SETUPPY% -q test"

 after_test:
  - "%SETUPPY% bdist_wheel"

 on_success:
  - "twine upload dist\\* -u %PYPI_USERNAME% -p %PYPI_PASSWORD%"

 artifacts:
  - path: dist\*

 deploy: off
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -13,13 +13,24 @@ Unreleased
 - Added support for Python 3.5.
 - ``<`` and ``>`` are now disallowed in wikilink titles and template names.
  This includes when denoting tags, but not comments.
 - Fixed the behavior of *preserve_spacing* in :func:`~.Template.add` and
  *keep_field* in :func:`~.Template.remove` on parameters with hidden keys.
 - Fixed the behavior of *preserve_spacing* in :meth:`.Template.add` and
  *keep_field* in :meth:`.Template.remove` on parameters with hidden keys.
 - Removed :meth:`._ListProxy.detach`. :class:`.SmartList`\ s now use weak
  references and their children are garbage-collected properly.
 - Fixed parser bugs involving:

  - templates with completely blank names;
  - templates with newlines and comments.

 - Heavy refactoring and fixes to the C tokenizer, including:

  - corrected a design flaw in text handling, allowing for substantial speed
    improvements when parsing long strings of plain text;
  - implemented new Python 3.3
    `PEP 393 <https://www.python.org/dev/peps/pep-0393/>`_ Unicode APIs.

 - Fixed various bugs in :class:`.SmartList`, including one that was causing
  memory issues on 64-bit builds of Python 2 on Windows.
 - Fixed some bugs in the release scripts.

 v0.4
--- a/mwparserfromhell/compat.py
+++ b/mwparserfromhell/compat.py
@@ -18,14 +18,12 @@ if py3k:
    bytes = bytes
    str = str
    range = range
    maxsize = sys.maxsize
    import html.entities as htmlentities

 else:
    bytes = str
    str = unicode
    range = xrange
    maxsize = sys.maxint
    import htmlentitydefs as htmlentities

 del sys
--- a/mwparserfromhell/definitions.py
+++ b/mwparserfromhell/definitions.py
@@ -81,10 +81,8 @@ def is_single_only(tag):
    """Return whether or not the given *tag* must exist without a close tag."""
    return tag.lower() in SINGLE_ONLY

 def is_scheme(scheme, slashes=True, reverse=False):
 def is_scheme(scheme, slashes=True):
    """Return whether *scheme* is valid for external links."""
    if reverse:  # Convenience for C
        scheme = scheme[::-1]
    scheme = scheme.lower()
    if slashes:
        return scheme in URI_SCHEMES
--- a/mwparserfromhell/parser/ctokenizer/common.h
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -0,0 +1,125 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #pragma once

 #ifndef PY_SSIZE_T_CLEAN
 #define PY_SSIZE_T_CLEAN  // See: https://docs.python.org/2/c-api/arg.html
 #endif

 #include <Python.h>
 #include <structmember.h>
 #include <bytesobject.h>

 /* Compatibility macros */

 #if PY_MAJOR_VERSION >= 3
 #define IS_PY3K
 #endif

 #ifndef uint64_t
 #define uint64_t unsigned PY_LONG_LONG
 #endif

 #define malloc  PyObject_Malloc  // XXX: yuck
 #define realloc PyObject_Realloc
 #define free    PyObject_Free

 /* Unicode support macros */

 #if defined(IS_PY3K) && PY_MINOR_VERSION >= 3
 #define PEP_393
 #endif

 #ifdef PEP_393
 #define Unicode Py_UCS4
 #define PyUnicode_FROM_SINGLE(chr)                                            \
    PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1)
 #else
 #define Unicode Py_UNICODE
 #define PyUnicode_FROM_SINGLE(chr)                                            \
    PyUnicode_FromUnicode(&(chr), 1)
 #define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE
 #endif

 /* Error handling macros */

 #define BAD_ROUTE            self->route_state
 #define BAD_ROUTE_CONTEXT    self->route_context
 #define FAIL_ROUTE(context)  {                                                \
        self->route_state = 1;                                                \
        self->route_context = context;                                        \
    }
 #define RESET_ROUTE()        self->route_state = 0

 /* Shared globals */

 extern char** entitydefs;

 extern PyObject* NOARGS;
 extern PyObject* definitions;

 /* Structs */

 typedef struct {
    Py_ssize_t capacity;
    Py_ssize_t length;
 #ifdef PEP_393
    PyObject* object;
    int kind;
    void* data;
 #else
    Py_UNICODE* data;
 #endif
 } Textbuffer;

 struct Stack {
    PyObject* stack;
    uint64_t context;
    Textbuffer* textbuffer;
    struct Stack* next;
 };
 typedef struct Stack Stack;

 typedef struct {
    PyObject* object;        /* base PyUnicodeObject object */
    Py_ssize_t length;       /* length of object, in code points */
 #ifdef PEP_393
    int kind;                /* object's kind value */
    void* data;              /* object's raw unicode buffer */
 #else
    Py_UNICODE* buf;         /* object's internal buffer */
 #endif
 } TokenizerInput;

 typedef struct {
    PyObject_HEAD
    TokenizerInput text;     /* text to tokenize */
    Stack* topstack;         /* topmost stack */
    Py_ssize_t head;         /* current position in text */
    int global;              /* global context */
    int depth;               /* stack recursion depth */
    int cycles;              /* total number of stack recursions */
    int route_state;         /* whether a BadRoute has been triggered */
    uint64_t route_context;  /* context when the last BadRoute was triggered */
    int skip_style_tags;     /* temp fix for the sometimes broken tag parser */
 } Tokenizer;
--- a/mwparserfromhell/parser/ctokenizer/contexts.h
+++ b/mwparserfromhell/parser/ctokenizer/contexts.h
@@ -0,0 +1,105 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #pragma once

 /* Local contexts */

 #define LC_TEMPLATE                 0x0000000000000007
 #define LC_TEMPLATE_NAME            0x0000000000000001
 #define LC_TEMPLATE_PARAM_KEY       0x0000000000000002
 #define LC_TEMPLATE_PARAM_VALUE     0x0000000000000004

 #define LC_ARGUMENT                 0x0000000000000018
 #define LC_ARGUMENT_NAME            0x0000000000000008
 #define LC_ARGUMENT_DEFAULT         0x0000000000000010

 #define LC_WIKILINK                 0x0000000000000060
 #define LC_WIKILINK_TITLE           0x0000000000000020
 #define LC_WIKILINK_TEXT            0x0000000000000040

 #define LC_EXT_LINK                 0x0000000000000180
 #define LC_EXT_LINK_URI             0x0000000000000080
 #define LC_EXT_LINK_TITLE           0x0000000000000100

 #define LC_HEADING                  0x0000000000007E00
 #define LC_HEADING_LEVEL_1          0x0000000000000200
 #define LC_HEADING_LEVEL_2          0x0000000000000400
 #define LC_HEADING_LEVEL_3          0x0000000000000800
 #define LC_HEADING_LEVEL_4          0x0000000000001000
 #define LC_HEADING_LEVEL_5          0x0000000000002000
 #define LC_HEADING_LEVEL_6          0x0000000000004000

 #define LC_TAG                      0x0000000000078000
 #define LC_TAG_OPEN                 0x0000000000008000
 #define LC_TAG_ATTR                 0x0000000000010000
 #define LC_TAG_BODY                 0x0000000000020000
 #define LC_TAG_CLOSE                0x0000000000040000

 #define LC_STYLE                    0x0000000000780000
 #define LC_STYLE_ITALICS            0x0000000000080000
 #define LC_STYLE_BOLD               0x0000000000100000
 #define LC_STYLE_PASS_AGAIN         0x0000000000200000
 #define LC_STYLE_SECOND_PASS        0x0000000000400000

 #define LC_DLTERM                   0x0000000000800000

 #define LC_SAFETY_CHECK             0x000000007F000000
 #define LC_HAS_TEXT                 0x0000000001000000
 #define LC_FAIL_ON_TEXT             0x0000000002000000
 #define LC_FAIL_NEXT                0x0000000004000000
 #define LC_FAIL_ON_LBRACE           0x0000000008000000
 #define LC_FAIL_ON_RBRACE           0x0000000010000000
 #define LC_FAIL_ON_EQUALS           0x0000000020000000
 #define LC_HAS_TEMPLATE             0x0000000040000000

 #define LC_TABLE                    0x0000001F80000000
 #define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000
 #define LC_TABLE_OPEN               0x0000000080000000
 #define LC_TABLE_CELL_OPEN          0x0000000100000000
 #define LC_TABLE_CELL_STYLE         0x0000000200000000
 #define LC_TABLE_ROW_OPEN           0x0000000400000000
 #define LC_TABLE_TD_LINE            0x0000000800000000
 #define LC_TABLE_TH_LINE            0x0000001000000000

 /* Global contexts */

 #define GL_HEADING 0x1

 /* Aggregate contexts */

 #define AGG_FAIL         (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
 #define AGG_UNSAFE       (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
 #define AGG_DOUBLE       (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
 #define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
 #define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)

 /* Tag contexts */

 #define TAG_NAME        0x01
 #define TAG_ATTR_READY  0x02
 #define TAG_ATTR_NAME   0x04
 #define TAG_ATTR_VALUE  0x08
 #define TAG_QUOTED      0x10
 #define TAG_NOTE_SPACE  0x20
 #define TAG_NOTE_EQUALS 0x40
 #define TAG_NOTE_QUOTE  0x80
--- a/mwparserfromhell/parser/ctokenizer/tag_data.c
+++ b/mwparserfromhell/parser/ctokenizer/tag_data.c
@@ -0,0 +1,78 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #include "tag_data.h"
 #include "contexts.h"

 /*
    Initialize a new TagData object.
 */
 TagData* TagData_new(TokenizerInput* text)
 {
 #define ALLOC_BUFFER(name)       \
    name = Textbuffer_new(text); \
    if (!name) {                 \
        TagData_dealloc(self);   \
        return NULL;             \
    }

    TagData *self = malloc(sizeof(TagData));
    if (!self) {
        PyErr_NoMemory();
        return NULL;
    }
    self->context = TAG_NAME;
    ALLOC_BUFFER(self->pad_first)
    ALLOC_BUFFER(self->pad_before_eq)
    ALLOC_BUFFER(self->pad_after_eq)
    self->quoter = 0;
    self->reset = 0;
    return self;

 #undef ALLOC_BUFFER
 }

 /*
    Deallocate the given TagData object.
 */
 void TagData_dealloc(TagData* self)
 {
    if (self->pad_first)
        Textbuffer_dealloc(self->pad_first);
    if (self->pad_before_eq)
        Textbuffer_dealloc(self->pad_before_eq);
    if (self->pad_after_eq)
        Textbuffer_dealloc(self->pad_after_eq);
    free(self);
 }

 /*
    Clear the internal buffers of the given TagData object.
 */
 int TagData_reset_buffers(TagData* self)
 {
    if (Textbuffer_reset(self->pad_first) ||
        Textbuffer_reset(self->pad_before_eq) ||
        Textbuffer_reset(self->pad_after_eq))
        return -1;
    return 0;
 }
--- a/mwparserfromhell/parser/ctokenizer/tag_data.h
+++ b/mwparserfromhell/parser/ctokenizer/tag_data.h
@@ -0,0 +1,43 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #pragma once

 #include "common.h"
 #include "textbuffer.h"

 /* Structs */

 typedef struct {
    uint64_t context;
    Textbuffer* pad_first;
    Textbuffer* pad_before_eq;
    Textbuffer* pad_after_eq;
    Unicode quoter;
    Py_ssize_t reset;
 } TagData;

 /* Functions */

 TagData* TagData_new(TokenizerInput*);
 void TagData_dealloc(TagData*);
 int TagData_reset_buffers(TagData*);
--- a/mwparserfromhell/parser/ctokenizer/textbuffer.c
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c
@@ -0,0 +1,232 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #include "textbuffer.h"

 #define INITIAL_CAPACITY 32
 #define RESIZE_FACTOR 2
 #define CONCAT_EXTRA 32

 /*
    Internal allocation function for textbuffers.
 */
 static int internal_alloc(Textbuffer* self, Unicode maxchar)
 {
    self->capacity = INITIAL_CAPACITY;
    self->length = 0;

 #ifdef PEP_393
    self->object = PyUnicode_New(self->capacity, maxchar);
    if (!self->object)
        return -1;
    self->kind = PyUnicode_KIND(self->object);
    self->data = PyUnicode_DATA(self->object);
 #else
    (void) maxchar;  // Unused
    self->data = malloc(sizeof(Unicode) * self->capacity);
    if (!self->data)
        return -1;
 #endif

    return 0;
 }

 /*
    Internal deallocation function for textbuffers.
 */
 static void internal_dealloc(Textbuffer* self)
 {
 #ifdef PEP_393
    Py_DECREF(self->object);
 #else
    free(self->data);
 #endif
 }

 /*
    Internal resize function.
 */
 static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
 {
 #ifdef PEP_393
    PyObject *newobj;
    void *newdata;

    newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object));
    if (!newobj)
        return -1;
    newdata = PyUnicode_DATA(newobj);
    memcpy(newdata, self->data, self->length * self->kind);
    Py_DECREF(self->object);
    self->object = newobj;
    self->data = newdata;
 #else
    if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap)))
        return -1;
 #endif

    self->capacity = new_cap;
    return 0;
 }

 /*
    Create a new textbuffer object.
 */
 Textbuffer* Textbuffer_new(TokenizerInput* text)
 {
    Textbuffer* self = malloc(sizeof(Textbuffer));
    Unicode maxchar = 0;

 #ifdef PEP_393
    maxchar = PyUnicode_MAX_CHAR_VALUE(text->object);
 #endif

    if (!self)
        goto fail_nomem;
    if (internal_alloc(self, maxchar) < 0)
        goto fail_dealloc;
    return self;

    fail_dealloc:
    free(self);
    fail_nomem:
    PyErr_NoMemory();
    return NULL;
 }

 /*
    Deallocate the given textbuffer.
 */
 void Textbuffer_dealloc(Textbuffer* self)
 {
    internal_dealloc(self);
    free(self);
 }

 /*
    Reset a textbuffer to its initial, empty state.
 */
 int Textbuffer_reset(Textbuffer* self)
 {
    Unicode maxchar = 0;

 #ifdef PEP_393
    maxchar = PyUnicode_MAX_CHAR_VALUE(self->object);
 #endif

    internal_dealloc(self);
    if (internal_alloc(self, maxchar))
        return -1;
    return 0;
 }

 /*
    Write a Unicode codepoint to the given textbuffer.
 */
 int Textbuffer_write(Textbuffer* self, Unicode code)
 {
    if (self->length >= self->capacity) {
        if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0)
            return -1;
    }

 #ifdef PEP_393
    PyUnicode_WRITE(self->kind, self->data, self->length++, code);
 #else
    self->data[self->length++] = code;
 #endif

    return 0;
 }

 /*
    Read a Unicode codepoint from the given index of the given textbuffer.

    This function does not check for bounds.
 */
 Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index)
 {
 #ifdef PEP_393
    return PyUnicode_READ(self->kind, self->data, index);
 #else
    return self->data[index];
 #endif
 }

 /*
    Return the contents of the textbuffer as a Python Unicode object.
 */
 PyObject* Textbuffer_render(Textbuffer* self)
 {
 #ifdef PEP_393
    return PyUnicode_FromKindAndData(self->kind, self->data, self->length);
 #else
    return PyUnicode_FromUnicode(self->data, self->length);
 #endif
 }

 /*
    Concatenate the 'other' textbuffer onto the end of the given textbuffer.
 */
 int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
 {
    Py_ssize_t newlen = self->length + other->length;

    if (newlen > self->capacity) {
        if (internal_resize(self, newlen + CONCAT_EXTRA) < 0)
            return -1;
    }

 #ifdef PEP_393
    assert(self->kind == other->kind);
    memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data,
           other->length * other->kind);
 #else
    memcpy(self->data + self->length, other->data,
           other->length * sizeof(Unicode));
 #endif

    self->length = newlen;
    return 0;
 }

 /*
    Reverse the contents of the given textbuffer.
 */
 void Textbuffer_reverse(Textbuffer* self)
 {
    Py_ssize_t i, end = self->length - 1;
    Unicode tmp;

    for (i = 0; i < self->length / 2; i++) {
 #ifdef PEP_393
        tmp = PyUnicode_READ(self->kind, self->data, i);
        PyUnicode_WRITE(self->kind, self->data, i,
                        PyUnicode_READ(self->kind, self->data, end - i));
        PyUnicode_WRITE(self->kind, self->data, end - i, tmp);
 #else
        tmp = self->data[i];
        self->data[i] = self->data[end - i];
        self->data[end - i] = tmp;
 #endif
    }
 }
--- a/mwparserfromhell/parser/ctokenizer/textbuffer.h
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h
@@ -0,0 +1,36 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #pragma once

 #include "common.h"

 /* Functions */

 Textbuffer* Textbuffer_new(TokenizerInput*);
 void Textbuffer_dealloc(Textbuffer*);
 int Textbuffer_reset(Textbuffer*);
 int Textbuffer_write(Textbuffer*, Unicode);
 Unicode Textbuffer_read(Textbuffer*, Py_ssize_t);
 PyObject* Textbuffer_render(Textbuffer*);
 int Textbuffer_concat(Textbuffer*, Textbuffer*);
 void Textbuffer_reverse(Textbuffer*);
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.h
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.h
@@ -0,0 +1,35 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #pragma once

 #include "common.h"

 static const char MARKERS[] = {
    '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
    '-', '!', '\n', '\0'};

 #define NUM_MARKERS 19

 /* Functions */

 PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);
--- a/mwparserfromhell/parser/ctokenizer/tok_support.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_support.c
@@ -0,0 +1,345 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #include "tok_support.h"
 #include "textbuffer.h"
 #include "tokens.h"

 /*
    Add a new token stack, context, and textbuffer to the list.
 */
 int Tokenizer_push(Tokenizer* self, uint64_t context)
 {
    Stack* top = malloc(sizeof(Stack));

    if (!top) {
        PyErr_NoMemory();
        return -1;
    }
    top->stack = PyList_New(0);
    top->context = context;
    top->textbuffer = Textbuffer_new(&self->text);
    if (!top->textbuffer)
        return -1;
    top->next = self->topstack;
    self->topstack = top;
    self->depth++;
    self->cycles++;
    return 0;
 }

 /*
    Push the textbuffer onto the stack as a Text node and clear it.
 */
 int Tokenizer_push_textbuffer(Tokenizer* self)
 {
    PyObject *text, *kwargs, *token;
    Textbuffer* buffer = self->topstack->textbuffer;

    if (buffer->length == 0)
        return 0;
    text = Textbuffer_render(buffer);
    if (!text)
        return -1;
    kwargs = PyDict_New();
    if (!kwargs) {
        Py_DECREF(text);
        return -1;
    }
    PyDict_SetItemString(kwargs, "text", text);
    Py_DECREF(text);
    token = PyObject_Call(Text, NOARGS, kwargs);
    Py_DECREF(kwargs);
    if (!token)
        return -1;
    if (PyList_Append(self->topstack->stack, token)) {
        Py_DECREF(token);
        return -1;
    }
    Py_DECREF(token);
    if (Textbuffer_reset(buffer))
        return -1;
    return 0;
 }

 /*
    Pop and deallocate the top token stack/context/textbuffer.
 */
 void Tokenizer_delete_top_of_stack(Tokenizer* self)
 {
    Stack* top = self->topstack;

    Py_DECREF(top->stack);
    Textbuffer_dealloc(top->textbuffer);
    self->topstack = top->next;
    free(top);
    self->depth--;
 }

 /*
    Pop the current stack/context/textbuffer, returing the stack.
 */
 PyObject* Tokenizer_pop(Tokenizer* self)
 {
    PyObject* stack;

    if (Tokenizer_push_textbuffer(self))
        return NULL;
    stack = self->topstack->stack;
    Py_INCREF(stack);
    Tokenizer_delete_top_of_stack(self);
    return stack;
 }

 /*
    Pop the current stack/context/textbuffer, returing the stack. We will also
    replace the underlying stack's context with the current stack's.
 */
 PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
 {
    PyObject* stack;
    uint64_t context;

    if (Tokenizer_push_textbuffer(self))
        return NULL;
    stack = self->topstack->stack;
    Py_INCREF(stack);
    context = self->topstack->context;
    Tokenizer_delete_top_of_stack(self);
    self->topstack->context = context;
    return stack;
 }

 /*
    Fail the current tokenization route. Discards the current
    stack/context/textbuffer and sets the BAD_ROUTE flag.
 */
 void* Tokenizer_fail_route(Tokenizer* self)
 {
    uint64_t context = self->topstack->context;
    PyObject* stack = Tokenizer_pop(self);

    Py_XDECREF(stack);
    FAIL_ROUTE(context);
    return NULL;
 }

 /*
    Write a token to the current token stack.
 */
 int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first)
 {
    PyObject* instance;

    if (Tokenizer_push_textbuffer(self))
        return -1;
    instance = PyObject_CallObject(token, NULL);
    if (!instance)
        return -1;
    if (first ? PyList_Insert(self->topstack->stack, 0, instance) :
                PyList_Append(self->topstack->stack, instance)) {
        Py_DECREF(instance);
        return -1;
    }
    Py_DECREF(instance);
    return 0;
 }

 /*
    Write a token to the current token stack, with kwargs. Steals a reference
    to kwargs.
 */
 int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
                                       PyObject* kwargs, int first)
 {
    PyObject* instance;

    if (Tokenizer_push_textbuffer(self)) {
        Py_DECREF(kwargs);
        return -1;
    }
    instance = PyObject_Call(token, NOARGS, kwargs);
    if (!instance) {
        Py_DECREF(kwargs);
        return -1;
    }
    if (first ? PyList_Insert(self->topstack->stack, 0, instance):
                PyList_Append(self->topstack->stack, instance)) {
        Py_DECREF(instance);
        Py_DECREF(kwargs);
        return -1;
    }
    Py_DECREF(instance);
    Py_DECREF(kwargs);
    return 0;
 }

 /*
    Write a Unicode codepoint to the current textbuffer.
 */
 int Tokenizer_emit_char(Tokenizer* self, Unicode code)
 {
    return Textbuffer_write(self->topstack->textbuffer, code);
 }

 /*
    Write a string of text to the current textbuffer.
 */
 int Tokenizer_emit_text(Tokenizer* self, const char* text)
 {
    int i = 0;

    while (text[i]) {
        if (Tokenizer_emit_char(self, text[i]))
            return -1;
        i++;
    }
    return 0;
 }

 /*
    Write the contents of another textbuffer to the current textbuffer,
    deallocating it in the process.
 */
 int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer)
 {
    int retval = Textbuffer_concat(self->topstack->textbuffer, buffer);
    Textbuffer_dealloc(buffer);
    return retval;
 }

 /*
    Write a series of tokens to the current stack at once.
 */
 int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
 {
    int pushed = 0;
    PyObject *stack, *token, *left, *right, *text;
    Textbuffer* buffer;
    Py_ssize_t size;

    if (PyList_GET_SIZE(tokenlist) > 0) {
        token = PyList_GET_ITEM(tokenlist, 0);
        switch (PyObject_IsInstance(token, Text)) {
            case 0:
                break;
            case 1: {
                pushed = 1;
                buffer = self->topstack->textbuffer;
                if (buffer->length == 0)
                    break;
                left = Textbuffer_render(buffer);
                if (!left)
                    return -1;
                right = PyObject_GetAttrString(token, "text");
                if (!right)
                    return -1;
                text = PyUnicode_Concat(left, right);
                Py_DECREF(left);
                Py_DECREF(right);
                if (!text)
                    return -1;
                if (PyObject_SetAttrString(token, "text", text)) {
                    Py_DECREF(text);
                    return -1;
                }
                Py_DECREF(text);
                if (Textbuffer_reset(buffer))
                    return -1;
                break;
            }
            case -1:
                return -1;
        }
    }
    if (!pushed) {
        if (Tokenizer_push_textbuffer(self))
            return -1;
    }
    stack = self->topstack->stack;
    size = PyList_GET_SIZE(stack);
    if (PyList_SetSlice(stack, size, size, tokenlist))
        return -1;
    return 0;
 }

 /*
    Pop the current stack, write text, and then write the stack. 'text' is a
    NULL-terminated array of chars.
 */
 int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
 {
    PyObject* stack = Tokenizer_pop(self);

    if (Tokenizer_emit_text(self, text)) {
        Py_DECREF(stack);
        return -1;
    }
    if (stack) {
        if (PyList_GET_SIZE(stack) > 0) {
            if (Tokenizer_emit_all(self, stack)) {
                Py_DECREF(stack);
                return -1;
            }
        }
        Py_DECREF(stack);
    }
    self->head--;
    return 0;
 }

 /*
    Internal function to read the codepoint at the given index from the input.
 */
 static Unicode read_codepoint(TokenizerInput* text, Py_ssize_t index)
 {
 #ifdef PEP_393
    return PyUnicode_READ(text->kind, text->data, index);
 #else
    return text->buf[index];
 #endif
 }

 /*
    Read the value at a relative point in the wikicode, forwards.
 */
 Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
 {
    Py_ssize_t index = self->head + delta;

    if (index >= self->text.length)
        return '\0';
    return read_codepoint(&self->text, index);
 }

 /*
    Read the value at a relative point in the wikicode, backwards.
 */
 Unicode Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
 {
    Py_ssize_t index;

    if (delta > self->head)
        return '\0';
    index = self->head - delta;
    return read_codepoint(&self->text, index);
 }
--- a/mwparserfromhell/parser/ctokenizer/tok_support.h
+++ b/mwparserfromhell/parser/ctokenizer/tok_support.h
@@ -0,0 +1,62 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #pragma once

 #include "common.h"

 /* Functions */

 int Tokenizer_push(Tokenizer*, uint64_t);
 int Tokenizer_push_textbuffer(Tokenizer*);
 void Tokenizer_delete_top_of_stack(Tokenizer*);
 PyObject* Tokenizer_pop(Tokenizer*);
 PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
 void* Tokenizer_fail_route(Tokenizer*);

 int Tokenizer_emit_token(Tokenizer*, PyObject*, int);
 int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int);
 int Tokenizer_emit_char(Tokenizer*, Unicode);
 int Tokenizer_emit_text(Tokenizer*, const char*);
 int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*);
 int Tokenizer_emit_all(Tokenizer*, PyObject*);
 int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);

 Unicode Tokenizer_read(Tokenizer*, Py_ssize_t);
 Unicode Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);

 /* Macros */

 #define MAX_DEPTH 40
 #define MAX_CYCLES 100000

 #define Tokenizer_CAN_RECURSE(self)                                           \
    (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)

 #define Tokenizer_emit(self, token)                                           \
    Tokenizer_emit_token(self, token, 0)
 #define Tokenizer_emit_first(self, token)                                     \
    Tokenizer_emit_token(self, token, 1)
 #define Tokenizer_emit_kwargs(self, token, kwargs)                            \
    Tokenizer_emit_token_kwargs(self, token, kwargs, 0)
 #define Tokenizer_emit_first_kwargs(self, token, kwargs)                      \
    Tokenizer_emit_token_kwargs(self, token, kwargs, 1)
--- a/mwparserfromhell/parser/ctokenizer/tokenizer.c
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c
@@ -0,0 +1,310 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #include "tokenizer.h"
 #include "tok_parse.h"
 #include "tokens.h"

 /* Globals */

 int route_state;
 uint64_t route_context;

 char** entitydefs;

 PyObject* NOARGS;
 PyObject* definitions;

 static PyObject* ParserError;

 /* Forward declarations */

 static int load_exceptions(void);

 /*
    Create a new tokenizer object.
 */
 static PyObject*
 Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
 {
    Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0);
    return (PyObject*) self;
 }

 /*
    Deallocate the given tokenizer's text field.
 */
 static void dealloc_tokenizer_text(TokenizerInput* text)
 {
    Py_XDECREF(text->object);
 }

 /*
    Deallocate the given tokenizer object.
 */
 static void Tokenizer_dealloc(Tokenizer* self)
 {
    Stack *this = self->topstack, *next;
    dealloc_tokenizer_text(&self->text);

    while (this) {
        Py_DECREF(this->stack);
        Textbuffer_dealloc(this->textbuffer);
        next = this->next;
        free(this);
        this = next;
    }
    Py_TYPE(self)->tp_free((PyObject*) self);
 }

 /*
    Initialize a new tokenizer instance's text field.
 */
 static void init_tokenizer_text(TokenizerInput* text)
 {
    text->object = Py_None;
    Py_INCREF(Py_None);
    text->length = 0;
 #ifdef PEP_393
    text->kind = PyUnicode_WCHAR_KIND;
    text->data = NULL;
 #else
    text->buf = NULL;
 #endif
 }

 /*
    Initialize a new tokenizer instance by setting instance attributes.
 */
 static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
 {
    static char* kwlist[] = {NULL};

    if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
        return -1;
    init_tokenizer_text(&self->text);
    self->topstack = NULL;
    self->head = self->global = self->depth = self->cycles = 0;
    self->route_context = self->route_state = 0;
    self->skip_style_tags = 0;
    return 0;
 }

 /*
    Load input text into the tokenizer.
 */
 static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
 {
    dealloc_tokenizer_text(text);
    text->object = input;

 #ifdef PEP_393
    if (PyUnicode_READY(input) < 0)
        return -1;
    text->kind = PyUnicode_KIND(input);
    text->data = PyUnicode_DATA(input);
 #else
    text->buf = PyUnicode_AS_UNICODE(input);
 #endif
    text->length = PyUnicode_GET_LENGTH(input);
    return 0;
 }

 /*
    Build a list of tokens from a string of wikicode and return it.
 */
 static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
 {
    PyObject *input, *tokens;
    uint64_t context = 0;
    int skip_style_tags = 0;

    if (PyArg_ParseTuple(args, "U|ii", &input, &context, &skip_style_tags)) {
        Py_INCREF(input);
        if (load_tokenizer_text(&self->text, input))
            return NULL;
    }
    else {
        const char *encoded;
        Py_ssize_t size;

        /* Failed to parse a Unicode object; try a string instead. */
        PyErr_Clear();
        if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context,
                              &skip_style_tags))
            return NULL;
        if (!(input = PyUnicode_FromStringAndSize(encoded, size)))
            return NULL;
        if (load_tokenizer_text(&self->text, input))
            return NULL;
    }

    self->head = self->global = self->depth = self->cycles = 0;
    self->skip_style_tags = skip_style_tags;
    tokens = Tokenizer_parse(self, context, 1);

    if ((!tokens && !PyErr_Occurred()) || self->topstack) {
        if (!ParserError) {
            if (load_exceptions())
                return NULL;
        }
        if (BAD_ROUTE) {
            RESET_ROUTE();
            PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");
        }
        else if (self->topstack)
            PyErr_SetString(ParserError,
                            "C tokenizer exited with non-empty token stack");
        else
            PyErr_SetString(ParserError, "C tokenizer exited unexpectedly");
        return NULL;
    }
    return tokens;
 }

 static int load_entities(void)
 {
    PyObject *tempmod, *defmap, *deflist;
    unsigned numdefs, i;
 #ifdef IS_PY3K
    PyObject *string;
 #endif

    tempmod = PyImport_ImportModule(ENTITYDEFS_MODULE);
    if (!tempmod)
        return -1;
    defmap = PyObject_GetAttrString(tempmod, "entitydefs");
    if (!defmap)
        return -1;
    Py_DECREF(tempmod);
    deflist = PyDict_Keys(defmap);
    if (!deflist)
        return -1;
    Py_DECREF(defmap);
    numdefs = (unsigned) PyList_GET_SIZE(defmap);
    entitydefs = calloc(numdefs + 1, sizeof(char*));
    if (!entitydefs)
        return -1;
    for (i = 0; i < numdefs; i++) {
 #ifdef IS_PY3K
        string = PyUnicode_AsASCIIString(PyList_GET_ITEM(deflist, i));
        if (!string)
            return -1;
        entitydefs[i] = PyBytes_AsString(string);
 #else
        entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i));
 #endif
        if (!entitydefs[i])
            return -1;
    }
    Py_DECREF(deflist);
    return 0;
 }

 static int load_tokens(void)
 {
    PyObject *tempmod, *tokens,
             *globals = PyEval_GetGlobals(),
             *locals = PyEval_GetLocals(),
             *fromlist = PyList_New(1),
             *modname = IMPORT_NAME_FUNC("tokens");
    char *name = "mwparserfromhell.parser";

    if (!fromlist || !modname)
        return -1;
    PyList_SET_ITEM(fromlist, 0, modname);
    tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
    Py_DECREF(fromlist);
    if (!tempmod)
        return -1;
    tokens = PyObject_GetAttrString(tempmod, "tokens");
    Py_DECREF(tempmod);
    load_tokens_from_module(tokens);
    Py_DECREF(tokens);
    return 0;
 }

 static int load_defs(void)
 {
    PyObject *tempmod,
             *globals = PyEval_GetGlobals(),
             *locals = PyEval_GetLocals(),
             *fromlist = PyList_New(1),
             *modname = IMPORT_NAME_FUNC("definitions");
    char *name = "mwparserfromhell";

    if (!fromlist || !modname)
        return -1;
    PyList_SET_ITEM(fromlist, 0, modname);
    tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
    Py_DECREF(fromlist);
    if (!tempmod)
        return -1;
    definitions = PyObject_GetAttrString(tempmod, "definitions");
    Py_DECREF(tempmod);
    return 0;
 }

 static int load_exceptions(void)
 {
    PyObject *tempmod, *parsermod,
             *globals = PyEval_GetGlobals(),
             *locals = PyEval_GetLocals(),
             *fromlist = PyList_New(1),
             *modname = IMPORT_NAME_FUNC("parser");
    char *name = "mwparserfromhell";

    if (!fromlist || !modname)
        return -1;
    PyList_SET_ITEM(fromlist, 0, modname);
    tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0);
    Py_DECREF(fromlist);
    if (!tempmod)
        return -1;
    parsermod = PyObject_GetAttrString(tempmod, "parser");
    Py_DECREF(tempmod);
    ParserError = PyObject_GetAttrString(parsermod, "ParserError");
    Py_DECREF(parsermod);
    return 0;
 }

 PyMODINIT_FUNC INIT_FUNC_NAME(void)
 {
    PyObject *module;

    TokenizerType.tp_new = PyType_GenericNew;
    if (PyType_Ready(&TokenizerType) < 0)
        INIT_ERROR;
    module = CREATE_MODULE;
    if (!module)
        INIT_ERROR;
    Py_INCREF(&TokenizerType);
    PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
    Py_INCREF(Py_True);
    PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
    NOARGS = PyTuple_New(0);
    if (!NOARGS || load_entities() || load_tokens() || load_defs())
        INIT_ERROR;
 #ifdef IS_PY3K
    return module;
 #endif
 }
--- a/mwparserfromhell/parser/ctokenizer/tokenizer.h
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.h
@@ -0,0 +1,111 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #pragma once

 #include "common.h"
 #include "textbuffer.h"

 /* Functions */

 static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
 static void Tokenizer_dealloc(Tokenizer*);
 static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
 static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);

 /* Compatibility macros */

 #ifdef IS_PY3K
    #define IMPORT_NAME_FUNC  PyUnicode_FromString
    #define CREATE_MODULE     PyModule_Create(&module_def);
    #define ENTITYDEFS_MODULE "html.entities"
    #define INIT_FUNC_NAME    PyInit__tokenizer
    #define INIT_ERROR        return NULL
 #else
    #define IMPORT_NAME_FUNC  PyBytes_FromString
    #define CREATE_MODULE     Py_InitModule("_tokenizer", NULL);
    #define ENTITYDEFS_MODULE "htmlentitydefs"
    #define INIT_FUNC_NAME    init_tokenizer
    #define INIT_ERROR        return
 #endif

 /* Structs */

 static PyMethodDef Tokenizer_methods[] = {
    {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS,
    "Build a list of tokens from a string of wikicode and return it."},
    {NULL}
 };

 static PyMemberDef Tokenizer_members[] = {
    {NULL}
 };

 static PyTypeObject TokenizerType = {
    PyVarObject_HEAD_INIT(NULL, 0)
    "_tokenizer.CTokenizer",                                /* tp_name */
    sizeof(Tokenizer),                                      /* tp_basicsize */
    0,                                                      /* tp_itemsize */
    (destructor) Tokenizer_dealloc,                         /* tp_dealloc */
    0,                                                      /* tp_print */
    0,                                                      /* tp_getattr */
    0,                                                      /* tp_setattr */
    0,                                                      /* tp_compare */
    0,                                                      /* tp_repr */
    0,                                                      /* tp_as_number */
    0,                                                      /* tp_as_sequence */
    0,                                                      /* tp_as_mapping */
    0,                                                      /* tp_hash  */
    0,                                                      /* tp_call */
    0,                                                      /* tp_str */
    0,                                                      /* tp_getattro */
    0,                                                      /* tp_setattro */
    0,                                                      /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT,                                     /* tp_flags */
    "Creates a list of tokens from a string of wikicode.",  /* tp_doc */
    0,                                                      /* tp_traverse */
    0,                                                      /* tp_clear */
    0,                                                      /* tp_richcompare */
    0,                                                      /* tp_weaklistoffset */
    0,                                                      /* tp_iter */
    0,                                                      /* tp_iternext */
    Tokenizer_methods,                                      /* tp_methods */
    Tokenizer_members,                                      /* tp_members */
    0,                                                      /* tp_getset */
    0,                                                      /* tp_base */
    0,                                                      /* tp_dict */
    0,                                                      /* tp_descr_get */
    0,                                                      /* tp_descr_set */
    0,                                                      /* tp_dictoffset */
    (initproc) Tokenizer_init,                              /* tp_init */
    0,                                                      /* tp_alloc */
    Tokenizer_new,                                          /* tp_new */
 };

 #ifdef IS_PY3K
 static PyModuleDef module_def = {
    PyModuleDef_HEAD_INIT,
    "_tokenizer",
    "Creates a list of tokens from a string of wikicode.",
    -1, NULL, NULL, NULL, NULL, NULL
 };
 #endif
--- a/mwparserfromhell/parser/ctokenizer/tokens.c
+++ b/mwparserfromhell/parser/ctokenizer/tokens.c
@@ -0,0 +1,111 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #include "tokens.h"

 /* Globals */

 PyObject* Text;

 PyObject* TemplateOpen;
 PyObject* TemplateParamSeparator;
 PyObject* TemplateParamEquals;
 PyObject* TemplateClose;

 PyObject* ArgumentOpen;
 PyObject* ArgumentSeparator;
 PyObject* ArgumentClose;

 PyObject* WikilinkOpen;
 PyObject* WikilinkSeparator;
 PyObject* WikilinkClose;

 PyObject* ExternalLinkOpen;
 PyObject* ExternalLinkSeparator;
 PyObject* ExternalLinkClose;

 PyObject* HTMLEntityStart;
 PyObject* HTMLEntityNumeric;
 PyObject* HTMLEntityHex;
 PyObject* HTMLEntityEnd;
 PyObject* HeadingStart;
 PyObject* HeadingEnd;

 PyObject* CommentStart;
 PyObject* CommentEnd;

 PyObject* TagOpenOpen;
 PyObject* TagAttrStart;
 PyObject* TagAttrEquals;
 PyObject* TagAttrQuote;
 PyObject* TagCloseOpen;
 PyObject* TagCloseSelfclose;
 PyObject* TagOpenClose;
 PyObject* TagCloseClose;

 /*
    Load individual tokens into globals from the given Python module object.
 */
 void load_tokens_from_module(PyObject* module)
 {
    Text = PyObject_GetAttrString(module, "Text");

    TemplateOpen = PyObject_GetAttrString(module, "TemplateOpen");
    TemplateParamSeparator = PyObject_GetAttrString(module,
                                                    "TemplateParamSeparator");
    TemplateParamEquals = PyObject_GetAttrString(module,
                                                 "TemplateParamEquals");
    TemplateClose = PyObject_GetAttrString(module, "TemplateClose");

    ArgumentOpen = PyObject_GetAttrString(module, "ArgumentOpen");
    ArgumentSeparator = PyObject_GetAttrString(module, "ArgumentSeparator");
    ArgumentClose = PyObject_GetAttrString(module, "ArgumentClose");

    WikilinkOpen = PyObject_GetAttrString(module, "WikilinkOpen");
    WikilinkSeparator = PyObject_GetAttrString(module, "WikilinkSeparator");
    WikilinkClose = PyObject_GetAttrString(module, "WikilinkClose");

    ExternalLinkOpen = PyObject_GetAttrString(module, "ExternalLinkOpen");
    ExternalLinkSeparator = PyObject_GetAttrString(module,
                                                   "ExternalLinkSeparator");
    ExternalLinkClose = PyObject_GetAttrString(module, "ExternalLinkClose");

    HTMLEntityStart = PyObject_GetAttrString(module, "HTMLEntityStart");
    HTMLEntityNumeric = PyObject_GetAttrString(module, "HTMLEntityNumeric");
    HTMLEntityHex = PyObject_GetAttrString(module, "HTMLEntityHex");
    HTMLEntityEnd = PyObject_GetAttrString(module, "HTMLEntityEnd");

    HeadingStart = PyObject_GetAttrString(module, "HeadingStart");
    HeadingEnd = PyObject_GetAttrString(module, "HeadingEnd");

    CommentStart = PyObject_GetAttrString(module, "CommentStart");
    CommentEnd = PyObject_GetAttrString(module, "CommentEnd");

    TagOpenOpen = PyObject_GetAttrString(module, "TagOpenOpen");
    TagAttrStart = PyObject_GetAttrString(module, "TagAttrStart");
    TagAttrEquals = PyObject_GetAttrString(module, "TagAttrEquals");
    TagAttrQuote = PyObject_GetAttrString(module, "TagAttrQuote");
    TagCloseOpen = PyObject_GetAttrString(module, "TagCloseOpen");
    TagCloseSelfclose = PyObject_GetAttrString(module, "TagCloseSelfclose");
    TagOpenClose = PyObject_GetAttrString(module, "TagOpenClose");
    TagCloseClose = PyObject_GetAttrString(module, "TagCloseClose");
 }
--- a/mwparserfromhell/parser/ctokenizer/tokens.h
+++ b/mwparserfromhell/parser/ctokenizer/tokens.h
@@ -0,0 +1,69 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #pragma once

 #include "common.h"

 /* Token globals */

 extern PyObject* Text;

 extern PyObject* TemplateOpen;
 extern PyObject* TemplateParamSeparator;
 extern PyObject* TemplateParamEquals;
 extern PyObject* TemplateClose;

 extern PyObject* ArgumentOpen;
 extern PyObject* ArgumentSeparator;
 extern PyObject* ArgumentClose;

 extern PyObject* WikilinkOpen;
 extern PyObject* WikilinkSeparator;
 extern PyObject* WikilinkClose;

 extern PyObject* ExternalLinkOpen;
 extern PyObject* ExternalLinkSeparator;
 extern PyObject* ExternalLinkClose;

 extern PyObject* HTMLEntityStart;
 extern PyObject* HTMLEntityNumeric;
 extern PyObject* HTMLEntityHex;
 extern PyObject* HTMLEntityEnd;
 extern PyObject* HeadingStart;
 extern PyObject* HeadingEnd;

 extern PyObject* CommentStart;
 extern PyObject* CommentEnd;

 extern PyObject* TagOpenOpen;
 extern PyObject* TagAttrStart;
 extern PyObject* TagAttrEquals;
 extern PyObject* TagAttrQuote;
 extern PyObject* TagCloseOpen;
 extern PyObject* TagCloseSelfclose;
 extern PyObject* TagOpenClose;
 extern PyObject* TagCloseClose;

 /* Functions */

 void load_tokens_from_module(PyObject*);
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -1,367 +0,0 @@
 /*
 Tokenizer Header File for MWParserFromHell
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #ifndef PY_SSIZE_T_CLEAN
 #define PY_SSIZE_T_CLEAN
 #endif

 #include <Python.h>
 #include <math.h>
 #include <structmember.h>
 #include <bytesobject.h>
 #include <stdint.h>

 #if PY_MAJOR_VERSION >= 3
 #define IS_PY3K
 #endif

 #define malloc PyObject_Malloc
 #define free   PyObject_Free

 #define DIGITS    "0123456789"
 #define HEXDIGITS "0123456789abcdefABCDEF"
 #define ALPHANUM  "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

 static const char MARKERS[] = {
    '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
    '-', '!', '\n', '\0'};

 #define NUM_MARKERS 19
 #define TEXTBUFFER_BLOCKSIZE 1024
 #define MAX_DEPTH 40
 #define MAX_CYCLES 100000
 #define MAX_BRACES 255
 #define MAX_ENTITY_SIZE 8

 static int route_state = 0;
 static uint64_t route_context = 0;
 #define BAD_ROUTE            route_state
 #define BAD_ROUTE_CONTEXT    route_context
 #define FAIL_ROUTE(context)  route_state = 1; route_context = context
 #define RESET_ROUTE()        route_state = 0

 static char** entitydefs;

 static PyObject* EMPTY;
 static PyObject* NOARGS;
 static PyObject* ParserError;
 static PyObject* definitions;


 /* Tokens: */

 static PyObject* Text;

 static PyObject* TemplateOpen;
 static PyObject* TemplateParamSeparator;
 static PyObject* TemplateParamEquals;
 static PyObject* TemplateClose;

 static PyObject* ArgumentOpen;
 static PyObject* ArgumentSeparator;
 static PyObject* ArgumentClose;

 static PyObject* WikilinkOpen;
 static PyObject* WikilinkSeparator;
 static PyObject* WikilinkClose;

 static PyObject* ExternalLinkOpen;
 static PyObject* ExternalLinkSeparator;
 static PyObject* ExternalLinkClose;

 static PyObject* HTMLEntityStart;
 static PyObject* HTMLEntityNumeric;
 static PyObject* HTMLEntityHex;
 static PyObject* HTMLEntityEnd;
 static PyObject* HeadingStart;
 static PyObject* HeadingEnd;

 static PyObject* CommentStart;
 static PyObject* CommentEnd;

 static PyObject* TagOpenOpen;
 static PyObject* TagAttrStart;
 static PyObject* TagAttrEquals;
 static PyObject* TagAttrQuote;
 static PyObject* TagCloseOpen;
 static PyObject* TagCloseSelfclose;
 static PyObject* TagOpenClose;
 static PyObject* TagCloseClose;


 /* Local contexts: */

 #define LC_TEMPLATE                 0x0000000000000007
 #define LC_TEMPLATE_NAME            0x0000000000000001
 #define LC_TEMPLATE_PARAM_KEY       0x0000000000000002
 #define LC_TEMPLATE_PARAM_VALUE     0x0000000000000004

 #define LC_ARGUMENT                 0x0000000000000018
 #define LC_ARGUMENT_NAME            0x0000000000000008
 #define LC_ARGUMENT_DEFAULT         0x0000000000000010

 #define LC_WIKILINK                 0x0000000000000060
 #define LC_WIKILINK_TITLE           0x0000000000000020
 #define LC_WIKILINK_TEXT            0x0000000000000040

 #define LC_EXT_LINK                 0x0000000000000180
 #define LC_EXT_LINK_URI             0x0000000000000080
 #define LC_EXT_LINK_TITLE           0x0000000000000100

 #define LC_HEADING                  0x0000000000007E00
 #define LC_HEADING_LEVEL_1          0x0000000000000200
 #define LC_HEADING_LEVEL_2          0x0000000000000400
 #define LC_HEADING_LEVEL_3          0x0000000000000800
 #define LC_HEADING_LEVEL_4          0x0000000000001000
 #define LC_HEADING_LEVEL_5          0x0000000000002000
 #define LC_HEADING_LEVEL_6          0x0000000000004000

 #define LC_TAG                      0x0000000000078000
 #define LC_TAG_OPEN                 0x0000000000008000
 #define LC_TAG_ATTR                 0x0000000000010000
 #define LC_TAG_BODY                 0x0000000000020000
 #define LC_TAG_CLOSE                0x0000000000040000

 #define LC_STYLE                    0x0000000000780000
 #define LC_STYLE_ITALICS            0x0000000000080000
 #define LC_STYLE_BOLD               0x0000000000100000
 #define LC_STYLE_PASS_AGAIN         0x0000000000200000
 #define LC_STYLE_SECOND_PASS        0x0000000000400000

 #define LC_DLTERM                   0x0000000000800000

 #define LC_SAFETY_CHECK             0x000000007F000000
 #define LC_HAS_TEXT                 0x0000000001000000
 #define LC_FAIL_ON_TEXT             0x0000000002000000
 #define LC_FAIL_NEXT                0x0000000004000000
 #define LC_FAIL_ON_LBRACE           0x0000000008000000
 #define LC_FAIL_ON_RBRACE           0x0000000010000000
 #define LC_FAIL_ON_EQUALS           0x0000000020000000
 #define LC_HAS_TEMPLATE             0x0000000040000000

 #define LC_TABLE                    0x0000001F80000000
 #define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000
 #define LC_TABLE_OPEN               0x0000000080000000
 #define LC_TABLE_CELL_OPEN          0x0000000100000000
 #define LC_TABLE_CELL_STYLE         0x0000000200000000
 #define LC_TABLE_ROW_OPEN           0x0000000400000000
 #define LC_TABLE_TD_LINE            0x0000000800000000
 #define LC_TABLE_TH_LINE            0x0000001000000000

 /* Global contexts: */

 #define GL_HEADING 0x1

 /* Aggregate contexts: */

 #define AGG_FAIL         (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
 #define AGG_UNSAFE       (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
 #define AGG_DOUBLE       (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
 #define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
 #define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)

 /* Tag contexts: */

 #define TAG_NAME        0x01
 #define TAG_ATTR_READY  0x02
 #define TAG_ATTR_NAME   0x04
 #define TAG_ATTR_VALUE  0x08
 #define TAG_QUOTED      0x10
 #define TAG_NOTE_SPACE  0x20
 #define TAG_NOTE_EQUALS 0x40
 #define TAG_NOTE_QUOTE  0x80


 /* Miscellaneous structs: */

 struct Textbuffer {
    Py_ssize_t size;
    Py_UNICODE* data;
    struct Textbuffer* prev;
    struct Textbuffer* next;
 };

 struct Stack {
    PyObject* stack;
    uint64_t context;
    struct Textbuffer* textbuffer;
    struct Stack* next;
 };

 typedef struct {
    PyObject* title;
    int level;
 } HeadingData;

 typedef struct {
    uint64_t context;
    struct Textbuffer* pad_first;
    struct Textbuffer* pad_before_eq;
    struct Textbuffer* pad_after_eq;
    Py_UNICODE quoter;
    Py_ssize_t reset;
 } TagData;

 typedef struct Textbuffer Textbuffer;
 typedef struct Stack Stack;


 /* Tokenizer object definition: */

 typedef struct {
    PyObject_HEAD
    PyObject* text;         /* text to tokenize */
    Stack* topstack;        /* topmost stack */
    Py_ssize_t head;        /* current position in text */
    Py_ssize_t length;      /* length of text */
    int global;             /* global context */
    int depth;              /* stack recursion depth */
    int cycles;             /* total number of stack recursions */
    int skip_style_tags;    /* temporary fix for the sometimes broken tag parser */
 } Tokenizer;


 /* Macros related to Tokenizer functions: */

 #define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta)))
 #define Tokenizer_READ_BACKWARDS(self, delta) \
                (*PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, delta)))
 #define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)

 #define Tokenizer_emit(self, token) Tokenizer_emit_token(self, token, 0)
 #define Tokenizer_emit_first(self, token) Tokenizer_emit_token(self, token, 1)
 #define Tokenizer_emit_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 0)
 #define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1)


 /* Macros for accessing definitions: */

 #define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
 #define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL))
 #define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL))
 #define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL))
 #define IS_SCHEME(scheme, slashes, reverse) \
    (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False))


 /* Function prototypes: */

 static Textbuffer* Textbuffer_new(void);
 static void Textbuffer_dealloc(Textbuffer*);

 static TagData* TagData_new(void);
 static void TagData_dealloc(TagData*);

 static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
 static void Tokenizer_dealloc(Tokenizer*);
 static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
 static int Tokenizer_parse_entity(Tokenizer*);
 static int Tokenizer_parse_comment(Tokenizer*);
 static int Tokenizer_handle_dl_term(Tokenizer*);
 static int Tokenizer_parse_tag(Tokenizer*);
 static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);
 static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);

 static int load_exceptions(void);


 /* Macros for Python 2/3 compatibility: */

 #ifdef IS_PY3K
    #define NEW_INT_FUNC      PyLong_FromSsize_t
    #define IMPORT_NAME_FUNC  PyUnicode_FromString
    #define CREATE_MODULE     PyModule_Create(&module_def);
    #define ENTITYDEFS_MODULE "html.entities"
    #define INIT_FUNC_NAME    PyInit__tokenizer
    #define INIT_ERROR        return NULL
 #else
    #define NEW_INT_FUNC      PyInt_FromSsize_t
    #define IMPORT_NAME_FUNC  PyBytes_FromString
    #define CREATE_MODULE     Py_InitModule("_tokenizer", NULL);
    #define ENTITYDEFS_MODULE "htmlentitydefs"
    #define INIT_FUNC_NAME    init_tokenizer
    #define INIT_ERROR        return
 #endif


 /* More structs for creating the Tokenizer type: */

 static PyMethodDef Tokenizer_methods[] = {
    {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS,
    "Build a list of tokens from a string of wikicode and return it."},
    {NULL}
 };

 static PyMemberDef Tokenizer_members[] = {
    {NULL}
 };

 static PyTypeObject TokenizerType = {
    PyVarObject_HEAD_INIT(NULL, 0)
    "_tokenizer.CTokenizer",                                /* tp_name */
    sizeof(Tokenizer),                                      /* tp_basicsize */
    0,                                                      /* tp_itemsize */
    (destructor) Tokenizer_dealloc,                         /* tp_dealloc */
    0,                                                      /* tp_print */
    0,                                                      /* tp_getattr */
    0,                                                      /* tp_setattr */
    0,                                                      /* tp_compare */
    0,                                                      /* tp_repr */
    0,                                                      /* tp_as_number */
    0,                                                      /* tp_as_sequence */
    0,                                                      /* tp_as_mapping */
    0,                                                      /* tp_hash  */
    0,                                                      /* tp_call */
    0,                                                      /* tp_str */
    0,                                                      /* tp_getattro */
    0,                                                      /* tp_setattro */
    0,                                                      /* tp_as_buffer */
    Py_TPFLAGS_DEFAULT,                                     /* tp_flags */
    "Creates a list of tokens from a string of wikicode.",  /* tp_doc */
    0,                                                      /* tp_traverse */
    0,                                                      /* tp_clear */
    0,                                                      /* tp_richcompare */
    0,                                                      /* tp_weaklistoffset */
    0,                                                      /* tp_iter */
    0,                                                      /* tp_iternext */
    Tokenizer_methods,                                      /* tp_methods */
    Tokenizer_members,                                      /* tp_members */
    0,                                                      /* tp_getset */
    0,                                                      /* tp_base */
    0,                                                      /* tp_dict */
    0,                                                      /* tp_descr_get */
    0,                                                      /* tp_descr_set */
    0,                                                      /* tp_dictoffset */
    (initproc) Tokenizer_init,                              /* tp_init */
    0,                                                      /* tp_alloc */
    Tokenizer_new,                                          /* tp_new */
 };

 #ifdef IS_PY3K
 static PyModuleDef module_def = {
    PyModuleDef_HEAD_INIT,
    "_tokenizer",
    "Creates a list of tokens from a string of wikicode.",
    -1, NULL, NULL, NULL, NULL, NULL
 };
 #endif
--- a/mwparserfromhell/smart_list.py
+++ b/mwparserfromhell/smart_list.py
@@ -27,8 +27,10 @@ reflect changes made to the main list, and vice-versa.
 """

 from __future__ import unicode_literals
 from sys import maxsize
 from weakref import ref

 from .compat import maxsize, py3k
 from .compat import py3k

 __all__ = ["SmartList"]

@@ -45,16 +47,16 @@ def inheritdoc(method):
 class _SliceNormalizerMixIn(object):
    """MixIn that provides a private method to normalize slices."""

    def _normalize_slice(self, key):
    def _normalize_slice(self, key, clamp=False):
        """Return a slice equivalent to the input *key*, standardized."""
        if key.start is not None:
        if key.start is None:
            start = 0
        else:
            start = (len(self) + key.start) if key.start < 0 else key.start
        if key.stop is None or key.stop == maxsize:
            stop = len(self) if clamp else None
        else:
            start = 0
        if key.stop is not None:
            stop = (len(self) + key.stop) if key.stop < 0 else key.stop
        else:
            stop = maxsize
        return slice(start, stop, key.step or 1)


@@ -80,13 +82,6 @@ class SmartList(_SliceNormalizerMixIn, list):
        [2, 3, 4]
        >>> parent
        [0, 1, 2, 3, 4]

    The parent needs to keep a list of its children in order to update them,
    which prevents them from being garbage-collected. If you are keeping the
    parent around for a while but creating many children, it is advisable to
    call :meth:`._ListProxy.detach` when you're finished with them. Certain
    parent methods, like :meth:`reverse` and :meth:`sort`, will do this
    automatically.
    """

    def __init__(self, iterable=None):
@@ -99,10 +94,11 @@ class SmartList(_SliceNormalizerMixIn, list):
    def __getitem__(self, key):
        if not isinstance(key, slice):
            return super(SmartList, self).__getitem__(key)
        key = self._normalize_slice(key)
        key = self._normalize_slice(key, clamp=False)
        sliceinfo = [key.start, key.stop, key.step]
        child = _ListProxy(self, sliceinfo)
        self._children[id(child)] = (child, sliceinfo)
        child_ref = ref(child, self._delete_child)
        self._children[id(child_ref)] = (child_ref, sliceinfo)
        return child

    def __setitem__(self, key, item):
@@ -110,20 +106,21 @@ class SmartList(_SliceNormalizerMixIn, list):
            return super(SmartList, self).__setitem__(key, item)
        item = list(item)
        super(SmartList, self).__setitem__(key, item)
        key = self._normalize_slice(key)
        key = self._normalize_slice(key, clamp=True)
        diff = len(item) + (key.start - key.stop) // key.step
        if not diff:
            return
        values = self._children.values if py3k else self._children.itervalues
        if diff:
            for child, (start, stop, step) in values():
                if start > key.stop:
                    self._children[id(child)][1][0] += diff
                if stop >= key.stop and stop != maxsize:
                    self._children[id(child)][1][1] += diff
        for child, (start, stop, step) in values():
            if start > key.stop:
                self._children[id(child)][1][0] += diff
            if stop is not None and stop >= key.stop:
                self._children[id(child)][1][1] += diff

    def __delitem__(self, key):
        super(SmartList, self).__delitem__(key)
        if isinstance(key, slice):
            key = self._normalize_slice(key)
            key = self._normalize_slice(key, clamp=True)
        else:
            key = slice(key, key + 1, 1)
        diff = (key.stop - key.start) // key.step
@@ -131,7 +128,7 @@ class SmartList(_SliceNormalizerMixIn, list):
        for child, (start, stop, step) in values():
            if start > key.start:
                self._children[id(child)][1][0] -= diff
            if stop >= key.stop and stop != maxsize:
            if stop is not None and stop >= key.stop:
                self._children[id(child)][1][1] -= diff

    if not py3k:
@@ -154,10 +151,16 @@ class SmartList(_SliceNormalizerMixIn, list):
        self.extend(other)
        return self

    def _delete_child(self, child_ref):
        """Remove a child reference that is about to be garbage-collected."""
        del self._children[id(child_ref)]

    def _detach_children(self):
        """Remove all children and give them independent parent copies."""
        children = [val[0] for val in self._children.values()]
        for child in children:
            child.detach()
            child()._parent = list(self)
        self._children.clear()

    @inheritdoc
    def append(self, item):
@@ -226,7 +229,6 @@ class _ListProxy(_SliceNormalizerMixIn, list):
        super(_ListProxy, self).__init__()
        self._parent = parent
        self._sliceinfo = sliceinfo
        self._detached = False

    def __repr__(self):
        return repr(self._render())
@@ -273,24 +275,20 @@ class _ListProxy(_SliceNormalizerMixIn, list):

    def __getitem__(self, key):
        if isinstance(key, slice):
            key = self._normalize_slice(key)
            if key.stop == maxsize:
                keystop = self._stop
            else:
                keystop = key.stop + self._start
            adjusted = slice(key.start + self._start, keystop, key.step)
            key = self._normalize_slice(key, clamp=True)
            keystart = min(self._start + key.start, self._stop)
            keystop = min(self._start + key.stop, self._stop)
            adjusted = slice(keystart, keystop, key.step)
            return self._parent[adjusted]
        else:
            return self._render()[key]

    def __setitem__(self, key, item):
        if isinstance(key, slice):
            key = self._normalize_slice(key)
            if key.stop == maxsize:
                keystop = self._stop
            else:
                keystop = key.stop + self._start
            adjusted = slice(key.start + self._start, keystop, key.step)
            key = self._normalize_slice(key, clamp=True)
            keystart = min(self._start + key.start, self._stop)
            keystop = min(self._start + key.stop, self._stop)
            adjusted = slice(keystart, keystop, key.step)
            self._parent[adjusted] = item
        else:
            length = len(self)
@@ -302,12 +300,10 @@ class _ListProxy(_SliceNormalizerMixIn, list):

    def __delitem__(self, key):
        if isinstance(key, slice):
            key = self._normalize_slice(key)
            if key.stop == maxsize:
                keystop = self._stop
            else:
                keystop = key.stop + self._start
            adjusted = slice(key.start + self._start, keystop, key.step)
            key = self._normalize_slice(key, clamp=True)
            keystart = min(self._start + key.start, self._stop)
            keystop = min(self._start + key.stop, self._stop)
            adjusted = slice(keystart, keystop, key.step)
            del self._parent[adjusted]
        else:
            length = len(self)
@@ -370,7 +366,7 @@ class _ListProxy(_SliceNormalizerMixIn, list):
    @property
    def _stop(self):
        """The ending index of this list, exclusive."""
        if self._sliceinfo[1] == maxsize:
        if self._sliceinfo[1] is None:
            return len(self._parent)
        return self._sliceinfo[1]

@@ -456,17 +452,5 @@ class _ListProxy(_SliceNormalizerMixIn, list):
            item.sort(**kwargs)
            self._parent[self._start:self._stop:self._step] = item

    def detach(self):
        """Detach the child so it operates like a normal list.

        This allows children to be properly garbage-collected if their parent
        is being kept around for a long time. This method has no effect if the
        child is already detached.
        """
        if not self._detached:
            self._parent._children.pop(id(self))
            self._parent = list(self._parent)
            self._detached = True


 del inheritdoc
--- a/scripts/README
+++ b/scripts/README
@@ -0,0 +1,3 @@
 This directory contains support files used for *developing* mwparserfromhell,
 not running it. If you are looking for code examples, read the documentation
 or explore the source code.
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -31,6 +31,13 @@ update_version() {
    echo " done."
 }

 update_appveyor() {
    filename="appveyor.yml"
    echo -n "Updating $filename..."
    sed -e "s/version: .*/version: $VERSION-b{build}/" -i "" $filename
    echo " done."
 }

 update_changelog() {
    filename="CHANGELOG"
    echo -n "Updating $filename..."
@@ -67,25 +74,18 @@ do_git_stuff() {
 }

 upload_to_pypi() {
    # TODO: check whether these commands give output
    echo -n "PyPI: uploading source tarball and docs..."
    python setup.py register sdist upload -s
    python setup.py upload_docs
    python setup.py -q register sdist upload -s
    python setup.py -q upload_docs
    echo " done."
 }

 windows_build() {
    echo "PyPI: building/uploading Windows binaries..."
    echo "*** Run in Windows: ./scripts/win_build.py"
    echo "*** Press enter when done."
    read
 }

 post_release() {
    echo
    echo "*** Release completed."
    echo "*** Update: https://github.com/earwig/mwparserfromhell/releases/tag/v$VERSION"
    echo "*** Verify: https://pypi.python.org/pypi/mwparserfromhell"
    echo "*** Verify: https://ci.appveyor.com/project/earwig/mwparserfromhell"
    echo "*** Verify: https://mwparserfromhell.readthedocs.org"
    echo "*** Press enter to sanity-check the release."
    read
@@ -153,11 +153,11 @@ cd "$SCRIPT_DIR/.."

 check_git
 update_version
 update_appveyor
 update_changelog
 update_docs_changelog
 do_git_stuff
 upload_to_pypi
 windows_build
 post_release
 test_release

--- a/scripts/win_build.py
+++ b/scripts/win_build.py
@@ -1,58 +0,0 @@
 # Build requirements:
 #
 # Python 2.6-3.2: Visual C++ Express Edition 2008:
 #                 http://go.microsoft.com/?linkid=7729279
 #
 # Python 3.3+: Visual C++ Express Edition 2010:
 #              http://go.microsoft.com/?linkid=9709949
 #
 # x64 builds: Microsoft Windows SDK for Windows 7 and .NET Framework 3.5 SP1:
 #             http://www.microsoft.com/en-us/download/details.aspx?id=3138
 #
 # Python interpreter, 2.6, 2.7, 3.2-3.4:
 # https://www.python.org/downloads/
 #
 # Pip, setuptools, wheel:
 # https://bootstrap.pypa.io/get-pip.py
 # and run *for each* Python version:
 # c:\pythonXX\python get-pip.py
 # c:\pythonXX\scripts\pip install wheel
 #
 # Afterwards, run this script with any of the python interpreters (2.7 suggested)

 from __future__ import print_function
 import os
 from subprocess import call, STDOUT

 ENVIRONMENTS = ["26", "27", "32", "33", "34"]

 def run(pyver, cmds):
    cmd = [r"C:\Python%s\Python.exe" % pyver, "setup.py"] + cmds
    print(" ".join(cmd), end=" ")

    with open("%s%s.log" % (cmds[0], pyver), "w") as logfile:
        retval = call(cmd, stdout=logfile, stderr=STDOUT, cwd="..")
    if not retval:
        print("[OK]")
    else:
        print("[FAILED (%i)]" % retval)
    return retval

 def main():
    path = os.path.split(__file__)[0]
    if path:
        os.chdir(path)

    print("Building Windows wheels for Python %s:" % ", ".join(ENVIRONMENTS))
    for pyver in ENVIRONMENTS:
        print()
        try:
            os.unlink("mwparserfromhell/parser/_tokenizer.pyd")
        except OSError:
            pass

        if run(pyver, ["test"]) == 0:
            run(pyver, ["bdist_wheel", "upload"])  # TODO: add "-s" to GPG sign

 if __name__ == "__main__":
    main()
--- a/scripts/win_wrapper.cmd
+++ b/scripts/win_wrapper.cmd
@@ -0,0 +1,43 @@
 :: To build extensions for 64 bit Python 3, we need to configure environment
 :: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of:
 :: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1)
 ::
 :: To build extensions for 64 bit Python 2, we need to configure environment
 :: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of:
 :: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0)
 ::
 :: 32 bit builds do not require specific environment configurations.
 ::
 :: Note: this script needs to be run with the /E:ON and /V:ON flags for the
 :: cmd interpreter, at least for (SDK v7.0)
 ::
 :: More details at:
 :: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows
 :: http://stackoverflow.com/a/13751649/163740
 ::
 :: Author: Olivier Grisel
 :: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
@ECHO OFF

 SET COMMAND_TO_RUN=%*
 SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows

 SET MAJOR_PYTHON_VERSION="%PYTHON_VERSION:~0,1%"
 IF %MAJOR_PYTHON_VERSION% == "2" (
    SET WINDOWS_SDK_VERSION="v7.0"
 ) ELSE IF %MAJOR_PYTHON_VERSION% == "3" (
    SET WINDOWS_SDK_VERSION="v7.1"
 ) ELSE (
    ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%"
    EXIT 1
 )

 IF "%PYTHON_ARCH%"=="64" (
    SET DISTUTILS_USE_SDK=1
    SET MSSdk=1
    "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION%
    "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release
    call %COMMAND_TO_RUN% || EXIT 1
 ) ELSE (
    call %COMMAND_TO_RUN% || EXIT 1
 )
--- a/setup.py
+++ b/setup.py
@@ -21,17 +21,18 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

 import os
 from __future__ import print_function
 from distutils.errors import DistutilsError, CCompilerError
 from glob import glob
 from os import environ
 import sys

 if (sys.version_info[0] == 2 and sys.version_info[1] < 6) or \
   (sys.version_info[1] == 3 and sys.version_info[1] < 2):
    raise Exception("mwparserfromhell needs Python 2.6+ or 3.2+")

 if sys.version_info >= (3, 0):
    basestring = (str, )
 if ((sys.version_info[0] == 2 and sys.version_info[1] < 6) or
    (sys.version_info[1] == 3 and sys.version_info[1] < 2)):
    raise RuntimeError("mwparserfromhell needs Python 2.6+ or 3.2+")

 from setuptools import setup, find_packages, Extension
 from setuptools.command.build_ext import build_ext

 from mwparserfromhell import __version__
 from mwparserfromhell.compat import py26, py3k
@@ -39,70 +40,48 @@ from mwparserfromhell.compat import py26, py3k
 with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp:
    long_docs = fp.read()

 tokenizer = Extension("mwparserfromhell.parser._tokenizer",
                      sources=["mwparserfromhell/parser/tokenizer.c"],
                      depends=["mwparserfromhell/parser/tokenizer.h"])

 use_extension = True
 fallback = True

 # Allow env var WITHOUT_EXTENSION and args --with[out]-extension
 if '--without-extension' in sys.argv:
    use_extension = False
 elif '--with-extension' in sys.argv:
    pass
 elif os.environ.get('WITHOUT_EXTENSION', '0') == '1':
    use_extension = False
 # Allow env var WITHOUT_EXTENSION and args --with[out]-extension:

 # Remove the command line argument as it isn't understood by
 # setuptools/distutils
 sys.argv = [arg for arg in sys.argv
            if not arg.startswith('--with')
            and not arg.endswith('-extension')]


 def optional_compile_setup(func=setup, use_ext=use_extension,
                           *args, **kwargs):
    """
    Wrap setup to allow optional compilation of extensions.
 env_var = environ.get("WITHOUT_EXTENSION")
 if "--without-extension" in sys.argv:
    use_extension = False
 elif "--with-extension" in sys.argv:
    fallback = False
 elif env_var is not None:
    if env_var == "1":
        use_extension = False
    elif env_var == "0":
        fallback = False

    Falls back to pure python mode (no extensions)
    if compilation of extensions fails.
    """
    extensions = kwargs.get('ext_modules', None)
 # Remove the command line argument as it isn't understood by setuptools:

    if use_ext and extensions:
        try:
            func(*args, **kwargs)
            return
        except SystemExit as e:
            assert(e.args)
            if e.args[0] is False:
                raise
            elif isinstance(e.args[0], basestring):
                if e.args[0].startswith('usage: '):
                    raise
                else:
                    # Fallback to pure python mode
                    print('setup with extension failed: %s' % repr(e))
                    pass
        except Exception as e:
            print('setup with extension failed: %s' % repr(e))
 sys.argv = [arg for arg in sys.argv
            if arg != "--without-extension" and arg != "--with-extension"]

    if extensions:
        if use_ext:
            print('Falling back to pure python mode.')
        else:
            print('Using pure python mode.')
 def build_ext_patched(self):
    try:
        build_ext_original(self)
    except (DistutilsError, CCompilerError) as exc:
        print("error: " + str(exc))
        print("Falling back to pure Python mode.")
        del self.extensions[:]

        del kwargs['ext_modules']
 if fallback:
    build_ext.run, build_ext_original = build_ext_patched, build_ext.run

    func(*args, **kwargs)
 # Project-specific part begins here:

 tokenizer = Extension("mwparserfromhell.parser._tokenizer",
                      sources=glob("mwparserfromhell/parser/ctokenizer/*.c"),
                      depends=glob("mwparserfromhell/parser/ctokenizer/*.h"))

 optional_compile_setup(
 setup(
    name = "mwparserfromhell",
    packages = find_packages(exclude=("tests",)),
    ext_modules = [tokenizer],
    ext_modules = [tokenizer] if use_extension else [],
    tests_require = ["unittest2"] if py26 else [],
    test_suite = "tests.discover",
    version = __version__,
--- a/tests/_test_tokenizer.py
+++ b/tests/_test_tokenizer.py
@@ -42,8 +42,8 @@ class TokenizerTestCase(object):
    directory.
    """

    @classmethod
    def _build_test_method(cls, funcname, data):
    @staticmethod
    def _build_test_method(funcname, data):
        """Create and return a method to be treated as a test case method.

        *data* is a dict containing multiple keys: the *input* text to be
@@ -58,13 +58,35 @@ class TokenizerTestCase(object):
                expected = data["output"]
                actual = self.tokenizer().tokenize(data["input"])
            self.assertEqual(expected, actual)

        if not py3k:
            inner.__name__ = funcname.encode("utf8")
        inner.__doc__ = data["label"]
        return inner

    @staticmethod
    def _parse_test(test, data):
        """Parse an individual *test*, storing its info in *data*."""
        for line in test.strip().splitlines():
            if line.startswith("name:"):
                data["name"] = line[len("name:"):].strip()
            elif line.startswith("label:"):
                data["label"] = line[len("label:"):].strip()
            elif line.startswith("input:"):
                raw = line[len("input:"):].strip()
                if raw[0] == '"' and raw[-1] == '"':
                    raw = raw[1:-1]
                raw = raw.encode("raw_unicode_escape")
                data["input"] = raw.decode("unicode_escape")
            elif line.startswith("output:"):
                raw = line[len("output:"):].strip()
                try:
                    data["output"] = eval(raw, vars(tokens))
                except Exception as err:
                    raise _TestParseError(err)

    @classmethod
    def _load_tests(cls, filename, name, text):
    def _load_tests(cls, filename, name, text, restrict=None):
        """Load all tests in *text* from the file *filename*."""
        tests = text.split("\n---\n")
        counter = 1
@@ -72,23 +94,7 @@ class TokenizerTestCase(object):
        for test in tests:
            data = {"name": None, "label": None, "input": None, "output": None}
            try:
                for line in test.strip().splitlines():
                    if line.startswith("name:"):
                        data["name"] = line[len("name:"):].strip()
                    elif line.startswith("label:"):
                        data["label"] = line[len("label:"):].strip()
                    elif line.startswith("input:"):
                        raw = line[len("input:"):].strip()
                        if raw[0] == '"' and raw[-1] == '"':
                            raw = raw[1:-1]
                        raw = raw.encode("raw_unicode_escape")
                        data["input"] = raw.decode("unicode_escape")
                    elif line.startswith("output:"):
                        raw = line[len("output:"):].strip()
                        try:
                            data["output"] = eval(raw, vars(tokens))
                        except Exception as err:
                            raise _TestParseError(err)
                cls._parse_test(test, data)
            except _TestParseError as err:
                if data["name"]:
                    error = "Could not parse test '{0}' in '{1}':\n\t{2}"
@@ -97,6 +103,7 @@ class TokenizerTestCase(object):
                    error = "Could not parse a test in '{0}':\n\t{1}"
                    print(error.format(filename, err))
                continue

            if not data["name"]:
                error = "A test in '{0}' was ignored because it lacked a name"
                print(error.format(filename))
@@ -105,27 +112,35 @@ class TokenizerTestCase(object):
                error = "Test '{0}' in '{1}' was ignored because it lacked an input or an output"
                print(error.format(data["name"], filename))
                continue

            number = str(counter).zfill(digits)
            counter += 1
            if restrict and data["name"] != restrict:
                continue

            fname = "test_{0}{1}_{2}".format(name, number, data["name"])
            meth = cls._build_test_method(fname, data)
            setattr(cls, fname, meth)
            counter += 1

    @classmethod
    def build(cls):
        """Load and install all tests from the 'tokenizer' directory."""
        def load_file(filename):
        def load_file(filename, restrict=None):
            with codecs.open(filename, "rU", encoding="utf8") as fp:
                text = fp.read()
                name = path.split(filename)[1][:0-len(extension)]
                cls._load_tests(filename, name, text)
                name = path.split(filename)[1][:-len(extension)]
                cls._load_tests(filename, name, text, restrict)

        directory = path.join(path.dirname(__file__), "tokenizer")
        extension = ".mwtest"
        if len(sys.argv) > 2 and sys.argv[1] == "--use":
            for name in sys.argv[2:]:
                load_file(path.join(directory, name + extension))
            sys.argv = [sys.argv[0]]  # So unittest doesn't try to load these
                if "." in name:
                    name, test = name.split(".", 1)
                else:
                    test = None
                load_file(path.join(directory, name + extension), test)
            sys.argv = [sys.argv[0]]  # So unittest doesn't try to parse this
            cls.skip_others = True
        else:
            for filename in listdir(directory):
--- a/tests/test_smart_list.py
+++ b/tests/test_smart_list.py
@@ -52,6 +52,7 @@ class TestSmartList(unittest.TestCase):
        self.assertEqual([0, 1, 2], list1[:3])
        self.assertEqual([0, 1, 2, 3, "one", "two"], list1[:])
        self.assertEqual([3, "one", "two"], list1[3:])
        self.assertEqual([3, "one", "two"], list1[3:100])
        self.assertEqual(["one", "two"], list1[-2:])
        self.assertEqual([0, 1], list1[:-4])
        self.assertEqual([], list1[6:])
@@ -389,28 +390,35 @@ class TestSmartList(unittest.TestCase):
        self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
        self.assertEqual([4, 3, 2, 1.9, 1.8], child2)

        child1.detach()
        self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], parent)
        self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
        child3 = parent[9:]
        self.assertEqual([8, 8.1, 8.2], child3)

        del parent[8:]
        self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent)
        self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6], child1)
        self.assertEqual([4, 3, 2, 1.9, 1.8], child2)
        self.assertEqual([], child3)

        del child1
        self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent)
        self.assertEqual([4, 3, 2, 1.9, 1.8], child2)
        self.assertEqual([], child3)
        self.assertEqual(2, len(parent._children))

        del child3
        self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent)
        self.assertEqual([4, 3, 2, 1.9, 1.8], child2)
        self.assertEqual(1, len(parent._children))

        parent.remove(1.9)
        parent.remove(1.8)
        self.assertEqual([1, 4, 3, 2, 5, 6, 7, 8, 8.1, 8.2], parent)
        self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
        self.assertEqual([1, 4, 3, 2, 5, 6], parent)
        self.assertEqual([4, 3, 2], child2)

        parent.reverse()
        self.assertEqual([8.2, 8.1, 8, 7, 6, 5, 2, 3, 4, 1], parent)
        self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
        self.assertEqual([6, 5, 2, 3, 4, 1], parent)
        self.assertEqual([4, 3, 2], child2)
        self.assertEqual(0, len(parent._children))

        child2.detach()
        self.assertEqual([8.2, 8.1, 8, 7, 6, 5, 2, 3, 4, 1], parent)
        self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
        self.assertEqual([4, 3, 2], child2)

 if __name__ == "__main__":
    unittest.main(verbosity=2)
--- a/tests/tokenizer/text.mwtest
+++ b/tests/tokenizer/text.mwtest
@@ -27,6 +27,6 @@ output: [Text(text="𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰")]
 ---

 name:   large
 label:  a lot of text, requiring multiple textbuffer blocks in the C tokenizer
 label:  a lot of text, requiring proper storage in the C tokenizer
 input:  "ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN"
 output: [Text(text="ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN")]