From 74ab399ed012687dd1f86d17f40a1a369e396e02 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Mon, 8 Jun 2015 18:29:22 -0400
Subject: [PATCH 01/22] Try out appveyor config (#95)

---
 appveyor.yml            | 56 ++++++++++++++++++++++++++++++++
 scripts/README          |  3 ++
 scripts/release.sh      |  7 ++--
 scripts/win_build.py    | 58 ---------------------------------
 scripts/win_install.ps1 | 85 +++++++++++++++++++++++++++++++++++++++++++++++++
 scripts/win_wrapper.cmd | 47 +++++++++++++++++++++++++++
 6 files changed, 195 insertions(+), 61 deletions(-)
 create mode 100644 appveyor.yml
 create mode 100644 scripts/README
 delete mode 100644 scripts/win_build.py
 create mode 100644 scripts/win_install.ps1
 create mode 100644 scripts/win_wrapper.cmd

diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 0000000..c312c89
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,56 @@
+# This config file is used by appveyor.com to build Windows release binaries
+
+version: 0.4.1.dev0-b{build}
+
+branches:
+  only:
+    - master
+
+skip_tags: true
+
+environment:
+  global:
+    # See: http://stackoverflow.com/a/13751649/163740
+    WRAPPER: "cmd /E:ON /V:ON /C .\\scripts\\win_wrapper.cmd"
+
+  matrix:
+    - PYTHON:         "C:\\Python27"
+      PYTHON_VERSION: "2.7.10"
+      PYTHON_ARCH:    "32"
+
+    - PYTHON:         "C:\\Python27-x64"
+      PYTHON_VERSION: "2.7.10"
+      PYTHON_ARCH:    "64"
+
+    - PYTHON:         "C:\\Python33"
+      PYTHON_VERSION: "3.3.6"
+      PYTHON_ARCH:    "32"
+
+    - PYTHON:         "C:\\Python33-x64"
+      PYTHON_VERSION: "3.3.6"
+      PYTHON_ARCH:    "64"
+
+    - PYTHON:         "C:\\Python34"
+      PYTHON_VERSION: "3.4.3"
+      PYTHON_ARCH:    "32"
+
+    - PYTHON:         "C:\\Python34-x64"
+      PYTHON_VERSION: "3.4.3"
+      PYTHON_ARCH:    "64"
+
+install:
+  - "powershell scripts\\win_install.ps1"
+
+build_script:
+  - "%WRAPPER% %PYTHON%\\python setup.py build"
+
+test_script:
+  - "%WRAPPER% %PYTHON%\\python setup.py -q test"
+
+after_test:
+  - "%WRAPPER% %PYTHON%\\python setup.py bdist_wheel"
+
+artifacts:
+  - path: dist\*
+
+deploy: off
diff --git a/scripts/README b/scripts/README
new file mode 100644
index 0000000..eea9627
--- /dev/null
+++ b/scripts/README
@@ -0,0 +1,3 @@
+This directory contains support files used for *developing* mwparserfromhell,
+not running it. If you are looking for code examples, read the documentation
+or explore the source code.
diff --git a/scripts/release.sh b/scripts/release.sh
index dcd871c..7d79e8e 100755
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -31,6 +31,8 @@ update_version() {
     echo " done."
 }
 
+# TODO: update appveyor version!
+
 update_changelog() {
     filename="CHANGELOG"
     echo -n "Updating $filename..."
@@ -67,10 +69,9 @@ do_git_stuff() {
 }
 
 upload_to_pypi() {
-    # TODO: check whether these commands give output
     echo -n "PyPI: uploading source tarball and docs..."
-    python setup.py register sdist upload -s
-    python setup.py upload_docs
+    python setup.py -q register sdist upload -s
+    python setup.py -q upload_docs
     echo " done."
 }
 
diff --git a/scripts/win_build.py b/scripts/win_build.py
deleted file mode 100644
index 2d51909..0000000
--- a/scripts/win_build.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Build requirements:
-#
-# Python 2.6-3.2: Visual C++ Express Edition 2008:
-#                 http://go.microsoft.com/?linkid=7729279
-#
-# Python 3.3+: Visual C++ Express Edition 2010:
-#              http://go.microsoft.com/?linkid=9709949
-#
-# x64 builds: Microsoft Windows SDK for Windows 7 and .NET Framework 3.5 SP1:
-#             http://www.microsoft.com/en-us/download/details.aspx?id=3138
-#
-# Python interpreter, 2.6, 2.7, 3.2-3.4:
-# https://www.python.org/downloads/
-#
-# Pip, setuptools, wheel:
-# https://bootstrap.pypa.io/get-pip.py
-# and run *for each* Python version:
-# c:\pythonXX\python get-pip.py
-# c:\pythonXX\scripts\pip install wheel
-#
-# Afterwards, run this script with any of the python interpreters (2.7 suggested)
-
-from __future__ import print_function
-import os
-from subprocess import call, STDOUT
-
-ENVIRONMENTS = ["26", "27", "32", "33", "34"]
-
-def run(pyver, cmds):
-    cmd = [r"C:\Python%s\Python.exe" % pyver, "setup.py"] + cmds
-    print(" ".join(cmd), end=" ")
-
-    with open("%s%s.log" % (cmds[0], pyver), "w") as logfile:
-        retval = call(cmd, stdout=logfile, stderr=STDOUT, cwd="..")
-    if not retval:
-        print("[OK]")
-    else:
-        print("[FAILED (%i)]" % retval)
-    return retval
-
-def main():
-    path = os.path.split(__file__)[0]
-    if path:
-        os.chdir(path)
-
-    print("Building Windows wheels for Python %s:" % ", ".join(ENVIRONMENTS))
-    for pyver in ENVIRONMENTS:
-        print()
-        try:
-            os.unlink("mwparserfromhell/parser/_tokenizer.pyd")
-        except OSError:
-            pass
-
-        if run(pyver, ["test"]) == 0:
-            run(pyver, ["bdist_wheel", "upload"])  # TODO: add "-s" to GPG sign
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/win_install.ps1 b/scripts/win_install.ps1
new file mode 100644
index 0000000..bd76d11
--- /dev/null
+++ b/scripts/win_install.ps1
@@ -0,0 +1,85 @@
+# Sample script to install Python and pip under Windows
+# Authors: Olivier Grisel and Kyle Kastner
+# License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
+
+$BASE_URL = "https://www.python.org/ftp/python/"
+$GET_PIP_URL = "https://bootstrap.pypa.io/get-pip.py"
+$GET_PIP_PATH = "C:\get-pip.py"
+
+
+function DownloadPython ($python_version, $platform_suffix) {
+    $webclient = New-Object System.Net.WebClient
+    $filename = "python-" + $python_version + $platform_suffix + ".msi"
+    $url = $BASE_URL + $python_version + "/" + $filename
+
+    $basedir = $pwd.Path + "\"
+    $filepath = $basedir + $filename
+    if (Test-Path $filename) {
+        Write-Host "Reusing" $filepath
+        return $filepath
+    }
+
+    # Download and retry up to 3 times in case of network transient errors.
+    Write-Host "Downloading" $filename "from" $url
+    $retry_attempts = 3
+    for($i=0; $i -lt $retry_attempts; $i++){
+        try {
+            $webclient.DownloadFile($url, $filepath)
+            break
+        }
+        Catch [Exception]{
+            Start-Sleep 1
+        }
+   }
+   Write-Host "File saved at" $filepath
+   return $filepath
+}
+
+
+function InstallPython ($python_version, $architecture, $python_home) {
+    Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home
+    if (Test-Path $python_home) {
+        Write-Host $python_home "already exists, skipping."
+        return $false
+    }
+    if ($architecture -eq "32") {
+        $platform_suffix = ""
+    } else {
+        $platform_suffix = ".amd64"
+    }
+    $filepath = DownloadPython $python_version $platform_suffix
+    Write-Host "Installing" $filepath "to" $python_home
+    $args = "/qn /i $filepath TARGETDIR=$python_home"
+    Write-Host "msiexec.exe" $args
+    Start-Process -FilePath "msiexec.exe" -ArgumentList $args -Wait -Passthru
+    Write-Host "Python $python_version ($architecture) installation complete"
+    return $true
+}
+
+
+function InstallPip ($python_home) {
+    $pip_path = $python_home + "/Scripts/pip.exe"
+    $python_path = $python_home + "/python.exe"
+    if (-not(Test-Path $pip_path)) {
+        Write-Host "Installing pip..."
+        $webclient = New-Object System.Net.WebClient
+        $webclient.DownloadFile($GET_PIP_URL, $GET_PIP_PATH)
+        Write-Host "Executing:" $python_path $GET_PIP_PATH
+        Start-Process -FilePath "$python_path" -ArgumentList "$GET_PIP_PATH" -Wait -Passthru
+    } else {
+        Write-Host "pip already installed."
+    }
+}
+
+function InstallPackage ($python_home, $pkg) {
+    $pip_path = $python_home + "/Scripts/pip.exe"
+    & $pip_path install $pkg
+}
+
+function main () {
+    InstallPython $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON
+    InstallPip $env:PYTHON
+    InstallPackage $env:PYTHON wheel
+}
+
+main
diff --git a/scripts/win_wrapper.cmd b/scripts/win_wrapper.cmd
new file mode 100644
index 0000000..3a472bc
--- /dev/null
+++ b/scripts/win_wrapper.cmd
@@ -0,0 +1,47 @@
+:: To build extensions for 64 bit Python 3, we need to configure environment
+:: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of:
+:: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1)
+::
+:: To build extensions for 64 bit Python 2, we need to configure environment
+:: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of:
+:: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0)
+::
+:: 32 bit builds do not require specific environment configurations.
+::
+:: Note: this script needs to be run with the /E:ON and /V:ON flags for the
+:: cmd interpreter, at least for (SDK v7.0)
+::
+:: More details at:
+:: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows
+:: http://stackoverflow.com/a/13751649/163740
+::
+:: Author: Olivier Grisel
+:: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
+@ECHO OFF
+
+SET COMMAND_TO_RUN=%*
+SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows
+
+SET MAJOR_PYTHON_VERSION="%PYTHON_VERSION:~0,1%"
+IF %MAJOR_PYTHON_VERSION% == "2" (
+    SET WINDOWS_SDK_VERSION="v7.0"
+) ELSE IF %MAJOR_PYTHON_VERSION% == "3" (
+    SET WINDOWS_SDK_VERSION="v7.1"
+) ELSE (
+    ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%"
+    EXIT 1
+)
+
+IF "%PYTHON_ARCH%"=="64" (
+    ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture
+    SET DISTUTILS_USE_SDK=1
+    SET MSSdk=1
+    "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION%
+    "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release
+    ECHO Executing: %COMMAND_TO_RUN%
+    call %COMMAND_TO_RUN% || EXIT 1
+) ELSE (
+    ECHO Using default MSVC build environment for 32 bit architecture
+    ECHO Executing: %COMMAND_TO_RUN%
+    call %COMMAND_TO_RUN% || EXIT 1
+)

From c7497ddf63bd5160a47aa407cbad7cc5a565cd2d Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Wed, 10 Jun 2015 16:25:53 -0400
Subject: [PATCH 02/22] Updates to installation/release code.

* Remove unnecessary win_install.ps1.
* Add appveyor version update to release script.
* Make setup.py's pure Python fallback code less intrusive.
---
 appveyor.yml            | 10 +++---
 scripts/release.sh      |  8 ++++-
 scripts/win_install.ps1 | 85 ----------------------------------------------
 setup.py                | 90 ++++++++++++++++++-------------------------------
 4 files changed, 46 insertions(+), 147 deletions(-)
 delete mode 100644 scripts/win_install.ps1

diff --git a/appveyor.yml b/appveyor.yml
index c312c89..34201f2 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -12,6 +12,8 @@ environment:
   global:
     # See: http://stackoverflow.com/a/13751649/163740
     WRAPPER: "cmd /E:ON /V:ON /C .\\scripts\\win_wrapper.cmd"
+    PIP:     "%WRAPPER% %PYTHON%\\Scripts\\pip.exe"
+    SETUPPY: "%WRAPPER% %PYTHON%\\python setup.py --with-extension"
 
   matrix:
     - PYTHON:         "C:\\Python27"
@@ -39,16 +41,16 @@ environment:
       PYTHON_ARCH:    "64"
 
 install:
-  - "powershell scripts\\win_install.ps1"
+  - "%PIP% install wheel"
 
 build_script:
-  - "%WRAPPER% %PYTHON%\\python setup.py build"
+  - "%SETUPPY% build"
 
 test_script:
-  - "%WRAPPER% %PYTHON%\\python setup.py -q test"
+  - "%SETUPPY% -q test"
 
 after_test:
-  - "%WRAPPER% %PYTHON%\\python setup.py bdist_wheel"
+  - "%SETUPPY% bdist_wheel"
 
 artifacts:
   - path: dist\*
diff --git a/scripts/release.sh b/scripts/release.sh
index 7d79e8e..c256c7c 100755
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -31,7 +31,12 @@ update_version() {
     echo " done."
 }
 
-# TODO: update appveyor version!
+update_appveyor() {
+    filename="appveyor.yml"
+    echo -n "Updating $filename..."
+    sed -e "s/version: .*/version: $VERSION-b{build}/" -i "" $filename
+    echo " done."
+}
 
 update_changelog() {
     filename="CHANGELOG"
@@ -154,6 +159,7 @@ cd "$SCRIPT_DIR/.."
 
 check_git
 update_version
+update_appveyor
 update_changelog
 update_docs_changelog
 do_git_stuff
diff --git a/scripts/win_install.ps1 b/scripts/win_install.ps1
deleted file mode 100644
index bd76d11..0000000
--- a/scripts/win_install.ps1
+++ /dev/null
@@ -1,85 +0,0 @@
-# Sample script to install Python and pip under Windows
-# Authors: Olivier Grisel and Kyle Kastner
-# License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
-
-$BASE_URL = "https://www.python.org/ftp/python/"
-$GET_PIP_URL = "https://bootstrap.pypa.io/get-pip.py"
-$GET_PIP_PATH = "C:\get-pip.py"
-
-
-function DownloadPython ($python_version, $platform_suffix) {
-    $webclient = New-Object System.Net.WebClient
-    $filename = "python-" + $python_version + $platform_suffix + ".msi"
-    $url = $BASE_URL + $python_version + "/" + $filename
-
-    $basedir = $pwd.Path + "\"
-    $filepath = $basedir + $filename
-    if (Test-Path $filename) {
-        Write-Host "Reusing" $filepath
-        return $filepath
-    }
-
-    # Download and retry up to 3 times in case of network transient errors.
-    Write-Host "Downloading" $filename "from" $url
-    $retry_attempts = 3
-    for($i=0; $i -lt $retry_attempts; $i++){
-        try {
-            $webclient.DownloadFile($url, $filepath)
-            break
-        }
-        Catch [Exception]{
-            Start-Sleep 1
-        }
-   }
-   Write-Host "File saved at" $filepath
-   return $filepath
-}
-
-
-function InstallPython ($python_version, $architecture, $python_home) {
-    Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home
-    if (Test-Path $python_home) {
-        Write-Host $python_home "already exists, skipping."
-        return $false
-    }
-    if ($architecture -eq "32") {
-        $platform_suffix = ""
-    } else {
-        $platform_suffix = ".amd64"
-    }
-    $filepath = DownloadPython $python_version $platform_suffix
-    Write-Host "Installing" $filepath "to" $python_home
-    $args = "/qn /i $filepath TARGETDIR=$python_home"
-    Write-Host "msiexec.exe" $args
-    Start-Process -FilePath "msiexec.exe" -ArgumentList $args -Wait -Passthru
-    Write-Host "Python $python_version ($architecture) installation complete"
-    return $true
-}
-
-
-function InstallPip ($python_home) {
-    $pip_path = $python_home + "/Scripts/pip.exe"
-    $python_path = $python_home + "/python.exe"
-    if (-not(Test-Path $pip_path)) {
-        Write-Host "Installing pip..."
-        $webclient = New-Object System.Net.WebClient
-        $webclient.DownloadFile($GET_PIP_URL, $GET_PIP_PATH)
-        Write-Host "Executing:" $python_path $GET_PIP_PATH
-        Start-Process -FilePath "$python_path" -ArgumentList "$GET_PIP_PATH" -Wait -Passthru
-    } else {
-        Write-Host "pip already installed."
-    }
-}
-
-function InstallPackage ($python_home, $pkg) {
-    $pip_path = $python_home + "/Scripts/pip.exe"
-    & $pip_path install $pkg
-}
-
-function main () {
-    InstallPython $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON
-    InstallPip $env:PYTHON
-    InstallPackage $env:PYTHON wheel
-}
-
-main
diff --git a/setup.py b/setup.py
index e2744ef..dcdd563 100644
--- a/setup.py
+++ b/setup.py
@@ -21,17 +21,17 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-import os
+from __future__ import print_function
+from distutils.errors import DistutilsError, CCompilerError
+from os import environ
 import sys
 
-if (sys.version_info[0] == 2 and sys.version_info[1] < 6) or \
-   (sys.version_info[1] == 3 and sys.version_info[1] < 2):
-    raise Exception("mwparserfromhell needs Python 2.6+ or 3.2+")
-
-if sys.version_info >= (3, 0):
-    basestring = (str, )
+if ((sys.version_info[0] == 2 and sys.version_info[1] < 6) or
+    (sys.version_info[1] == 3 and sys.version_info[1] < 2)):
+    raise RuntimeError("mwparserfromhell needs Python 2.6+ or 3.2+")
 
 from setuptools import setup, find_packages, Extension
+from setuptools.command.build_ext import build_ext
 
 from mwparserfromhell import __version__
 from mwparserfromhell.compat import py26, py3k
@@ -44,65 +44,41 @@ tokenizer = Extension("mwparserfromhell.parser._tokenizer",
                       depends=["mwparserfromhell/parser/tokenizer.h"])
 
 use_extension = True
+fallback = True
 
-# Allow env var WITHOUT_EXTENSION and args --with[out]-extension
-if '--without-extension' in sys.argv:
-    use_extension = False
-elif '--with-extension' in sys.argv:
-    pass
-elif os.environ.get('WITHOUT_EXTENSION', '0') == '1':
-    use_extension = False
-
-# Remove the command line argument as it isn't understood by
-# setuptools/distutils
-sys.argv = [arg for arg in sys.argv
-            if not arg.startswith('--with')
-            and not arg.endswith('-extension')]
-
-
-def optional_compile_setup(func=setup, use_ext=use_extension,
-                           *args, **kwargs):
-    """
-    Wrap setup to allow optional compilation of extensions.
-
-    Falls back to pure python mode (no extensions)
-    if compilation of extensions fails.
-    """
-    extensions = kwargs.get('ext_modules', None)
+# Allow env var WITHOUT_EXTENSION and args --with[out]-extension:
 
-    if use_ext and extensions:
-        try:
-            func(*args, **kwargs)
-            return
-        except SystemExit as e:
-            assert(e.args)
-            if e.args[0] is False:
-                raise
-            elif isinstance(e.args[0], basestring):
-                if e.args[0].startswith('usage: '):
-                    raise
-                else:
-                    # Fallback to pure python mode
-                    print('setup with extension failed: %s' % repr(e))
-                    pass
-        except Exception as e:
-            print('setup with extension failed: %s' % repr(e))
+env_var = environ.get("WITHOUT_EXTENSION")
+if "--without-extension" in sys.argv:
+    use_extension = False
+elif "--with-extension" in sys.argv:
+    fallback = False
+elif env_var is not None:
+    if env_var == "1":
+        use_extension = False
+    elif env_var == "0":
+        fallback = False
 
-    if extensions:
-        if use_ext:
-            print('Falling back to pure python mode.')
-        else:
-            print('Using pure python mode.')
+# Remove the command line argument as it isn't understood by setuptools:
 
-        del kwargs['ext_modules']
+sys.argv = [arg for arg in sys.argv
+            if arg != "--without-extension" and arg != "--with-extension"]
 
-    func(*args, **kwargs)
+def build_ext_patched(self):
+    try:
+        build_ext_original(self)
+    except (DistutilsError, CCompilerError) as exc:
+        print("error: " + str(exc))
+        print("Falling back to pure Python mode.")
+        del self.extensions[:]
 
+if fallback:
+    build_ext.run, build_ext_original = build_ext_patched, build_ext.run
 
-optional_compile_setup(
+setup(
     name = "mwparserfromhell",
     packages = find_packages(exclude=("tests",)),
-    ext_modules = [tokenizer],
+    ext_modules = [tokenizer] if use_extension else [],
     tests_require = ["unittest2"] if py26 else [],
     test_suite = "tests.discover",
     version = __version__,

From dad042bc2c637c05728b29881eca644745f1fc6b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Wed, 10 Jun 2015 22:42:57 -0400
Subject: [PATCH 03/22] Fix C warnings in MSVC.

---
 mwparserfromhell/parser/tokenizer.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index ec0315f..dd11d16 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -40,7 +40,7 @@ static int is_marker(Py_UNICODE this)
 /*
     Given a context, return the heading level encoded within it.
 */
-static int heading_level_from_context(int n)
+static int heading_level_from_context(uint64_t n)
 {
     int level;
 
@@ -177,7 +177,8 @@ static TagData* TagData_new(void)
     ALLOC_BUFFER(self->pad_first)
     ALLOC_BUFFER(self->pad_before_eq)
     ALLOC_BUFFER(self->pad_after_eq)
-    self->quoter = self->reset = 0;
+    self->quoter = 0;
+    self->reset = 0;
     return self;
 }
 
@@ -444,7 +445,7 @@ static int
 Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse)
 {
     Textbuffer *original = buffer;
-    long i;
+    Py_ssize_t i;
 
     if (reverse) {
         do {
@@ -939,7 +940,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
     Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer;
     PyObject *scheme;
     Py_UNICODE chunk;
-    long i;
+    Py_ssize_t i;
     int slashes, j;
 
     if (!scheme_buffer)

From a8c0ff3f290cf82da8e22cd0007d78adfed3c4b1 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Thu, 11 Jun 2015 00:03:49 -0400
Subject: [PATCH 04/22] Remove stdint.h include for MSVC 2008.

---
 mwparserfromhell/parser/tokenizer.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index 842e65d..102fecd 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -29,12 +29,15 @@ SOFTWARE.
 #include <math.h>
 #include <structmember.h>
 #include <bytesobject.h>
-#include <stdint.h>
 
 #if PY_MAJOR_VERSION >= 3
 #define IS_PY3K
 #endif
 
+#ifndef uint64_t
+#define uint64_t unsigned PY_LONG_LONG
+#endif
+
 #define malloc PyObject_Malloc
 #define free   PyObject_Free
 

From 3aa6bb891cdd59e7f0e4483bc3316a5612ab4989 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Thu, 11 Jun 2015 18:11:54 -0400
Subject: [PATCH 05/22] Point releases are unnecessary in appyveyor.yml.

---
 appveyor.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index 34201f2..9d0d8c8 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -17,27 +17,27 @@ environment:
 
   matrix:
     - PYTHON:         "C:\\Python27"
-      PYTHON_VERSION: "2.7.10"
+      PYTHON_VERSION: "2.7"
       PYTHON_ARCH:    "32"
 
     - PYTHON:         "C:\\Python27-x64"
-      PYTHON_VERSION: "2.7.10"
+      PYTHON_VERSION: "2.7"
       PYTHON_ARCH:    "64"
 
     - PYTHON:         "C:\\Python33"
-      PYTHON_VERSION: "3.3.6"
+      PYTHON_VERSION: "3.3"
       PYTHON_ARCH:    "32"
 
     - PYTHON:         "C:\\Python33-x64"
-      PYTHON_VERSION: "3.3.6"
+      PYTHON_VERSION: "3.3"
       PYTHON_ARCH:    "64"
 
     - PYTHON:         "C:\\Python34"
-      PYTHON_VERSION: "3.4.3"
+      PYTHON_VERSION: "3.4"
       PYTHON_ARCH:    "32"
 
     - PYTHON:         "C:\\Python34-x64"
-      PYTHON_VERSION: "3.4.3"
+      PYTHON_VERSION: "3.4"
       PYTHON_ARCH:    "64"
 
 install:

From efc571c5c0e18782f2514b39b9bf351c19fafce4 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Thu, 11 Jun 2015 21:45:34 -0400
Subject: [PATCH 06/22] Refactor _test_tokenizer; add syntax for running just
 one test.

---
 tests/_test_tokenizer.py | 67 +++++++++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py
index 1cbbc3d..cacf166 100644
--- a/tests/_test_tokenizer.py
+++ b/tests/_test_tokenizer.py
@@ -42,8 +42,8 @@ class TokenizerTestCase(object):
     directory.
     """
 
-    @classmethod
-    def _build_test_method(cls, funcname, data):
+    @staticmethod
+    def _build_test_method(funcname, data):
         """Create and return a method to be treated as a test case method.
 
         *data* is a dict containing multiple keys: the *input* text to be
@@ -58,13 +58,35 @@ class TokenizerTestCase(object):
                 expected = data["output"]
                 actual = self.tokenizer().tokenize(data["input"])
             self.assertEqual(expected, actual)
+
         if not py3k:
             inner.__name__ = funcname.encode("utf8")
         inner.__doc__ = data["label"]
         return inner
 
+    @staticmethod
+    def _parse_test(test, data):
+        """Parse an individual *test*, storing its info in *data*."""
+        for line in test.strip().splitlines():
+            if line.startswith("name:"):
+                data["name"] = line[len("name:"):].strip()
+            elif line.startswith("label:"):
+                data["label"] = line[len("label:"):].strip()
+            elif line.startswith("input:"):
+                raw = line[len("input:"):].strip()
+                if raw[0] == '"' and raw[-1] == '"':
+                    raw = raw[1:-1]
+                raw = raw.encode("raw_unicode_escape")
+                data["input"] = raw.decode("unicode_escape")
+            elif line.startswith("output:"):
+                raw = line[len("output:"):].strip()
+                try:
+                    data["output"] = eval(raw, vars(tokens))
+                except Exception as err:
+                    raise _TestParseError(err)
+
     @classmethod
-    def _load_tests(cls, filename, name, text):
+    def _load_tests(cls, filename, name, text, restrict=None):
         """Load all tests in *text* from the file *filename*."""
         tests = text.split("\n---\n")
         counter = 1
@@ -72,23 +94,7 @@ class TokenizerTestCase(object):
         for test in tests:
             data = {"name": None, "label": None, "input": None, "output": None}
             try:
-                for line in test.strip().splitlines():
-                    if line.startswith("name:"):
-                        data["name"] = line[len("name:"):].strip()
-                    elif line.startswith("label:"):
-                        data["label"] = line[len("label:"):].strip()
-                    elif line.startswith("input:"):
-                        raw = line[len("input:"):].strip()
-                        if raw[0] == '"' and raw[-1] == '"':
-                            raw = raw[1:-1]
-                        raw = raw.encode("raw_unicode_escape")
-                        data["input"] = raw.decode("unicode_escape")
-                    elif line.startswith("output:"):
-                        raw = line[len("output:"):].strip()
-                        try:
-                            data["output"] = eval(raw, vars(tokens))
-                        except Exception as err:
-                            raise _TestParseError(err)
+                cls._parse_test(test, data)
             except _TestParseError as err:
                 if data["name"]:
                     error = "Could not parse test '{0}' in '{1}':\n\t{2}"
@@ -97,6 +103,7 @@ class TokenizerTestCase(object):
                     error = "Could not parse a test in '{0}':\n\t{1}"
                     print(error.format(filename, err))
                 continue
+
             if not data["name"]:
                 error = "A test in '{0}' was ignored because it lacked a name"
                 print(error.format(filename))
@@ -105,27 +112,35 @@ class TokenizerTestCase(object):
                 error = "Test '{0}' in '{1}' was ignored because it lacked an input or an output"
                 print(error.format(data["name"], filename))
                 continue
+
             number = str(counter).zfill(digits)
+            counter += 1
+            if restrict and data["name"] != restrict:
+                continue
+
             fname = "test_{0}{1}_{2}".format(name, number, data["name"])
             meth = cls._build_test_method(fname, data)
             setattr(cls, fname, meth)
-            counter += 1
 
     @classmethod
     def build(cls):
         """Load and install all tests from the 'tokenizer' directory."""
-        def load_file(filename):
+        def load_file(filename, restrict=None):
             with codecs.open(filename, "rU", encoding="utf8") as fp:
                 text = fp.read()
-                name = path.split(filename)[1][:0-len(extension)]
-                cls._load_tests(filename, name, text)
+                name = path.split(filename)[1][:-len(extension)]
+                cls._load_tests(filename, name, text, restrict)
 
         directory = path.join(path.dirname(__file__), "tokenizer")
         extension = ".mwtest"
         if len(sys.argv) > 2 and sys.argv[1] == "--use":
             for name in sys.argv[2:]:
-                load_file(path.join(directory, name + extension))
-            sys.argv = [sys.argv[0]]  # So unittest doesn't try to load these
+                if "." in name:
+                    name, test = name.split(".", 1)
+                else:
+                    test = None
+                load_file(path.join(directory, name + extension), test)
+            sys.argv = [sys.argv[0]]  # So unittest doesn't try to parse this
             cls.skip_others = True
         else:
             for filename in listdir(directory):

From 0e547aa416f76970fc09092f110e3367bced99fd Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Sun, 14 Jun 2015 17:40:10 -0400
Subject: [PATCH 07/22] Begin splitting up C tokenizer.

---
 CHANGELOG                                          |   1 +
 docs/changelog.rst                                 |   1 +
 mwparserfromhell/parser/ctokenizer/common.h        |  40 +++++++++
 mwparserfromhell/parser/ctokenizer/textbuffer.c    | 100 +++++++++++++++++++++
 mwparserfromhell/parser/ctokenizer/textbuffer.h    |  40 +++++++++
 .../parser/{ => ctokenizer}/tokenizer.c            |  70 ---------------
 .../parser/{ => ctokenizer}/tokenizer.h            |  32 +------
 setup.py                                           |  11 ++-
 8 files changed, 191 insertions(+), 104 deletions(-)
 create mode 100644 mwparserfromhell/parser/ctokenizer/common.h
 create mode 100644 mwparserfromhell/parser/ctokenizer/textbuffer.c
 create mode 100644 mwparserfromhell/parser/ctokenizer/textbuffer.h
 rename mwparserfromhell/parser/{ => ctokenizer}/tokenizer.c (98%)
 rename mwparserfromhell/parser/{ => ctokenizer}/tokenizer.h (95%)

diff --git a/CHANGELOG b/CHANGELOG
index c49aaf7..7ad2930 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -6,6 +6,7 @@ v0.4.1 (unreleased):
 - Added support for Python 3.5.
 - '<' and '>' are now disallowed in wikilink titles and template names. This
   includes when denoting tags, but not comments.
+- Heavy refactoring and fixes to the C tokenizer.
 - Fixed some bugs in the release scripts.
 
 v0.4 (released May 23, 2015):
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 3217a35..2944992 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -13,6 +13,7 @@ Unreleased
 - Added support for Python 3.5.
 - ``<`` and ``>`` are now disallowed in wikilink titles and template names.
   This includes when denoting tags, but not comments.
+- Heavy refactoring and fixes to the C tokenizer.
 - Fixed some bugs in the release scripts.
 
 v0.4
diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h
new file mode 100644
index 0000000..2ed5a02
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -0,0 +1,40 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef PY_SSIZE_T_CLEAN
+#define PY_SSIZE_T_CLEAN
+#endif
+
+#include <Python.h>
+#include <structmember.h>
+#include <bytesobject.h>
+
+#if PY_MAJOR_VERSION >= 3
+#define IS_PY3K
+#endif
+
+#ifndef uint64_t
+#define uint64_t unsigned PY_LONG_LONG
+#endif
+
+#define malloc PyObject_Malloc
+#define free   PyObject_Free
diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.c b/mwparserfromhell/parser/ctokenizer/textbuffer.c
new file mode 100644
index 0000000..63d45d6
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c
@@ -0,0 +1,100 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "textbuffer.h"
+
+#define TEXTBUFFER_BLOCKSIZE 1024
+
+/*
+    Create a new textbuffer object.
+*/
+Textbuffer* Textbuffer_new(void)
+{
+    Textbuffer* buffer = malloc(sizeof(Textbuffer));
+
+    if (!buffer) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    buffer->size = 0;
+    buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
+    if (!buffer->data) {
+        free(buffer);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    buffer->prev = buffer->next = NULL;
+    return buffer;
+}
+
+/*
+    Deallocate the given textbuffer.
+*/
+void Textbuffer_dealloc(Textbuffer* self)
+{
+    Textbuffer* next;
+
+    while (self) {
+        free(self->data);
+        next = self->next;
+        free(self);
+        self = next;
+    }
+}
+
+/*
+    Write a Unicode codepoint to the given textbuffer.
+*/
+int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
+{
+    Textbuffer* self = *this;
+
+    if (self->size == TEXTBUFFER_BLOCKSIZE) {
+        Textbuffer* new = Textbuffer_new();
+        if (!new)
+            return -1;
+        new->next = self;
+        self->prev = new;
+        *this = self = new;
+    }
+    self->data[self->size++] = code;
+    return 0;
+}
+
+/*
+    Return the contents of the textbuffer as a Python Unicode object.
+*/
+PyObject* Textbuffer_render(Textbuffer* self)
+{
+    PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
+    PyObject *left, *concat;
+
+    while (self->next) {
+        self = self->next;
+        left = PyUnicode_FromUnicode(self->data, self->size);
+        concat = PyUnicode_Concat(left, result);
+        Py_DECREF(left);
+        Py_DECREF(result);
+        result = concat;
+    }
+    return result;
+}
diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.h b/mwparserfromhell/parser/ctokenizer/textbuffer.h
new file mode 100644
index 0000000..36b2207
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h
@@ -0,0 +1,40 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "common.h"
+
+/* Structs */
+
+struct Textbuffer {
+    Py_ssize_t size;
+    Py_UNICODE* data;
+    struct Textbuffer* prev;
+    struct Textbuffer* next;
+};
+typedef struct Textbuffer Textbuffer;
+
+/* Functions */
+
+Textbuffer* Textbuffer_new(void);
+void Textbuffer_dealloc(Textbuffer*);
+int Textbuffer_write(Textbuffer**, Py_UNICODE);
+PyObject* Textbuffer_render(Textbuffer*);
diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c
similarity index 98%
rename from mwparserfromhell/parser/tokenizer.c
rename to mwparserfromhell/parser/ctokenizer/tokenizer.c
index dd11d16..2bce247 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c
@@ -1,5 +1,4 @@
 /*
-Tokenizer for MWParserFromHell
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ -89,75 +88,6 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr)
     return lowered;
 }
 
-static Textbuffer* Textbuffer_new(void)
-{
-    Textbuffer* buffer = malloc(sizeof(Textbuffer));
-
-    if (!buffer) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-    buffer->size = 0;
-    buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
-    if (!buffer->data) {
-        free(buffer);
-        PyErr_NoMemory();
-        return NULL;
-    }
-    buffer->prev = buffer->next = NULL;
-    return buffer;
-}
-
-static void Textbuffer_dealloc(Textbuffer* self)
-{
-    Textbuffer* next;
-
-    while (self) {
-        free(self->data);
-        next = self->next;
-        free(self);
-        self = next;
-    }
-}
-
-/*
-    Write a Unicode codepoint to the given textbuffer.
-*/
-static int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
-{
-    Textbuffer* self = *this;
-
-    if (self->size == TEXTBUFFER_BLOCKSIZE) {
-        Textbuffer* new = Textbuffer_new();
-        if (!new)
-            return -1;
-        new->next = self;
-        self->prev = new;
-        *this = self = new;
-    }
-    self->data[self->size++] = code;
-    return 0;
-}
-
-/*
-    Return the contents of the textbuffer as a Python Unicode object.
-*/
-static PyObject* Textbuffer_render(Textbuffer* self)
-{
-    PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
-    PyObject *left, *concat;
-
-    while (self->next) {
-        self = self->next;
-        left = PyUnicode_FromUnicode(self->data, self->size);
-        concat = PyUnicode_Concat(left, result);
-        Py_DECREF(left);
-        Py_DECREF(result);
-        result = concat;
-    }
-    return result;
-}
-
 static TagData* TagData_new(void)
 {
     TagData *self = malloc(sizeof(TagData));
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/ctokenizer/tokenizer.h
similarity index 95%
rename from mwparserfromhell/parser/tokenizer.h
rename to mwparserfromhell/parser/ctokenizer/tokenizer.h
index 102fecd..66f1e90 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.h
@@ -1,5 +1,4 @@
 /*
-Tokenizer Header File for MWParserFromHell
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ -21,25 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 
-#ifndef PY_SSIZE_T_CLEAN
-#define PY_SSIZE_T_CLEAN
-#endif
-
-#include <Python.h>
 #include <math.h>
-#include <structmember.h>
-#include <bytesobject.h>
 
-#if PY_MAJOR_VERSION >= 3
-#define IS_PY3K
-#endif
-
-#ifndef uint64_t
-#define uint64_t unsigned PY_LONG_LONG
-#endif
-
-#define malloc PyObject_Malloc
-#define free   PyObject_Free
+#include "common.h"
+#include "textbuffer.h"
 
 #define DIGITS    "0123456789"
 #define HEXDIGITS "0123456789abcdefABCDEF"
@@ -50,7 +34,6 @@ static const char MARKERS[] = {
     '-', '!', '\n', '\0'};
 
 #define NUM_MARKERS 19
-#define TEXTBUFFER_BLOCKSIZE 1024
 #define MAX_DEPTH 40
 #define MAX_CYCLES 100000
 #define MAX_BRACES 255
@@ -196,13 +179,6 @@ static PyObject* TagCloseClose;
 
 /* Miscellaneous structs: */
 
-struct Textbuffer {
-    Py_ssize_t size;
-    Py_UNICODE* data;
-    struct Textbuffer* prev;
-    struct Textbuffer* next;
-};
-
 struct Stack {
     PyObject* stack;
     uint64_t context;
@@ -224,7 +200,6 @@ typedef struct {
     Py_ssize_t reset;
 } TagData;
 
-typedef struct Textbuffer Textbuffer;
 typedef struct Stack Stack;
 
 
@@ -268,9 +243,6 @@ typedef struct {
 
 /* Function prototypes: */
 
-static Textbuffer* Textbuffer_new(void);
-static void Textbuffer_dealloc(Textbuffer*);
-
 static TagData* TagData_new(void);
 static void TagData_dealloc(TagData*);
 
diff --git a/setup.py b/setup.py
index dcdd563..1bca436 100644
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,7 @@
 
 from __future__ import print_function
 from distutils.errors import DistutilsError, CCompilerError
+from glob import glob
 from os import environ
 import sys
 
@@ -39,10 +40,6 @@ from mwparserfromhell.compat import py26, py3k
 with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp:
     long_docs = fp.read()
 
-tokenizer = Extension("mwparserfromhell.parser._tokenizer",
-                      sources=["mwparserfromhell/parser/tokenizer.c"],
-                      depends=["mwparserfromhell/parser/tokenizer.h"])
-
 use_extension = True
 fallback = True
 
@@ -75,6 +72,12 @@ def build_ext_patched(self):
 if fallback:
     build_ext.run, build_ext_original = build_ext_patched, build_ext.run
 
+# Project-specific part begins here:
+
+tokenizer = Extension("mwparserfromhell.parser._tokenizer",
+                      sources=glob("mwparserfromhell/parser/ctokenizer/*.c"),
+                      depends=glob("mwparserfromhell/parser/ctokenizer/*.h"))
+
 setup(
     name = "mwparserfromhell",
     packages = find_packages(exclude=("tests",)),

From 2005efd309ecb21e658a7fa7c0739efed54c27ec Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Mon, 15 Jun 2015 00:05:28 -0400
Subject: [PATCH 08/22] Split up C tokenizer into tag_data, tok_parse,
 tok_support, tokens.

---
 mwparserfromhell/parser/ctokenizer/common.h      |   56 +-
 mwparserfromhell/parser/ctokenizer/contexts.h    |  104 +
 mwparserfromhell/parser/ctokenizer/tag_data.c    |   88 +
 mwparserfromhell/parser/ctokenizer/tag_data.h    |   43 +
 mwparserfromhell/parser/ctokenizer/textbuffer.h  |   12 +-
 mwparserfromhell/parser/ctokenizer/tok_parse.c   | 2750 +++++++++++++++++++
 mwparserfromhell/parser/ctokenizer/tok_parse.h   |   29 +
 mwparserfromhell/parser/ctokenizer/tok_support.c |  362 +++
 mwparserfromhell/parser/ctokenizer/tok_support.h |   66 +
 mwparserfromhell/parser/ctokenizer/tokenizer.c   | 3162 +---------------------
 mwparserfromhell/parser/ctokenizer/tokenizer.h   |  238 +-
 mwparserfromhell/parser/ctokenizer/tokens.c      |  111 +
 mwparserfromhell/parser/ctokenizer/tokens.h      |   69 +
 13 files changed, 3722 insertions(+), 3368 deletions(-)
 create mode 100644 mwparserfromhell/parser/ctokenizer/contexts.h
 create mode 100644 mwparserfromhell/parser/ctokenizer/tag_data.c
 create mode 100644 mwparserfromhell/parser/ctokenizer/tag_data.h
 create mode 100644 mwparserfromhell/parser/ctokenizer/tok_parse.c
 create mode 100644 mwparserfromhell/parser/ctokenizer/tok_parse.h
 create mode 100644 mwparserfromhell/parser/ctokenizer/tok_support.c
 create mode 100644 mwparserfromhell/parser/ctokenizer/tok_support.h
 create mode 100644 mwparserfromhell/parser/ctokenizer/tokens.c
 create mode 100644 mwparserfromhell/parser/ctokenizer/tokens.h

diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h
index 2ed5a02..58c9487 100644
--- a/mwparserfromhell/parser/ctokenizer/common.h
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -20,14 +20,18 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 
+#pragma once
+
 #ifndef PY_SSIZE_T_CLEAN
-#define PY_SSIZE_T_CLEAN
+#define PY_SSIZE_T_CLEAN  // See: https://docs.python.org/2/c-api/arg.html
 #endif
 
 #include <Python.h>
 #include <structmember.h>
 #include <bytesobject.h>
 
+/* Compatibility macros */
+
 #if PY_MAJOR_VERSION >= 3
 #define IS_PY3K
 #endif
@@ -36,5 +40,53 @@ SOFTWARE.
 #define uint64_t unsigned PY_LONG_LONG
 #endif
 
-#define malloc PyObject_Malloc
+#define malloc PyObject_Malloc  // XXX: yuck
 #define free   PyObject_Free
+
+/* Error handling globals/macros */
+
+extern int route_state;  // TODO: this is NOT thread-safe!
+extern uint64_t route_context;
+
+#define BAD_ROUTE            route_state
+#define BAD_ROUTE_CONTEXT    route_context
+#define FAIL_ROUTE(context)  { route_state = 1; route_context = context; }
+#define RESET_ROUTE()        route_state = 0
+
+/* Shared globals */
+
+extern char** entitydefs;
+
+extern PyObject* EMPTY;
+extern PyObject* NOARGS;
+extern PyObject* definitions;
+
+/* Structs */
+
+struct Textbuffer {
+    Py_ssize_t size;
+    Py_UNICODE* data;
+    struct Textbuffer* prev;
+    struct Textbuffer* next;
+};
+typedef struct Textbuffer Textbuffer;
+
+struct Stack {
+    PyObject* stack;
+    uint64_t context;
+    struct Textbuffer* textbuffer;
+    struct Stack* next;
+};
+typedef struct Stack Stack;
+
+typedef struct {
+    PyObject_HEAD
+    PyObject* text;         /* text to tokenize */
+    Stack* topstack;        /* topmost stack */
+    Py_ssize_t head;        /* current position in text */
+    Py_ssize_t length;      /* length of text */
+    int global;             /* global context */
+    int depth;              /* stack recursion depth */
+    int cycles;             /* total number of stack recursions */
+    int skip_style_tags;    /* temporary fix for the sometimes broken tag parser */
+} Tokenizer;
diff --git a/mwparserfromhell/parser/ctokenizer/contexts.h b/mwparserfromhell/parser/ctokenizer/contexts.h
new file mode 100644
index 0000000..8e24372
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/contexts.h
@@ -0,0 +1,104 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+/* Local contexts */
+
+#define LC_TEMPLATE                 0x0000000000000007
+#define LC_TEMPLATE_NAME            0x0000000000000001
+#define LC_TEMPLATE_PARAM_KEY       0x0000000000000002
+#define LC_TEMPLATE_PARAM_VALUE     0x0000000000000004
+
+#define LC_ARGUMENT                 0x0000000000000018
+#define LC_ARGUMENT_NAME            0x0000000000000008
+#define LC_ARGUMENT_DEFAULT         0x0000000000000010
+
+#define LC_WIKILINK                 0x0000000000000060
+#define LC_WIKILINK_TITLE           0x0000000000000020
+#define LC_WIKILINK_TEXT            0x0000000000000040
+
+#define LC_EXT_LINK                 0x0000000000000180
+#define LC_EXT_LINK_URI             0x0000000000000080
+#define LC_EXT_LINK_TITLE           0x0000000000000100
+
+#define LC_HEADING                  0x0000000000007E00
+#define LC_HEADING_LEVEL_1          0x0000000000000200
+#define LC_HEADING_LEVEL_2          0x0000000000000400
+#define LC_HEADING_LEVEL_3          0x0000000000000800
+#define LC_HEADING_LEVEL_4          0x0000000000001000
+#define LC_HEADING_LEVEL_5          0x0000000000002000
+#define LC_HEADING_LEVEL_6          0x0000000000004000
+
+#define LC_TAG                      0x0000000000078000
+#define LC_TAG_OPEN                 0x0000000000008000
+#define LC_TAG_ATTR                 0x0000000000010000
+#define LC_TAG_BODY                 0x0000000000020000
+#define LC_TAG_CLOSE                0x0000000000040000
+
+#define LC_STYLE                    0x0000000000780000
+#define LC_STYLE_ITALICS            0x0000000000080000
+#define LC_STYLE_BOLD               0x0000000000100000
+#define LC_STYLE_PASS_AGAIN         0x0000000000200000
+#define LC_STYLE_SECOND_PASS        0x0000000000400000
+
+#define LC_DLTERM                   0x0000000000800000
+
+#define LC_SAFETY_CHECK             0x000000003F000000
+#define LC_HAS_TEXT                 0x0000000001000000
+#define LC_FAIL_ON_TEXT             0x0000000002000000
+#define LC_FAIL_NEXT                0x0000000004000000
+#define LC_FAIL_ON_LBRACE           0x0000000008000000
+#define LC_FAIL_ON_RBRACE           0x0000000010000000
+#define LC_FAIL_ON_EQUALS           0x0000000020000000
+
+#define LC_TABLE                    0x0000000FC0000000
+#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000
+#define LC_TABLE_OPEN               0x0000000040000000
+#define LC_TABLE_CELL_OPEN          0x0000000080000000
+#define LC_TABLE_CELL_STYLE         0x0000000100000000
+#define LC_TABLE_ROW_OPEN           0x0000000200000000
+#define LC_TABLE_TD_LINE            0x0000000400000000
+#define LC_TABLE_TH_LINE            0x0000000800000000
+
+/* Global contexts */
+
+#define GL_HEADING 0x1
+
+/* Aggregate contexts */
+
+#define AGG_FAIL         (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
+#define AGG_UNSAFE       (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
+#define AGG_DOUBLE       (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
+#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
+#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)
+
+/* Tag contexts */
+
+#define TAG_NAME        0x01
+#define TAG_ATTR_READY  0x02
+#define TAG_ATTR_NAME   0x04
+#define TAG_ATTR_VALUE  0x08
+#define TAG_QUOTED      0x10
+#define TAG_NOTE_SPACE  0x20
+#define TAG_NOTE_EQUALS 0x40
+#define TAG_NOTE_QUOTE  0x80
diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.c b/mwparserfromhell/parser/ctokenizer/tag_data.c
new file mode 100644
index 0000000..968a760
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/tag_data.c
@@ -0,0 +1,88 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "tag_data.h"
+#include "contexts.h"
+
+/*
+    Initialize a new TagData object.
+*/
+TagData* TagData_new(void)
+{
+#define ALLOC_BUFFER(name)     \
+    name = Textbuffer_new();   \
+    if (!name) {               \
+        TagData_dealloc(self); \
+        return NULL;           \
+    }
+
+    TagData *self = malloc(sizeof(TagData));
+    if (!self) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    self->context = TAG_NAME;
+    ALLOC_BUFFER(self->pad_first)
+    ALLOC_BUFFER(self->pad_before_eq)
+    ALLOC_BUFFER(self->pad_after_eq)
+    self->quoter = 0;
+    self->reset = 0;
+    return self;
+
+#undef ALLOC_BUFFER
+}
+
+/*
+    Deallocate the given TagData object.
+*/
+void TagData_dealloc(TagData* self)
+{
+#define DEALLOC_BUFFER(name)      \
+    if (name)                     \
+        Textbuffer_dealloc(name);
+
+    DEALLOC_BUFFER(self->pad_first);
+    DEALLOC_BUFFER(self->pad_before_eq);
+    DEALLOC_BUFFER(self->pad_after_eq);
+    free(self);
+
+#undef DEALLOC_BUFFER
+}
+
+/*
+    Clear the internal buffers of the given TagData object.
+*/
+int TagData_reset_buffers(TagData* self)
+{
+#define RESET_BUFFER(name)    \
+    Textbuffer_dealloc(name); \
+    name = Textbuffer_new();  \
+    if (!name)                \
+        return -1;
+
+    RESET_BUFFER(self->pad_first)
+    RESET_BUFFER(self->pad_before_eq)
+    RESET_BUFFER(self->pad_after_eq)
+    return 0;
+
+#undef RESET_BUFFER
+}
diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.h b/mwparserfromhell/parser/ctokenizer/tag_data.h
new file mode 100644
index 0000000..e2ae807
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/tag_data.h
@@ -0,0 +1,43 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#include "common.h"
+#include "textbuffer.h"
+
+/* Structs */
+
+typedef struct {
+    uint64_t context;
+    Textbuffer* pad_first;
+    Textbuffer* pad_before_eq;
+    Textbuffer* pad_after_eq;
+    Py_UNICODE quoter;
+    Py_ssize_t reset;
+} TagData;
+
+/* Functions */
+
+TagData* TagData_new(void);
+void TagData_dealloc(TagData*);
+int TagData_reset_buffers(TagData*);
diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.h b/mwparserfromhell/parser/ctokenizer/textbuffer.h
index 36b2207..389a9fe 100644
--- a/mwparserfromhell/parser/ctokenizer/textbuffer.h
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h
@@ -20,17 +20,9 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 
-#include "common.h"
-
-/* Structs */
+#pragma once
 
-struct Textbuffer {
-    Py_ssize_t size;
-    Py_UNICODE* data;
-    struct Textbuffer* prev;
-    struct Textbuffer* next;
-};
-typedef struct Textbuffer Textbuffer;
+#include "common.h"
 
 /* Functions */
 
diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c
new file mode 100644
index 0000000..1e6424d
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -0,0 +1,2750 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "tok_parse.h"
+#include "contexts.h"
+#include "tag_data.h"
+#include "tok_support.h"
+#include "tokens.h"
+
+#define DIGITS    "0123456789"
+#define HEXDIGITS "0123456789abcdefABCDEF"
+#define ALPHANUM  "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+
+static const char MARKERS[] = {
+    '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
+    '-', '!', '\n', '\0'};
+
+#define NUM_MARKERS 19
+#define MAX_BRACES 255
+#define MAX_ENTITY_SIZE 8
+
+#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
+#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL))
+#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL))
+#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL))
+#define IS_SCHEME(scheme, slashes, reverse) \
+    (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False))
+
+#ifdef IS_PY3K
+    #define NEW_INT_FUNC      PyLong_FromSsize_t
+#else
+    #define NEW_INT_FUNC      PyInt_FromSsize_t
+#endif
+
+typedef struct {
+    PyObject* title;
+    int level;
+} HeadingData;
+
+/* Forward declarations */
+
+static int Tokenizer_parse_entity(Tokenizer*);
+static int Tokenizer_parse_comment(Tokenizer*);
+static int Tokenizer_handle_dl_term(Tokenizer*);
+static int Tokenizer_parse_tag(Tokenizer*);
+
+/*
+    Determine whether the given Py_UNICODE is a marker.
+*/
+static int is_marker(Py_UNICODE this)
+{
+    int i;
+
+    for (i = 0; i < NUM_MARKERS; i++) {
+        if (MARKERS[i] == this)
+            return 1;
+    }
+    return 0;
+}
+
+/*
+    Given a context, return the heading level encoded within it.
+*/
+static int heading_level_from_context(uint64_t n)
+{
+    int level;
+
+    n /= LC_HEADING_LEVEL_1;
+    for (level = 1; n > 1; n >>= 1)
+        level++;
+    return level;
+}
+
+/*
+    Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as
+    parameters, and return its output as a bool.
+*/
+static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2,
+                         PyObject* in3)
+{
+    PyObject* func = PyObject_GetAttrString(definitions, funcname);
+    PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL);
+    int ans = (result == Py_True) ? 1 : 0;
+
+    Py_DECREF(func);
+    Py_DECREF(result);
+    return ans;
+}
+
+/*
+    Sanitize the name of a tag so it can be compared with others for equality.
+*/
+static PyObject* strip_tag_name(PyObject* token, int take_attr)
+{
+    PyObject *text, *rstripped, *lowered;
+
+    if (take_attr) {
+        text = PyObject_GetAttrString(token, "text");
+        if (!text)
+            return NULL;
+        rstripped = PyObject_CallMethod(text, "rstrip", NULL);
+        Py_DECREF(text);
+    }
+    else
+        rstripped = PyObject_CallMethod(token, "rstrip", NULL);
+    if (!rstripped)
+        return NULL;
+    lowered = PyObject_CallMethod(rstripped, "lower", NULL);
+    Py_DECREF(rstripped);
+    return lowered;
+}
+
+/*
+    Parse a template at the head of the wikicode string.
+*/
+static int Tokenizer_parse_template(Tokenizer* self)
+{
+    PyObject *template;
+    Py_ssize_t reset = self->head;
+
+    template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1);
+    if (BAD_ROUTE) {
+        self->head = reset;
+        return 0;
+    }
+    if (!template)
+        return -1;
+    if (Tokenizer_emit_first(self, TemplateOpen)) {
+        Py_DECREF(template);
+        return -1;
+    }
+    if (Tokenizer_emit_all(self, template)) {
+        Py_DECREF(template);
+        return -1;
+    }
+    Py_DECREF(template);
+    if (Tokenizer_emit(self, TemplateClose))
+        return -1;
+    return 0;
+}
+
+/*
+    Parse an argument at the head of the wikicode string.
+*/
+static int Tokenizer_parse_argument(Tokenizer* self)
+{
+    PyObject *argument;
+    Py_ssize_t reset = self->head;
+
+    argument = Tokenizer_parse(self, LC_ARGUMENT_NAME, 1);
+    if (BAD_ROUTE) {
+        self->head = reset;
+        return 0;
+    }
+    if (!argument)
+        return -1;
+    if (Tokenizer_emit_first(self, ArgumentOpen)) {
+        Py_DECREF(argument);
+        return -1;
+    }
+    if (Tokenizer_emit_all(self, argument)) {
+        Py_DECREF(argument);
+        return -1;
+    }
+    Py_DECREF(argument);
+    if (Tokenizer_emit(self, ArgumentClose))
+        return -1;
+    return 0;
+}
+
+/*
+    Parse a template or argument at the head of the wikicode string.
+*/
+static int Tokenizer_parse_template_or_argument(Tokenizer* self)
+{
+    unsigned int braces = 2, i;
+    PyObject *tokenlist;
+
+    self->head += 2;
+    while (Tokenizer_READ(self, 0) == '{' && braces < MAX_BRACES) {
+        self->head++;
+        braces++;
+    }
+    if (Tokenizer_push(self, 0))
+        return -1;
+    while (braces) {
+        if (braces == 1) {
+            if (Tokenizer_emit_text_then_stack(self, "{"))
+                return -1;
+            return 0;
+        }
+        if (braces == 2) {
+            if (Tokenizer_parse_template(self))
+                return -1;
+            if (BAD_ROUTE) {
+                RESET_ROUTE();
+                if (Tokenizer_emit_text_then_stack(self, "{{"))
+                    return -1;
+                return 0;
+            }
+            break;
+        }
+        if (Tokenizer_parse_argument(self))
+            return -1;
+        if (BAD_ROUTE) {
+            RESET_ROUTE();
+            if (Tokenizer_parse_template(self))
+                return -1;
+            if (BAD_ROUTE) {
+                char text[MAX_BRACES + 1];
+                RESET_ROUTE();
+                for (i = 0; i < braces; i++) text[i] = '{';
+                text[braces] = '\0';
+                if (Tokenizer_emit_text_then_stack(self, text))
+                    return -1;
+                return 0;
+            }
+            else
+                braces -= 2;
+        }
+        else
+            braces -= 3;
+        if (braces)
+            self->head++;
+    }
+    tokenlist = Tokenizer_pop(self);
+    if (!tokenlist)
+        return -1;
+    if (Tokenizer_emit_all(self, tokenlist)) {
+        Py_DECREF(tokenlist);
+        return -1;
+    }
+    Py_DECREF(tokenlist);
+    if (self->topstack->context & LC_FAIL_NEXT)
+        self->topstack->context ^= LC_FAIL_NEXT;
+    return 0;
+}
+
+/*
+    Handle a template parameter at the head of the string.
+*/
+static int Tokenizer_handle_template_param(Tokenizer* self)
+{
+    PyObject *stack;
+
+    if (self->topstack->context & LC_TEMPLATE_NAME)
+        self->topstack->context ^= LC_TEMPLATE_NAME;
+    else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE)
+        self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE;
+    if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
+        stack = Tokenizer_pop_keeping_context(self);
+        if (!stack)
+            return -1;
+        if (Tokenizer_emit_all(self, stack)) {
+            Py_DECREF(stack);
+            return -1;
+        }
+        Py_DECREF(stack);
+    }
+    else
+        self->topstack->context |= LC_TEMPLATE_PARAM_KEY;
+    if (Tokenizer_emit(self, TemplateParamSeparator))
+        return -1;
+    if (Tokenizer_push(self, self->topstack->context))
+        return -1;
+    return 0;
+}
+
+/*
+    Handle a template parameter's value at the head of the string.
+*/
+static int Tokenizer_handle_template_param_value(Tokenizer* self)
+{
+    PyObject *stack;
+
+    stack = Tokenizer_pop_keeping_context(self);
+    if (!stack)
+        return -1;
+    if (Tokenizer_emit_all(self, stack)) {
+        Py_DECREF(stack);
+        return -1;
+    }
+    Py_DECREF(stack);
+    self->topstack->context ^= LC_TEMPLATE_PARAM_KEY;
+    self->topstack->context |= LC_TEMPLATE_PARAM_VALUE;
+    if (Tokenizer_emit(self, TemplateParamEquals))
+        return -1;
+    return 0;
+}
+
+/*
+    Handle the end of a template at the head of the string.
+*/
+static PyObject* Tokenizer_handle_template_end(Tokenizer* self)
+{
+    PyObject* stack;
+
+    if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
+        stack = Tokenizer_pop_keeping_context(self);
+        if (!stack)
+            return NULL;
+        if (Tokenizer_emit_all(self, stack)) {
+            Py_DECREF(stack);
+            return NULL;
+        }
+        Py_DECREF(stack);
+    }
+    self->head++;
+    stack = Tokenizer_pop(self);
+    return stack;
+}
+
+/*
+    Handle the separator between an argument's name and default.
+*/
+static int Tokenizer_handle_argument_separator(Tokenizer* self)
+{
+    self->topstack->context ^= LC_ARGUMENT_NAME;
+    self->topstack->context |= LC_ARGUMENT_DEFAULT;
+    if (Tokenizer_emit(self, ArgumentSeparator))
+        return -1;
+    return 0;
+}
+
+/*
+    Handle the end of an argument at the head of the string.
+*/
+static PyObject* Tokenizer_handle_argument_end(Tokenizer* self)
+{
+    PyObject* stack = Tokenizer_pop(self);
+
+    self->head += 2;
+    return stack;
+}
+
+/*
+    Parse an internal wikilink at the head of the wikicode string.
+*/
+static int Tokenizer_parse_wikilink(Tokenizer* self)
+{
+    Py_ssize_t reset;
+    PyObject *wikilink;
+
+    self->head += 2;
+    reset = self->head - 1;
+    wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1);
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset;
+        if (Tokenizer_emit_text(self, "[["))
+            return -1;
+        return 0;
+    }
+    if (!wikilink)
+        return -1;
+    if (Tokenizer_emit(self, WikilinkOpen)) {
+        Py_DECREF(wikilink);
+        return -1;
+    }
+    if (Tokenizer_emit_all(self, wikilink)) {
+        Py_DECREF(wikilink);
+        return -1;
+    }
+    Py_DECREF(wikilink);
+    if (Tokenizer_emit(self, WikilinkClose))
+        return -1;
+    return 0;
+}
+
+/*
+    Handle the separator between a wikilink's title and its text.
+*/
+static int Tokenizer_handle_wikilink_separator(Tokenizer* self)
+{
+    self->topstack->context ^= LC_WIKILINK_TITLE;
+    self->topstack->context |= LC_WIKILINK_TEXT;
+    if (Tokenizer_emit(self, WikilinkSeparator))
+        return -1;
+    return 0;
+}
+
+/*
+    Handle the end of a wikilink at the head of the string.
+*/
+static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self)
+{
+    PyObject* stack = Tokenizer_pop(self);
+    self->head += 1;
+    return stack;
+}
+
+/*
+    Parse the URI scheme of a bracket-enclosed external link.
+*/
+static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
+{
+    static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
+    Textbuffer* buffer;
+    PyObject* scheme;
+    Py_UNICODE this;
+    int slashes, i;
+
+    if (Tokenizer_push(self, LC_EXT_LINK_URI))
+        return -1;
+    if (Tokenizer_READ(self, 0) == '/' && Tokenizer_READ(self, 1) == '/') {
+        if (Tokenizer_emit_text(self, "//"))
+            return -1;
+        self->head += 2;
+    }
+    else {
+        buffer = Textbuffer_new();
+        if (!buffer)
+            return -1;
+        while ((this = Tokenizer_READ(self, 0))) {
+            i = 0;
+            while (1) {
+                if (!valid[i])
+                    goto end_of_loop;
+                if (this == valid[i])
+                    break;
+                i++;
+            }
+            Textbuffer_write(&buffer, this);
+            if (Tokenizer_emit_char(self, this)) {
+                Textbuffer_dealloc(buffer);
+                return -1;
+            }
+            self->head++;
+        }
+        end_of_loop:
+        if (this != ':') {
+            Textbuffer_dealloc(buffer);
+            Tokenizer_fail_route(self);
+            return 0;
+        }
+        if (Tokenizer_emit_char(self, ':')) {
+            Textbuffer_dealloc(buffer);
+            return -1;
+        }
+        self->head++;
+        slashes = (Tokenizer_READ(self, 0) == '/' &&
+                   Tokenizer_READ(self, 1) == '/');
+        if (slashes) {
+            if (Tokenizer_emit_text(self, "//")) {
+                Textbuffer_dealloc(buffer);
+                return -1;
+            }
+            self->head += 2;
+        }
+        scheme = Textbuffer_render(buffer);
+        Textbuffer_dealloc(buffer);
+        if (!scheme)
+            return -1;
+        if (!IS_SCHEME(scheme, slashes, 0)) {
+            Py_DECREF(scheme);
+            Tokenizer_fail_route(self);
+            return 0;
+        }
+        Py_DECREF(scheme);
+    }
+    return 0;
+}
+
+/*
+    Parse the URI scheme of a free (no brackets) external link.
+*/
+static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
+{
+    static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
+    Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer;
+    PyObject *scheme;
+    Py_UNICODE chunk;
+    Py_ssize_t i;
+    int slashes, j;
+
+    if (!scheme_buffer)
+        return -1;
+    // We have to backtrack through the textbuffer looking for our scheme since
+    // it was just parsed as text:
+    temp_buffer = self->topstack->textbuffer;
+    while (temp_buffer) {
+        for (i = temp_buffer->size - 1; i >= 0; i--) {
+            chunk = temp_buffer->data[i];
+            if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
+                goto end_of_loop;
+            j = 0;
+            while (1) {
+                if (!valid[j]) {
+                    Textbuffer_dealloc(scheme_buffer);
+                    FAIL_ROUTE(0);
+                    return 0;
+                }
+                if (chunk == valid[j])
+                    break;
+                j++;
+            }
+            Textbuffer_write(&scheme_buffer, chunk);
+        }
+        temp_buffer = temp_buffer->next;
+    }
+    end_of_loop:
+    scheme = Textbuffer_render(scheme_buffer);
+    if (!scheme) {
+        Textbuffer_dealloc(scheme_buffer);
+        return -1;
+    }
+    slashes = (Tokenizer_READ(self, 0) == '/' &&
+               Tokenizer_READ(self, 1) == '/');
+    if (!IS_SCHEME(scheme, slashes, 1)) {
+        Py_DECREF(scheme);
+        Textbuffer_dealloc(scheme_buffer);
+        FAIL_ROUTE(0);
+        return 0;
+    }
+    Py_DECREF(scheme);
+    if (Tokenizer_push(self, self->topstack->context | LC_EXT_LINK_URI)) {
+        Textbuffer_dealloc(scheme_buffer);
+        return -1;
+    }
+    if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1))
+        return -1;
+    if (Tokenizer_emit_char(self, ':'))
+        return -1;
+    if (slashes) {
+        if (Tokenizer_emit_text(self, "//"))
+            return -1;
+        self->head += 2;
+    }
+    return 0;
+}
+
+/*
+    Handle text in a free external link, including trailing punctuation.
+*/
+static int
+Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
+                                Textbuffer** tail, Py_UNICODE this)
+{
+    #define PUSH_TAIL_BUFFER(tail, error)                 \
+        if ((tail)->size || (tail)->next) {               \
+            if (Tokenizer_emit_textbuffer(self, tail, 0)) \
+                return error;                             \
+            tail = Textbuffer_new();                      \
+            if (!(tail))                                  \
+                return error;                             \
+        }
+
+    if (this == '(' && !(*parens)) {
+        *parens = 1;
+        PUSH_TAIL_BUFFER(*tail, -1)
+    }
+    else if (this == ',' || this == ';' || this == '\\' || this == '.' ||
+             this == ':' || this == '!' || this == '?' ||
+             (!(*parens) && this == ')'))
+        return Textbuffer_write(tail, this);
+    else
+        PUSH_TAIL_BUFFER(*tail, -1)
+    return Tokenizer_emit_char(self, this);
+}
+
+/*
+    Return whether the current head is the end of a free link.
+*/
+static int
+Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
+{
+    // Built from Tokenizer_parse()'s end sentinels:
+    Py_UNICODE after = Tokenizer_READ(self, 2);
+    uint64_t ctx = self->topstack->context;
+
+    return (!this || this == '\n' || this == '[' || this == ']' ||
+        this == '<' || this == '>'  || (this == '\'' && next == '\'') ||
+        (this == '|' && ctx & LC_TEMPLATE) ||
+        (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) ||
+        (this == '}' && next == '}' &&
+            (ctx & LC_TEMPLATE || (after == '}' && ctx & LC_ARGUMENT))));
+}
+
+/*
+    Really parse an external link.
+*/
+static PyObject*
+Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
+                                     Textbuffer** extra)
+{
+    Py_UNICODE this, next;
+    int parens = 0;
+
+    if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) :
+                   Tokenizer_parse_free_uri_scheme(self))
+        return NULL;
+    if (BAD_ROUTE)
+        return NULL;
+    this = Tokenizer_READ(self, 0);
+    if (!this || this == '\n' || this == ' ' || this == ']')
+        return Tokenizer_fail_route(self);
+    if (!brackets && this == '[')
+        return Tokenizer_fail_route(self);
+    while (1) {
+        this = Tokenizer_READ(self, 0);
+        next = Tokenizer_READ(self, 1);
+        if (this == '&') {
+            PUSH_TAIL_BUFFER(*extra, NULL)
+            if (Tokenizer_parse_entity(self))
+                return NULL;
+        }
+        else if (this == '<' && next == '!'
+                 && Tokenizer_READ(self, 2) == '-'
+                 && Tokenizer_READ(self, 3) == '-') {
+            PUSH_TAIL_BUFFER(*extra, NULL)
+            if (Tokenizer_parse_comment(self))
+                return NULL;
+        }
+        else if (!brackets && Tokenizer_is_free_link(self, this, next)) {
+            self->head--;
+            return Tokenizer_pop(self);
+        }
+        else if (!this || this == '\n')
+            return Tokenizer_fail_route(self);
+        else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) {
+            PUSH_TAIL_BUFFER(*extra, NULL)
+            if (Tokenizer_parse_template_or_argument(self))
+                return NULL;
+        }
+        else if (this == ']')
+            return Tokenizer_pop(self);
+        else if (this == ' ') {
+            if (brackets) {
+                if (Tokenizer_emit(self, ExternalLinkSeparator))
+                    return NULL;
+                self->topstack->context ^= LC_EXT_LINK_URI;
+                self->topstack->context |= LC_EXT_LINK_TITLE;
+                self->head++;
+                return Tokenizer_parse(self, 0, 0);
+            }
+            if (Textbuffer_write(extra, ' '))
+                return NULL;
+            return Tokenizer_pop(self);
+        }
+        else if (!brackets) {
+            if (Tokenizer_handle_free_link_text(self, &parens, extra, this))
+                return NULL;
+        }
+        else {
+            if (Tokenizer_emit_char(self, this))
+                return NULL;
+        }
+        self->head++;
+    }
+}
+
+/*
+    Remove the URI scheme of a new external link from the textbuffer.
+*/
+static int
+Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
+{
+    PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"),
+             *split, *scheme;
+    Py_ssize_t length;
+    Textbuffer* temp;
+
+    if (!text)
+        return -1;
+    split = PyObject_CallMethod(text, "split", "si", ":", 1);
+    Py_DECREF(text);
+    if (!split)
+        return -1;
+    scheme = PyList_GET_ITEM(split, 0);
+    length = PyUnicode_GET_SIZE(scheme);
+    while (length) {
+        temp = self->topstack->textbuffer;
+        if (length <= temp->size) {
+            temp->size -= length;
+            break;
+        }
+        length -= temp->size;
+        self->topstack->textbuffer = temp->next;
+        free(temp->data);
+        free(temp);
+    }
+    Py_DECREF(split);
+    return 0;
+}
+
+/*
+    Parse an external link at the head of the wikicode string.
+*/
+static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
+{
+    #define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS
+    #define NOT_A_LINK                                        \
+        if (!brackets && self->topstack->context & LC_DLTERM) \
+            return Tokenizer_handle_dl_term(self);            \
+        return Tokenizer_emit_char(self, Tokenizer_READ(self, 0))
+
+    Py_ssize_t reset = self->head;
+    PyObject *link, *kwargs;
+    Textbuffer *extra = 0;
+
+    if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) {
+        NOT_A_LINK;
+    }
+    extra = Textbuffer_new();
+    if (!extra)
+        return -1;
+    self->head++;
+    link = Tokenizer_really_parse_external_link(self, brackets, &extra);
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset;
+        Textbuffer_dealloc(extra);
+        NOT_A_LINK;
+    }
+    if (!link) {
+        Textbuffer_dealloc(extra);
+        return -1;
+    }
+    if (!brackets) {
+        if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) {
+            Textbuffer_dealloc(extra);
+            Py_DECREF(link);
+            return -1;
+        }
+    }
+    kwargs = PyDict_New();
+    if (!kwargs) {
+        Textbuffer_dealloc(extra);
+        Py_DECREF(link);
+        return -1;
+    }
+    PyDict_SetItemString(kwargs, "brackets", brackets ? Py_True : Py_False);
+    if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) {
+        Textbuffer_dealloc(extra);
+        Py_DECREF(link);
+        return -1;
+    }
+    if (Tokenizer_emit_all(self, link)) {
+        Textbuffer_dealloc(extra);
+        Py_DECREF(link);
+        return -1;
+    }
+    Py_DECREF(link);
+    if (Tokenizer_emit(self, ExternalLinkClose)) {
+        Textbuffer_dealloc(extra);
+        return -1;
+    }
+    if (extra->size || extra->next)
+        return Tokenizer_emit_textbuffer(self, extra, 0);
+    Textbuffer_dealloc(extra);
+    return 0;
+}
+
+/*
+    Parse a section heading at the head of the wikicode string.
+*/
+static int Tokenizer_parse_heading(Tokenizer* self)
+{
+    Py_ssize_t reset = self->head;
+    int best = 1, i, context, diff;
+    HeadingData *heading;
+    PyObject *level, *kwargs;
+
+    self->global |= GL_HEADING;
+    self->head += 1;
+    while (Tokenizer_READ(self, 0) == '=') {
+        best++;
+        self->head++;
+    }
+    context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1);
+    heading = (HeadingData*) Tokenizer_parse(self, context, 1);
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset + best - 1;
+        for (i = 0; i < best; i++) {
+            if (Tokenizer_emit_char(self, '='))
+                return -1;
+        }
+        self->global ^= GL_HEADING;
+        return 0;
+    }
+    level = NEW_INT_FUNC(heading->level);
+    if (!level) {
+        Py_DECREF(heading->title);
+        free(heading);
+        return -1;
+    }
+    kwargs = PyDict_New();
+    if (!kwargs) {
+        Py_DECREF(level);
+        Py_DECREF(heading->title);
+        free(heading);
+        return -1;
+    }
+    PyDict_SetItemString(kwargs, "level", level);
+    Py_DECREF(level);
+    if (Tokenizer_emit_kwargs(self, HeadingStart, kwargs)) {
+        Py_DECREF(heading->title);
+        free(heading);
+        return -1;
+    }
+    if (heading->level < best) {
+        diff = best - heading->level;
+        for (i = 0; i < diff; i++) {
+            if (Tokenizer_emit_char(self, '=')) {
+                Py_DECREF(heading->title);
+                free(heading);
+                return -1;
+            }
+        }
+    }
+    if (Tokenizer_emit_all(self, heading->title)) {
+        Py_DECREF(heading->title);
+        free(heading);
+        return -1;
+    }
+    Py_DECREF(heading->title);
+    free(heading);
+    if (Tokenizer_emit(self, HeadingEnd))
+        return -1;
+    self->global ^= GL_HEADING;
+    return 0;
+}
+
+/*
+    Handle the end of a section heading at the head of the string.
+*/
+static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
+{
+    Py_ssize_t reset = self->head;
+    int best, i, current, level, diff;
+    HeadingData *after, *heading;
+    PyObject *stack;
+
+    self->head += 1;
+    best = 1;
+    while (Tokenizer_READ(self, 0) == '=') {
+        best++;
+        self->head++;
+    }
+    current = heading_level_from_context(self->topstack->context);
+    level = current > best ? (best > 6 ? 6 : best) :
+                             (current > 6 ? 6 : current);
+    after = (HeadingData*) Tokenizer_parse(self, self->topstack->context, 1);
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        if (level < best) {
+            diff = best - level;
+            for (i = 0; i < diff; i++) {
+                if (Tokenizer_emit_char(self, '='))
+                    return NULL;
+            }
+        }
+        self->head = reset + best - 1;
+    }
+    else {
+        for (i = 0; i < best; i++) {
+            if (Tokenizer_emit_char(self, '=')) {
+                Py_DECREF(after->title);
+                free(after);
+                return NULL;
+            }
+        }
+        if (Tokenizer_emit_all(self, after->title)) {
+            Py_DECREF(after->title);
+            free(after);
+            return NULL;
+        }
+        Py_DECREF(after->title);
+        level = after->level;
+        free(after);
+    }
+    stack = Tokenizer_pop(self);
+    if (!stack)
+        return NULL;
+    heading = malloc(sizeof(HeadingData));
+    if (!heading) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    heading->title = stack;
+    heading->level = level;
+    return heading;
+}
+
+/*
+    Actually parse an HTML entity and ensure that it is valid.
+*/
+static int Tokenizer_really_parse_entity(Tokenizer* self)
+{
+    PyObject *kwargs, *textobj;
+    Py_UNICODE this;
+    int numeric, hexadecimal, i, j, zeroes, test;
+    char *valid, *text, *buffer, *def;
+
+    #define FAIL_ROUTE_AND_EXIT() { \
+        Tokenizer_fail_route(self); \
+        free(text);                 \
+        return 0;                   \
+    }
+
+    if (Tokenizer_emit(self, HTMLEntityStart))
+        return -1;
+    self->head++;
+    this = Tokenizer_READ(self, 0);
+    if (!this) {
+        Tokenizer_fail_route(self);
+        return 0;
+    }
+    if (this == '#') {
+        numeric = 1;
+        if (Tokenizer_emit(self, HTMLEntityNumeric))
+            return -1;
+        self->head++;
+        this = Tokenizer_READ(self, 0);
+        if (!this) {
+            Tokenizer_fail_route(self);
+            return 0;
+        }
+        if (this == 'x' || this == 'X') {
+            hexadecimal = 1;
+            kwargs = PyDict_New();
+            if (!kwargs)
+                return -1;
+            PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0));
+            if (Tokenizer_emit_kwargs(self, HTMLEntityHex, kwargs))
+                return -1;
+            self->head++;
+        }
+        else
+            hexadecimal = 0;
+    }
+    else
+        numeric = hexadecimal = 0;
+    if (hexadecimal)
+        valid = HEXDIGITS;
+    else if (numeric)
+        valid = DIGITS;
+    else
+        valid = ALPHANUM;
+    text = calloc(MAX_ENTITY_SIZE, sizeof(char));
+    if (!text) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    i = 0;
+    zeroes = 0;
+    while (1) {
+        this = Tokenizer_READ(self, 0);
+        if (this == ';') {
+            if (i == 0)
+                FAIL_ROUTE_AND_EXIT()
+            break;
+        }
+        if (i == 0 && this == '0') {
+            zeroes++;
+            self->head++;
+            continue;
+        }
+        if (i >= MAX_ENTITY_SIZE)
+            FAIL_ROUTE_AND_EXIT()
+        if (is_marker(this))
+            FAIL_ROUTE_AND_EXIT()
+        j = 0;
+        while (1) {
+            if (!valid[j])
+                FAIL_ROUTE_AND_EXIT()
+            if (this == valid[j])
+                break;
+            j++;
+        }
+        text[i] = (char) this;
+        self->head++;
+        i++;
+    }
+    if (numeric) {
+        sscanf(text, (hexadecimal ? "%x" : "%d"), &test);
+        if (test < 1 || test > 0x10FFFF)
+            FAIL_ROUTE_AND_EXIT()
+    }
+    else {
+        i = 0;
+        while (1) {
+            def = entitydefs[i];
+            if (!def)  // We've reached the end of the defs without finding it
+                FAIL_ROUTE_AND_EXIT()
+            if (strcmp(text, def) == 0)
+                break;
+            i++;
+        }
+    }
+    if (zeroes) {
+        buffer = calloc(strlen(text) + zeroes + 1, sizeof(char));
+        if (!buffer) {
+            free(text);
+            PyErr_NoMemory();
+            return -1;
+        }
+        for (i = 0; i < zeroes; i++)
+            strcat(buffer, "0");
+        strcat(buffer, text);
+        free(text);
+        text = buffer;
+    }
+    textobj = PyUnicode_FromString(text);
+    if (!textobj) {
+        free(text);
+        return -1;
+    }
+    free(text);
+    kwargs = PyDict_New();
+    if (!kwargs) {
+        Py_DECREF(textobj);
+        return -1;
+    }
+    PyDict_SetItemString(kwargs, "text", textobj);
+    Py_DECREF(textobj);
+    if (Tokenizer_emit_kwargs(self, Text, kwargs))
+        return -1;
+    if (Tokenizer_emit(self, HTMLEntityEnd))
+        return -1;
+    return 0;
+}
+
+/*
+    Parse an HTML entity at the head of the wikicode string.
+*/
+static int Tokenizer_parse_entity(Tokenizer* self)
+{
+    Py_ssize_t reset = self->head;
+    PyObject *tokenlist;
+
+    if (Tokenizer_push(self, 0))
+        return -1;
+    if (Tokenizer_really_parse_entity(self))
+        return -1;
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset;
+        if (Tokenizer_emit_char(self, '&'))
+            return -1;
+        return 0;
+    }
+    tokenlist = Tokenizer_pop(self);
+    if (!tokenlist)
+        return -1;
+    if (Tokenizer_emit_all(self, tokenlist)) {
+        Py_DECREF(tokenlist);
+        return -1;
+    }
+    Py_DECREF(tokenlist);
+    return 0;
+}
+
+/*
+    Parse an HTML comment at the head of the wikicode string.
+*/
+static int Tokenizer_parse_comment(Tokenizer* self)
+{
+    Py_ssize_t reset = self->head + 3;
+    PyObject *comment;
+    Py_UNICODE this;
+
+    self->head += 4;
+    if (Tokenizer_push(self, 0))
+        return -1;
+    while (1) {
+        this = Tokenizer_READ(self, 0);
+        if (!this) {
+            comment = Tokenizer_pop(self);
+            Py_XDECREF(comment);
+            self->head = reset;
+            return Tokenizer_emit_text(self, "<!--");
+        }
+        if (this == '-' && Tokenizer_READ(self, 1) == this &&
+                            Tokenizer_READ(self, 2) == '>') {
+            if (Tokenizer_emit_first(self, CommentStart))
+                return -1;
+            if (Tokenizer_emit(self, CommentEnd))
+                return -1;
+            comment = Tokenizer_pop(self);
+            if (!comment)
+                return -1;
+            if (Tokenizer_emit_all(self, comment))
+                return -1;
+            Py_DECREF(comment);
+            self->head += 2;
+            if (self->topstack->context & LC_FAIL_NEXT) {
+                /* _verify_safe() sets this flag while parsing a template or
+                   link when it encounters what might be a comment -- we must
+                   unset it to let _verify_safe() know it was correct: */
+                self->topstack->context ^= LC_FAIL_NEXT;
+            }
+            return 0;
+        }
+        if (Tokenizer_emit_char(self, this))
+            return -1;
+        self->head++;
+    }
+}
+
+/*
+    Write a pending tag attribute from data to the stack.
+*/
+static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
+{
+    PyObject *tokens, *kwargs, *tmp, *pad_first, *pad_before_eq, *pad_after_eq;
+
+    if (data->context & TAG_QUOTED) {
+        kwargs = PyDict_New();
+        if (!kwargs)
+            return -1;
+        tmp = PyUnicode_FromUnicode(&data->quoter, 1);
+        if (!tmp)
+            return -1;
+        PyDict_SetItemString(kwargs, "char", tmp);
+        Py_DECREF(tmp);
+        if (Tokenizer_emit_first_kwargs(self, TagAttrQuote, kwargs))
+            return -1;
+        tokens = Tokenizer_pop(self);
+        if (!tokens)
+            return -1;
+        if (Tokenizer_emit_all(self, tokens)) {
+            Py_DECREF(tokens);
+            return -1;
+        }
+        Py_DECREF(tokens);
+    }
+    pad_first = Textbuffer_render(data->pad_first);
+    pad_before_eq = Textbuffer_render(data->pad_before_eq);
+    pad_after_eq = Textbuffer_render(data->pad_after_eq);
+    if (!pad_first || !pad_before_eq || !pad_after_eq)
+        return -1;
+    kwargs = PyDict_New();
+    if (!kwargs)
+        return -1;
+    PyDict_SetItemString(kwargs, "pad_first", pad_first);
+    PyDict_SetItemString(kwargs, "pad_before_eq", pad_before_eq);
+    PyDict_SetItemString(kwargs, "pad_after_eq", pad_after_eq);
+    Py_DECREF(pad_first);
+    Py_DECREF(pad_before_eq);
+    Py_DECREF(pad_after_eq);
+    if (Tokenizer_emit_first_kwargs(self, TagAttrStart, kwargs))
+        return -1;
+    tokens = Tokenizer_pop(self);
+    if (!tokens)
+        return -1;
+    if (Tokenizer_emit_all(self, tokens)) {
+        Py_DECREF(tokens);
+        return -1;
+    }
+    Py_DECREF(tokens);
+    if (TagData_reset_buffers(data))
+        return -1;
+    return 0;
+}
+
+/*
+    Handle whitespace inside of an HTML open tag.
+*/
+static int
+Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
+{
+    uint64_t ctx = data->context;
+    uint64_t end_of_value = (ctx & TAG_ATTR_VALUE &&
+                             !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));
+
+    if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) {
+        if (Tokenizer_push_tag_buffer(self, data))
+            return -1;
+        data->context = TAG_ATTR_READY;
+    }
+    else if (ctx & TAG_NOTE_SPACE)
+        data->context = TAG_ATTR_READY;
+    else if (ctx & TAG_ATTR_NAME) {
+        data->context |= TAG_NOTE_EQUALS;
+        if (Textbuffer_write(&(data->pad_before_eq), text))
+            return -1;
+    }
+    if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) {
+        if (Tokenizer_emit_char(self, text))
+            return -1;
+    }
+    else if (data->context & TAG_ATTR_READY)
+        return Textbuffer_write(&(data->pad_first), text);
+    else if (data->context & TAG_ATTR_VALUE)
+        return Textbuffer_write(&(data->pad_after_eq), text);
+    return 0;
+}
+
+/*
+    Handle regular text inside of an HTML open tag.
+*/
+static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
+{
+    Py_UNICODE next = Tokenizer_READ(self, 1);
+
+    if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self))
+        return Tokenizer_emit_char(self, text);
+    else if (text == next && next == '{')
+        return Tokenizer_parse_template_or_argument(self);
+    else if (text == next && next == '[')
+        return Tokenizer_parse_wikilink(self);
+    else if (text == '<')
+        return Tokenizer_parse_tag(self);
+    return Tokenizer_emit_char(self, text);
+}
+
+/*
+    Handle all sorts of text data inside of an HTML open tag.
+*/
+static int
+Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk)
+{
+    PyObject *trash;
+    int first_time, escaped;
+
+    if (data->context & TAG_NAME) {
+        first_time = !(data->context & TAG_NOTE_SPACE);
+        if (is_marker(chunk) || (Py_UNICODE_ISSPACE(chunk) && first_time)) {
+            // Tags must start with text, not spaces
+            Tokenizer_fail_route(self);
+            return 0;
+        }
+        else if (first_time)
+            data->context |= TAG_NOTE_SPACE;
+        else if (Py_UNICODE_ISSPACE(chunk)) {
+            data->context = TAG_ATTR_READY;
+            return Tokenizer_handle_tag_space(self, data, chunk);
+        }
+    }
+    else if (Py_UNICODE_ISSPACE(chunk))
+        return Tokenizer_handle_tag_space(self, data, chunk);
+    else if (data->context & TAG_NOTE_SPACE) {
+        if (data->context & TAG_QUOTED) {
+            data->context = TAG_ATTR_VALUE;
+            trash = Tokenizer_pop(self);
+            Py_XDECREF(trash);
+            self->head = data->reset - 1;  // Will be auto-incremented
+        }
+        else
+            Tokenizer_fail_route(self);
+        return 0;
+    }
+    else if (data->context & TAG_ATTR_READY) {
+        data->context = TAG_ATTR_NAME;
+        if (Tokenizer_push(self, LC_TAG_ATTR))
+            return -1;
+    }
+    else if (data->context & TAG_ATTR_NAME) {
+        if (chunk == '=') {
+            data->context = TAG_ATTR_VALUE | TAG_NOTE_QUOTE;
+            if (Tokenizer_emit(self, TagAttrEquals))
+                return -1;
+            return 0;
+        }
+        if (data->context & TAG_NOTE_EQUALS) {
+            if (Tokenizer_push_tag_buffer(self, data))
+                return -1;
+            data->context = TAG_ATTR_NAME;
+            if (Tokenizer_push(self, LC_TAG_ATTR))
+                return -1;
+        }
+    }
+    else {  // data->context & TAG_ATTR_VALUE assured
+        escaped = (Tokenizer_READ_BACKWARDS(self, 1) == '\\' &&
+                   Tokenizer_READ_BACKWARDS(self, 2) != '\\');
+        if (data->context & TAG_NOTE_QUOTE) {
+            data->context ^= TAG_NOTE_QUOTE;
+            if ((chunk == '"' || chunk == '\'') && !escaped) {
+                data->context |= TAG_QUOTED;
+                data->quoter = chunk;
+                data->reset = self->head;
+                if (Tokenizer_push(self, self->topstack->context))
+                    return -1;
+                return 0;
+            }
+        }
+        else if (data->context & TAG_QUOTED) {
+            if (chunk == data->quoter && !escaped) {
+                data->context |= TAG_NOTE_SPACE;
+                return 0;
+            }
+        }
+    }
+    return Tokenizer_handle_tag_text(self, chunk);
+}
+
+/*
+    Handle the closing of a open tag (<foo>).
+*/
+static int
+Tokenizer_handle_tag_close_open(Tokenizer* self, TagData* data, PyObject* cls)
+{
+    PyObject *padding, *kwargs;
+
+    if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) {
+        if (Tokenizer_push_tag_buffer(self, data))
+            return -1;
+    }
+    padding = Textbuffer_render(data->pad_first);
+    if (!padding)
+        return -1;
+    kwargs = PyDict_New();
+    if (!kwargs) {
+        Py_DECREF(padding);
+        return -1;
+    }
+    PyDict_SetItemString(kwargs, "padding", padding);
+    Py_DECREF(padding);
+    if (Tokenizer_emit_kwargs(self, cls, kwargs))
+        return -1;
+    self->head++;
+    return 0;
+}
+
+/*
+    Handle the opening of a closing tag (</foo>).
+*/
+static int Tokenizer_handle_tag_open_close(Tokenizer* self)
+{
+    if (Tokenizer_emit(self, TagOpenClose))
+        return -1;
+    if (Tokenizer_push(self, LC_TAG_CLOSE))
+        return -1;
+    self->head++;
+    return 0;
+}
+
+/*
+    Handle the ending of a closing tag (</foo>).
+*/
+static PyObject* Tokenizer_handle_tag_close_close(Tokenizer* self)
+{
+    PyObject *closing, *first, *so, *sc;
+    int valid = 1;
+
+    closing = Tokenizer_pop(self);
+    if (!closing)
+        return NULL;
+    if (PyList_GET_SIZE(closing) != 1)
+        valid = 0;
+    else {
+        first = PyList_GET_ITEM(closing, 0);
+        switch (PyObject_IsInstance(first, Text)) {
+            case 0:
+                valid = 0;
+                break;
+            case 1: {
+                so = strip_tag_name(first, 1);
+                sc = strip_tag_name(
+                    PyList_GET_ITEM(self->topstack->stack, 1), 1);
+                if (so && sc) {
+                    if (PyUnicode_Compare(so, sc))
+                        valid = 0;
+                    Py_DECREF(so);
+                    Py_DECREF(sc);
+                    break;
+                }
+                Py_XDECREF(so);
+                Py_XDECREF(sc);
+            }
+            case -1:
+                Py_DECREF(closing);
+                return NULL;
+        }
+    }
+    if (!valid) {
+        Py_DECREF(closing);
+        return Tokenizer_fail_route(self);
+    }
+    if (Tokenizer_emit_all(self, closing)) {
+        Py_DECREF(closing);
+        return NULL;
+    }
+    Py_DECREF(closing);
+    if (Tokenizer_emit(self, TagCloseClose))
+        return NULL;
+    return Tokenizer_pop(self);
+}
+
+/*
+    Handle the body of an HTML tag that is parser-blacklisted.
+*/
+static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
+{
+    Textbuffer* buffer;
+    PyObject *buf_tmp, *end_tag, *start_tag;
+    Py_UNICODE this, next;
+    Py_ssize_t reset;
+    int cmp;
+
+    while (1) {
+        this = Tokenizer_READ(self, 0);
+        next = Tokenizer_READ(self, 1);
+        if (!this)
+            return Tokenizer_fail_route(self);
+        else if (this == '<' && next == '/') {
+            self->head += 2;
+            reset = self->head - 1;
+            buffer = Textbuffer_new();
+            if (!buffer)
+                return NULL;
+            while ((this = Tokenizer_READ(self, 0))) {
+                if (this == '>') {
+                    buf_tmp = Textbuffer_render(buffer);
+                    if (!buf_tmp)
+                        return NULL;
+                    end_tag = strip_tag_name(buf_tmp, 0);
+                    Py_DECREF(buf_tmp);
+                    if (!end_tag)
+                        return NULL;
+                    start_tag = strip_tag_name(
+                        PyList_GET_ITEM(self->topstack->stack, 1), 1);
+                    if (!start_tag)
+                        return NULL;
+                    cmp = PyUnicode_Compare(start_tag, end_tag);
+                    Py_DECREF(end_tag);
+                    Py_DECREF(start_tag);
+                    if (cmp)
+                        goto no_matching_end;
+                    if (Tokenizer_emit(self, TagOpenClose))
+                        return NULL;
+                    if (Tokenizer_emit_textbuffer(self, buffer, 0))
+                        return NULL;
+                    if (Tokenizer_emit(self, TagCloseClose))
+                        return NULL;
+                    return Tokenizer_pop(self);
+                }
+                if (!this || this == '\n') {
+                    no_matching_end:
+                    Textbuffer_dealloc(buffer);
+                    self->head = reset;
+                    if (Tokenizer_emit_text(self, "</"))
+                        return NULL;
+                    break;
+                }
+                Textbuffer_write(&buffer, this);
+                self->head++;
+            }
+        }
+        else if (this == '&') {
+            if (Tokenizer_parse_entity(self))
+                return NULL;
+        }
+        else if (Tokenizer_emit_char(self, this))
+            return NULL;
+        self->head++;
+    }
+}
+
+/*
+    Handle the end of an implicitly closing single-only HTML tag.
+*/
+static PyObject* Tokenizer_handle_single_only_tag_end(Tokenizer* self)
+{
+    PyObject *top, *padding, *kwargs;
+
+    top = PyObject_CallMethod(self->topstack->stack, "pop", NULL);
+    if (!top)
+        return NULL;
+    padding = PyObject_GetAttrString(top, "padding");
+    Py_DECREF(top);
+    if (!padding)
+        return NULL;
+    kwargs = PyDict_New();
+    if (!kwargs) {
+        Py_DECREF(padding);
+        return NULL;
+    }
+    PyDict_SetItemString(kwargs, "padding", padding);
+    PyDict_SetItemString(kwargs, "implicit", Py_True);
+    Py_DECREF(padding);
+    if (Tokenizer_emit_kwargs(self, TagCloseSelfclose, kwargs))
+        return NULL;
+    self->head--;  // Offset displacement done by handle_tag_close_open
+    return Tokenizer_pop(self);
+}
+
+/*
+    Handle the stream end when inside a single-supporting HTML tag.
+*/
+static PyObject* Tokenizer_handle_single_tag_end(Tokenizer* self)
+{
+    PyObject *token = 0, *padding, *kwargs;
+    Py_ssize_t len, index;
+    int depth = 1, is_instance;
+
+    len = PyList_GET_SIZE(self->topstack->stack);
+    for (index = 2; index < len; index++) {
+        token = PyList_GET_ITEM(self->topstack->stack, index);
+        is_instance = PyObject_IsInstance(token, TagOpenOpen);
+        if (is_instance == -1)
+            return NULL;
+        else if (is_instance == 1)
+            depth++;
+        is_instance = PyObject_IsInstance(token, TagCloseOpen);
+        if (is_instance == -1)
+            return NULL;
+        else if (is_instance == 1) {
+            depth--;
+            if (depth == 0)
+                break;
+        }
+    }
+    if (!token || depth > 0)
+        return NULL;
+    padding = PyObject_GetAttrString(token, "padding");
+    if (!padding)
+        return NULL;
+    kwargs = PyDict_New();
+    if (!kwargs) {
+        Py_DECREF(padding);
+        return NULL;
+    }
+    PyDict_SetItemString(kwargs, "padding", padding);
+    PyDict_SetItemString(kwargs, "implicit", Py_True);
+    Py_DECREF(padding);
+    token = PyObject_Call(TagCloseSelfclose, NOARGS, kwargs);
+    Py_DECREF(kwargs);
+    if (!token)
+        return NULL;
+    if (PyList_SetItem(self->topstack->stack, index, token)) {
+        Py_DECREF(token);
+        return NULL;
+    }
+    return Tokenizer_pop(self);
+}
+
+/*
+    Actually parse an HTML tag, starting with the open (<foo>).
+*/
+static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
+{
+    TagData *data = TagData_new();
+    PyObject *token, *text, *trash;
+    Py_UNICODE this, next;
+    int can_exit;
+
+    if (!data)
+        return NULL;
+    if (Tokenizer_push(self, LC_TAG_OPEN)) {
+        TagData_dealloc(data);
+        return NULL;
+    }
+    if (Tokenizer_emit(self, TagOpenOpen)) {
+        TagData_dealloc(data);
+        return NULL;
+    }
+    while (1) {
+        this = Tokenizer_READ(self, 0);
+        next = Tokenizer_READ(self, 1);
+        can_exit = (!(data->context & (TAG_QUOTED | TAG_NAME)) ||
+                    data->context & TAG_NOTE_SPACE);
+        if (!this) {
+            if (self->topstack->context & LC_TAG_ATTR) {
+                if (data->context & TAG_QUOTED) {
+                    // Unclosed attribute quote: reset, don't die
+                    data->context = TAG_ATTR_VALUE;
+                    trash = Tokenizer_pop(self);
+                    Py_XDECREF(trash);
+                    self->head = data->reset;
+                    continue;
+                }
+                trash = Tokenizer_pop(self);
+                Py_XDECREF(trash);
+            }
+            TagData_dealloc(data);
+            return Tokenizer_fail_route(self);
+        }
+        else if (this == '>' && can_exit) {
+            if (Tokenizer_handle_tag_close_open(self, data, TagCloseOpen)) {
+                TagData_dealloc(data);
+                return NULL;
+            }
+            TagData_dealloc(data);
+            self->topstack->context = LC_TAG_BODY;
+            token = PyList_GET_ITEM(self->topstack->stack, 1);
+            text = PyObject_GetAttrString(token, "text");
+            if (!text)
+                return NULL;
+            if (IS_SINGLE_ONLY(text)) {
+                Py_DECREF(text);
+                return Tokenizer_handle_single_only_tag_end(self);
+            }
+            if (IS_PARSABLE(text)) {
+                Py_DECREF(text);
+                return Tokenizer_parse(self, 0, 0);
+            }
+            Py_DECREF(text);
+            return Tokenizer_handle_blacklisted_tag(self);
+        }
+        else if (this == '/' && next == '>' && can_exit) {
+            if (Tokenizer_handle_tag_close_open(self, data,
+                                                TagCloseSelfclose)) {
+                TagData_dealloc(data);
+                return NULL;
+            }
+            TagData_dealloc(data);
+            return Tokenizer_pop(self);
+        }
+        else {
+            if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
+                TagData_dealloc(data);
+                return NULL;
+            }
+        }
+        self->head++;
+    }
+}
+
+/*
+    Handle the (possible) start of an implicitly closing single tag.
+*/
+static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
+{
+    Py_ssize_t reset = self->head + 1, pos = 0;
+    Textbuffer* buf;
+    PyObject *name, *tag;
+    Py_UNICODE this;
+
+    self->head += 2;
+    buf = Textbuffer_new();
+    if (!buf)
+        return -1;
+    while (1) {
+        this = Tokenizer_READ(self, pos);
+        if (Py_UNICODE_ISSPACE(this) || is_marker(this)) {
+            name = Textbuffer_render(buf);
+            if (!name) {
+                Textbuffer_dealloc(buf);
+                return -1;
+            }
+            if (!IS_SINGLE_ONLY(name))
+                FAIL_ROUTE(0);
+            Py_DECREF(name);
+            break;
+        }
+        Textbuffer_write(&buf, this);
+        pos++;
+    }
+    Textbuffer_dealloc(buf);
+    if (!BAD_ROUTE)
+        tag = Tokenizer_really_parse_tag(self);
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset;
+        return Tokenizer_emit_text(self, "</");
+    }
+    if (!tag)
+        return -1;
+    // Set invalid=True flag of TagOpenOpen
+    if (PyObject_SetAttrString(PyList_GET_ITEM(tag, 0), "invalid", Py_True))
+        return -1;
+    if (Tokenizer_emit_all(self, tag)) {
+        Py_DECREF(tag);
+        return -1;
+    }
+    Py_DECREF(tag);
+    return 0;
+}
+
+/*
+    Parse an HTML tag at the head of the wikicode string.
+*/
+static int Tokenizer_parse_tag(Tokenizer* self)
+{
+    Py_ssize_t reset = self->head;
+    PyObject* tag;
+
+    self->head++;
+    tag = Tokenizer_really_parse_tag(self);
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset;
+        return Tokenizer_emit_char(self, '<');
+    }
+    if (!tag) {
+        return -1;
+    }
+    if (Tokenizer_emit_all(self, tag)) {
+        Py_DECREF(tag);
+        return -1;
+    }
+    Py_DECREF(tag);
+    return 0;
+}
+
+/*
+    Write the body of a tag and the tokens that should surround it.
+*/
+static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag,
+                                    const char* ticks, PyObject* body)
+{
+    PyObject *markup, *kwargs;
+
+    markup = PyUnicode_FromString(ticks);
+    if (!markup)
+        return -1;
+    kwargs = PyDict_New();
+    if (!kwargs) {
+        Py_DECREF(markup);
+        return -1;
+    }
+    PyDict_SetItemString(kwargs, "wiki_markup", markup);
+    Py_DECREF(markup);
+    if (Tokenizer_emit_kwargs(self, TagOpenOpen, kwargs))
+        return -1;
+    if (Tokenizer_emit_text(self, tag))
+        return -1;
+    if (Tokenizer_emit(self, TagCloseOpen))
+        return -1;
+    if (Tokenizer_emit_all(self, body))
+        return -1;
+    Py_DECREF(body);
+    if (Tokenizer_emit(self, TagOpenClose))
+        return -1;
+    if (Tokenizer_emit_text(self, tag))
+        return -1;
+    if (Tokenizer_emit(self, TagCloseClose))
+        return -1;
+    return 0;
+}
+
+/*
+    Parse wiki-style italics.
+*/
+static int Tokenizer_parse_italics(Tokenizer* self)
+{
+    Py_ssize_t reset = self->head;
+    uint64_t context;
+    PyObject *stack;
+
+    stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset;
+        if (BAD_ROUTE_CONTEXT & LC_STYLE_PASS_AGAIN) {
+            context = LC_STYLE_ITALICS | LC_STYLE_SECOND_PASS;
+            stack = Tokenizer_parse(self, context, 1);
+        }
+        else
+            return Tokenizer_emit_text(self, "''");
+    }
+    if (!stack)
+        return -1;
+    return Tokenizer_emit_style_tag(self, "i", "''", stack);
+}
+
+/*
+    Parse wiki-style bold.
+*/
+static int Tokenizer_parse_bold(Tokenizer* self)
+{
+    Py_ssize_t reset = self->head;
+    PyObject *stack;
+
+    stack = Tokenizer_parse(self, LC_STYLE_BOLD, 1);
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset;
+        if (self->topstack->context & LC_STYLE_SECOND_PASS)
+            return Tokenizer_emit_char(self, '\'') ? -1 : 1;
+        if (self->topstack->context & LC_STYLE_ITALICS) {
+            self->topstack->context |= LC_STYLE_PASS_AGAIN;
+            return Tokenizer_emit_text(self, "'''");
+        }
+        if (Tokenizer_emit_char(self, '\''))
+            return -1;
+        return Tokenizer_parse_italics(self);
+    }
+    if (!stack)
+        return -1;
+    return Tokenizer_emit_style_tag(self, "b", "'''", stack);
+}
+
+/*
+    Parse wiki-style italics and bold together (i.e., five ticks).
+*/
+static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
+{
+    Py_ssize_t reset = self->head;
+    PyObject *stack, *stack2;
+
+    stack = Tokenizer_parse(self, LC_STYLE_BOLD, 1);
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset;
+        stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
+        if (BAD_ROUTE) {
+            RESET_ROUTE();
+            self->head = reset;
+            return Tokenizer_emit_text(self, "'''''");
+        }
+        if (!stack)
+            return -1;
+        reset = self->head;
+        stack2 = Tokenizer_parse(self, LC_STYLE_BOLD, 1);
+        if (BAD_ROUTE) {
+            RESET_ROUTE();
+            self->head = reset;
+            if (Tokenizer_emit_text(self, "'''"))
+                return -1;
+            return Tokenizer_emit_style_tag(self, "i", "''", stack);
+        }
+        if (!stack2)
+            return -1;
+        if (Tokenizer_push(self, 0))
+            return -1;
+        if (Tokenizer_emit_style_tag(self, "i", "''", stack))
+            return -1;
+        if (Tokenizer_emit_all(self, stack2))
+            return -1;
+        Py_DECREF(stack2);
+        stack2 = Tokenizer_pop(self);
+        if (!stack2)
+            return -1;
+        return Tokenizer_emit_style_tag(self, "b", "'''", stack2);
+    }
+    if (!stack)
+        return -1;
+    reset = self->head;
+    stack2 = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset;
+        if (Tokenizer_emit_text(self, "''"))
+            return -1;
+        return Tokenizer_emit_style_tag(self, "b", "'''", stack);
+    }
+    if (!stack2)
+        return -1;
+    if (Tokenizer_push(self, 0))
+        return -1;
+    if (Tokenizer_emit_style_tag(self, "b", "'''", stack))
+        return -1;
+    if (Tokenizer_emit_all(self, stack2))
+        return -1;
+    Py_DECREF(stack2);
+    stack2 = Tokenizer_pop(self);
+    if (!stack2)
+        return -1;
+    return Tokenizer_emit_style_tag(self, "i", "''", stack2);
+}
+
+/*
+    Parse wiki-style formatting (''/''' for italics/bold).
+*/
+static PyObject* Tokenizer_parse_style(Tokenizer* self)
+{
+    uint64_t context = self->topstack->context, ticks = 2, i;
+
+    self->head += 2;
+    while (Tokenizer_READ(self, 0) == '\'') {
+        self->head++;
+        ticks++;
+    }
+    if (ticks > 5) {
+        for (i = 0; i < ticks - 5; i++) {
+            if (Tokenizer_emit_char(self, '\''))
+                return NULL;
+        }
+        ticks = 5;
+    }
+    else if (ticks == 4) {
+        if (Tokenizer_emit_char(self, '\''))
+            return NULL;
+        ticks = 3;
+    }
+    if ((context & LC_STYLE_ITALICS && (ticks == 2 || ticks == 5)) ||
+           (context & LC_STYLE_BOLD && (ticks == 3 || ticks == 5))) {
+        if (ticks == 5)
+            self->head -= context & LC_STYLE_ITALICS ? 3 : 2;
+        return Tokenizer_pop(self);
+    }
+    if (!Tokenizer_CAN_RECURSE(self)) {
+        if (ticks == 3) {
+            if (context & LC_STYLE_SECOND_PASS) {
+                if (Tokenizer_emit_char(self, '\''))
+                    return NULL;
+                return Tokenizer_pop(self);
+            }
+            if (context & LC_STYLE_ITALICS)
+                self->topstack->context |= LC_STYLE_PASS_AGAIN;
+        }
+        for (i = 0; i < ticks; i++) {
+            if (Tokenizer_emit_char(self, '\''))
+                return NULL;
+        }
+    }
+    else if (ticks == 2) {
+        if (Tokenizer_parse_italics(self))
+            return NULL;
+    }
+    else if (ticks == 3) {
+        switch (Tokenizer_parse_bold(self)) {
+            case 1:
+                return Tokenizer_pop(self);
+            case -1:
+                return NULL;
+        }
+    }
+    else {
+        if (Tokenizer_parse_italics_and_bold(self))
+            return NULL;
+    }
+    self->head--;
+    return Py_None;
+}
+
+/*
+    Handle a list marker at the head (#, *, ;, :).
+*/
+static int Tokenizer_handle_list_marker(Tokenizer* self)
+{
+    PyObject *markup = Tokenizer_read(self, 0), *kwargs;
+    Py_UNICODE code = *PyUnicode_AS_UNICODE(markup);
+
+    if (code == ';')
+        self->topstack->context |= LC_DLTERM;
+    kwargs = PyDict_New();
+    if (!kwargs)
+        return -1;
+    PyDict_SetItemString(kwargs, "wiki_markup", markup);
+    if (Tokenizer_emit_kwargs(self, TagOpenOpen, kwargs))
+        return -1;
+    if (Tokenizer_emit_text(self, GET_HTML_TAG(code)))
+        return -1;
+    if (Tokenizer_emit(self, TagCloseSelfclose))
+        return -1;
+    return 0;
+}
+
+/*
+    Handle a wiki-style list (#, *, ;, :).
+*/
+static int Tokenizer_handle_list(Tokenizer* self)
+{
+    Py_UNICODE marker = Tokenizer_READ(self, 1);
+
+    if (Tokenizer_handle_list_marker(self))
+        return -1;
+    while (marker == '#' || marker == '*' || marker == ';' ||
+           marker == ':') {
+        self->head++;
+        if (Tokenizer_handle_list_marker(self))
+            return -1;
+        marker = Tokenizer_READ(self, 1);
+    }
+    return 0;
+}
+
+/*
+    Handle a wiki-style horizontal rule (----) in the string.
+*/
+static int Tokenizer_handle_hr(Tokenizer* self)
+{
+    PyObject *markup, *kwargs;
+    Textbuffer *buffer = Textbuffer_new();
+    int i;
+
+    if (!buffer)
+        return -1;
+    self->head += 3;
+    for (i = 0; i < 4; i++) {
+        if (Textbuffer_write(&buffer, '-'))
+            return -1;
+    }
+    while (Tokenizer_READ(self, 1) == '-') {
+        if (Textbuffer_write(&buffer, '-'))
+            return -1;
+        self->head++;
+    }
+    markup = Textbuffer_render(buffer);
+    Textbuffer_dealloc(buffer);
+    if (!markup)
+        return -1;
+    kwargs = PyDict_New();
+    if (!kwargs)
+        return -1;
+    PyDict_SetItemString(kwargs, "wiki_markup", markup);
+    Py_DECREF(markup);
+    if (Tokenizer_emit_kwargs(self, TagOpenOpen, kwargs))
+        return -1;
+    if (Tokenizer_emit_text(self, "hr"))
+        return -1;
+    if (Tokenizer_emit(self, TagCloseSelfclose))
+        return -1;
+    return 0;
+}
+
+/*
+    Handle the term in a description list ('foo' in ';foo:bar').
+*/
+static int Tokenizer_handle_dl_term(Tokenizer* self)
+{
+    self->topstack->context ^= LC_DLTERM;
+    if (Tokenizer_READ(self, 0) == ':')
+        return Tokenizer_handle_list_marker(self);
+    return Tokenizer_emit_char(self, '\n');
+}
+
+/*
+    Emit a table tag.
+*/
+static int
+Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup,
+                         const char* tag, PyObject* style, PyObject* padding,
+                         const char* close_open_markup, PyObject* contents,
+                         const char* open_close_markup)
+{
+    PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs,
+             *close_open_markup_unicode, *open_close_kwargs,
+             *open_close_markup_unicode;
+
+    open_open_kwargs = PyDict_New();
+    if (!open_open_kwargs)
+        goto fail_decref_all;
+    open_open_markup_unicode = PyUnicode_FromString(open_open_markup);
+    if (!open_open_markup_unicode) {
+        Py_DECREF(open_open_kwargs);
+        goto fail_decref_all;
+    }
+    PyDict_SetItemString(open_open_kwargs, "wiki_markup",
+                         open_open_markup_unicode);
+    Py_DECREF(open_open_markup_unicode);
+    if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs))
+        goto fail_decref_all;
+    if (Tokenizer_emit_text(self, tag))
+        goto fail_decref_all;
+
+    if (style) {
+        if (Tokenizer_emit_all(self, style))
+            goto fail_decref_all;
+        Py_DECREF(style);
+    }
+
+    close_open_kwargs = PyDict_New();
+    if (!close_open_kwargs)
+        goto fail_decref_padding_contents;
+    if (close_open_markup && strlen(close_open_markup) != 0) {
+        close_open_markup_unicode = PyUnicode_FromString(close_open_markup);
+        if (!close_open_markup_unicode) {
+            Py_DECREF(close_open_kwargs);
+            goto fail_decref_padding_contents;
+        }
+        PyDict_SetItemString(close_open_kwargs, "wiki_markup",
+                             close_open_markup_unicode);
+        Py_DECREF(close_open_markup_unicode);
+    }
+    PyDict_SetItemString(close_open_kwargs, "padding", padding);
+    Py_DECREF(padding);
+    if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs))
+        goto fail_decref_contents;
+
+    if (contents) {
+        if (Tokenizer_emit_all(self, contents))
+            goto fail_decref_contents;
+        Py_DECREF(contents);
+    }
+
+    open_close_kwargs = PyDict_New();
+    if (!open_close_kwargs)
+        return -1;
+    open_close_markup_unicode = PyUnicode_FromString(open_close_markup);
+    if (!open_close_markup_unicode) {
+        Py_DECREF(open_close_kwargs);
+        return -1;
+    }
+    PyDict_SetItemString(open_close_kwargs, "wiki_markup",
+                         open_close_markup_unicode);
+    Py_DECREF(open_close_markup_unicode);
+    if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs))
+        return -1;
+    if (Tokenizer_emit_text(self, tag))
+        return -1;
+    if (Tokenizer_emit(self, TagCloseClose))
+        return -1;
+    return 0;
+
+    fail_decref_all:
+    Py_XDECREF(style);
+    fail_decref_padding_contents:
+    Py_DECREF(padding);
+    fail_decref_contents:
+    Py_DECREF(contents);
+    return -1;
+}
+
+/*
+    Handle style attributes for a table until an ending token.
+*/
+static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
+{
+    TagData *data = TagData_new();
+    PyObject *padding, *trash;
+    Py_UNICODE this;
+    int can_exit;
+
+    if (!data)
+        return NULL;
+    data->context = TAG_ATTR_READY;
+
+    while (1) {
+        this = Tokenizer_READ(self, 0);
+        can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE);
+        if (this == end_token && can_exit) {
+            if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) {
+                if (Tokenizer_push_tag_buffer(self, data)) {
+                    TagData_dealloc(data);
+                    return NULL;
+                }
+            }
+            if (Py_UNICODE_ISSPACE(this))
+                Textbuffer_write(&(data->pad_first), this);
+            padding = Textbuffer_render(data->pad_first);
+            TagData_dealloc(data);
+            if (!padding)
+                return NULL;
+            return padding;
+        }
+        else if (!this || this == end_token) {
+           if (self->topstack->context & LC_TAG_ATTR) {
+                if (data->context & TAG_QUOTED) {
+                    // Unclosed attribute quote: reset, don't die
+                    data->context = TAG_ATTR_VALUE;
+                    trash = Tokenizer_pop(self);
+                    Py_XDECREF(trash);
+                    self->head = data->reset;
+                    continue;
+                }
+                trash = Tokenizer_pop(self);
+                Py_XDECREF(trash);
+            }
+            TagData_dealloc(data);
+            return Tokenizer_fail_route(self);
+        }
+        else {
+            if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
+                TagData_dealloc(data);
+                return NULL;
+            }
+        }
+        self->head++;
+    }
+}
+
+/*
+    Parse a wikicode table by starting with the first line.
+*/
+static int Tokenizer_parse_table(Tokenizer* self)
+{
+    Py_ssize_t reset = self->head + 1;
+    PyObject *style, *padding;
+    PyObject *table = NULL;
+    self->head += 2;
+
+    if(Tokenizer_push(self, LC_TABLE_OPEN))
+        return -1;
+    padding = Tokenizer_handle_table_style(self, '\n');
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        self->head = reset;
+        if (Tokenizer_emit_text(self, "{|"))
+            return -1;
+        return 0;
+    }
+    if (!padding)
+        return -1;
+    style = Tokenizer_pop(self);
+    if (!style) {
+        Py_DECREF(padding);
+        return -1;
+    }
+
+    self->head++;
+    table = Tokenizer_parse(self, LC_TABLE_OPEN, 1);
+    if (BAD_ROUTE) {
+        RESET_ROUTE();
+        Py_DECREF(padding);
+        Py_DECREF(style);
+        self->head = reset;
+        if (Tokenizer_emit_text(self, "{|"))
+            return -1;
+        return 0;
+    }
+    if (!table) {
+        Py_DECREF(padding);
+        Py_DECREF(style);
+        return -1;
+    }
+
+    if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL,
+                                 table, "|}"))
+        return -1;
+    // Offset displacement done by _parse()
+    self->head--;
+    return 0;
+}
+
+/*
+    Parse as style until end of the line, then continue.
+*/
+static int Tokenizer_handle_table_row(Tokenizer* self)
+{
+    PyObject *padding, *style, *row, *trash;
+    self->head += 2;
+
+    if (!Tokenizer_CAN_RECURSE(self)) {
+        if (Tokenizer_emit_text(self, "|-"))
+            return -1;
+        self->head -= 1;
+        return 0;
+    }
+
+    if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN))
+        return -1;
+    padding = Tokenizer_handle_table_style(self, '\n');
+    if (BAD_ROUTE) {
+        trash = Tokenizer_pop(self);
+        Py_XDECREF(trash);
+        return 0;
+    }
+    if (!padding)
+        return -1;
+    style = Tokenizer_pop(self);
+    if (!style) {
+        Py_DECREF(padding);
+        return -1;
+    }
+
+    // Don't parse the style separator
+    self->head++;
+    row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1);
+    if (!row) {
+        Py_DECREF(padding);
+        Py_DECREF(style);
+        return -1;
+    }
+
+    if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, ""))
+        return -1;
+    // Offset displacement done by _parse()
+    self->head--;
+    return 0;
+}
+
+/*
+    Parse as normal syntax unless we hit a style marker, then parse style
+    as HTML attributes and the remainder as normal syntax.
+*/
+static int
+Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
+                            const char *tag, uint64_t line_context)
+{
+    uint64_t old_context = self->topstack->context;
+    uint64_t cell_context;
+    Py_ssize_t reset;
+    PyObject *padding, *cell, *style = NULL;
+    const char *close_open_markup = NULL;
+
+    self->head += strlen(markup);
+    reset = self->head;
+
+    if (!Tokenizer_CAN_RECURSE(self)) {
+        if (Tokenizer_emit_text(self, markup))
+            return -1;
+        self->head--;
+        return 0;
+    }
+
+    cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
+                           LC_TABLE_CELL_STYLE | line_context, 1);
+    if (!cell)
+        return -1;
+    cell_context = self->topstack->context;
+    self->topstack->context = old_context;
+
+    if (cell_context & LC_TABLE_CELL_STYLE) {
+        Py_DECREF(cell);
+        self->head = reset;
+        if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
+                          line_context))
+            return -1;
+        padding = Tokenizer_handle_table_style(self, '|');
+        if (!padding)
+            return -1;
+        style = Tokenizer_pop(self);
+        if (!style) {
+            Py_DECREF(padding);
+            return -1;
+        }
+        // Don't parse the style separator
+        self->head++;
+        cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
+                               line_context, 1);
+        if (!cell) {
+            Py_DECREF(padding);
+            Py_DECREF(style);
+            return -1;
+        }
+        cell_context = self->topstack->context;
+        self->topstack->context = old_context;
+    }
+    else {
+        padding = PyUnicode_FromString("");
+        if (!padding) {
+            Py_DECREF(cell);
+            return -1;
+        }
+    }
+
+    if (style) {
+        close_open_markup = "|";
+    }
+    if (Tokenizer_emit_table_tag(self, markup, tag, style, padding,
+                                 close_open_markup, cell, ""))
+        return -1;
+    // Keep header/cell line contexts
+    self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE);
+    // Offset displacement done by parse()
+    self->head--;
+    return 0;
+}
+
+/*
+    Returns the context, stack, and whether to reset the cell for style
+    in a tuple.
+*/
+static PyObject*
+Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style)
+{
+    if (reset_for_style)
+        self->topstack->context |= LC_TABLE_CELL_STYLE;
+    else
+        self->topstack->context &= ~LC_TABLE_CELL_STYLE;
+    return Tokenizer_pop_keeping_context(self);
+}
+
+/*
+    Return the stack in order to handle the table row end.
+*/
+static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self)
+{
+    return Tokenizer_pop(self);
+}
+
+/*
+    Return the stack in order to handle the table end.
+*/
+static PyObject* Tokenizer_handle_table_end(Tokenizer* self)
+{
+    self->head += 2;
+    return Tokenizer_pop(self);
+}
+
+/*
+    Handle the end of the stream of wikitext.
+*/
+static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
+{
+    PyObject *token, *text, *trash;
+    int single;
+
+    if (context & AGG_FAIL) {
+        if (context & LC_TAG_BODY) {
+            token = PyList_GET_ITEM(self->topstack->stack, 1);
+            text = PyObject_GetAttrString(token, "text");
+            if (!text)
+                return NULL;
+            single = IS_SINGLE(text);
+            Py_DECREF(text);
+            if (single)
+                return Tokenizer_handle_single_tag_end(self);
+        }
+        else {
+            if (context & LC_TABLE_CELL_OPEN) {
+                trash = Tokenizer_pop(self);
+                Py_XDECREF(trash);
+                context = self->topstack->context;
+            }
+            if (context & AGG_DOUBLE) {
+                trash = Tokenizer_pop(self);
+                Py_XDECREF(trash);
+            }
+        }
+        return Tokenizer_fail_route(self);
+    }
+    return Tokenizer_pop(self);
+}
+
+/*
+    Make sure we are not trying to write an invalid character. Return 0 if
+    everything is safe, or -1 if the route must be failed.
+*/
+static int
+Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
+{
+    if (context & LC_FAIL_NEXT)
+        return -1;
+    if (context & LC_WIKILINK_TITLE) {
+        if (data == ']' || data == '{') {
+            self->topstack->context |= LC_FAIL_NEXT;
+        } else if (data == '\n' || data == '[' || data == '}' || data == '>') {
+            return -1;
+        } else if (data == '<') {
+            if (Tokenizer_READ(self, 1) == '!')
+                self->topstack->context |= LC_FAIL_NEXT;
+            else
+                return -1;
+        }
+        return 0;
+    }
+    if (context & LC_EXT_LINK_TITLE)
+        return (data == '\n') ? -1 : 0;
+    if (context & LC_TAG_CLOSE)
+        return (data == '<') ? -1 : 0;
+    if (context & LC_TEMPLATE_NAME) {
+        if (data == '{' || data == '}' || data == '[') {
+            self->topstack->context |= LC_FAIL_NEXT;
+            return 0;
+        }
+        if (data == ']' || data == '>' || (data == '<' &&
+                                           Tokenizer_READ(self, 1) != '!')) {
+            return -1;
+        }
+        if (data == '|')
+            return 0;
+        if (context & LC_HAS_TEXT) {
+            if (context & LC_FAIL_ON_TEXT) {
+                if (!Py_UNICODE_ISSPACE(data)) {
+                    if (data == '<' && Tokenizer_READ(self, 1) == '!') {
+                        self->topstack->context |= LC_FAIL_NEXT;
+                        return 0;
+                    }
+                    return -1;
+                }
+            }
+            else {
+                if (data == '\n')
+                    self->topstack->context |= LC_FAIL_ON_TEXT;
+            }
+        }
+        else if (!Py_UNICODE_ISSPACE(data))
+            self->topstack->context |= LC_HAS_TEXT;
+    }
+    else {
+        if (context & LC_FAIL_ON_EQUALS) {
+            if (data == '=') {
+                return -1;
+            }
+        }
+        else if (context & LC_FAIL_ON_LBRACE) {
+            if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' &&
+                                Tokenizer_READ_BACKWARDS(self, 2) == '{')) {
+                if (context & LC_TEMPLATE)
+                    self->topstack->context |= LC_FAIL_ON_EQUALS;
+                else
+                    self->topstack->context |= LC_FAIL_NEXT;
+                return 0;
+            }
+            self->topstack->context ^= LC_FAIL_ON_LBRACE;
+        }
+        else if (context & LC_FAIL_ON_RBRACE) {
+            if (data == '}') {
+                self->topstack->context |= LC_FAIL_NEXT;
+                return 0;
+            }
+            self->topstack->context ^= LC_FAIL_ON_RBRACE;
+        }
+        else if (data == '{')
+            self->topstack->context |= LC_FAIL_ON_LBRACE;
+        else if (data == '}')
+            self->topstack->context |= LC_FAIL_ON_RBRACE;
+    }
+    return 0;
+}
+
+/*
+    Returns whether the current head has leading whitespace.
+    TODO: treat comments and templates as whitespace, allow fail on non-newline spaces.
+*/
+static int Tokenizer_has_leading_whitespace(Tokenizer* self)
+{
+    int offset = 1;
+    Py_UNICODE current_character;
+    while (1) {
+        current_character = Tokenizer_READ_BACKWARDS(self, offset);
+        if (!current_character || current_character == '\n')
+            return 1;
+        else if (!Py_UNICODE_ISSPACE(current_character))
+            return 0;
+        offset++;
+    }
+}
+
+/*
+    Parse the wikicode string, using context for when to stop. If push is true,
+    we will push a new context, otherwise we won't and context will be ignored.
+*/
+PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
+{
+    uint64_t this_context;
+    Py_UNICODE this, next, next_next, last;
+    PyObject* temp;
+
+    if (push) {
+        if (Tokenizer_push(self, context))
+            return NULL;
+    }
+    while (1) {
+        this = Tokenizer_READ(self, 0);
+        this_context = self->topstack->context;
+        if (this_context & AGG_UNSAFE) {
+            if (Tokenizer_verify_safe(self, this_context, this) < 0) {
+                if (this_context & AGG_DOUBLE) {
+                    temp = Tokenizer_pop(self);
+                    Py_XDECREF(temp);
+                }
+                return Tokenizer_fail_route(self);
+            }
+        }
+        if (!is_marker(this)) {
+            if (Tokenizer_emit_char(self, this))
+                return NULL;
+            self->head++;
+            continue;
+        }
+        if (!this)
+            return Tokenizer_handle_end(self, this_context);
+        next = Tokenizer_READ(self, 1);
+        last = Tokenizer_READ_BACKWARDS(self, 1);
+        if (this == next && next == '{') {
+            if (Tokenizer_CAN_RECURSE(self)) {
+                if (Tokenizer_parse_template_or_argument(self))
+                    return NULL;
+            }
+            else if (Tokenizer_emit_char(self, this))
+                return NULL;
+        }
+        else if (this == '|' && this_context & LC_TEMPLATE) {
+            if (Tokenizer_handle_template_param(self))
+                return NULL;
+        }
+        else if (this == '=' && this_context & LC_TEMPLATE_PARAM_KEY) {
+            if (Tokenizer_handle_template_param_value(self))
+                return NULL;
+        }
+        else if (this == next && next == '}' && this_context & LC_TEMPLATE)
+            return Tokenizer_handle_template_end(self);
+        else if (this == '|' && this_context & LC_ARGUMENT_NAME) {
+            if (Tokenizer_handle_argument_separator(self))
+                return NULL;
+        }
+        else if (this == next && next == '}' && this_context & LC_ARGUMENT) {
+            if (Tokenizer_READ(self, 2) == '}') {
+                return Tokenizer_handle_argument_end(self);
+            }
+            if (Tokenizer_emit_char(self, this))
+                return NULL;
+        }
+        else if (this == next && next == '[' && Tokenizer_CAN_RECURSE(self)) {
+            if (!(this_context & AGG_NO_WIKILINKS)) {
+                if (Tokenizer_parse_wikilink(self))
+                    return NULL;
+            }
+            else if (Tokenizer_emit_char(self, this))
+                return NULL;
+        }
+        else if (this == '|' && this_context & LC_WIKILINK_TITLE) {
+            if (Tokenizer_handle_wikilink_separator(self))
+                return NULL;
+        }
+        else if (this == next && next == ']' && this_context & LC_WIKILINK)
+            return Tokenizer_handle_wikilink_end(self);
+        else if (this == '[') {
+            if (Tokenizer_parse_external_link(self, 1))
+                return NULL;
+        }
+        else if (this == ':' && !is_marker(last)) {
+            if (Tokenizer_parse_external_link(self, 0))
+                return NULL;
+        }
+        else if (this == ']' && this_context & LC_EXT_LINK_TITLE)
+            return Tokenizer_pop(self);
+        else if (this == '=' && !(self->global & GL_HEADING)) {
+            if (!last || last == '\n') {
+                if (Tokenizer_parse_heading(self))
+                    return NULL;
+            }
+            else if (Tokenizer_emit_char(self, this))
+                return NULL;
+        }
+        else if (this == '=' && this_context & LC_HEADING)
+            return (PyObject*) Tokenizer_handle_heading_end(self);
+        else if (this == '\n' && this_context & LC_HEADING)
+            return Tokenizer_fail_route(self);
+        else if (this == '&') {
+            if (Tokenizer_parse_entity(self))
+                return NULL;
+        }
+        else if (this == '<' && next == '!') {
+            next_next = Tokenizer_READ(self, 2);
+            if (next_next == Tokenizer_READ(self, 3) && next_next == '-') {
+                if (Tokenizer_parse_comment(self))
+                    return NULL;
+            }
+            else if (Tokenizer_emit_char(self, this))
+                return NULL;
+        }
+        else if (this == '<' && next == '/' && Tokenizer_READ(self, 2)) {
+            if (this_context & LC_TAG_BODY ?
+                Tokenizer_handle_tag_open_close(self) :
+                Tokenizer_handle_invalid_tag_start(self))
+                return NULL;
+        }
+        else if (this == '<' && !(this_context & LC_TAG_CLOSE)) {
+            if (Tokenizer_CAN_RECURSE(self)) {
+                if (Tokenizer_parse_tag(self))
+                    return NULL;
+            }
+            else if (Tokenizer_emit_char(self, this))
+                return NULL;
+        }
+        else if (this == '>' && this_context & LC_TAG_CLOSE)
+            return Tokenizer_handle_tag_close_close(self);
+        else if (this == next && next == '\'' && !self->skip_style_tags) {
+            temp = Tokenizer_parse_style(self);
+            if (temp != Py_None)
+                return temp;
+        }
+        else if ((!last || last == '\n') && (this == '#' || this == '*' || this == ';' || this == ':')) {
+            if (Tokenizer_handle_list(self))
+                return NULL;
+        }
+        else if ((!last || last == '\n') && (this == '-' && this == next &&
+                 this == Tokenizer_READ(self, 2) &&
+                 this == Tokenizer_READ(self, 3))) {
+            if (Tokenizer_handle_hr(self))
+                return NULL;
+        }
+        else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) {
+            if (Tokenizer_handle_dl_term(self))
+                return NULL;
+            // Kill potential table contexts
+            if (this == '\n')
+                self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS;
+        }
+
+        // Start of table parsing
+        else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) {
+            if (Tokenizer_CAN_RECURSE(self)) {
+                if (Tokenizer_parse_table(self))
+                    return NULL;
+            }
+            else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next))
+                return NULL;
+            else
+                self->head++;
+        }
+        else if (this_context & LC_TABLE_OPEN) {
+            if (this == '|' && next == '|' && this_context & LC_TABLE_TD_LINE) {
+                if (this_context & LC_TABLE_CELL_OPEN)
+                    return Tokenizer_handle_table_cell_end(self, 0);
+                else if (Tokenizer_handle_table_cell(self, "||", "td", LC_TABLE_TD_LINE))
+                    return NULL;
+            }
+            else if (this == '|' && next == '|' && this_context & LC_TABLE_TH_LINE) {
+                if (this_context & LC_TABLE_CELL_OPEN)
+                    return Tokenizer_handle_table_cell_end(self, 0);
+                else if (Tokenizer_handle_table_cell(self, "||", "th", LC_TABLE_TH_LINE))
+                    return NULL;
+            }
+            else if (this == '!' && next == '!' && this_context & LC_TABLE_TH_LINE) {
+                if (this_context & LC_TABLE_CELL_OPEN)
+                    return Tokenizer_handle_table_cell_end(self, 0);
+                else if (Tokenizer_handle_table_cell(self, "!!", "th", LC_TABLE_TH_LINE))
+                    return NULL;
+            }
+            else if (this == '|' && this_context & LC_TABLE_CELL_STYLE) {
+                return Tokenizer_handle_table_cell_end(self, 1);
+            }
+            // On newline, clear out cell line contexts
+            else if (this == '\n' && this_context & LC_TABLE_CELL_LINE_CONTEXTS) {
+                self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS;
+                if (Tokenizer_emit_char(self, this))
+                    return NULL;
+            }
+            else if (Tokenizer_has_leading_whitespace(self)) {
+                if (this == '|' && next == '}') {
+                    if (this_context & LC_TABLE_CELL_OPEN)
+                        return Tokenizer_handle_table_cell_end(self, 0);
+                    if (this_context & LC_TABLE_ROW_OPEN)
+                        return Tokenizer_handle_table_row_end(self);
+                    else
+                        return Tokenizer_handle_table_end(self);
+                }
+                else if (this == '|' && next == '-') {
+                    if (this_context & LC_TABLE_CELL_OPEN)
+                        return Tokenizer_handle_table_cell_end(self, 0);
+                    if (this_context & LC_TABLE_ROW_OPEN)
+                        return Tokenizer_handle_table_row_end(self);
+                    else if (Tokenizer_handle_table_row(self))
+                        return NULL;
+                }
+                else if (this == '|') {
+                    if (this_context & LC_TABLE_CELL_OPEN)
+                        return Tokenizer_handle_table_cell_end(self, 0);
+                    else if (Tokenizer_handle_table_cell(self, "|", "td", LC_TABLE_TD_LINE))
+                        return NULL;
+                }
+                else if (this == '!') {
+                    if (this_context & LC_TABLE_CELL_OPEN)
+                        return Tokenizer_handle_table_cell_end(self, 0);
+                    else if (Tokenizer_handle_table_cell(self, "!", "th", LC_TABLE_TH_LINE))
+                        return NULL;
+                }
+                else if (Tokenizer_emit_char(self, this))
+                    return NULL;
+            }
+            else if (Tokenizer_emit_char(self, this))
+                return NULL;
+            // Raise BadRoute to table start
+            if (BAD_ROUTE)
+                return NULL;
+        }
+        else if (Tokenizer_emit_char(self, this))
+            return NULL;
+        self->head++;
+    }
+}
diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.h b/mwparserfromhell/parser/ctokenizer/tok_parse.h
new file mode 100644
index 0000000..79e4acf
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.h
@@ -0,0 +1,29 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#include "common.h"
+
+/* Functions */
+
+PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);
diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.c b/mwparserfromhell/parser/ctokenizer/tok_support.c
new file mode 100644
index 0000000..12c7818
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/tok_support.c
@@ -0,0 +1,362 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "tok_support.h"
+#include "textbuffer.h"
+#include "tokens.h"
+
+/*
+    Add a new token stack, context, and textbuffer to the list.
+*/
+int Tokenizer_push(Tokenizer* self, uint64_t context)
+{
+    Stack* top = malloc(sizeof(Stack));
+
+    if (!top) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    top->stack = PyList_New(0);
+    top->context = context;
+    top->textbuffer = Textbuffer_new();
+    if (!top->textbuffer)
+        return -1;
+    top->next = self->topstack;
+    self->topstack = top;
+    self->depth++;
+    self->cycles++;
+    return 0;
+}
+
+/*
+    Push the textbuffer onto the stack as a Text node and clear it.
+*/
+int Tokenizer_push_textbuffer(Tokenizer* self)
+{
+    PyObject *text, *kwargs, *token;
+    Textbuffer* buffer = self->topstack->textbuffer;
+
+    if (buffer->size == 0 && !buffer->next)
+        return 0;
+    text = Textbuffer_render(buffer);
+    if (!text)
+        return -1;
+    kwargs = PyDict_New();
+    if (!kwargs) {
+        Py_DECREF(text);
+        return -1;
+    }
+    PyDict_SetItemString(kwargs, "text", text);
+    Py_DECREF(text);
+    token = PyObject_Call(Text, NOARGS, kwargs);
+    Py_DECREF(kwargs);
+    if (!token)
+        return -1;
+    if (PyList_Append(self->topstack->stack, token)) {
+        Py_DECREF(token);
+        return -1;
+    }
+    Py_DECREF(token);
+    Textbuffer_dealloc(buffer);
+    self->topstack->textbuffer = Textbuffer_new();
+    if (!self->topstack->textbuffer)
+        return -1;
+    return 0;
+}
+
+/*
+    Pop and deallocate the top token stack/context/textbuffer.
+*/
+void Tokenizer_delete_top_of_stack(Tokenizer* self)
+{
+    Stack* top = self->topstack;
+
+    Py_DECREF(top->stack);
+    Textbuffer_dealloc(top->textbuffer);
+    self->topstack = top->next;
+    free(top);
+    self->depth--;
+}
+
+/*
+    Pop the current stack/context/textbuffer, returing the stack.
+*/
+PyObject* Tokenizer_pop(Tokenizer* self)
+{
+    PyObject* stack;
+
+    if (Tokenizer_push_textbuffer(self))
+        return NULL;
+    stack = self->topstack->stack;
+    Py_INCREF(stack);
+    Tokenizer_delete_top_of_stack(self);
+    return stack;
+}
+
+/*
+    Pop the current stack/context/textbuffer, returing the stack. We will also
+    replace the underlying stack's context with the current stack's.
+*/
+PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
+{
+    PyObject* stack;
+    uint64_t context;
+
+    if (Tokenizer_push_textbuffer(self))
+        return NULL;
+    stack = self->topstack->stack;
+    Py_INCREF(stack);
+    context = self->topstack->context;
+    Tokenizer_delete_top_of_stack(self);
+    self->topstack->context = context;
+    return stack;
+}
+
+/*
+    Fail the current tokenization route. Discards the current
+    stack/context/textbuffer and sets the BAD_ROUTE flag.
+*/
+void* Tokenizer_fail_route(Tokenizer* self)
+{
+    uint64_t context = self->topstack->context;
+    PyObject* stack = Tokenizer_pop(self);
+
+    Py_XDECREF(stack);
+    FAIL_ROUTE(context);
+    return NULL;
+}
+
+/*
+    Write a token to the current token stack.
+*/
+int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first)
+{
+    PyObject* instance;
+
+    if (Tokenizer_push_textbuffer(self))
+        return -1;
+    instance = PyObject_CallObject(token, NULL);
+    if (!instance)
+        return -1;
+    if (first ? PyList_Insert(self->topstack->stack, 0, instance) :
+                PyList_Append(self->topstack->stack, instance)) {
+        Py_DECREF(instance);
+        return -1;
+    }
+    Py_DECREF(instance);
+    return 0;
+}
+
+/*
+    Write a token to the current token stack, with kwargs. Steals a reference
+    to kwargs.
+*/
+int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
+                                       PyObject* kwargs, int first)
+{
+    PyObject* instance;
+
+    if (Tokenizer_push_textbuffer(self)) {
+        Py_DECREF(kwargs);
+        return -1;
+    }
+    instance = PyObject_Call(token, NOARGS, kwargs);
+    if (!instance) {
+        Py_DECREF(kwargs);
+        return -1;
+    }
+    if (first ? PyList_Insert(self->topstack->stack, 0, instance):
+                PyList_Append(self->topstack->stack, instance)) {
+        Py_DECREF(instance);
+        Py_DECREF(kwargs);
+        return -1;
+    }
+    Py_DECREF(instance);
+    Py_DECREF(kwargs);
+    return 0;
+}
+
+/*
+    Write a Unicode codepoint to the current textbuffer.
+*/
+int Tokenizer_emit_char(Tokenizer* self, Py_UNICODE code)
+{
+    return Textbuffer_write(&(self->topstack->textbuffer), code);
+}
+
+/*
+    Write a string of text to the current textbuffer.
+*/
+int Tokenizer_emit_text(Tokenizer* self, const char* text)
+{
+    int i = 0;
+
+    while (text[i]) {
+        if (Tokenizer_emit_char(self, text[i]))
+            return -1;
+        i++;
+    }
+    return 0;
+}
+
+/*
+    Write the contents of another textbuffer to the current textbuffer,
+    deallocating it in the process.
+*/
+int
+Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse)
+{
+    Textbuffer *original = buffer;
+    Py_ssize_t i;
+
+    if (reverse) {
+        do {
+            for (i = buffer->size - 1; i >= 0; i--) {
+                if (Tokenizer_emit_char(self, buffer->data[i])) {
+                    Textbuffer_dealloc(original);
+                    return -1;
+                }
+            }
+        } while ((buffer = buffer->next));
+    }
+    else {
+        while (buffer->next)
+            buffer = buffer->next;
+        do {
+            for (i = 0; i < buffer->size; i++) {
+                if (Tokenizer_emit_char(self, buffer->data[i])) {
+                    Textbuffer_dealloc(original);
+                    return -1;
+                }
+            }
+        } while ((buffer = buffer->prev));
+    }
+    Textbuffer_dealloc(original);
+    return 0;
+}
+
+/*
+    Write a series of tokens to the current stack at once.
+*/
+int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
+{
+    int pushed = 0;
+    PyObject *stack, *token, *left, *right, *text;
+    Textbuffer* buffer;
+    Py_ssize_t size;
+
+    if (PyList_GET_SIZE(tokenlist) > 0) {
+        token = PyList_GET_ITEM(tokenlist, 0);
+        switch (PyObject_IsInstance(token, Text)) {
+            case 0:
+                break;
+            case 1: {
+                pushed = 1;
+                buffer = self->topstack->textbuffer;
+                if (buffer->size == 0 && !buffer->next)
+                    break;
+                left = Textbuffer_render(buffer);
+                if (!left)
+                    return -1;
+                right = PyObject_GetAttrString(token, "text");
+                if (!right)
+                    return -1;
+                text = PyUnicode_Concat(left, right);
+                Py_DECREF(left);
+                Py_DECREF(right);
+                if (!text)
+                    return -1;
+                if (PyObject_SetAttrString(token, "text", text)) {
+                    Py_DECREF(text);
+                    return -1;
+                }
+                Py_DECREF(text);
+                Textbuffer_dealloc(buffer);
+                self->topstack->textbuffer = Textbuffer_new();
+                if (!self->topstack->textbuffer)
+                    return -1;
+                break;
+            }
+            case -1:
+                return -1;
+        }
+    }
+    if (!pushed) {
+        if (Tokenizer_push_textbuffer(self))
+            return -1;
+    }
+    stack = self->topstack->stack;
+    size = PyList_GET_SIZE(stack);
+    if (PyList_SetSlice(stack, size, size, tokenlist))
+        return -1;
+    return 0;
+}
+
+/*
+    Pop the current stack, write text, and then write the stack. 'text' is a
+    NULL-terminated array of chars.
+*/
+int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
+{
+    PyObject* stack = Tokenizer_pop(self);
+
+    if (Tokenizer_emit_text(self, text)) {
+        Py_DECREF(stack);
+        return -1;
+    }
+    if (stack) {
+        if (PyList_GET_SIZE(stack) > 0) {
+            if (Tokenizer_emit_all(self, stack)) {
+                Py_DECREF(stack);
+                return -1;
+            }
+        }
+        Py_DECREF(stack);
+    }
+    self->head--;
+    return 0;
+}
+
+/*
+    Read the value at a relative point in the wikicode, forwards.
+*/
+PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
+{
+    Py_ssize_t index = self->head + delta;
+
+    if (index >= self->length)
+        return EMPTY;
+    return PyList_GET_ITEM(self->text, index);
+}
+
+/*
+    Read the value at a relative point in the wikicode, backwards.
+*/
+PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
+{
+    Py_ssize_t index;
+
+    if (delta > self->head)
+        return EMPTY;
+    index = self->head - delta;
+    return PyList_GET_ITEM(self->text, index);
+}
diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.h b/mwparserfromhell/parser/ctokenizer/tok_support.h
new file mode 100644
index 0000000..25a302a
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/tok_support.h
@@ -0,0 +1,66 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#include "common.h"
+
+/* Functions */
+
+int Tokenizer_push(Tokenizer*, uint64_t);
+int Tokenizer_push_textbuffer(Tokenizer*);
+void Tokenizer_delete_top_of_stack(Tokenizer*);
+PyObject* Tokenizer_pop(Tokenizer*);
+PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
+void* Tokenizer_fail_route(Tokenizer*);
+
+int Tokenizer_emit_token(Tokenizer*, PyObject*, int);
+int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int);
+int Tokenizer_emit_char(Tokenizer*, Py_UNICODE);
+int Tokenizer_emit_text(Tokenizer*, const char*);
+int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*, int);
+int Tokenizer_emit_all(Tokenizer*, PyObject*);
+int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);
+
+PyObject* Tokenizer_read(Tokenizer*, Py_ssize_t);
+PyObject* Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);
+
+/* Macros */
+
+#define MAX_DEPTH 40
+#define MAX_CYCLES 100000
+
+#define Tokenizer_READ(self, delta)                                           \
+    (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta)))
+#define Tokenizer_READ_BACKWARDS(self, delta)                                 \
+    (*PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, delta)))
+#define Tokenizer_CAN_RECURSE(self)                                           \
+    (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)
+
+#define Tokenizer_emit(self, token)                                           \
+    Tokenizer_emit_token(self, token, 0)
+#define Tokenizer_emit_first(self, token)                                     \
+    Tokenizer_emit_token(self, token, 1)
+#define Tokenizer_emit_kwargs(self, token, kwargs)                            \
+    Tokenizer_emit_token_kwargs(self, token, kwargs, 0)
+#define Tokenizer_emit_first_kwargs(self, token, kwargs)                      \
+    Tokenizer_emit_token_kwargs(self, token, kwargs, 1)
diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c
index 2bce247..a6400f6 100644
--- a/mwparserfromhell/parser/ctokenizer/tokenizer.c
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c
@@ -21,3116 +21,76 @@ SOFTWARE.
 */
 
 #include "tokenizer.h"
+#include "tok_parse.h"
+#include "tokens.h"
 
-/*
-    Determine whether the given Py_UNICODE is a marker.
-*/
-static int is_marker(Py_UNICODE this)
-{
-    int i;
-
-    for (i = 0; i < NUM_MARKERS; i++) {
-        if (MARKERS[i] == this)
-            return 1;
-    }
-    return 0;
-}
-
-/*
-    Given a context, return the heading level encoded within it.
-*/
-static int heading_level_from_context(uint64_t n)
-{
-    int level;
-
-    n /= LC_HEADING_LEVEL_1;
-    for (level = 1; n > 1; n >>= 1)
-        level++;
-    return level;
-}
-
-/*
-    Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as
-    parameters, and return its output as a bool.
-*/
-static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2,
-                         PyObject* in3)
-{
-    PyObject* func = PyObject_GetAttrString(definitions, funcname);
-    PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL);
-    int ans = (result == Py_True) ? 1 : 0;
-
-    Py_DECREF(func);
-    Py_DECREF(result);
-    return ans;
-}
-
-/*
-    Sanitize the name of a tag so it can be compared with others for equality.
-*/
-static PyObject* strip_tag_name(PyObject* token, int take_attr)
-{
-    PyObject *text, *rstripped, *lowered;
-
-    if (take_attr) {
-        text = PyObject_GetAttrString(token, "text");
-        if (!text)
-            return NULL;
-        rstripped = PyObject_CallMethod(text, "rstrip", NULL);
-        Py_DECREF(text);
-    }
-    else
-        rstripped = PyObject_CallMethod(token, "rstrip", NULL);
-    if (!rstripped)
-        return NULL;
-    lowered = PyObject_CallMethod(rstripped, "lower", NULL);
-    Py_DECREF(rstripped);
-    return lowered;
-}
-
-static TagData* TagData_new(void)
-{
-    TagData *self = malloc(sizeof(TagData));
-
-    #define ALLOC_BUFFER(name)     \
-        name = Textbuffer_new();   \
-        if (!name) {               \
-            TagData_dealloc(self); \
-            return NULL;           \
-        }
-
-    if (!self) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-    self->context = TAG_NAME;
-    ALLOC_BUFFER(self->pad_first)
-    ALLOC_BUFFER(self->pad_before_eq)
-    ALLOC_BUFFER(self->pad_after_eq)
-    self->quoter = 0;
-    self->reset = 0;
-    return self;
-}
-
-static void TagData_dealloc(TagData* self)
-{
-    #define DEALLOC_BUFFER(name) \
-        if (name)                \
-            Textbuffer_dealloc(name);
-
-    DEALLOC_BUFFER(self->pad_first);
-    DEALLOC_BUFFER(self->pad_before_eq);
-    DEALLOC_BUFFER(self->pad_after_eq);
-    free(self);
-}
-
-static int TagData_reset_buffers(TagData* self)
-{
-    #define RESET_BUFFER(name)    \
-        Textbuffer_dealloc(name); \
-        name = Textbuffer_new();  \
-        if (!name)                \
-            return -1;
-
-    RESET_BUFFER(self->pad_first)
-    RESET_BUFFER(self->pad_before_eq)
-    RESET_BUFFER(self->pad_after_eq)
-    return 0;
-}
-
-static PyObject*
-Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
-{
-    Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0);
-    return (PyObject*) self;
-}
-
-static void Tokenizer_dealloc(Tokenizer* self)
-{
-    Stack *this = self->topstack, *next;
-    Py_XDECREF(self->text);
-
-    while (this) {
-        Py_DECREF(this->stack);
-        Textbuffer_dealloc(this->textbuffer);
-        next = this->next;
-        free(this);
-        this = next;
-    }
-    Py_TYPE(self)->tp_free((PyObject*) self);
-}
-
-static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
-{
-    static char* kwlist[] = {NULL};
-
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
-        return -1;
-    self->text = Py_None;
-    Py_INCREF(Py_None);
-    self->topstack = NULL;
-    self->head = self->length = self->global = self->depth = self->cycles = 0;
-    return 0;
-}
-
-/*
-    Add a new token stack, context, and textbuffer to the list.
-*/
-static int Tokenizer_push(Tokenizer* self, uint64_t context)
-{
-    Stack* top = malloc(sizeof(Stack));
-
-    if (!top) {
-        PyErr_NoMemory();
-        return -1;
-    }
-    top->stack = PyList_New(0);
-    top->context = context;
-    top->textbuffer = Textbuffer_new();
-    if (!top->textbuffer)
-        return -1;
-    top->next = self->topstack;
-    self->topstack = top;
-    self->depth++;
-    self->cycles++;
-    return 0;
-}
-
-/*
-    Push the textbuffer onto the stack as a Text node and clear it.
-*/
-static int Tokenizer_push_textbuffer(Tokenizer* self)
-{
-    PyObject *text, *kwargs, *token;
-    Textbuffer* buffer = self->topstack->textbuffer;
-
-    if (buffer->size == 0 && !buffer->next)
-        return 0;
-    text = Textbuffer_render(buffer);
-    if (!text)
-        return -1;
-    kwargs = PyDict_New();
-    if (!kwargs) {
-        Py_DECREF(text);
-        return -1;
-    }
-    PyDict_SetItemString(kwargs, "text", text);
-    Py_DECREF(text);
-    token = PyObject_Call(Text, NOARGS, kwargs);
-    Py_DECREF(kwargs);
-    if (!token)
-        return -1;
-    if (PyList_Append(self->topstack->stack, token)) {
-        Py_DECREF(token);
-        return -1;
-    }
-    Py_DECREF(token);
-    Textbuffer_dealloc(buffer);
-    self->topstack->textbuffer = Textbuffer_new();
-    if (!self->topstack->textbuffer)
-        return -1;
-    return 0;
-}
-
-/*
-    Pop and deallocate the top token stack/context/textbuffer.
-*/
-static void Tokenizer_delete_top_of_stack(Tokenizer* self)
-{
-    Stack* top = self->topstack;
-
-    Py_DECREF(top->stack);
-    Textbuffer_dealloc(top->textbuffer);
-    self->topstack = top->next;
-    free(top);
-    self->depth--;
-}
-
-/*
-    Pop the current stack/context/textbuffer, returing the stack.
-*/
-static PyObject* Tokenizer_pop(Tokenizer* self)
-{
-    PyObject* stack;
-
-    if (Tokenizer_push_textbuffer(self))
-        return NULL;
-    stack = self->topstack->stack;
-    Py_INCREF(stack);
-    Tokenizer_delete_top_of_stack(self);
-    return stack;
-}
-
-/*
-    Pop the current stack/context/textbuffer, returing the stack. We will also
-    replace the underlying stack's context with the current stack's.
-*/
-static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
-{
-    PyObject* stack;
-    uint64_t context;
-
-    if (Tokenizer_push_textbuffer(self))
-        return NULL;
-    stack = self->topstack->stack;
-    Py_INCREF(stack);
-    context = self->topstack->context;
-    Tokenizer_delete_top_of_stack(self);
-    self->topstack->context = context;
-    return stack;
-}
-
-/*
-    Fail the current tokenization route. Discards the current
-    stack/context/textbuffer and sets the BAD_ROUTE flag.
-*/
-static void* Tokenizer_fail_route(Tokenizer* self)
-{
-    uint64_t context = self->topstack->context;
-    PyObject* stack = Tokenizer_pop(self);
-
-    Py_XDECREF(stack);
-    FAIL_ROUTE(context);
-    return NULL;
-}
-
-/*
-    Write a token to the current token stack.
-*/
-static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first)
-{
-    PyObject* instance;
-
-    if (Tokenizer_push_textbuffer(self))
-        return -1;
-    instance = PyObject_CallObject(token, NULL);
-    if (!instance)
-        return -1;
-    if (first ? PyList_Insert(self->topstack->stack, 0, instance) :
-                PyList_Append(self->topstack->stack, instance)) {
-        Py_DECREF(instance);
-        return -1;
-    }
-    Py_DECREF(instance);
-    return 0;
-}
-
-/*
-    Write a token to the current token stack, with kwargs. Steals a reference
-    to kwargs.
-*/
-static int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
-                                       PyObject* kwargs, int first)
-{
-    PyObject* instance;
-
-    if (Tokenizer_push_textbuffer(self)) {
-        Py_DECREF(kwargs);
-        return -1;
-    }
-    instance = PyObject_Call(token, NOARGS, kwargs);
-    if (!instance) {
-        Py_DECREF(kwargs);
-        return -1;
-    }
-    if (first ? PyList_Insert(self->topstack->stack, 0, instance):
-                PyList_Append(self->topstack->stack, instance)) {
-        Py_DECREF(instance);
-        Py_DECREF(kwargs);
-        return -1;
-    }
-    Py_DECREF(instance);
-    Py_DECREF(kwargs);
-    return 0;
-}
-
-/*
-    Write a Unicode codepoint to the current textbuffer.
-*/
-static int Tokenizer_emit_char(Tokenizer* self, Py_UNICODE code)
-{
-    return Textbuffer_write(&(self->topstack->textbuffer), code);
-}
-
-/*
-    Write a string of text to the current textbuffer.
-*/
-static int Tokenizer_emit_text(Tokenizer* self, const char* text)
-{
-    int i = 0;
-
-    while (text[i]) {
-        if (Tokenizer_emit_char(self, text[i]))
-            return -1;
-        i++;
-    }
-    return 0;
-}
-
-/*
-    Write the contents of another textbuffer to the current textbuffer,
-    deallocating it in the process.
-*/
-static int
-Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse)
-{
-    Textbuffer *original = buffer;
-    Py_ssize_t i;
-
-    if (reverse) {
-        do {
-            for (i = buffer->size - 1; i >= 0; i--) {
-                if (Tokenizer_emit_char(self, buffer->data[i])) {
-                    Textbuffer_dealloc(original);
-                    return -1;
-                }
-            }
-        } while ((buffer = buffer->next));
-    }
-    else {
-        while (buffer->next)
-            buffer = buffer->next;
-        do {
-            for (i = 0; i < buffer->size; i++) {
-                if (Tokenizer_emit_char(self, buffer->data[i])) {
-                    Textbuffer_dealloc(original);
-                    return -1;
-                }
-            }
-        } while ((buffer = buffer->prev));
-    }
-    Textbuffer_dealloc(original);
-    return 0;
-}
-
-/*
-    Write a series of tokens to the current stack at once.
-*/
-static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
-{
-    int pushed = 0;
-    PyObject *stack, *token, *left, *right, *text;
-    Textbuffer* buffer;
-    Py_ssize_t size;
-
-    if (PyList_GET_SIZE(tokenlist) > 0) {
-        token = PyList_GET_ITEM(tokenlist, 0);
-        switch (PyObject_IsInstance(token, Text)) {
-            case 0:
-                break;
-            case 1: {
-                pushed = 1;
-                buffer = self->topstack->textbuffer;
-                if (buffer->size == 0 && !buffer->next)
-                    break;
-                left = Textbuffer_render(buffer);
-                if (!left)
-                    return -1;
-                right = PyObject_GetAttrString(token, "text");
-                if (!right)
-                    return -1;
-                text = PyUnicode_Concat(left, right);
-                Py_DECREF(left);
-                Py_DECREF(right);
-                if (!text)
-                    return -1;
-                if (PyObject_SetAttrString(token, "text", text)) {
-                    Py_DECREF(text);
-                    return -1;
-                }
-                Py_DECREF(text);
-                Textbuffer_dealloc(buffer);
-                self->topstack->textbuffer = Textbuffer_new();
-                if (!self->topstack->textbuffer)
-                    return -1;
-                break;
-            }
-            case -1:
-                return -1;
-        }
-    }
-    if (!pushed) {
-        if (Tokenizer_push_textbuffer(self))
-            return -1;
-    }
-    stack = self->topstack->stack;
-    size = PyList_GET_SIZE(stack);
-    if (PyList_SetSlice(stack, size, size, tokenlist))
-        return -1;
-    return 0;
-}
-
-/*
-    Pop the current stack, write text, and then write the stack. 'text' is a
-    NULL-terminated array of chars.
-*/
-static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
-{
-    PyObject* stack = Tokenizer_pop(self);
-
-    if (Tokenizer_emit_text(self, text)) {
-        Py_DECREF(stack);
-        return -1;
-    }
-    if (stack) {
-        if (PyList_GET_SIZE(stack) > 0) {
-            if (Tokenizer_emit_all(self, stack)) {
-                Py_DECREF(stack);
-                return -1;
-            }
-        }
-        Py_DECREF(stack);
-    }
-    self->head--;
-    return 0;
-}
-
-/*
-    Read the value at a relative point in the wikicode, forwards.
-*/
-static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
-{
-    Py_ssize_t index = self->head + delta;
-
-    if (index >= self->length)
-        return EMPTY;
-    return PyList_GET_ITEM(self->text, index);
-}
-
-/*
-    Read the value at a relative point in the wikicode, backwards.
-*/
-static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
-{
-    Py_ssize_t index;
-
-    if (delta > self->head)
-        return EMPTY;
-    index = self->head - delta;
-    return PyList_GET_ITEM(self->text, index);
-}
-
-/*
-    Parse a template at the head of the wikicode string.
-*/
-static int Tokenizer_parse_template(Tokenizer* self)
-{
-    PyObject *template;
-    Py_ssize_t reset = self->head;
-
-    template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1);
-    if (BAD_ROUTE) {
-        self->head = reset;
-        return 0;
-    }
-    if (!template)
-        return -1;
-    if (Tokenizer_emit_first(self, TemplateOpen)) {
-        Py_DECREF(template);
-        return -1;
-    }
-    if (Tokenizer_emit_all(self, template)) {
-        Py_DECREF(template);
-        return -1;
-    }
-    Py_DECREF(template);
-    if (Tokenizer_emit(self, TemplateClose))
-        return -1;
-    return 0;
-}
-
-/*
-    Parse an argument at the head of the wikicode string.
-*/
-static int Tokenizer_parse_argument(Tokenizer* self)
-{
-    PyObject *argument;
-    Py_ssize_t reset = self->head;
-
-    argument = Tokenizer_parse(self, LC_ARGUMENT_NAME, 1);
-    if (BAD_ROUTE) {
-        self->head = reset;
-        return 0;
-    }
-    if (!argument)
-        return -1;
-    if (Tokenizer_emit_first(self, ArgumentOpen)) {
-        Py_DECREF(argument);
-        return -1;
-    }
-    if (Tokenizer_emit_all(self, argument)) {
-        Py_DECREF(argument);
-        return -1;
-    }
-    Py_DECREF(argument);
-    if (Tokenizer_emit(self, ArgumentClose))
-        return -1;
-    return 0;
-}
-
-/*
-    Parse a template or argument at the head of the wikicode string.
-*/
-static int Tokenizer_parse_template_or_argument(Tokenizer* self)
-{
-    unsigned int braces = 2, i;
-    PyObject *tokenlist;
-
-    self->head += 2;
-    while (Tokenizer_READ(self, 0) == '{' && braces < MAX_BRACES) {
-        self->head++;
-        braces++;
-    }
-    if (Tokenizer_push(self, 0))
-        return -1;
-    while (braces) {
-        if (braces == 1) {
-            if (Tokenizer_emit_text_then_stack(self, "{"))
-                return -1;
-            return 0;
-        }
-        if (braces == 2) {
-            if (Tokenizer_parse_template(self))
-                return -1;
-            if (BAD_ROUTE) {
-                RESET_ROUTE();
-                if (Tokenizer_emit_text_then_stack(self, "{{"))
-                    return -1;
-                return 0;
-            }
-            break;
-        }
-        if (Tokenizer_parse_argument(self))
-            return -1;
-        if (BAD_ROUTE) {
-            RESET_ROUTE();
-            if (Tokenizer_parse_template(self))
-                return -1;
-            if (BAD_ROUTE) {
-                char text[MAX_BRACES + 1];
-                RESET_ROUTE();
-                for (i = 0; i < braces; i++) text[i] = '{';
-                text[braces] = '\0';
-                if (Tokenizer_emit_text_then_stack(self, text))
-                    return -1;
-                return 0;
-            }
-            else
-                braces -= 2;
-        }
-        else
-            braces -= 3;
-        if (braces)
-            self->head++;
-    }
-    tokenlist = Tokenizer_pop(self);
-    if (!tokenlist)
-        return -1;
-    if (Tokenizer_emit_all(self, tokenlist)) {
-        Py_DECREF(tokenlist);
-        return -1;
-    }
-    Py_DECREF(tokenlist);
-    if (self->topstack->context & LC_FAIL_NEXT)
-        self->topstack->context ^= LC_FAIL_NEXT;
-    return 0;
-}
-
-/*
-    Handle a template parameter at the head of the string.
-*/
-static int Tokenizer_handle_template_param(Tokenizer* self)
-{
-    PyObject *stack;
-
-    if (self->topstack->context & LC_TEMPLATE_NAME)
-        self->topstack->context ^= LC_TEMPLATE_NAME;
-    else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE)
-        self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE;
-    if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
-        stack = Tokenizer_pop_keeping_context(self);
-        if (!stack)
-            return -1;
-        if (Tokenizer_emit_all(self, stack)) {
-            Py_DECREF(stack);
-            return -1;
-        }
-        Py_DECREF(stack);
-    }
-    else
-        self->topstack->context |= LC_TEMPLATE_PARAM_KEY;
-    if (Tokenizer_emit(self, TemplateParamSeparator))
-        return -1;
-    if (Tokenizer_push(self, self->topstack->context))
-        return -1;
-    return 0;
-}
-
-/*
-    Handle a template parameter's value at the head of the string.
-*/
-static int Tokenizer_handle_template_param_value(Tokenizer* self)
-{
-    PyObject *stack;
-
-    stack = Tokenizer_pop_keeping_context(self);
-    if (!stack)
-        return -1;
-    if (Tokenizer_emit_all(self, stack)) {
-        Py_DECREF(stack);
-        return -1;
-    }
-    Py_DECREF(stack);
-    self->topstack->context ^= LC_TEMPLATE_PARAM_KEY;
-    self->topstack->context |= LC_TEMPLATE_PARAM_VALUE;
-    if (Tokenizer_emit(self, TemplateParamEquals))
-        return -1;
-    return 0;
-}
-
-/*
-    Handle the end of a template at the head of the string.
-*/
-static PyObject* Tokenizer_handle_template_end(Tokenizer* self)
-{
-    PyObject* stack;
-
-    if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
-        stack = Tokenizer_pop_keeping_context(self);
-        if (!stack)
-            return NULL;
-        if (Tokenizer_emit_all(self, stack)) {
-            Py_DECREF(stack);
-            return NULL;
-        }
-        Py_DECREF(stack);
-    }
-    self->head++;
-    stack = Tokenizer_pop(self);
-    return stack;
-}
-
-/*
-    Handle the separator between an argument's name and default.
-*/
-static int Tokenizer_handle_argument_separator(Tokenizer* self)
-{
-    self->topstack->context ^= LC_ARGUMENT_NAME;
-    self->topstack->context |= LC_ARGUMENT_DEFAULT;
-    if (Tokenizer_emit(self, ArgumentSeparator))
-        return -1;
-    return 0;
-}
-
-/*
-    Handle the end of an argument at the head of the string.
-*/
-static PyObject* Tokenizer_handle_argument_end(Tokenizer* self)
-{
-    PyObject* stack = Tokenizer_pop(self);
-
-    self->head += 2;
-    return stack;
-}
-
-/*
-    Parse an internal wikilink at the head of the wikicode string.
-*/
-static int Tokenizer_parse_wikilink(Tokenizer* self)
-{
-    Py_ssize_t reset;
-    PyObject *wikilink;
-
-    self->head += 2;
-    reset = self->head - 1;
-    wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1);
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset;
-        if (Tokenizer_emit_text(self, "[["))
-            return -1;
-        return 0;
-    }
-    if (!wikilink)
-        return -1;
-    if (Tokenizer_emit(self, WikilinkOpen)) {
-        Py_DECREF(wikilink);
-        return -1;
-    }
-    if (Tokenizer_emit_all(self, wikilink)) {
-        Py_DECREF(wikilink);
-        return -1;
-    }
-    Py_DECREF(wikilink);
-    if (Tokenizer_emit(self, WikilinkClose))
-        return -1;
-    return 0;
-}
-
-/*
-    Handle the separator between a wikilink's title and its text.
-*/
-static int Tokenizer_handle_wikilink_separator(Tokenizer* self)
-{
-    self->topstack->context ^= LC_WIKILINK_TITLE;
-    self->topstack->context |= LC_WIKILINK_TEXT;
-    if (Tokenizer_emit(self, WikilinkSeparator))
-        return -1;
-    return 0;
-}
-
-/*
-    Handle the end of a wikilink at the head of the string.
-*/
-static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self)
-{
-    PyObject* stack = Tokenizer_pop(self);
-    self->head += 1;
-    return stack;
-}
-
-/*
-    Parse the URI scheme of a bracket-enclosed external link.
-*/
-static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
-{
-    static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
-    Textbuffer* buffer;
-    PyObject* scheme;
-    Py_UNICODE this;
-    int slashes, i;
-
-    if (Tokenizer_push(self, LC_EXT_LINK_URI))
-        return -1;
-    if (Tokenizer_READ(self, 0) == '/' && Tokenizer_READ(self, 1) == '/') {
-        if (Tokenizer_emit_text(self, "//"))
-            return -1;
-        self->head += 2;
-    }
-    else {
-        buffer = Textbuffer_new();
-        if (!buffer)
-            return -1;
-        while ((this = Tokenizer_READ(self, 0))) {
-            i = 0;
-            while (1) {
-                if (!valid[i])
-                    goto end_of_loop;
-                if (this == valid[i])
-                    break;
-                i++;
-            }
-            Textbuffer_write(&buffer, this);
-            if (Tokenizer_emit_char(self, this)) {
-                Textbuffer_dealloc(buffer);
-                return -1;
-            }
-            self->head++;
-        }
-        end_of_loop:
-        if (this != ':') {
-            Textbuffer_dealloc(buffer);
-            Tokenizer_fail_route(self);
-            return 0;
-        }
-        if (Tokenizer_emit_char(self, ':')) {
-            Textbuffer_dealloc(buffer);
-            return -1;
-        }
-        self->head++;
-        slashes = (Tokenizer_READ(self, 0) == '/' &&
-                   Tokenizer_READ(self, 1) == '/');
-        if (slashes) {
-            if (Tokenizer_emit_text(self, "//")) {
-                Textbuffer_dealloc(buffer);
-                return -1;
-            }
-            self->head += 2;
-        }
-        scheme = Textbuffer_render(buffer);
-        Textbuffer_dealloc(buffer);
-        if (!scheme)
-            return -1;
-        if (!IS_SCHEME(scheme, slashes, 0)) {
-            Py_DECREF(scheme);
-            Tokenizer_fail_route(self);
-            return 0;
-        }
-        Py_DECREF(scheme);
-    }
-    return 0;
-}
-
-/*
-    Parse the URI scheme of a free (no brackets) external link.
-*/
-static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
-{
-    static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
-    Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer;
-    PyObject *scheme;
-    Py_UNICODE chunk;
-    Py_ssize_t i;
-    int slashes, j;
-
-    if (!scheme_buffer)
-        return -1;
-    // We have to backtrack through the textbuffer looking for our scheme since
-    // it was just parsed as text:
-    temp_buffer = self->topstack->textbuffer;
-    while (temp_buffer) {
-        for (i = temp_buffer->size - 1; i >= 0; i--) {
-            chunk = temp_buffer->data[i];
-            if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
-                goto end_of_loop;
-            j = 0;
-            while (1) {
-                if (!valid[j]) {
-                    Textbuffer_dealloc(scheme_buffer);
-                    FAIL_ROUTE(0);
-                    return 0;
-                }
-                if (chunk == valid[j])
-                    break;
-                j++;
-            }
-            Textbuffer_write(&scheme_buffer, chunk);
-        }
-        temp_buffer = temp_buffer->next;
-    }
-    end_of_loop:
-    scheme = Textbuffer_render(scheme_buffer);
-    if (!scheme) {
-        Textbuffer_dealloc(scheme_buffer);
-        return -1;
-    }
-    slashes = (Tokenizer_READ(self, 0) == '/' &&
-               Tokenizer_READ(self, 1) == '/');
-    if (!IS_SCHEME(scheme, slashes, 1)) {
-        Py_DECREF(scheme);
-        Textbuffer_dealloc(scheme_buffer);
-        FAIL_ROUTE(0);
-        return 0;
-    }
-    Py_DECREF(scheme);
-    if (Tokenizer_push(self, self->topstack->context | LC_EXT_LINK_URI)) {
-        Textbuffer_dealloc(scheme_buffer);
-        return -1;
-    }
-    if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1))
-        return -1;
-    if (Tokenizer_emit_char(self, ':'))
-        return -1;
-    if (slashes) {
-        if (Tokenizer_emit_text(self, "//"))
-            return -1;
-        self->head += 2;
-    }
-    return 0;
-}
-
-/*
-    Handle text in a free external link, including trailing punctuation.
-*/
-static int
-Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
-                                Textbuffer** tail, Py_UNICODE this)
-{
-    #define PUSH_TAIL_BUFFER(tail, error)                 \
-        if ((tail)->size || (tail)->next) {               \
-            if (Tokenizer_emit_textbuffer(self, tail, 0)) \
-                return error;                             \
-            tail = Textbuffer_new();                      \
-            if (!(tail))                                  \
-                return error;                             \
-        }
-
-    if (this == '(' && !(*parens)) {
-        *parens = 1;
-        PUSH_TAIL_BUFFER(*tail, -1)
-    }
-    else if (this == ',' || this == ';' || this == '\\' || this == '.' ||
-             this == ':' || this == '!' || this == '?' ||
-             (!(*parens) && this == ')'))
-        return Textbuffer_write(tail, this);
-    else
-        PUSH_TAIL_BUFFER(*tail, -1)
-    return Tokenizer_emit_char(self, this);
-}
-
-/*
-    Return whether the current head is the end of a free link.
-*/
-static int
-Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
-{
-    // Built from Tokenizer_parse()'s end sentinels:
-    Py_UNICODE after = Tokenizer_READ(self, 2);
-    uint64_t ctx = self->topstack->context;
-
-    return (!this || this == '\n' || this == '[' || this == ']' ||
-        this == '<' || this == '>'  || (this == '\'' && next == '\'') ||
-        (this == '|' && ctx & LC_TEMPLATE) ||
-        (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) ||
-        (this == '}' && next == '}' &&
-            (ctx & LC_TEMPLATE || (after == '}' && ctx & LC_ARGUMENT))));
-}
-
-/*
-    Really parse an external link.
-*/
-static PyObject*
-Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
-                                     Textbuffer** extra)
-{
-    Py_UNICODE this, next;
-    int parens = 0;
-
-    if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) :
-                   Tokenizer_parse_free_uri_scheme(self))
-        return NULL;
-    if (BAD_ROUTE)
-        return NULL;
-    this = Tokenizer_READ(self, 0);
-    if (!this || this == '\n' || this == ' ' || this == ']')
-        return Tokenizer_fail_route(self);
-    if (!brackets && this == '[')
-        return Tokenizer_fail_route(self);
-    while (1) {
-        this = Tokenizer_READ(self, 0);
-        next = Tokenizer_READ(self, 1);
-        if (this == '&') {
-            PUSH_TAIL_BUFFER(*extra, NULL)
-            if (Tokenizer_parse_entity(self))
-                return NULL;
-        }
-        else if (this == '<' && next == '!'
-                 && Tokenizer_READ(self, 2) == '-'
-                 && Tokenizer_READ(self, 3) == '-') {
-            PUSH_TAIL_BUFFER(*extra, NULL)
-            if (Tokenizer_parse_comment(self))
-                return NULL;
-        }
-        else if (!brackets && Tokenizer_is_free_link(self, this, next)) {
-            self->head--;
-            return Tokenizer_pop(self);
-        }
-        else if (!this || this == '\n')
-            return Tokenizer_fail_route(self);
-        else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) {
-            PUSH_TAIL_BUFFER(*extra, NULL)
-            if (Tokenizer_parse_template_or_argument(self))
-                return NULL;
-        }
-        else if (this == ']')
-            return Tokenizer_pop(self);
-        else if (this == ' ') {
-            if (brackets) {
-                if (Tokenizer_emit(self, ExternalLinkSeparator))
-                    return NULL;
-                self->topstack->context ^= LC_EXT_LINK_URI;
-                self->topstack->context |= LC_EXT_LINK_TITLE;
-                self->head++;
-                return Tokenizer_parse(self, 0, 0);
-            }
-            if (Textbuffer_write(extra, ' '))
-                return NULL;
-            return Tokenizer_pop(self);
-        }
-        else if (!brackets) {
-            if (Tokenizer_handle_free_link_text(self, &parens, extra, this))
-                return NULL;
-        }
-        else {
-            if (Tokenizer_emit_char(self, this))
-                return NULL;
-        }
-        self->head++;
-    }
-}
-
-/*
-    Remove the URI scheme of a new external link from the textbuffer.
-*/
-static int
-Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
-{
-    PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"),
-             *split, *scheme;
-    Py_ssize_t length;
-    Textbuffer* temp;
-
-    if (!text)
-        return -1;
-    split = PyObject_CallMethod(text, "split", "si", ":", 1);
-    Py_DECREF(text);
-    if (!split)
-        return -1;
-    scheme = PyList_GET_ITEM(split, 0);
-    length = PyUnicode_GET_SIZE(scheme);
-    while (length) {
-        temp = self->topstack->textbuffer;
-        if (length <= temp->size) {
-            temp->size -= length;
-            break;
-        }
-        length -= temp->size;
-        self->topstack->textbuffer = temp->next;
-        free(temp->data);
-        free(temp);
-    }
-    Py_DECREF(split);
-    return 0;
-}
-
-/*
-    Parse an external link at the head of the wikicode string.
-*/
-static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
-{
-    #define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS
-    #define NOT_A_LINK                                        \
-        if (!brackets && self->topstack->context & LC_DLTERM) \
-            return Tokenizer_handle_dl_term(self);            \
-        return Tokenizer_emit_char(self, Tokenizer_READ(self, 0))
-
-    Py_ssize_t reset = self->head;
-    PyObject *link, *kwargs;
-    Textbuffer *extra = 0;
-
-    if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) {
-        NOT_A_LINK;
-    }
-    extra = Textbuffer_new();
-    if (!extra)
-        return -1;
-    self->head++;
-    link = Tokenizer_really_parse_external_link(self, brackets, &extra);
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset;
-        Textbuffer_dealloc(extra);
-        NOT_A_LINK;
-    }
-    if (!link) {
-        Textbuffer_dealloc(extra);
-        return -1;
-    }
-    if (!brackets) {
-        if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) {
-            Textbuffer_dealloc(extra);
-            Py_DECREF(link);
-            return -1;
-        }
-    }
-    kwargs = PyDict_New();
-    if (!kwargs) {
-        Textbuffer_dealloc(extra);
-        Py_DECREF(link);
-        return -1;
-    }
-    PyDict_SetItemString(kwargs, "brackets", brackets ? Py_True : Py_False);
-    if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) {
-        Textbuffer_dealloc(extra);
-        Py_DECREF(link);
-        return -1;
-    }
-    if (Tokenizer_emit_all(self, link)) {
-        Textbuffer_dealloc(extra);
-        Py_DECREF(link);
-        return -1;
-    }
-    Py_DECREF(link);
-    if (Tokenizer_emit(self, ExternalLinkClose)) {
-        Textbuffer_dealloc(extra);
-        return -1;
-    }
-    if (extra->size || extra->next)
-        return Tokenizer_emit_textbuffer(self, extra, 0);
-    Textbuffer_dealloc(extra);
-    return 0;
-}
-
-/*
-    Parse a section heading at the head of the wikicode string.
-*/
-static int Tokenizer_parse_heading(Tokenizer* self)
-{
-    Py_ssize_t reset = self->head;
-    int best = 1, i, context, diff;
-    HeadingData *heading;
-    PyObject *level, *kwargs;
-
-    self->global |= GL_HEADING;
-    self->head += 1;
-    while (Tokenizer_READ(self, 0) == '=') {
-        best++;
-        self->head++;
-    }
-    context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1);
-    heading = (HeadingData*) Tokenizer_parse(self, context, 1);
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset + best - 1;
-        for (i = 0; i < best; i++) {
-            if (Tokenizer_emit_char(self, '='))
-                return -1;
-        }
-        self->global ^= GL_HEADING;
-        return 0;
-    }
-    level = NEW_INT_FUNC(heading->level);
-    if (!level) {
-        Py_DECREF(heading->title);
-        free(heading);
-        return -1;
-    }
-    kwargs = PyDict_New();
-    if (!kwargs) {
-        Py_DECREF(level);
-        Py_DECREF(heading->title);
-        free(heading);
-        return -1;
-    }
-    PyDict_SetItemString(kwargs, "level", level);
-    Py_DECREF(level);
-    if (Tokenizer_emit_kwargs(self, HeadingStart, kwargs)) {
-        Py_DECREF(heading->title);
-        free(heading);
-        return -1;
-    }
-    if (heading->level < best) {
-        diff = best - heading->level;
-        for (i = 0; i < diff; i++) {
-            if (Tokenizer_emit_char(self, '=')) {
-                Py_DECREF(heading->title);
-                free(heading);
-                return -1;
-            }
-        }
-    }
-    if (Tokenizer_emit_all(self, heading->title)) {
-        Py_DECREF(heading->title);
-        free(heading);
-        return -1;
-    }
-    Py_DECREF(heading->title);
-    free(heading);
-    if (Tokenizer_emit(self, HeadingEnd))
-        return -1;
-    self->global ^= GL_HEADING;
-    return 0;
-}
-
-/*
-    Handle the end of a section heading at the head of the string.
-*/
-static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
-{
-    Py_ssize_t reset = self->head;
-    int best, i, current, level, diff;
-    HeadingData *after, *heading;
-    PyObject *stack;
-
-    self->head += 1;
-    best = 1;
-    while (Tokenizer_READ(self, 0) == '=') {
-        best++;
-        self->head++;
-    }
-    current = heading_level_from_context(self->topstack->context);
-    level = current > best ? (best > 6 ? 6 : best) :
-                             (current > 6 ? 6 : current);
-    after = (HeadingData*) Tokenizer_parse(self, self->topstack->context, 1);
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        if (level < best) {
-            diff = best - level;
-            for (i = 0; i < diff; i++) {
-                if (Tokenizer_emit_char(self, '='))
-                    return NULL;
-            }
-        }
-        self->head = reset + best - 1;
-    }
-    else {
-        for (i = 0; i < best; i++) {
-            if (Tokenizer_emit_char(self, '=')) {
-                Py_DECREF(after->title);
-                free(after);
-                return NULL;
-            }
-        }
-        if (Tokenizer_emit_all(self, after->title)) {
-            Py_DECREF(after->title);
-            free(after);
-            return NULL;
-        }
-        Py_DECREF(after->title);
-        level = after->level;
-        free(after);
-    }
-    stack = Tokenizer_pop(self);
-    if (!stack)
-        return NULL;
-    heading = malloc(sizeof(HeadingData));
-    if (!heading) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-    heading->title = stack;
-    heading->level = level;
-    return heading;
-}
-
-/*
-    Actually parse an HTML entity and ensure that it is valid.
-*/
-static int Tokenizer_really_parse_entity(Tokenizer* self)
-{
-    PyObject *kwargs, *textobj;
-    Py_UNICODE this;
-    int numeric, hexadecimal, i, j, zeroes, test;
-    char *valid, *text, *buffer, *def;
-
-    #define FAIL_ROUTE_AND_EXIT() { \
-        Tokenizer_fail_route(self); \
-        free(text);                 \
-        return 0;                   \
-    }
-
-    if (Tokenizer_emit(self, HTMLEntityStart))
-        return -1;
-    self->head++;
-    this = Tokenizer_READ(self, 0);
-    if (!this) {
-        Tokenizer_fail_route(self);
-        return 0;
-    }
-    if (this == '#') {
-        numeric = 1;
-        if (Tokenizer_emit(self, HTMLEntityNumeric))
-            return -1;
-        self->head++;
-        this = Tokenizer_READ(self, 0);
-        if (!this) {
-            Tokenizer_fail_route(self);
-            return 0;
-        }
-        if (this == 'x' || this == 'X') {
-            hexadecimal = 1;
-            kwargs = PyDict_New();
-            if (!kwargs)
-                return -1;
-            PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0));
-            if (Tokenizer_emit_kwargs(self, HTMLEntityHex, kwargs))
-                return -1;
-            self->head++;
-        }
-        else
-            hexadecimal = 0;
-    }
-    else
-        numeric = hexadecimal = 0;
-    if (hexadecimal)
-        valid = HEXDIGITS;
-    else if (numeric)
-        valid = DIGITS;
-    else
-        valid = ALPHANUM;
-    text = calloc(MAX_ENTITY_SIZE, sizeof(char));
-    if (!text) {
-        PyErr_NoMemory();
-        return -1;
-    }
-    i = 0;
-    zeroes = 0;
-    while (1) {
-        this = Tokenizer_READ(self, 0);
-        if (this == ';') {
-            if (i == 0)
-                FAIL_ROUTE_AND_EXIT()
-            break;
-        }
-        if (i == 0 && this == '0') {
-            zeroes++;
-            self->head++;
-            continue;
-        }
-        if (i >= MAX_ENTITY_SIZE)
-            FAIL_ROUTE_AND_EXIT()
-        if (is_marker(this))
-            FAIL_ROUTE_AND_EXIT()
-        j = 0;
-        while (1) {
-            if (!valid[j])
-                FAIL_ROUTE_AND_EXIT()
-            if (this == valid[j])
-                break;
-            j++;
-        }
-        text[i] = (char) this;
-        self->head++;
-        i++;
-    }
-    if (numeric) {
-        sscanf(text, (hexadecimal ? "%x" : "%d"), &test);
-        if (test < 1 || test > 0x10FFFF)
-            FAIL_ROUTE_AND_EXIT()
-    }
-    else {
-        i = 0;
-        while (1) {
-            def = entitydefs[i];
-            if (!def)  // We've reached the end of the defs without finding it
-                FAIL_ROUTE_AND_EXIT()
-            if (strcmp(text, def) == 0)
-                break;
-            i++;
-        }
-    }
-    if (zeroes) {
-        buffer = calloc(strlen(text) + zeroes + 1, sizeof(char));
-        if (!buffer) {
-            free(text);
-            PyErr_NoMemory();
-            return -1;
-        }
-        for (i = 0; i < zeroes; i++)
-            strcat(buffer, "0");
-        strcat(buffer, text);
-        free(text);
-        text = buffer;
-    }
-    textobj = PyUnicode_FromString(text);
-    if (!textobj) {
-        free(text);
-        return -1;
-    }
-    free(text);
-    kwargs = PyDict_New();
-    if (!kwargs) {
-        Py_DECREF(textobj);
-        return -1;
-    }
-    PyDict_SetItemString(kwargs, "text", textobj);
-    Py_DECREF(textobj);
-    if (Tokenizer_emit_kwargs(self, Text, kwargs))
-        return -1;
-    if (Tokenizer_emit(self, HTMLEntityEnd))
-        return -1;
-    return 0;
-}
-
-/*
-    Parse an HTML entity at the head of the wikicode string.
-*/
-static int Tokenizer_parse_entity(Tokenizer* self)
-{
-    Py_ssize_t reset = self->head;
-    PyObject *tokenlist;
-
-    if (Tokenizer_push(self, 0))
-        return -1;
-    if (Tokenizer_really_parse_entity(self))
-        return -1;
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset;
-        if (Tokenizer_emit_char(self, '&'))
-            return -1;
-        return 0;
-    }
-    tokenlist = Tokenizer_pop(self);
-    if (!tokenlist)
-        return -1;
-    if (Tokenizer_emit_all(self, tokenlist)) {
-        Py_DECREF(tokenlist);
-        return -1;
-    }
-    Py_DECREF(tokenlist);
-    return 0;
-}
-
-/*
-    Parse an HTML comment at the head of the wikicode string.
-*/
-static int Tokenizer_parse_comment(Tokenizer* self)
-{
-    Py_ssize_t reset = self->head + 3;
-    PyObject *comment;
-    Py_UNICODE this;
-
-    self->head += 4;
-    if (Tokenizer_push(self, 0))
-        return -1;
-    while (1) {
-        this = Tokenizer_READ(self, 0);
-        if (!this) {
-            comment = Tokenizer_pop(self);
-            Py_XDECREF(comment);
-            self->head = reset;
-            return Tokenizer_emit_text(self, "<!--");
-        }
-        if (this == '-' && Tokenizer_READ(self, 1) == this &&
-                            Tokenizer_READ(self, 2) == '>') {
-            if (Tokenizer_emit_first(self, CommentStart))
-                return -1;
-            if (Tokenizer_emit(self, CommentEnd))
-                return -1;
-            comment = Tokenizer_pop(self);
-            if (!comment)
-                return -1;
-            if (Tokenizer_emit_all(self, comment))
-                return -1;
-            Py_DECREF(comment);
-            self->head += 2;
-            if (self->topstack->context & LC_FAIL_NEXT) {
-                /* _verify_safe() sets this flag while parsing a template or
-                   link when it encounters what might be a comment -- we must
-                   unset it to let _verify_safe() know it was correct: */
-                self->topstack->context ^= LC_FAIL_NEXT;
-            }
-            return 0;
-        }
-        if (Tokenizer_emit_char(self, this))
-            return -1;
-        self->head++;
-    }
-}
-
-/*
-    Write a pending tag attribute from data to the stack.
-*/
-static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
-{
-    PyObject *tokens, *kwargs, *tmp, *pad_first, *pad_before_eq, *pad_after_eq;
-
-    if (data->context & TAG_QUOTED) {
-        kwargs = PyDict_New();
-        if (!kwargs)
-            return -1;
-        tmp = PyUnicode_FromUnicode(&data->quoter, 1);
-        if (!tmp)
-            return -1;
-        PyDict_SetItemString(kwargs, "char", tmp);
-        Py_DECREF(tmp);
-        if (Tokenizer_emit_first_kwargs(self, TagAttrQuote, kwargs))
-            return -1;
-        tokens = Tokenizer_pop(self);
-        if (!tokens)
-            return -1;
-        if (Tokenizer_emit_all(self, tokens)) {
-            Py_DECREF(tokens);
-            return -1;
-        }
-        Py_DECREF(tokens);
-    }
-    pad_first = Textbuffer_render(data->pad_first);
-    pad_before_eq = Textbuffer_render(data->pad_before_eq);
-    pad_after_eq = Textbuffer_render(data->pad_after_eq);
-    if (!pad_first || !pad_before_eq || !pad_after_eq)
-        return -1;
-    kwargs = PyDict_New();
-    if (!kwargs)
-        return -1;
-    PyDict_SetItemString(kwargs, "pad_first", pad_first);
-    PyDict_SetItemString(kwargs, "pad_before_eq", pad_before_eq);
-    PyDict_SetItemString(kwargs, "pad_after_eq", pad_after_eq);
-    Py_DECREF(pad_first);
-    Py_DECREF(pad_before_eq);
-    Py_DECREF(pad_after_eq);
-    if (Tokenizer_emit_first_kwargs(self, TagAttrStart, kwargs))
-        return -1;
-    tokens = Tokenizer_pop(self);
-    if (!tokens)
-        return -1;
-    if (Tokenizer_emit_all(self, tokens)) {
-        Py_DECREF(tokens);
-        return -1;
-    }
-    Py_DECREF(tokens);
-    if (TagData_reset_buffers(data))
-        return -1;
-    return 0;
-}
-
-/*
-    Handle whitespace inside of an HTML open tag.
-*/
-static int
-Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
-{
-    uint64_t ctx = data->context;
-    uint64_t end_of_value = (ctx & TAG_ATTR_VALUE &&
-                             !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));
-
-    if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) {
-        if (Tokenizer_push_tag_buffer(self, data))
-            return -1;
-        data->context = TAG_ATTR_READY;
-    }
-    else if (ctx & TAG_NOTE_SPACE)
-        data->context = TAG_ATTR_READY;
-    else if (ctx & TAG_ATTR_NAME) {
-        data->context |= TAG_NOTE_EQUALS;
-        if (Textbuffer_write(&(data->pad_before_eq), text))
-            return -1;
-    }
-    if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) {
-        if (Tokenizer_emit_char(self, text))
-            return -1;
-    }
-    else if (data->context & TAG_ATTR_READY)
-        return Textbuffer_write(&(data->pad_first), text);
-    else if (data->context & TAG_ATTR_VALUE)
-        return Textbuffer_write(&(data->pad_after_eq), text);
-    return 0;
-}
-
-/*
-    Handle regular text inside of an HTML open tag.
-*/
-static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
-{
-    Py_UNICODE next = Tokenizer_READ(self, 1);
-
-    if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self))
-        return Tokenizer_emit_char(self, text);
-    else if (text == next && next == '{')
-        return Tokenizer_parse_template_or_argument(self);
-    else if (text == next && next == '[')
-        return Tokenizer_parse_wikilink(self);
-    else if (text == '<')
-        return Tokenizer_parse_tag(self);
-    return Tokenizer_emit_char(self, text);
-}
-
-/*
-    Handle all sorts of text data inside of an HTML open tag.
-*/
-static int
-Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk)
-{
-    PyObject *trash;
-    int first_time, escaped;
-
-    if (data->context & TAG_NAME) {
-        first_time = !(data->context & TAG_NOTE_SPACE);
-        if (is_marker(chunk) || (Py_UNICODE_ISSPACE(chunk) && first_time)) {
-            // Tags must start with text, not spaces
-            Tokenizer_fail_route(self);
-            return 0;
-        }
-        else if (first_time)
-            data->context |= TAG_NOTE_SPACE;
-        else if (Py_UNICODE_ISSPACE(chunk)) {
-            data->context = TAG_ATTR_READY;
-            return Tokenizer_handle_tag_space(self, data, chunk);
-        }
-    }
-    else if (Py_UNICODE_ISSPACE(chunk))
-        return Tokenizer_handle_tag_space(self, data, chunk);
-    else if (data->context & TAG_NOTE_SPACE) {
-        if (data->context & TAG_QUOTED) {
-            data->context = TAG_ATTR_VALUE;
-            trash = Tokenizer_pop(self);
-            Py_XDECREF(trash);
-            self->head = data->reset - 1;  // Will be auto-incremented
-        }
-        else
-            Tokenizer_fail_route(self);
-        return 0;
-    }
-    else if (data->context & TAG_ATTR_READY) {
-        data->context = TAG_ATTR_NAME;
-        if (Tokenizer_push(self, LC_TAG_ATTR))
-            return -1;
-    }
-    else if (data->context & TAG_ATTR_NAME) {
-        if (chunk == '=') {
-            data->context = TAG_ATTR_VALUE | TAG_NOTE_QUOTE;
-            if (Tokenizer_emit(self, TagAttrEquals))
-                return -1;
-            return 0;
-        }
-        if (data->context & TAG_NOTE_EQUALS) {
-            if (Tokenizer_push_tag_buffer(self, data))
-                return -1;
-            data->context = TAG_ATTR_NAME;
-            if (Tokenizer_push(self, LC_TAG_ATTR))
-                return -1;
-        }
-    }
-    else {  // data->context & TAG_ATTR_VALUE assured
-        escaped = (Tokenizer_READ_BACKWARDS(self, 1) == '\\' &&
-                   Tokenizer_READ_BACKWARDS(self, 2) != '\\');
-        if (data->context & TAG_NOTE_QUOTE) {
-            data->context ^= TAG_NOTE_QUOTE;
-            if ((chunk == '"' || chunk == '\'') && !escaped) {
-                data->context |= TAG_QUOTED;
-                data->quoter = chunk;
-                data->reset = self->head;
-                if (Tokenizer_push(self, self->topstack->context))
-                    return -1;
-                return 0;
-            }
-        }
-        else if (data->context & TAG_QUOTED) {
-            if (chunk == data->quoter && !escaped) {
-                data->context |= TAG_NOTE_SPACE;
-                return 0;
-            }
-        }
-    }
-    return Tokenizer_handle_tag_text(self, chunk);
-}
-
-/*
-    Handle the closing of a open tag (<foo>).
-*/
-static int
-Tokenizer_handle_tag_close_open(Tokenizer* self, TagData* data, PyObject* cls)
-{
-    PyObject *padding, *kwargs;
-
-    if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) {
-        if (Tokenizer_push_tag_buffer(self, data))
-            return -1;
-    }
-    padding = Textbuffer_render(data->pad_first);
-    if (!padding)
-        return -1;
-    kwargs = PyDict_New();
-    if (!kwargs) {
-        Py_DECREF(padding);
-        return -1;
-    }
-    PyDict_SetItemString(kwargs, "padding", padding);
-    Py_DECREF(padding);
-    if (Tokenizer_emit_kwargs(self, cls, kwargs))
-        return -1;
-    self->head++;
-    return 0;
-}
-
-/*
-    Handle the opening of a closing tag (</foo>).
-*/
-static int Tokenizer_handle_tag_open_close(Tokenizer* self)
-{
-    if (Tokenizer_emit(self, TagOpenClose))
-        return -1;
-    if (Tokenizer_push(self, LC_TAG_CLOSE))
-        return -1;
-    self->head++;
-    return 0;
-}
-
-/*
-    Handle the ending of a closing tag (</foo>).
-*/
-static PyObject* Tokenizer_handle_tag_close_close(Tokenizer* self)
-{
-    PyObject *closing, *first, *so, *sc;
-    int valid = 1;
-
-    closing = Tokenizer_pop(self);
-    if (!closing)
-        return NULL;
-    if (PyList_GET_SIZE(closing) != 1)
-        valid = 0;
-    else {
-        first = PyList_GET_ITEM(closing, 0);
-        switch (PyObject_IsInstance(first, Text)) {
-            case 0:
-                valid = 0;
-                break;
-            case 1: {
-                so = strip_tag_name(first, 1);
-                sc = strip_tag_name(
-                    PyList_GET_ITEM(self->topstack->stack, 1), 1);
-                if (so && sc) {
-                    if (PyUnicode_Compare(so, sc))
-                        valid = 0;
-                    Py_DECREF(so);
-                    Py_DECREF(sc);
-                    break;
-                }
-                Py_XDECREF(so);
-                Py_XDECREF(sc);
-            }
-            case -1:
-                Py_DECREF(closing);
-                return NULL;
-        }
-    }
-    if (!valid) {
-        Py_DECREF(closing);
-        return Tokenizer_fail_route(self);
-    }
-    if (Tokenizer_emit_all(self, closing)) {
-        Py_DECREF(closing);
-        return NULL;
-    }
-    Py_DECREF(closing);
-    if (Tokenizer_emit(self, TagCloseClose))
-        return NULL;
-    return Tokenizer_pop(self);
-}
-
-/*
-    Handle the body of an HTML tag that is parser-blacklisted.
-*/
-static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
-{
-    Textbuffer* buffer;
-    PyObject *buf_tmp, *end_tag, *start_tag;
-    Py_UNICODE this, next;
-    Py_ssize_t reset;
-    int cmp;
-
-    while (1) {
-        this = Tokenizer_READ(self, 0);
-        next = Tokenizer_READ(self, 1);
-        if (!this)
-            return Tokenizer_fail_route(self);
-        else if (this == '<' && next == '/') {
-            self->head += 2;
-            reset = self->head - 1;
-            buffer = Textbuffer_new();
-            if (!buffer)
-                return NULL;
-            while ((this = Tokenizer_READ(self, 0))) {
-                if (this == '>') {
-                    buf_tmp = Textbuffer_render(buffer);
-                    if (!buf_tmp)
-                        return NULL;
-                    end_tag = strip_tag_name(buf_tmp, 0);
-                    Py_DECREF(buf_tmp);
-                    if (!end_tag)
-                        return NULL;
-                    start_tag = strip_tag_name(
-                        PyList_GET_ITEM(self->topstack->stack, 1), 1);
-                    if (!start_tag)
-                        return NULL;
-                    cmp = PyUnicode_Compare(start_tag, end_tag);
-                    Py_DECREF(end_tag);
-                    Py_DECREF(start_tag);
-                    if (cmp)
-                        goto no_matching_end;
-                    if (Tokenizer_emit(self, TagOpenClose))
-                        return NULL;
-                    if (Tokenizer_emit_textbuffer(self, buffer, 0))
-                        return NULL;
-                    if (Tokenizer_emit(self, TagCloseClose))
-                        return NULL;
-                    return Tokenizer_pop(self);
-                }
-                if (!this || this == '\n') {
-                    no_matching_end:
-                    Textbuffer_dealloc(buffer);
-                    self->head = reset;
-                    if (Tokenizer_emit_text(self, "</"))
-                        return NULL;
-                    break;
-                }
-                Textbuffer_write(&buffer, this);
-                self->head++;
-            }
-        }
-        else if (this == '&') {
-            if (Tokenizer_parse_entity(self))
-                return NULL;
-        }
-        else if (Tokenizer_emit_char(self, this))
-            return NULL;
-        self->head++;
-    }
-}
-
-/*
-    Handle the end of an implicitly closing single-only HTML tag.
-*/
-static PyObject* Tokenizer_handle_single_only_tag_end(Tokenizer* self)
-{
-    PyObject *top, *padding, *kwargs;
-
-    top = PyObject_CallMethod(self->topstack->stack, "pop", NULL);
-    if (!top)
-        return NULL;
-    padding = PyObject_GetAttrString(top, "padding");
-    Py_DECREF(top);
-    if (!padding)
-        return NULL;
-    kwargs = PyDict_New();
-    if (!kwargs) {
-        Py_DECREF(padding);
-        return NULL;
-    }
-    PyDict_SetItemString(kwargs, "padding", padding);
-    PyDict_SetItemString(kwargs, "implicit", Py_True);
-    Py_DECREF(padding);
-    if (Tokenizer_emit_kwargs(self, TagCloseSelfclose, kwargs))
-        return NULL;
-    self->head--;  // Offset displacement done by handle_tag_close_open
-    return Tokenizer_pop(self);
-}
-
-/*
-    Handle the stream end when inside a single-supporting HTML tag.
-*/
-static PyObject* Tokenizer_handle_single_tag_end(Tokenizer* self)
-{
-    PyObject *token = 0, *padding, *kwargs;
-    Py_ssize_t len, index;
-    int depth = 1, is_instance;
-
-    len = PyList_GET_SIZE(self->topstack->stack);
-    for (index = 2; index < len; index++) {
-        token = PyList_GET_ITEM(self->topstack->stack, index);
-        is_instance = PyObject_IsInstance(token, TagOpenOpen);
-        if (is_instance == -1)
-            return NULL;
-        else if (is_instance == 1)
-            depth++;
-        is_instance = PyObject_IsInstance(token, TagCloseOpen);
-        if (is_instance == -1)
-            return NULL;
-        else if (is_instance == 1) {
-            depth--;
-            if (depth == 0)
-                break;
-        }
-    }
-    if (!token || depth > 0)
-        return NULL;
-    padding = PyObject_GetAttrString(token, "padding");
-    if (!padding)
-        return NULL;
-    kwargs = PyDict_New();
-    if (!kwargs) {
-        Py_DECREF(padding);
-        return NULL;
-    }
-    PyDict_SetItemString(kwargs, "padding", padding);
-    PyDict_SetItemString(kwargs, "implicit", Py_True);
-    Py_DECREF(padding);
-    token = PyObject_Call(TagCloseSelfclose, NOARGS, kwargs);
-    Py_DECREF(kwargs);
-    if (!token)
-        return NULL;
-    if (PyList_SetItem(self->topstack->stack, index, token)) {
-        Py_DECREF(token);
-        return NULL;
-    }
-    return Tokenizer_pop(self);
-}
-
-/*
-    Actually parse an HTML tag, starting with the open (<foo>).
-*/
-static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
-{
-    TagData *data = TagData_new();
-    PyObject *token, *text, *trash;
-    Py_UNICODE this, next;
-    int can_exit;
-
-    if (!data)
-        return NULL;
-    if (Tokenizer_push(self, LC_TAG_OPEN)) {
-        TagData_dealloc(data);
-        return NULL;
-    }
-    if (Tokenizer_emit(self, TagOpenOpen)) {
-        TagData_dealloc(data);
-        return NULL;
-    }
-    while (1) {
-        this = Tokenizer_READ(self, 0);
-        next = Tokenizer_READ(self, 1);
-        can_exit = (!(data->context & (TAG_QUOTED | TAG_NAME)) ||
-                    data->context & TAG_NOTE_SPACE);
-        if (!this) {
-            if (self->topstack->context & LC_TAG_ATTR) {
-                if (data->context & TAG_QUOTED) {
-                    // Unclosed attribute quote: reset, don't die
-                    data->context = TAG_ATTR_VALUE;
-                    trash = Tokenizer_pop(self);
-                    Py_XDECREF(trash);
-                    self->head = data->reset;
-                    continue;
-                }
-                trash = Tokenizer_pop(self);
-                Py_XDECREF(trash);
-            }
-            TagData_dealloc(data);
-            return Tokenizer_fail_route(self);
-        }
-        else if (this == '>' && can_exit) {
-            if (Tokenizer_handle_tag_close_open(self, data, TagCloseOpen)) {
-                TagData_dealloc(data);
-                return NULL;
-            }
-            TagData_dealloc(data);
-            self->topstack->context = LC_TAG_BODY;
-            token = PyList_GET_ITEM(self->topstack->stack, 1);
-            text = PyObject_GetAttrString(token, "text");
-            if (!text)
-                return NULL;
-            if (IS_SINGLE_ONLY(text)) {
-                Py_DECREF(text);
-                return Tokenizer_handle_single_only_tag_end(self);
-            }
-            if (IS_PARSABLE(text)) {
-                Py_DECREF(text);
-                return Tokenizer_parse(self, 0, 0);
-            }
-            Py_DECREF(text);
-            return Tokenizer_handle_blacklisted_tag(self);
-        }
-        else if (this == '/' && next == '>' && can_exit) {
-            if (Tokenizer_handle_tag_close_open(self, data,
-                                                TagCloseSelfclose)) {
-                TagData_dealloc(data);
-                return NULL;
-            }
-            TagData_dealloc(data);
-            return Tokenizer_pop(self);
-        }
-        else {
-            if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
-                TagData_dealloc(data);
-                return NULL;
-            }
-        }
-        self->head++;
-    }
-}
-
-/*
-    Handle the (possible) start of an implicitly closing single tag.
-*/
-static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
-{
-    Py_ssize_t reset = self->head + 1, pos = 0;
-    Textbuffer* buf;
-    PyObject *name, *tag;
-    Py_UNICODE this;
-
-    self->head += 2;
-    buf = Textbuffer_new();
-    if (!buf)
-        return -1;
-    while (1) {
-        this = Tokenizer_READ(self, pos);
-        if (Py_UNICODE_ISSPACE(this) || is_marker(this)) {
-            name = Textbuffer_render(buf);
-            if (!name) {
-                Textbuffer_dealloc(buf);
-                return -1;
-            }
-            if (!IS_SINGLE_ONLY(name))
-                FAIL_ROUTE(0);
-            Py_DECREF(name);
-            break;
-        }
-        Textbuffer_write(&buf, this);
-        pos++;
-    }
-    Textbuffer_dealloc(buf);
-    if (!BAD_ROUTE)
-        tag = Tokenizer_really_parse_tag(self);
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset;
-        return Tokenizer_emit_text(self, "</");
-    }
-    if (!tag)
-        return -1;
-    // Set invalid=True flag of TagOpenOpen
-    if (PyObject_SetAttrString(PyList_GET_ITEM(tag, 0), "invalid", Py_True))
-        return -1;
-    if (Tokenizer_emit_all(self, tag)) {
-        Py_DECREF(tag);
-        return -1;
-    }
-    Py_DECREF(tag);
-    return 0;
-}
-
-/*
-    Parse an HTML tag at the head of the wikicode string.
-*/
-static int Tokenizer_parse_tag(Tokenizer* self)
-{
-    Py_ssize_t reset = self->head;
-    PyObject* tag;
-
-    self->head++;
-    tag = Tokenizer_really_parse_tag(self);
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset;
-        return Tokenizer_emit_char(self, '<');
-    }
-    if (!tag) {
-        return -1;
-    }
-    if (Tokenizer_emit_all(self, tag)) {
-        Py_DECREF(tag);
-        return -1;
-    }
-    Py_DECREF(tag);
-    return 0;
-}
-
-/*
-    Write the body of a tag and the tokens that should surround it.
-*/
-static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag,
-                                    const char* ticks, PyObject* body)
-{
-    PyObject *markup, *kwargs;
-
-    markup = PyUnicode_FromString(ticks);
-    if (!markup)
-        return -1;
-    kwargs = PyDict_New();
-    if (!kwargs) {
-        Py_DECREF(markup);
-        return -1;
-    }
-    PyDict_SetItemString(kwargs, "wiki_markup", markup);
-    Py_DECREF(markup);
-    if (Tokenizer_emit_kwargs(self, TagOpenOpen, kwargs))
-        return -1;
-    if (Tokenizer_emit_text(self, tag))
-        return -1;
-    if (Tokenizer_emit(self, TagCloseOpen))
-        return -1;
-    if (Tokenizer_emit_all(self, body))
-        return -1;
-    Py_DECREF(body);
-    if (Tokenizer_emit(self, TagOpenClose))
-        return -1;
-    if (Tokenizer_emit_text(self, tag))
-        return -1;
-    if (Tokenizer_emit(self, TagCloseClose))
-        return -1;
-    return 0;
-}
-
-/*
-    Parse wiki-style italics.
-*/
-static int Tokenizer_parse_italics(Tokenizer* self)
-{
-    Py_ssize_t reset = self->head;
-    uint64_t context;
-    PyObject *stack;
-
-    stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset;
-        if (BAD_ROUTE_CONTEXT & LC_STYLE_PASS_AGAIN) {
-            context = LC_STYLE_ITALICS | LC_STYLE_SECOND_PASS;
-            stack = Tokenizer_parse(self, context, 1);
-        }
-        else
-            return Tokenizer_emit_text(self, "''");
-    }
-    if (!stack)
-        return -1;
-    return Tokenizer_emit_style_tag(self, "i", "''", stack);
-}
-
-/*
-    Parse wiki-style bold.
-*/
-static int Tokenizer_parse_bold(Tokenizer* self)
-{
-    Py_ssize_t reset = self->head;
-    PyObject *stack;
-
-    stack = Tokenizer_parse(self, LC_STYLE_BOLD, 1);
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset;
-        if (self->topstack->context & LC_STYLE_SECOND_PASS)
-            return Tokenizer_emit_char(self, '\'') ? -1 : 1;
-        if (self->topstack->context & LC_STYLE_ITALICS) {
-            self->topstack->context |= LC_STYLE_PASS_AGAIN;
-            return Tokenizer_emit_text(self, "'''");
-        }
-        if (Tokenizer_emit_char(self, '\''))
-            return -1;
-        return Tokenizer_parse_italics(self);
-    }
-    if (!stack)
-        return -1;
-    return Tokenizer_emit_style_tag(self, "b", "'''", stack);
-}
-
-/*
-    Parse wiki-style italics and bold together (i.e., five ticks).
-*/
-static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
-{
-    Py_ssize_t reset = self->head;
-    PyObject *stack, *stack2;
-
-    stack = Tokenizer_parse(self, LC_STYLE_BOLD, 1);
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset;
-        stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
-        if (BAD_ROUTE) {
-            RESET_ROUTE();
-            self->head = reset;
-            return Tokenizer_emit_text(self, "'''''");
-        }
-        if (!stack)
-            return -1;
-        reset = self->head;
-        stack2 = Tokenizer_parse(self, LC_STYLE_BOLD, 1);
-        if (BAD_ROUTE) {
-            RESET_ROUTE();
-            self->head = reset;
-            if (Tokenizer_emit_text(self, "'''"))
-                return -1;
-            return Tokenizer_emit_style_tag(self, "i", "''", stack);
-        }
-        if (!stack2)
-            return -1;
-        if (Tokenizer_push(self, 0))
-            return -1;
-        if (Tokenizer_emit_style_tag(self, "i", "''", stack))
-            return -1;
-        if (Tokenizer_emit_all(self, stack2))
-            return -1;
-        Py_DECREF(stack2);
-        stack2 = Tokenizer_pop(self);
-        if (!stack2)
-            return -1;
-        return Tokenizer_emit_style_tag(self, "b", "'''", stack2);
-    }
-    if (!stack)
-        return -1;
-    reset = self->head;
-    stack2 = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset;
-        if (Tokenizer_emit_text(self, "''"))
-            return -1;
-        return Tokenizer_emit_style_tag(self, "b", "'''", stack);
-    }
-    if (!stack2)
-        return -1;
-    if (Tokenizer_push(self, 0))
-        return -1;
-    if (Tokenizer_emit_style_tag(self, "b", "'''", stack))
-        return -1;
-    if (Tokenizer_emit_all(self, stack2))
-        return -1;
-    Py_DECREF(stack2);
-    stack2 = Tokenizer_pop(self);
-    if (!stack2)
-        return -1;
-    return Tokenizer_emit_style_tag(self, "i", "''", stack2);
-}
-
-/*
-    Parse wiki-style formatting (''/''' for italics/bold).
-*/
-static PyObject* Tokenizer_parse_style(Tokenizer* self)
-{
-    uint64_t context = self->topstack->context, ticks = 2, i;
-
-    self->head += 2;
-    while (Tokenizer_READ(self, 0) == '\'') {
-        self->head++;
-        ticks++;
-    }
-    if (ticks > 5) {
-        for (i = 0; i < ticks - 5; i++) {
-            if (Tokenizer_emit_char(self, '\''))
-                return NULL;
-        }
-        ticks = 5;
-    }
-    else if (ticks == 4) {
-        if (Tokenizer_emit_char(self, '\''))
-            return NULL;
-        ticks = 3;
-    }
-    if ((context & LC_STYLE_ITALICS && (ticks == 2 || ticks == 5)) ||
-           (context & LC_STYLE_BOLD && (ticks == 3 || ticks == 5))) {
-        if (ticks == 5)
-            self->head -= context & LC_STYLE_ITALICS ? 3 : 2;
-        return Tokenizer_pop(self);
-    }
-    if (!Tokenizer_CAN_RECURSE(self)) {
-        if (ticks == 3) {
-            if (context & LC_STYLE_SECOND_PASS) {
-                if (Tokenizer_emit_char(self, '\''))
-                    return NULL;
-                return Tokenizer_pop(self);
-            }
-            if (context & LC_STYLE_ITALICS)
-                self->topstack->context |= LC_STYLE_PASS_AGAIN;
-        }
-        for (i = 0; i < ticks; i++) {
-            if (Tokenizer_emit_char(self, '\''))
-                return NULL;
-        }
-    }
-    else if (ticks == 2) {
-        if (Tokenizer_parse_italics(self))
-            return NULL;
-    }
-    else if (ticks == 3) {
-        switch (Tokenizer_parse_bold(self)) {
-            case 1:
-                return Tokenizer_pop(self);
-            case -1:
-                return NULL;
-        }
-    }
-    else {
-        if (Tokenizer_parse_italics_and_bold(self))
-            return NULL;
-    }
-    self->head--;
-    return Py_None;
-}
+/* Globals */
 
-/*
-    Handle a list marker at the head (#, *, ;, :).
-*/
-static int Tokenizer_handle_list_marker(Tokenizer* self)
-{
-    PyObject *markup = Tokenizer_read(self, 0), *kwargs;
-    Py_UNICODE code = *PyUnicode_AS_UNICODE(markup);
-
-    if (code == ';')
-        self->topstack->context |= LC_DLTERM;
-    kwargs = PyDict_New();
-    if (!kwargs)
-        return -1;
-    PyDict_SetItemString(kwargs, "wiki_markup", markup);
-    if (Tokenizer_emit_kwargs(self, TagOpenOpen, kwargs))
-        return -1;
-    if (Tokenizer_emit_text(self, GET_HTML_TAG(code)))
-        return -1;
-    if (Tokenizer_emit(self, TagCloseSelfclose))
-        return -1;
-    return 0;
-}
-
-/*
-    Handle a wiki-style list (#, *, ;, :).
-*/
-static int Tokenizer_handle_list(Tokenizer* self)
-{
-    Py_UNICODE marker = Tokenizer_READ(self, 1);
-
-    if (Tokenizer_handle_list_marker(self))
-        return -1;
-    while (marker == '#' || marker == '*' || marker == ';' ||
-           marker == ':') {
-        self->head++;
-        if (Tokenizer_handle_list_marker(self))
-            return -1;
-        marker = Tokenizer_READ(self, 1);
-    }
-    return 0;
-}
-
-/*
-    Handle a wiki-style horizontal rule (----) in the string.
-*/
-static int Tokenizer_handle_hr(Tokenizer* self)
-{
-    PyObject *markup, *kwargs;
-    Textbuffer *buffer = Textbuffer_new();
-    int i;
-
-    if (!buffer)
-        return -1;
-    self->head += 3;
-    for (i = 0; i < 4; i++) {
-        if (Textbuffer_write(&buffer, '-'))
-            return -1;
-    }
-    while (Tokenizer_READ(self, 1) == '-') {
-        if (Textbuffer_write(&buffer, '-'))
-            return -1;
-        self->head++;
-    }
-    markup = Textbuffer_render(buffer);
-    Textbuffer_dealloc(buffer);
-    if (!markup)
-        return -1;
-    kwargs = PyDict_New();
-    if (!kwargs)
-        return -1;
-    PyDict_SetItemString(kwargs, "wiki_markup", markup);
-    Py_DECREF(markup);
-    if (Tokenizer_emit_kwargs(self, TagOpenOpen, kwargs))
-        return -1;
-    if (Tokenizer_emit_text(self, "hr"))
-        return -1;
-    if (Tokenizer_emit(self, TagCloseSelfclose))
-        return -1;
-    return 0;
-}
-
-/*
-    Handle the term in a description list ('foo' in ';foo:bar').
-*/
-static int Tokenizer_handle_dl_term(Tokenizer* self)
-{
-    self->topstack->context ^= LC_DLTERM;
-    if (Tokenizer_READ(self, 0) == ':')
-        return Tokenizer_handle_list_marker(self);
-    return Tokenizer_emit_char(self, '\n');
-}
-
-/*
-    Emit a table tag.
-*/
-static int
-Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup,
-                         const char* tag, PyObject* style, PyObject* padding,
-                         const char* close_open_markup, PyObject* contents,
-                         const char* open_close_markup)
-{
-    PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs,
-             *close_open_markup_unicode, *open_close_kwargs,
-             *open_close_markup_unicode;
-
-    open_open_kwargs = PyDict_New();
-    if (!open_open_kwargs)
-        goto fail_decref_all;
-    open_open_markup_unicode = PyUnicode_FromString(open_open_markup);
-    if (!open_open_markup_unicode) {
-        Py_DECREF(open_open_kwargs);
-        goto fail_decref_all;
-    }
-    PyDict_SetItemString(open_open_kwargs, "wiki_markup",
-                         open_open_markup_unicode);
-    Py_DECREF(open_open_markup_unicode);
-    if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs))
-        goto fail_decref_all;
-    if (Tokenizer_emit_text(self, tag))
-        goto fail_decref_all;
+int route_state;
+uint64_t route_context;
 
-    if (style) {
-        if (Tokenizer_emit_all(self, style))
-            goto fail_decref_all;
-        Py_DECREF(style);
-    }
-
-    close_open_kwargs = PyDict_New();
-    if (!close_open_kwargs)
-        goto fail_decref_padding_contents;
-    if (close_open_markup && strlen(close_open_markup) != 0) {
-        close_open_markup_unicode = PyUnicode_FromString(close_open_markup);
-        if (!close_open_markup_unicode) {
-            Py_DECREF(close_open_kwargs);
-            goto fail_decref_padding_contents;
-        }
-        PyDict_SetItemString(close_open_kwargs, "wiki_markup",
-                             close_open_markup_unicode);
-        Py_DECREF(close_open_markup_unicode);
-    }
-    PyDict_SetItemString(close_open_kwargs, "padding", padding);
-    Py_DECREF(padding);
-    if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs))
-        goto fail_decref_contents;
-
-    if (contents) {
-        if (Tokenizer_emit_all(self, contents))
-            goto fail_decref_contents;
-        Py_DECREF(contents);
-    }
-
-    open_close_kwargs = PyDict_New();
-    if (!open_close_kwargs)
-        return -1;
-    open_close_markup_unicode = PyUnicode_FromString(open_close_markup);
-    if (!open_close_markup_unicode) {
-        Py_DECREF(open_close_kwargs);
-        return -1;
-    }
-    PyDict_SetItemString(open_close_kwargs, "wiki_markup",
-                         open_close_markup_unicode);
-    Py_DECREF(open_close_markup_unicode);
-    if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs))
-        return -1;
-    if (Tokenizer_emit_text(self, tag))
-        return -1;
-    if (Tokenizer_emit(self, TagCloseClose))
-        return -1;
-    return 0;
+char** entitydefs;
 
-    fail_decref_all:
-    Py_XDECREF(style);
-    fail_decref_padding_contents:
-    Py_DECREF(padding);
-    fail_decref_contents:
-    Py_DECREF(contents);
-    return -1;
-}
+PyObject* EMPTY;
+PyObject* NOARGS;
+PyObject* definitions;
 
-/*
-    Handle style attributes for a table until an ending token.
-*/
-static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
-{
-    TagData *data = TagData_new();
-    PyObject *padding, *trash;
-    Py_UNICODE this;
-    int can_exit;
+static PyObject* ParserError;
 
-    if (!data)
-        return NULL;
-    data->context = TAG_ATTR_READY;
+/* Forward declarations */
 
-    while (1) {
-        this = Tokenizer_READ(self, 0);
-        can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE);
-        if (this == end_token && can_exit) {
-            if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) {
-                if (Tokenizer_push_tag_buffer(self, data)) {
-                    TagData_dealloc(data);
-                    return NULL;
-                }
-            }
-            if (Py_UNICODE_ISSPACE(this))
-                Textbuffer_write(&(data->pad_first), this);
-            padding = Textbuffer_render(data->pad_first);
-            TagData_dealloc(data);
-            if (!padding)
-                return NULL;
-            return padding;
-        }
-        else if (!this || this == end_token) {
-           if (self->topstack->context & LC_TAG_ATTR) {
-                if (data->context & TAG_QUOTED) {
-                    // Unclosed attribute quote: reset, don't die
-                    data->context = TAG_ATTR_VALUE;
-                    trash = Tokenizer_pop(self);
-                    Py_XDECREF(trash);
-                    self->head = data->reset;
-                    continue;
-                }
-                trash = Tokenizer_pop(self);
-                Py_XDECREF(trash);
-            }
-            TagData_dealloc(data);
-            return Tokenizer_fail_route(self);
-        }
-        else {
-            if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
-                TagData_dealloc(data);
-                return NULL;
-            }
-        }
-        self->head++;
-    }
-}
+static int load_exceptions(void);
 
 /*
-    Parse a wikicode table by starting with the first line.
+    Create a new tokenizer object.
 */
-static int Tokenizer_parse_table(Tokenizer* self)
+static PyObject*
+Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
 {
-    Py_ssize_t reset = self->head + 1;
-    PyObject *style, *padding;
-    PyObject *table = NULL;
-    self->head += 2;
-
-    if(Tokenizer_push(self, LC_TABLE_OPEN))
-        return -1;
-    padding = Tokenizer_handle_table_style(self, '\n');
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        self->head = reset;
-        if (Tokenizer_emit_text(self, "{|"))
-            return -1;
-        return 0;
-    }
-    if (!padding)
-        return -1;
-    style = Tokenizer_pop(self);
-    if (!style) {
-        Py_DECREF(padding);
-        return -1;
-    }
-
-    self->head++;
-    table = Tokenizer_parse(self, LC_TABLE_OPEN, 1);
-    if (BAD_ROUTE) {
-        RESET_ROUTE();
-        Py_DECREF(padding);
-        Py_DECREF(style);
-        self->head = reset;
-        if (Tokenizer_emit_text(self, "{|"))
-            return -1;
-        return 0;
-    }
-    if (!table) {
-        Py_DECREF(padding);
-        Py_DECREF(style);
-        return -1;
-    }
-
-    if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL,
-                                 table, "|}"))
-        return -1;
-    // Offset displacement done by _parse()
-    self->head--;
-    return 0;
+    Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0);
+    return (PyObject*) self;
 }
 
 /*
-    Parse as style until end of the line, then continue.
+    Deallocate the given tokenizer object.
 */
-static int Tokenizer_handle_table_row(Tokenizer* self)
+static void Tokenizer_dealloc(Tokenizer* self)
 {
-    PyObject *padding, *style, *row, *trash;
-    self->head += 2;
-
-    if (!Tokenizer_CAN_RECURSE(self)) {
-        if (Tokenizer_emit_text(self, "|-"))
-            return -1;
-        self->head -= 1;
-        return 0;
-    }
-
-    if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN))
-        return -1;
-    padding = Tokenizer_handle_table_style(self, '\n');
-    if (BAD_ROUTE) {
-        trash = Tokenizer_pop(self);
-        Py_XDECREF(trash);
-        return 0;
-    }
-    if (!padding)
-        return -1;
-    style = Tokenizer_pop(self);
-    if (!style) {
-        Py_DECREF(padding);
-        return -1;
-    }
+    Stack *this = self->topstack, *next;
+    Py_XDECREF(self->text);
 
-    // Don't parse the style separator
-    self->head++;
-    row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1);
-    if (!row) {
-        Py_DECREF(padding);
-        Py_DECREF(style);
-        return -1;
+    while (this) {
+        Py_DECREF(this->stack);
+        Textbuffer_dealloc(this->textbuffer);
+        next = this->next;
+        free(this);
+        this = next;
     }
-
-    if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, ""))
-        return -1;
-    // Offset displacement done by _parse()
-    self->head--;
-    return 0;
+    Py_TYPE(self)->tp_free((PyObject*) self);
 }
 
 /*
-    Parse as normal syntax unless we hit a style marker, then parse style
-    as HTML attributes and the remainder as normal syntax.
+    Initialize a new tokenizer instance by setting instance attributes.
 */
-static int
-Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
-                            const char *tag, uint64_t line_context)
+static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
 {
-    uint64_t old_context = self->topstack->context;
-    uint64_t cell_context;
-    Py_ssize_t reset;
-    PyObject *padding, *cell, *style = NULL;
-    const char *close_open_markup = NULL;
-
-    self->head += strlen(markup);
-    reset = self->head;
-
-    if (!Tokenizer_CAN_RECURSE(self)) {
-        if (Tokenizer_emit_text(self, markup))
-            return -1;
-        self->head--;
-        return 0;
-    }
-
-    cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
-                           LC_TABLE_CELL_STYLE | line_context, 1);
-    if (!cell)
-        return -1;
-    cell_context = self->topstack->context;
-    self->topstack->context = old_context;
-
-    if (cell_context & LC_TABLE_CELL_STYLE) {
-        Py_DECREF(cell);
-        self->head = reset;
-        if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
-                          line_context))
-            return -1;
-        padding = Tokenizer_handle_table_style(self, '|');
-        if (!padding)
-            return -1;
-        style = Tokenizer_pop(self);
-        if (!style) {
-            Py_DECREF(padding);
-            return -1;
-        }
-        // Don't parse the style separator
-        self->head++;
-        cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
-                               line_context, 1);
-        if (!cell) {
-            Py_DECREF(padding);
-            Py_DECREF(style);
-            return -1;
-        }
-        cell_context = self->topstack->context;
-        self->topstack->context = old_context;
-    }
-    else {
-        padding = PyUnicode_FromString("");
-        if (!padding) {
-            Py_DECREF(cell);
-            return -1;
-        }
-    }
+    static char* kwlist[] = {NULL};
 
-    if (style) {
-        close_open_markup = "|";
-    }
-    if (Tokenizer_emit_table_tag(self, markup, tag, style, padding,
-                                 close_open_markup, cell, ""))
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
         return -1;
-    // Keep header/cell line contexts
-    self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE);
-    // Offset displacement done by parse()
-    self->head--;
-    return 0;
-}
-
-/*
-    Returns the context, stack, and whether to reset the cell for style
-    in a tuple.
-*/
-static PyObject*
-Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style)
-{
-    if (reset_for_style)
-        self->topstack->context |= LC_TABLE_CELL_STYLE;
-    else
-        self->topstack->context &= ~LC_TABLE_CELL_STYLE;
-    return Tokenizer_pop_keeping_context(self);
-}
-
-/*
-    Return the stack in order to handle the table row end.
-*/
-static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self)
-{
-    return Tokenizer_pop(self);
-}
-
-/*
-    Return the stack in order to handle the table end.
-*/
-static PyObject* Tokenizer_handle_table_end(Tokenizer* self)
-{
-    self->head += 2;
-    return Tokenizer_pop(self);
-}
-
-/*
-    Handle the end of the stream of wikitext.
-*/
-static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
-{
-    PyObject *token, *text, *trash;
-    int single;
+    self->text = Py_None;
+    Py_INCREF(Py_None);
+    self->topstack = NULL;
+    self->head = self->length = self->global = self->depth = self->cycles = 0;
 
-    if (context & AGG_FAIL) {
-        if (context & LC_TAG_BODY) {
-            token = PyList_GET_ITEM(self->topstack->stack, 1);
-            text = PyObject_GetAttrString(token, "text");
-            if (!text)
-                return NULL;
-            single = IS_SINGLE(text);
-            Py_DECREF(text);
-            if (single)
-                return Tokenizer_handle_single_tag_end(self);
-        }
-        else {
-            if (context & LC_TABLE_CELL_OPEN) {
-                trash = Tokenizer_pop(self);
-                Py_XDECREF(trash);
-                context = self->topstack->context;
-            }
-            if (context & AGG_DOUBLE) {
-                trash = Tokenizer_pop(self);
-                Py_XDECREF(trash);
-            }
-        }
-        return Tokenizer_fail_route(self);
-    }
-    return Tokenizer_pop(self);
-}
+    // TODO: should be member variables!
+    route_state = 0;
+    route_context = 0;
 
-/*
-    Make sure we are not trying to write an invalid character. Return 0 if
-    everything is safe, or -1 if the route must be failed.
-*/
-static int
-Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
-{
-    if (context & LC_FAIL_NEXT)
-        return -1;
-    if (context & LC_WIKILINK_TITLE) {
-        if (data == ']' || data == '{') {
-            self->topstack->context |= LC_FAIL_NEXT;
-        } else if (data == '\n' || data == '[' || data == '}' || data == '>') {
-            return -1;
-        } else if (data == '<') {
-            if (Tokenizer_READ(self, 1) == '!')
-                self->topstack->context |= LC_FAIL_NEXT;
-            else
-                return -1;
-        }
-        return 0;
-    }
-    if (context & LC_EXT_LINK_TITLE)
-        return (data == '\n') ? -1 : 0;
-    if (context & LC_TAG_CLOSE)
-        return (data == '<') ? -1 : 0;
-    if (context & LC_TEMPLATE_NAME) {
-        if (data == '{' || data == '}' || data == '[') {
-            self->topstack->context |= LC_FAIL_NEXT;
-            return 0;
-        }
-        if (data == ']' || data == '>' || (data == '<' &&
-                                           Tokenizer_READ(self, 1) != '!')) {
-            return -1;
-        }
-        if (data == '|')
-            return 0;
-        if (context & LC_HAS_TEXT) {
-            if (context & LC_FAIL_ON_TEXT) {
-                if (!Py_UNICODE_ISSPACE(data)) {
-                    if (data == '<' && Tokenizer_READ(self, 1) == '!') {
-                        self->topstack->context |= LC_FAIL_NEXT;
-                        return 0;
-                    }
-                    return -1;
-                }
-            }
-            else {
-                if (data == '\n')
-                    self->topstack->context |= LC_FAIL_ON_TEXT;
-            }
-        }
-        else if (!Py_UNICODE_ISSPACE(data))
-            self->topstack->context |= LC_HAS_TEXT;
-    }
-    else {
-        if (context & LC_FAIL_ON_EQUALS) {
-            if (data == '=') {
-                return -1;
-            }
-        }
-        else if (context & LC_FAIL_ON_LBRACE) {
-            if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' &&
-                                Tokenizer_READ_BACKWARDS(self, 2) == '{')) {
-                if (context & LC_TEMPLATE)
-                    self->topstack->context |= LC_FAIL_ON_EQUALS;
-                else
-                    self->topstack->context |= LC_FAIL_NEXT;
-                return 0;
-            }
-            self->topstack->context ^= LC_FAIL_ON_LBRACE;
-        }
-        else if (context & LC_FAIL_ON_RBRACE) {
-            if (data == '}') {
-                self->topstack->context |= LC_FAIL_NEXT;
-                return 0;
-            }
-            self->topstack->context ^= LC_FAIL_ON_RBRACE;
-        }
-        else if (data == '{')
-            self->topstack->context |= LC_FAIL_ON_LBRACE;
-        else if (data == '}')
-            self->topstack->context |= LC_FAIL_ON_RBRACE;
-    }
     return 0;
 }
 
 /*
-    Returns whether the current head has leading whitespace.
-    TODO: treat comments and templates as whitespace, allow fail on non-newline spaces.
-*/
-static int Tokenizer_has_leading_whitespace(Tokenizer* self)
-{
-    int offset = 1;
-    Py_UNICODE current_character;
-    while (1) {
-        current_character = Tokenizer_READ_BACKWARDS(self, offset);
-        if (!current_character || current_character == '\n')
-            return 1;
-        else if (!Py_UNICODE_ISSPACE(current_character))
-            return 0;
-        offset++;
-    }
-}
-
-/*
-    Parse the wikicode string, using context for when to stop. If push is true,
-    we will push a new context, otherwise we won't and context will be ignored.
-*/
-static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
-{
-    uint64_t this_context;
-    Py_UNICODE this, next, next_next, last;
-    PyObject* temp;
-
-    if (push) {
-        if (Tokenizer_push(self, context))
-            return NULL;
-    }
-    while (1) {
-        this = Tokenizer_READ(self, 0);
-        this_context = self->topstack->context;
-        if (this_context & AGG_UNSAFE) {
-            if (Tokenizer_verify_safe(self, this_context, this) < 0) {
-                if (this_context & AGG_DOUBLE) {
-                    temp = Tokenizer_pop(self);
-                    Py_XDECREF(temp);
-                }
-                return Tokenizer_fail_route(self);
-            }
-        }
-        if (!is_marker(this)) {
-            if (Tokenizer_emit_char(self, this))
-                return NULL;
-            self->head++;
-            continue;
-        }
-        if (!this)
-            return Tokenizer_handle_end(self, this_context);
-        next = Tokenizer_READ(self, 1);
-        last = Tokenizer_READ_BACKWARDS(self, 1);
-        if (this == next && next == '{') {
-            if (Tokenizer_CAN_RECURSE(self)) {
-                if (Tokenizer_parse_template_or_argument(self))
-                    return NULL;
-            }
-            else if (Tokenizer_emit_char(self, this))
-                return NULL;
-        }
-        else if (this == '|' && this_context & LC_TEMPLATE) {
-            if (Tokenizer_handle_template_param(self))
-                return NULL;
-        }
-        else if (this == '=' && this_context & LC_TEMPLATE_PARAM_KEY) {
-            if (Tokenizer_handle_template_param_value(self))
-                return NULL;
-        }
-        else if (this == next && next == '}' && this_context & LC_TEMPLATE)
-            return Tokenizer_handle_template_end(self);
-        else if (this == '|' && this_context & LC_ARGUMENT_NAME) {
-            if (Tokenizer_handle_argument_separator(self))
-                return NULL;
-        }
-        else if (this == next && next == '}' && this_context & LC_ARGUMENT) {
-            if (Tokenizer_READ(self, 2) == '}') {
-                return Tokenizer_handle_argument_end(self);
-            }
-            if (Tokenizer_emit_char(self, this))
-                return NULL;
-        }
-        else if (this == next && next == '[' && Tokenizer_CAN_RECURSE(self)) {
-            if (!(this_context & AGG_NO_WIKILINKS)) {
-                if (Tokenizer_parse_wikilink(self))
-                    return NULL;
-            }
-            else if (Tokenizer_emit_char(self, this))
-                return NULL;
-        }
-        else if (this == '|' && this_context & LC_WIKILINK_TITLE) {
-            if (Tokenizer_handle_wikilink_separator(self))
-                return NULL;
-        }
-        else if (this == next && next == ']' && this_context & LC_WIKILINK)
-            return Tokenizer_handle_wikilink_end(self);
-        else if (this == '[') {
-            if (Tokenizer_parse_external_link(self, 1))
-                return NULL;
-        }
-        else if (this == ':' && !is_marker(last)) {
-            if (Tokenizer_parse_external_link(self, 0))
-                return NULL;
-        }
-        else if (this == ']' && this_context & LC_EXT_LINK_TITLE)
-            return Tokenizer_pop(self);
-        else if (this == '=' && !(self->global & GL_HEADING)) {
-            if (!last || last == '\n') {
-                if (Tokenizer_parse_heading(self))
-                    return NULL;
-            }
-            else if (Tokenizer_emit_char(self, this))
-                return NULL;
-        }
-        else if (this == '=' && this_context & LC_HEADING)
-            return (PyObject*) Tokenizer_handle_heading_end(self);
-        else if (this == '\n' && this_context & LC_HEADING)
-            return Tokenizer_fail_route(self);
-        else if (this == '&') {
-            if (Tokenizer_parse_entity(self))
-                return NULL;
-        }
-        else if (this == '<' && next == '!') {
-            next_next = Tokenizer_READ(self, 2);
-            if (next_next == Tokenizer_READ(self, 3) && next_next == '-') {
-                if (Tokenizer_parse_comment(self))
-                    return NULL;
-            }
-            else if (Tokenizer_emit_char(self, this))
-                return NULL;
-        }
-        else if (this == '<' && next == '/' && Tokenizer_READ(self, 2)) {
-            if (this_context & LC_TAG_BODY ?
-                Tokenizer_handle_tag_open_close(self) :
-                Tokenizer_handle_invalid_tag_start(self))
-                return NULL;
-        }
-        else if (this == '<' && !(this_context & LC_TAG_CLOSE)) {
-            if (Tokenizer_CAN_RECURSE(self)) {
-                if (Tokenizer_parse_tag(self))
-                    return NULL;
-            }
-            else if (Tokenizer_emit_char(self, this))
-                return NULL;
-        }
-        else if (this == '>' && this_context & LC_TAG_CLOSE)
-            return Tokenizer_handle_tag_close_close(self);
-        else if (this == next && next == '\'' && !self->skip_style_tags) {
-            temp = Tokenizer_parse_style(self);
-            if (temp != Py_None)
-                return temp;
-        }
-        else if ((!last || last == '\n') && (this == '#' || this == '*' || this == ';' || this == ':')) {
-            if (Tokenizer_handle_list(self))
-                return NULL;
-        }
-        else if ((!last || last == '\n') && (this == '-' && this == next &&
-                 this == Tokenizer_READ(self, 2) &&
-                 this == Tokenizer_READ(self, 3))) {
-            if (Tokenizer_handle_hr(self))
-                return NULL;
-        }
-        else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) {
-            if (Tokenizer_handle_dl_term(self))
-                return NULL;
-            // Kill potential table contexts
-            if (this == '\n')
-                self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS;
-        }
-
-        // Start of table parsing
-        else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) {
-            if (Tokenizer_CAN_RECURSE(self)) {
-                if (Tokenizer_parse_table(self))
-                    return NULL;
-            }
-            else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next))
-                return NULL;
-            else
-                self->head++;
-        }
-        else if (this_context & LC_TABLE_OPEN) {
-            if (this == '|' && next == '|' && this_context & LC_TABLE_TD_LINE) {
-                if (this_context & LC_TABLE_CELL_OPEN)
-                    return Tokenizer_handle_table_cell_end(self, 0);
-                else if (Tokenizer_handle_table_cell(self, "||", "td", LC_TABLE_TD_LINE))
-                    return NULL;
-            }
-            else if (this == '|' && next == '|' && this_context & LC_TABLE_TH_LINE) {
-                if (this_context & LC_TABLE_CELL_OPEN)
-                    return Tokenizer_handle_table_cell_end(self, 0);
-                else if (Tokenizer_handle_table_cell(self, "||", "th", LC_TABLE_TH_LINE))
-                    return NULL;
-            }
-            else if (this == '!' && next == '!' && this_context & LC_TABLE_TH_LINE) {
-                if (this_context & LC_TABLE_CELL_OPEN)
-                    return Tokenizer_handle_table_cell_end(self, 0);
-                else if (Tokenizer_handle_table_cell(self, "!!", "th", LC_TABLE_TH_LINE))
-                    return NULL;
-            }
-            else if (this == '|' && this_context & LC_TABLE_CELL_STYLE) {
-                return Tokenizer_handle_table_cell_end(self, 1);
-            }
-            // On newline, clear out cell line contexts
-            else if (this == '\n' && this_context & LC_TABLE_CELL_LINE_CONTEXTS) {
-                self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS;
-                if (Tokenizer_emit_char(self, this))
-                    return NULL;
-            }
-            else if (Tokenizer_has_leading_whitespace(self)) {
-                if (this == '|' && next == '}') {
-                    if (this_context & LC_TABLE_CELL_OPEN)
-                        return Tokenizer_handle_table_cell_end(self, 0);
-                    if (this_context & LC_TABLE_ROW_OPEN)
-                        return Tokenizer_handle_table_row_end(self);
-                    else
-                        return Tokenizer_handle_table_end(self);
-                }
-                else if (this == '|' && next == '-') {
-                    if (this_context & LC_TABLE_CELL_OPEN)
-                        return Tokenizer_handle_table_cell_end(self, 0);
-                    if (this_context & LC_TABLE_ROW_OPEN)
-                        return Tokenizer_handle_table_row_end(self);
-                    else if (Tokenizer_handle_table_row(self))
-                        return NULL;
-                }
-                else if (this == '|') {
-                    if (this_context & LC_TABLE_CELL_OPEN)
-                        return Tokenizer_handle_table_cell_end(self, 0);
-                    else if (Tokenizer_handle_table_cell(self, "|", "td", LC_TABLE_TD_LINE))
-                        return NULL;
-                }
-                else if (this == '!') {
-                    if (this_context & LC_TABLE_CELL_OPEN)
-                        return Tokenizer_handle_table_cell_end(self, 0);
-                    else if (Tokenizer_handle_table_cell(self, "!", "th", LC_TABLE_TH_LINE))
-                        return NULL;
-                }
-                else if (Tokenizer_emit_char(self, this))
-                    return NULL;
-            }
-            else if (Tokenizer_emit_char(self, this))
-                return NULL;
-            // Raise BadRoute to table start
-            if (BAD_ROUTE)
-                return NULL;
-        }
-        else if (Tokenizer_emit_char(self, this))
-            return NULL;
-        self->head++;
-    }
-}
-
-/*
     Build a list of tokens from a string of wikicode and return it.
 */
 static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
@@ -3241,49 +201,7 @@ static int load_tokens(void)
         return -1;
     tokens = PyObject_GetAttrString(tempmod, "tokens");
     Py_DECREF(tempmod);
-
-    Text = PyObject_GetAttrString(tokens, "Text");
-
-    TemplateOpen = PyObject_GetAttrString(tokens, "TemplateOpen");
-    TemplateParamSeparator = PyObject_GetAttrString(tokens,
-                                                    "TemplateParamSeparator");
-    TemplateParamEquals = PyObject_GetAttrString(tokens,
-                                                 "TemplateParamEquals");
-    TemplateClose = PyObject_GetAttrString(tokens, "TemplateClose");
-
-    ArgumentOpen = PyObject_GetAttrString(tokens, "ArgumentOpen");
-    ArgumentSeparator = PyObject_GetAttrString(tokens, "ArgumentSeparator");
-    ArgumentClose = PyObject_GetAttrString(tokens, "ArgumentClose");
-
-    WikilinkOpen = PyObject_GetAttrString(tokens, "WikilinkOpen");
-    WikilinkSeparator = PyObject_GetAttrString(tokens, "WikilinkSeparator");
-    WikilinkClose = PyObject_GetAttrString(tokens, "WikilinkClose");
-
-    ExternalLinkOpen = PyObject_GetAttrString(tokens, "ExternalLinkOpen");
-    ExternalLinkSeparator = PyObject_GetAttrString(tokens,
-                                                   "ExternalLinkSeparator");
-    ExternalLinkClose = PyObject_GetAttrString(tokens, "ExternalLinkClose");
-
-    HTMLEntityStart = PyObject_GetAttrString(tokens, "HTMLEntityStart");
-    HTMLEntityNumeric = PyObject_GetAttrString(tokens, "HTMLEntityNumeric");
-    HTMLEntityHex = PyObject_GetAttrString(tokens, "HTMLEntityHex");
-    HTMLEntityEnd = PyObject_GetAttrString(tokens, "HTMLEntityEnd");
-
-    HeadingStart = PyObject_GetAttrString(tokens, "HeadingStart");
-    HeadingEnd = PyObject_GetAttrString(tokens, "HeadingEnd");
-
-    CommentStart = PyObject_GetAttrString(tokens, "CommentStart");
-    CommentEnd = PyObject_GetAttrString(tokens, "CommentEnd");
-
-    TagOpenOpen = PyObject_GetAttrString(tokens, "TagOpenOpen");
-    TagAttrStart = PyObject_GetAttrString(tokens, "TagAttrStart");
-    TagAttrEquals = PyObject_GetAttrString(tokens, "TagAttrEquals");
-    TagAttrQuote = PyObject_GetAttrString(tokens, "TagAttrQuote");
-    TagCloseOpen = PyObject_GetAttrString(tokens, "TagCloseOpen");
-    TagCloseSelfclose = PyObject_GetAttrString(tokens, "TagCloseSelfclose");
-    TagOpenClose = PyObject_GetAttrString(tokens, "TagOpenClose");
-    TagCloseClose = PyObject_GetAttrString(tokens, "TagCloseClose");
-
+    load_tokens_from_module(tokens);
     Py_DECREF(tokens);
     return 0;
 }
diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.h b/mwparserfromhell/parser/ctokenizer/tokenizer.h
index 66f1e90..ac45a45 100644
--- a/mwparserfromhell/parser/ctokenizer/tokenizer.h
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.h
@@ -20,256 +20,27 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 
-#include <math.h>
+#pragma once
 
 #include "common.h"
 #include "textbuffer.h"
 
-#define DIGITS    "0123456789"
-#define HEXDIGITS "0123456789abcdefABCDEF"
-#define ALPHANUM  "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
-
-static const char MARKERS[] = {
-    '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
-    '-', '!', '\n', '\0'};
-
-#define NUM_MARKERS 19
-#define MAX_DEPTH 40
-#define MAX_CYCLES 100000
-#define MAX_BRACES 255
-#define MAX_ENTITY_SIZE 8
-
-static int route_state = 0;
-static uint64_t route_context = 0;
-#define BAD_ROUTE            route_state
-#define BAD_ROUTE_CONTEXT    route_context
-#define FAIL_ROUTE(context)  route_state = 1; route_context = context
-#define RESET_ROUTE()        route_state = 0
-
-static char** entitydefs;
-
-static PyObject* EMPTY;
-static PyObject* NOARGS;
-static PyObject* ParserError;
-static PyObject* definitions;
-
-
-/* Tokens: */
-
-static PyObject* Text;
-
-static PyObject* TemplateOpen;
-static PyObject* TemplateParamSeparator;
-static PyObject* TemplateParamEquals;
-static PyObject* TemplateClose;
-
-static PyObject* ArgumentOpen;
-static PyObject* ArgumentSeparator;
-static PyObject* ArgumentClose;
-
-static PyObject* WikilinkOpen;
-static PyObject* WikilinkSeparator;
-static PyObject* WikilinkClose;
-
-static PyObject* ExternalLinkOpen;
-static PyObject* ExternalLinkSeparator;
-static PyObject* ExternalLinkClose;
-
-static PyObject* HTMLEntityStart;
-static PyObject* HTMLEntityNumeric;
-static PyObject* HTMLEntityHex;
-static PyObject* HTMLEntityEnd;
-static PyObject* HeadingStart;
-static PyObject* HeadingEnd;
-
-static PyObject* CommentStart;
-static PyObject* CommentEnd;
-
-static PyObject* TagOpenOpen;
-static PyObject* TagAttrStart;
-static PyObject* TagAttrEquals;
-static PyObject* TagAttrQuote;
-static PyObject* TagCloseOpen;
-static PyObject* TagCloseSelfclose;
-static PyObject* TagOpenClose;
-static PyObject* TagCloseClose;
-
-
-/* Local contexts: */
-
-#define LC_TEMPLATE                 0x0000000000000007
-#define LC_TEMPLATE_NAME            0x0000000000000001
-#define LC_TEMPLATE_PARAM_KEY       0x0000000000000002
-#define LC_TEMPLATE_PARAM_VALUE     0x0000000000000004
-
-#define LC_ARGUMENT                 0x0000000000000018
-#define LC_ARGUMENT_NAME            0x0000000000000008
-#define LC_ARGUMENT_DEFAULT         0x0000000000000010
-
-#define LC_WIKILINK                 0x0000000000000060
-#define LC_WIKILINK_TITLE           0x0000000000000020
-#define LC_WIKILINK_TEXT            0x0000000000000040
-
-#define LC_EXT_LINK                 0x0000000000000180
-#define LC_EXT_LINK_URI             0x0000000000000080
-#define LC_EXT_LINK_TITLE           0x0000000000000100
-
-#define LC_HEADING                  0x0000000000007E00
-#define LC_HEADING_LEVEL_1          0x0000000000000200
-#define LC_HEADING_LEVEL_2          0x0000000000000400
-#define LC_HEADING_LEVEL_3          0x0000000000000800
-#define LC_HEADING_LEVEL_4          0x0000000000001000
-#define LC_HEADING_LEVEL_5          0x0000000000002000
-#define LC_HEADING_LEVEL_6          0x0000000000004000
-
-#define LC_TAG                      0x0000000000078000
-#define LC_TAG_OPEN                 0x0000000000008000
-#define LC_TAG_ATTR                 0x0000000000010000
-#define LC_TAG_BODY                 0x0000000000020000
-#define LC_TAG_CLOSE                0x0000000000040000
-
-#define LC_STYLE                    0x0000000000780000
-#define LC_STYLE_ITALICS            0x0000000000080000
-#define LC_STYLE_BOLD               0x0000000000100000
-#define LC_STYLE_PASS_AGAIN         0x0000000000200000
-#define LC_STYLE_SECOND_PASS        0x0000000000400000
-
-#define LC_DLTERM                   0x0000000000800000
-
-#define LC_SAFETY_CHECK             0x000000003F000000
-#define LC_HAS_TEXT                 0x0000000001000000
-#define LC_FAIL_ON_TEXT             0x0000000002000000
-#define LC_FAIL_NEXT                0x0000000004000000
-#define LC_FAIL_ON_LBRACE           0x0000000008000000
-#define LC_FAIL_ON_RBRACE           0x0000000010000000
-#define LC_FAIL_ON_EQUALS           0x0000000020000000
-
-#define LC_TABLE                    0x0000000FC0000000
-#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000
-#define LC_TABLE_OPEN               0x0000000040000000
-#define LC_TABLE_CELL_OPEN          0x0000000080000000
-#define LC_TABLE_CELL_STYLE         0x0000000100000000
-#define LC_TABLE_ROW_OPEN           0x0000000200000000
-#define LC_TABLE_TD_LINE            0x0000000400000000
-#define LC_TABLE_TH_LINE            0x0000000800000000
-
-/* Global contexts: */
-
-#define GL_HEADING 0x1
-
-/* Aggregate contexts: */
-
-#define AGG_FAIL         (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
-#define AGG_UNSAFE       (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
-#define AGG_DOUBLE       (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
-#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
-#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)
-
-/* Tag contexts: */
-
-#define TAG_NAME        0x01
-#define TAG_ATTR_READY  0x02
-#define TAG_ATTR_NAME   0x04
-#define TAG_ATTR_VALUE  0x08
-#define TAG_QUOTED      0x10
-#define TAG_NOTE_SPACE  0x20
-#define TAG_NOTE_EQUALS 0x40
-#define TAG_NOTE_QUOTE  0x80
-
-
-/* Miscellaneous structs: */
-
-struct Stack {
-    PyObject* stack;
-    uint64_t context;
-    struct Textbuffer* textbuffer;
-    struct Stack* next;
-};
-
-typedef struct {
-    PyObject* title;
-    int level;
-} HeadingData;
-
-typedef struct {
-    uint64_t context;
-    struct Textbuffer* pad_first;
-    struct Textbuffer* pad_before_eq;
-    struct Textbuffer* pad_after_eq;
-    Py_UNICODE quoter;
-    Py_ssize_t reset;
-} TagData;
-
-typedef struct Stack Stack;
-
-
-/* Tokenizer object definition: */
-
-typedef struct {
-    PyObject_HEAD
-    PyObject* text;         /* text to tokenize */
-    Stack* topstack;        /* topmost stack */
-    Py_ssize_t head;        /* current position in text */
-    Py_ssize_t length;      /* length of text */
-    int global;             /* global context */
-    int depth;              /* stack recursion depth */
-    int cycles;             /* total number of stack recursions */
-    int skip_style_tags;    /* temporary fix for the sometimes broken tag parser */
-} Tokenizer;
-
-
-/* Macros related to Tokenizer functions: */
-
-#define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta)))
-#define Tokenizer_READ_BACKWARDS(self, delta) \
-                (*PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, delta)))
-#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)
-
-#define Tokenizer_emit(self, token) Tokenizer_emit_token(self, token, 0)
-#define Tokenizer_emit_first(self, token) Tokenizer_emit_token(self, token, 1)
-#define Tokenizer_emit_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 0)
-#define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1)
-
-
-/* Macros for accessing definitions: */
-
-#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
-#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL))
-#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL))
-#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL))
-#define IS_SCHEME(scheme, slashes, reverse) \
-    (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False))
-
-
-/* Function prototypes: */
-
-static TagData* TagData_new(void);
-static void TagData_dealloc(TagData*);
+/* Functions */
 
 static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
 static void Tokenizer_dealloc(Tokenizer*);
 static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
-static int Tokenizer_parse_entity(Tokenizer*);
-static int Tokenizer_parse_comment(Tokenizer*);
-static int Tokenizer_handle_dl_term(Tokenizer*);
-static int Tokenizer_parse_tag(Tokenizer*);
-static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);
 static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);
 
-static int load_exceptions(void);
-
-
-/* Macros for Python 2/3 compatibility: */
+/* Compatibility macros */
 
 #ifdef IS_PY3K
-    #define NEW_INT_FUNC      PyLong_FromSsize_t
     #define IMPORT_NAME_FUNC  PyUnicode_FromString
     #define CREATE_MODULE     PyModule_Create(&module_def);
     #define ENTITYDEFS_MODULE "html.entities"
     #define INIT_FUNC_NAME    PyInit__tokenizer
     #define INIT_ERROR        return NULL
 #else
-    #define NEW_INT_FUNC      PyInt_FromSsize_t
     #define IMPORT_NAME_FUNC  PyBytes_FromString
     #define CREATE_MODULE     Py_InitModule("_tokenizer", NULL);
     #define ENTITYDEFS_MODULE "htmlentitydefs"
@@ -277,8 +48,7 @@ static int load_exceptions(void);
     #define INIT_ERROR        return
 #endif
 
-
-/* More structs for creating the Tokenizer type: */
+/* Structs */
 
 static PyMethodDef Tokenizer_methods[] = {
     {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS,
diff --git a/mwparserfromhell/parser/ctokenizer/tokens.c b/mwparserfromhell/parser/ctokenizer/tokens.c
new file mode 100644
index 0000000..f080bab
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/tokens.c
@@ -0,0 +1,111 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "tokens.h"
+
+/* Globals */
+
+PyObject* Text;
+
+PyObject* TemplateOpen;
+PyObject* TemplateParamSeparator;
+PyObject* TemplateParamEquals;
+PyObject* TemplateClose;
+
+PyObject* ArgumentOpen;
+PyObject* ArgumentSeparator;
+PyObject* ArgumentClose;
+
+PyObject* WikilinkOpen;
+PyObject* WikilinkSeparator;
+PyObject* WikilinkClose;
+
+PyObject* ExternalLinkOpen;
+PyObject* ExternalLinkSeparator;
+PyObject* ExternalLinkClose;
+
+PyObject* HTMLEntityStart;
+PyObject* HTMLEntityNumeric;
+PyObject* HTMLEntityHex;
+PyObject* HTMLEntityEnd;
+PyObject* HeadingStart;
+PyObject* HeadingEnd;
+
+PyObject* CommentStart;
+PyObject* CommentEnd;
+
+PyObject* TagOpenOpen;
+PyObject* TagAttrStart;
+PyObject* TagAttrEquals;
+PyObject* TagAttrQuote;
+PyObject* TagCloseOpen;
+PyObject* TagCloseSelfclose;
+PyObject* TagOpenClose;
+PyObject* TagCloseClose;
+
+/*
+    Load individual tokens into globals from the given Python module object.
+*/
+void load_tokens_from_module(PyObject* module)
+{
+    Text = PyObject_GetAttrString(module, "Text");
+
+    TemplateOpen = PyObject_GetAttrString(module, "TemplateOpen");
+    TemplateParamSeparator = PyObject_GetAttrString(module,
+                                                    "TemplateParamSeparator");
+    TemplateParamEquals = PyObject_GetAttrString(module,
+                                                 "TemplateParamEquals");
+    TemplateClose = PyObject_GetAttrString(module, "TemplateClose");
+
+    ArgumentOpen = PyObject_GetAttrString(module, "ArgumentOpen");
+    ArgumentSeparator = PyObject_GetAttrString(module, "ArgumentSeparator");
+    ArgumentClose = PyObject_GetAttrString(module, "ArgumentClose");
+
+    WikilinkOpen = PyObject_GetAttrString(module, "WikilinkOpen");
+    WikilinkSeparator = PyObject_GetAttrString(module, "WikilinkSeparator");
+    WikilinkClose = PyObject_GetAttrString(module, "WikilinkClose");
+
+    ExternalLinkOpen = PyObject_GetAttrString(module, "ExternalLinkOpen");
+    ExternalLinkSeparator = PyObject_GetAttrString(module,
+                                                   "ExternalLinkSeparator");
+    ExternalLinkClose = PyObject_GetAttrString(module, "ExternalLinkClose");
+
+    HTMLEntityStart = PyObject_GetAttrString(module, "HTMLEntityStart");
+    HTMLEntityNumeric = PyObject_GetAttrString(module, "HTMLEntityNumeric");
+    HTMLEntityHex = PyObject_GetAttrString(module, "HTMLEntityHex");
+    HTMLEntityEnd = PyObject_GetAttrString(module, "HTMLEntityEnd");
+
+    HeadingStart = PyObject_GetAttrString(module, "HeadingStart");
+    HeadingEnd = PyObject_GetAttrString(module, "HeadingEnd");
+
+    CommentStart = PyObject_GetAttrString(module, "CommentStart");
+    CommentEnd = PyObject_GetAttrString(module, "CommentEnd");
+
+    TagOpenOpen = PyObject_GetAttrString(module, "TagOpenOpen");
+    TagAttrStart = PyObject_GetAttrString(module, "TagAttrStart");
+    TagAttrEquals = PyObject_GetAttrString(module, "TagAttrEquals");
+    TagAttrQuote = PyObject_GetAttrString(module, "TagAttrQuote");
+    TagCloseOpen = PyObject_GetAttrString(module, "TagCloseOpen");
+    TagCloseSelfclose = PyObject_GetAttrString(module, "TagCloseSelfclose");
+    TagOpenClose = PyObject_GetAttrString(module, "TagOpenClose");
+    TagCloseClose = PyObject_GetAttrString(module, "TagCloseClose");
+}
diff --git a/mwparserfromhell/parser/ctokenizer/tokens.h b/mwparserfromhell/parser/ctokenizer/tokens.h
new file mode 100644
index 0000000..9551902
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/tokens.h
@@ -0,0 +1,69 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#pragma once
+
+#include "common.h"
+
+/* Token globals */
+
+extern PyObject* Text;
+
+extern PyObject* TemplateOpen;
+extern PyObject* TemplateParamSeparator;
+extern PyObject* TemplateParamEquals;
+extern PyObject* TemplateClose;
+
+extern PyObject* ArgumentOpen;
+extern PyObject* ArgumentSeparator;
+extern PyObject* ArgumentClose;
+
+extern PyObject* WikilinkOpen;
+extern PyObject* WikilinkSeparator;
+extern PyObject* WikilinkClose;
+
+extern PyObject* ExternalLinkOpen;
+extern PyObject* ExternalLinkSeparator;
+extern PyObject* ExternalLinkClose;
+
+extern PyObject* HTMLEntityStart;
+extern PyObject* HTMLEntityNumeric;
+extern PyObject* HTMLEntityHex;
+extern PyObject* HTMLEntityEnd;
+extern PyObject* HeadingStart;
+extern PyObject* HeadingEnd;
+
+extern PyObject* CommentStart;
+extern PyObject* CommentEnd;
+
+extern PyObject* TagOpenOpen;
+extern PyObject* TagAttrStart;
+extern PyObject* TagAttrEquals;
+extern PyObject* TagAttrQuote;
+extern PyObject* TagCloseOpen;
+extern PyObject* TagCloseSelfclose;
+extern PyObject* TagOpenClose;
+extern PyObject* TagCloseClose;
+
+/* Functions */
+
+void load_tokens_from_module(PyObject*);

From 7345a3742e0375f14cb0dd9c83e1f2b7df0926ee Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Mon, 15 Jun 2015 01:32:03 -0400
Subject: [PATCH 09/22] Fix a thread safety issue involving route state.

---
 mwparserfromhell/parser/ctokenizer/common.h    | 34 ++++++++++++++------------
 mwparserfromhell/parser/ctokenizer/tokenizer.c |  6 +----
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h
index 58c9487..555cbf9 100644
--- a/mwparserfromhell/parser/ctokenizer/common.h
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -43,15 +43,15 @@ SOFTWARE.
 #define malloc PyObject_Malloc  // XXX: yuck
 #define free   PyObject_Free
 
-/* Error handling globals/macros */
+/* Error handling macros */
 
-extern int route_state;  // TODO: this is NOT thread-safe!
-extern uint64_t route_context;
-
-#define BAD_ROUTE            route_state
-#define BAD_ROUTE_CONTEXT    route_context
-#define FAIL_ROUTE(context)  { route_state = 1; route_context = context; }
-#define RESET_ROUTE()        route_state = 0
+#define BAD_ROUTE            self->route_state
+#define BAD_ROUTE_CONTEXT    self->route_context
+#define FAIL_ROUTE(context)  {                                                \
+        self->route_state = 1;                                                \
+        self->route_context = context;                                        \
+    }
+#define RESET_ROUTE()        self->route_state = 0
 
 /* Shared globals */
 
@@ -81,12 +81,14 @@ typedef struct Stack Stack;
 
 typedef struct {
     PyObject_HEAD
-    PyObject* text;         /* text to tokenize */
-    Stack* topstack;        /* topmost stack */
-    Py_ssize_t head;        /* current position in text */
-    Py_ssize_t length;      /* length of text */
-    int global;             /* global context */
-    int depth;              /* stack recursion depth */
-    int cycles;             /* total number of stack recursions */
-    int skip_style_tags;    /* temporary fix for the sometimes broken tag parser */
+    PyObject* text;          /* text to tokenize */
+    Stack* topstack;         /* topmost stack */
+    Py_ssize_t head;         /* current position in text */
+    Py_ssize_t length;       /* length of text */
+    int global;              /* global context */
+    int depth;               /* stack recursion depth */
+    int cycles;              /* total number of stack recursions */
+    int route_state;         /* whether a BadRoute has been triggered */
+    uint64_t route_context;  /* context when the last BadRoute was triggered */
+    int skip_style_tags;     /* temp fix for the sometimes broken tag parser */
 } Tokenizer;
diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c
index a6400f6..7af60a5 100644
--- a/mwparserfromhell/parser/ctokenizer/tokenizer.c
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c
@@ -82,11 +82,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
     Py_INCREF(Py_None);
     self->topstack = NULL;
     self->head = self->length = self->global = self->depth = self->cycles = 0;
-
-    // TODO: should be member variables!
-    route_state = 0;
-    route_context = 0;
-
+    self->route_context = self->route_state = 0;
     return 0;
 }
 

From 40fed91806ba139a08ea60b7dfc4410057201106 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Sun, 28 Jun 2015 03:09:14 -0400
Subject: [PATCH 10/22] Fix C tokenizer leaking memory.

---
 mwparserfromhell/parser/ctokenizer/tok_parse.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c
index 1e6424d..0aff311 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -1420,7 +1420,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
             buffer = Textbuffer_new();
             if (!buffer)
                 return NULL;
-            while ((this = Tokenizer_READ(self, 0))) {
+            while ((this = Tokenizer_READ(self, 0)), 1) {
                 if (this == '>') {
                     buf_tmp = Textbuffer_render(buffer);
                     if (!buf_tmp)

From 2a3a978986165431a9c192ad7ff64da897f93a6e Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Wed, 22 Jul 2015 22:55:06 -0400
Subject: [PATCH 11/22] Incomplete code for C tokenizer textbuffer.

---
 mwparserfromhell/parser/ctokenizer/common.h    | 21 +++++++++++++++------
 mwparserfromhell/parser/ctokenizer/tok_parse.c | 17 +++++------------
 mwparserfromhell/parser/ctokenizer/tok_parse.h |  6 ++++++
 tests/tokenizer/text.mwtest                    |  2 +-
 4 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h
index 555cbf9..92a41ca 100644
--- a/mwparserfromhell/parser/ctokenizer/common.h
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -43,6 +43,18 @@ SOFTWARE.
 #define malloc PyObject_Malloc  // XXX: yuck
 #define free   PyObject_Free
 
+/* Unicode support macros */
+
+#if defined(IS_PY3K) && PYTHON_MINOR_VERSION >= 3
+#define PEP_393
+#endif
+
+#ifdef PEP_393
+#define Unicode Py_UCS4
+#else
+#define Unicode Py_UNICODE
+#endif
+
 /* Error handling macros */
 
 #define BAD_ROUTE            self->route_state
@@ -63,18 +75,15 @@ extern PyObject* definitions;
 
 /* Structs */
 
-struct Textbuffer {
+typedef struct {
     Py_ssize_t size;
     Py_UNICODE* data;
-    struct Textbuffer* prev;
-    struct Textbuffer* next;
-};
-typedef struct Textbuffer Textbuffer;
+} Textbuffer;
 
 struct Stack {
     PyObject* stack;
     uint64_t context;
-    struct Textbuffer* textbuffer;
+    Textbuffer* textbuffer;
     struct Stack* next;
 };
 typedef struct Stack Stack;
diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c
index 0aff311..81d4bce 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -30,11 +30,6 @@ SOFTWARE.
 #define HEXDIGITS "0123456789abcdefABCDEF"
 #define ALPHANUM  "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
 
-static const char MARKERS[] = {
-    '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
-    '-', '!', '\n', '\0'};
-
-#define NUM_MARKERS 19
 #define MAX_BRACES 255
 #define MAX_ENTITY_SIZE 8
 
@@ -45,12 +40,6 @@ static const char MARKERS[] = {
 #define IS_SCHEME(scheme, slashes, reverse) \
     (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False))
 
-#ifdef IS_PY3K
-    #define NEW_INT_FUNC      PyLong_FromSsize_t
-#else
-    #define NEW_INT_FUNC      PyInt_FromSsize_t
-#endif
-
 typedef struct {
     PyObject* title;
     int level;
@@ -798,7 +787,11 @@ static int Tokenizer_parse_heading(Tokenizer* self)
         self->global ^= GL_HEADING;
         return 0;
     }
-    level = NEW_INT_FUNC(heading->level);
+#ifdef IS_PY3K
+    level = PyLong_FromSsize_t(heading->level);
+#else
+    level = PyInt_FromSsize_t(heading->level);
+#endif
     if (!level) {
         Py_DECREF(heading->title);
         free(heading);
diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.h b/mwparserfromhell/parser/ctokenizer/tok_parse.h
index 79e4acf..0899a34 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.h
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.h
@@ -24,6 +24,12 @@ SOFTWARE.
 
 #include "common.h"
 
+static const char MARKERS[] = {
+    '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
+    '-', '!', '\n', '\0'};
+
+#define NUM_MARKERS 19
+
 /* Functions */
 
 PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);
diff --git a/tests/tokenizer/text.mwtest b/tests/tokenizer/text.mwtest
index 040c677..95bea6f 100644
--- a/tests/tokenizer/text.mwtest
+++ b/tests/tokenizer/text.mwtest
@@ -27,6 +27,6 @@ output: [Text(text="𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰")]
 ---
 
 name:   large
-label:  a lot of text, requiring multiple textbuffer blocks in the C tokenizer
+label:  a lot of text, requiring proper storage in the C tokenizer
 input:  "ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN"
 output: [Text(text="ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN")]

From 2072a10b67ca7da7bd21c5306108be28eaae57c1 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Mon, 27 Jul 2015 03:57:16 -0400
Subject: [PATCH 12/22] More reworking of CTokenizer Unicode support
 (incomplete)

---
 mwparserfromhell/parser/ctokenizer/common.h      |  20 +++-
 mwparserfromhell/parser/ctokenizer/tok_parse.c   | 120 ++++++++++++-----------
 mwparserfromhell/parser/ctokenizer/tok_support.c |  24 +++--
 mwparserfromhell/parser/ctokenizer/tok_support.h |  10 +-
 mwparserfromhell/parser/ctokenizer/tokenizer.c   |  73 +++++++++++---
 5 files changed, 160 insertions(+), 87 deletions(-)

diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h
index 92a41ca..55d3906 100644
--- a/mwparserfromhell/parser/ctokenizer/common.h
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -51,8 +51,12 @@ SOFTWARE.
 
 #ifdef PEP_393
 #define Unicode Py_UCS4
+#define PyUnicode_FROM_SINGLE(chr)                                            \
+    PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1)
 #else
 #define Unicode Py_UNICODE
+#define PyUnicode_FROM_SINGLE(chr)                                            \
+    PyUnicode_FromUnicode(&(chr), 1)
 #endif
 
 /* Error handling macros */
@@ -77,7 +81,7 @@ extern PyObject* definitions;
 
 typedef struct {
     Py_ssize_t size;
-    Py_UNICODE* data;
+    Unicode* data;
 } Textbuffer;
 
 struct Stack {
@@ -89,11 +93,21 @@ struct Stack {
 typedef struct Stack Stack;
 
 typedef struct {
+    PyObject* object;        /* base PyUnicodeObject object */
+    Py_ssize_t length;       /* length of object, in code points */
+#ifdef PEP_393
+    int kind;                /* object's kind value */
+    void* data;              /* object's raw unicode buffer */
+#else
+    Py_UNICODE* buf;         /* object's internal buffer */
+#endif
+} TokenizerInput;
+
+typedef struct {
     PyObject_HEAD
-    PyObject* text;          /* text to tokenize */
+    TokenizerInput text;     /* text to tokenize */
     Stack* topstack;         /* topmost stack */
     Py_ssize_t head;         /* current position in text */
-    Py_ssize_t length;       /* length of text */
     int global;              /* global context */
     int depth;               /* stack recursion depth */
     int cycles;              /* total number of stack recursions */
diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c
index d761e27..712e248 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -190,7 +190,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
     PyObject *tokenlist;
 
     self->head += 2;
-    while (Tokenizer_READ(self, 0) == '{' && braces < MAX_BRACES) {
+    while (Tokenizer_read(self, 0) == '{' && braces < MAX_BRACES) {
         self->head++;
         braces++;
     }
@@ -426,7 +426,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
 
     if (Tokenizer_push(self, LC_EXT_LINK_URI))
         return -1;
-    if (Tokenizer_READ(self, 0) == '/' && Tokenizer_READ(self, 1) == '/') {
+    if (Tokenizer_read(self, 0) == '/' && Tokenizer_read(self, 1) == '/') {
         if (Tokenizer_emit_text(self, "//"))
             return -1;
         self->head += 2;
@@ -435,7 +435,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
         buffer = Textbuffer_new();
         if (!buffer)
             return -1;
-        while ((this = Tokenizer_READ(self, 0))) {
+        while ((this = Tokenizer_read(self, 0))) {
             i = 0;
             while (1) {
                 if (!valid[i])
@@ -462,8 +462,8 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
             return -1;
         }
         self->head++;
-        slashes = (Tokenizer_READ(self, 0) == '/' &&
-                   Tokenizer_READ(self, 1) == '/');
+        slashes = (Tokenizer_read(self, 0) == '/' &&
+                   Tokenizer_read(self, 1) == '/');
         if (slashes) {
             if (Tokenizer_emit_text(self, "//")) {
                 Textbuffer_dealloc(buffer);
@@ -528,8 +528,8 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
         Textbuffer_dealloc(scheme_buffer);
         return -1;
     }
-    slashes = (Tokenizer_READ(self, 0) == '/' &&
-               Tokenizer_READ(self, 1) == '/');
+    slashes = (Tokenizer_read(self, 0) == '/' &&
+               Tokenizer_read(self, 1) == '/');
     if (!IS_SCHEME(scheme, slashes, 1)) {
         Py_DECREF(scheme);
         Textbuffer_dealloc(scheme_buffer);
@@ -589,7 +589,7 @@ static int
 Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
 {
     // Built from Tokenizer_parse()'s end sentinels:
-    Py_UNICODE after = Tokenizer_READ(self, 2);
+    Py_UNICODE after = Tokenizer_read(self, 2);
     uint64_t ctx = self->topstack->context;
 
     return (!this || this == '\n' || this == '[' || this == ']' ||
@@ -615,22 +615,22 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
         return NULL;
     if (BAD_ROUTE)
         return NULL;
-    this = Tokenizer_READ(self, 0);
+    this = Tokenizer_read(self, 0);
     if (!this || this == '\n' || this == ' ' || this == ']')
         return Tokenizer_fail_route(self);
     if (!brackets && this == '[')
         return Tokenizer_fail_route(self);
     while (1) {
-        this = Tokenizer_READ(self, 0);
-        next = Tokenizer_READ(self, 1);
+        this = Tokenizer_read(self, 0);
+        next = Tokenizer_read(self, 1);
         if (this == '&') {
             PUSH_TAIL_BUFFER(*extra, NULL)
             if (Tokenizer_parse_entity(self))
                 return NULL;
         }
         else if (this == '<' && next == '!'
-                 && Tokenizer_READ(self, 2) == '-'
-                 && Tokenizer_READ(self, 3) == '-') {
+                 && Tokenizer_read(self, 2) == '-'
+                 && Tokenizer_read(self, 3) == '-') {
             PUSH_TAIL_BUFFER(*extra, NULL)
             if (Tokenizer_parse_comment(self))
                 return NULL;
@@ -716,7 +716,7 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
     #define NOT_A_LINK                                        \
         if (!brackets && self->topstack->context & LC_DLTERM) \
             return Tokenizer_handle_dl_term(self);            \
-        return Tokenizer_emit_char(self, Tokenizer_READ(self, 0))
+        return Tokenizer_emit_char(self, Tokenizer_read(self, 0))
 
     Py_ssize_t reset = self->head;
     PyObject *link, *kwargs;
@@ -787,7 +787,7 @@ static int Tokenizer_parse_heading(Tokenizer* self)
 
     self->global |= GL_HEADING;
     self->head += 1;
-    while (Tokenizer_READ(self, 0) == '=') {
+    while (Tokenizer_read(self, 0) == '=') {
         best++;
         self->head++;
     }
@@ -862,7 +862,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
 
     self->head += 1;
     best = 1;
-    while (Tokenizer_READ(self, 0) == '=') {
+    while (Tokenizer_read(self, 0) == '=') {
         best++;
         self->head++;
     }
@@ -916,7 +916,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
 */
 static int Tokenizer_really_parse_entity(Tokenizer* self)
 {
-    PyObject *kwargs, *textobj;
+    PyObject *kwargs, *charobj, *textobj;
     Py_UNICODE this;
     int numeric, hexadecimal, i, j, zeroes, test;
     char *valid, *text, *buffer, *def;
@@ -930,7 +930,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self)
     if (Tokenizer_emit(self, HTMLEntityStart))
         return -1;
     self->head++;
-    this = Tokenizer_READ(self, 0);
+    this = Tokenizer_read(self, 0);
     if (!this) {
         Tokenizer_fail_route(self);
         return 0;
@@ -940,7 +940,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self)
         if (Tokenizer_emit(self, HTMLEntityNumeric))
             return -1;
         self->head++;
-        this = Tokenizer_READ(self, 0);
+        this = Tokenizer_read(self, 0);
         if (!this) {
             Tokenizer_fail_route(self);
             return 0;
@@ -950,7 +950,12 @@ static int Tokenizer_really_parse_entity(Tokenizer* self)
             kwargs = PyDict_New();
             if (!kwargs)
                 return -1;
-            PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0));
+            if (!(charobj = PyUnicode_FROM_SINGLE(this))) {
+                Py_DECREF(kwargs);
+                return -1;
+            }
+            PyDict_SetItemString(kwargs, "char", charobj);
+            Py_DECREF(charobj);
             if (Tokenizer_emit_kwargs(self, HTMLEntityHex, kwargs))
                 return -1;
             self->head++;
@@ -974,7 +979,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self)
     i = 0;
     zeroes = 0;
     while (1) {
-        this = Tokenizer_READ(self, 0);
+        this = Tokenizer_read(self, 0);
         if (this == ';') {
             if (i == 0)
                 FAIL_ROUTE_AND_EXIT()
@@ -1093,15 +1098,15 @@ static int Tokenizer_parse_comment(Tokenizer* self)
     if (Tokenizer_push(self, 0))
         return -1;
     while (1) {
-        this = Tokenizer_READ(self, 0);
+        this = Tokenizer_read(self, 0);
         if (!this) {
             comment = Tokenizer_pop(self);
             Py_XDECREF(comment);
             self->head = reset;
             return Tokenizer_emit_text(self, "<!--");
         }
-        if (this == '-' && Tokenizer_READ(self, 1) == this &&
-                            Tokenizer_READ(self, 2) == '>') {
+        if (this == '-' && Tokenizer_read(self, 1) == this &&
+                            Tokenizer_read(self, 2) == '>') {
             if (Tokenizer_emit_first(self, CommentStart))
                 return -1;
             if (Tokenizer_emit(self, CommentEnd))
@@ -1221,7 +1226,7 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
 */
 static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
 {
-    Py_UNICODE next = Tokenizer_READ(self, 1);
+    Py_UNICODE next = Tokenizer_read(self, 1);
 
     if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self))
         return Tokenizer_emit_char(self, text);
@@ -1291,8 +1296,8 @@ Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk)
         }
     }
     else {  // data->context & TAG_ATTR_VALUE assured
-        escaped = (Tokenizer_READ_BACKWARDS(self, 1) == '\\' &&
-                   Tokenizer_READ_BACKWARDS(self, 2) != '\\');
+        escaped = (Tokenizer_read_backwards(self, 1) == '\\' &&
+                   Tokenizer_read_backwards(self, 2) != '\\');
         if (data->context & TAG_NOTE_QUOTE) {
             data->context ^= TAG_NOTE_QUOTE;
             if ((chunk == '"' || chunk == '\'') && !escaped) {
@@ -1419,8 +1424,8 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
     int cmp;
 
     while (1) {
-        this = Tokenizer_READ(self, 0);
-        next = Tokenizer_READ(self, 1);
+        this = Tokenizer_read(self, 0);
+        next = Tokenizer_read(self, 1);
         if (!this)
             return Tokenizer_fail_route(self);
         else if (this == '<' && next == '/') {
@@ -1429,7 +1434,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
             buffer = Textbuffer_new();
             if (!buffer)
                 return NULL;
-            while ((this = Tokenizer_READ(self, 0)), 1) {
+            while ((this = Tokenizer_read(self, 0)), 1) {
                 if (this == '>') {
                     buf_tmp = Textbuffer_render(buffer);
                     if (!buf_tmp)
@@ -1576,8 +1581,8 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
         return NULL;
     }
     while (1) {
-        this = Tokenizer_READ(self, 0);
-        next = Tokenizer_READ(self, 1);
+        this = Tokenizer_read(self, 0);
+        next = Tokenizer_read(self, 1);
         can_exit = (!(data->context & (TAG_QUOTED | TAG_NAME)) ||
                     data->context & TAG_NOTE_SPACE);
         if (!this) {
@@ -1652,7 +1657,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
     if (!buf)
         return -1;
     while (1) {
-        this = Tokenizer_READ(self, pos);
+        this = Tokenizer_read(self, pos);
         if (Py_UNICODE_ISSPACE(this) || is_marker(this)) {
             name = Textbuffer_render(buf);
             if (!name) {
@@ -1879,7 +1884,7 @@ static PyObject* Tokenizer_parse_style(Tokenizer* self)
     uint64_t context = self->topstack->context, ticks = 2, i;
 
     self->head += 2;
-    while (Tokenizer_READ(self, 0) == '\'') {
+    while (Tokenizer_read(self, 0) == '\'') {
         self->head++;
         ticks++;
     }
@@ -1941,15 +1946,20 @@ static PyObject* Tokenizer_parse_style(Tokenizer* self)
 */
 static int Tokenizer_handle_list_marker(Tokenizer* self)
 {
-    PyObject *markup = Tokenizer_read(self, 0), *kwargs;
-    Py_UNICODE code = *PyUnicode_AS_UNICODE(markup);
+    PyObject *kwargs, *markup;
+    Py_UNICODE code = Tokenizer_read(self, 0);
 
     if (code == ';')
         self->topstack->context |= LC_DLTERM;
     kwargs = PyDict_New();
     if (!kwargs)
         return -1;
+    if (!(markup = PyUnicode_FROM_SINGLE(code))) {
+        Py_DECREF(kwargs);
+        return -1;
+    }
     PyDict_SetItemString(kwargs, "wiki_markup", markup);
+    Py_DECREF(markup);
     if (Tokenizer_emit_kwargs(self, TagOpenOpen, kwargs))
         return -1;
     if (Tokenizer_emit_text(self, GET_HTML_TAG(code)))
@@ -1964,7 +1974,7 @@ static int Tokenizer_handle_list_marker(Tokenizer* self)
 */
 static int Tokenizer_handle_list(Tokenizer* self)
 {
-    Py_UNICODE marker = Tokenizer_READ(self, 1);
+    Py_UNICODE marker = Tokenizer_read(self, 1);
 
     if (Tokenizer_handle_list_marker(self))
         return -1;
@@ -1973,7 +1983,7 @@ static int Tokenizer_handle_list(Tokenizer* self)
         self->head++;
         if (Tokenizer_handle_list_marker(self))
             return -1;
-        marker = Tokenizer_READ(self, 1);
+        marker = Tokenizer_read(self, 1);
     }
     return 0;
 }
@@ -1994,7 +2004,7 @@ static int Tokenizer_handle_hr(Tokenizer* self)
         if (Textbuffer_write(&buffer, '-'))
             return -1;
     }
-    while (Tokenizer_READ(self, 1) == '-') {
+    while (Tokenizer_read(self, 1) == '-') {
         if (Textbuffer_write(&buffer, '-'))
             return -1;
         self->head++;
@@ -2023,7 +2033,7 @@ static int Tokenizer_handle_hr(Tokenizer* self)
 static int Tokenizer_handle_dl_term(Tokenizer* self)
 {
     self->topstack->context ^= LC_DLTERM;
-    if (Tokenizer_READ(self, 0) == ':')
+    if (Tokenizer_read(self, 0) == ':')
         return Tokenizer_handle_list_marker(self);
     return Tokenizer_emit_char(self, '\n');
 }
@@ -2130,7 +2140,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
     data->context = TAG_ATTR_READY;
 
     while (1) {
-        this = Tokenizer_READ(self, 0);
+        this = Tokenizer_read(self, 0);
         can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE);
         if (this == end_token && can_exit) {
             if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) {
@@ -2432,7 +2442,7 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
         } else if (data == '\n' || data == '[' || data == '}' || data == '>') {
             return -1;
         } else if (data == '<') {
-            if (Tokenizer_READ(self, 1) == '!')
+            if (Tokenizer_read(self, 1) == '!')
                 self->topstack->context |= LC_FAIL_NEXT;
             else
                 return -1;
@@ -2448,7 +2458,7 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
             self->topstack->context |= LC_HAS_TEMPLATE | LC_FAIL_NEXT;
             return 0;
         }
-        if (data == '}' || (data == '<' && Tokenizer_READ(self, 1) == '!')) {
+        if (data == '}' || (data == '<' && Tokenizer_read(self, 1) == '!')) {
             self->topstack->context |= LC_FAIL_NEXT;
             return 0;
         }
@@ -2475,8 +2485,8 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
             }
         }
         else if (context & LC_FAIL_ON_LBRACE) {
-            if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' &&
-                                Tokenizer_READ_BACKWARDS(self, 2) == '{')) {
+            if (data == '{' || (Tokenizer_read_backwards(self, 1) == '{' &&
+                                Tokenizer_read_backwards(self, 2) == '{')) {
                 if (context & LC_TEMPLATE)
                     self->topstack->context |= LC_FAIL_ON_EQUALS;
                 else
@@ -2509,7 +2519,7 @@ static int Tokenizer_has_leading_whitespace(Tokenizer* self)
     int offset = 1;
     Py_UNICODE current_character;
     while (1) {
-        current_character = Tokenizer_READ_BACKWARDS(self, offset);
+        current_character = Tokenizer_read_backwards(self, offset);
         if (!current_character || current_character == '\n')
             return 1;
         else if (!Py_UNICODE_ISSPACE(current_character))
@@ -2533,7 +2543,7 @@ PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
             return NULL;
     }
     while (1) {
-        this = Tokenizer_READ(self, 0);
+        this = Tokenizer_read(self, 0);
         this_context = self->topstack->context;
         if (this_context & AGG_UNSAFE) {
             if (Tokenizer_verify_safe(self, this_context, this) < 0) {
@@ -2552,8 +2562,8 @@ PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
         }
         if (!this)
             return Tokenizer_handle_end(self, this_context);
-        next = Tokenizer_READ(self, 1);
-        last = Tokenizer_READ_BACKWARDS(self, 1);
+        next = Tokenizer_read(self, 1);
+        last = Tokenizer_read_backwards(self, 1);
         if (this == next && next == '{') {
             if (Tokenizer_CAN_RECURSE(self)) {
                 if (Tokenizer_parse_template_or_argument(self))
@@ -2577,7 +2587,7 @@ PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
                 return NULL;
         }
         else if (this == next && next == '}' && this_context & LC_ARGUMENT) {
-            if (Tokenizer_READ(self, 2) == '}') {
+            if (Tokenizer_read(self, 2) == '}') {
                 return Tokenizer_handle_argument_end(self);
             }
             if (Tokenizer_emit_char(self, this))
@@ -2624,15 +2634,15 @@ PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
                 return NULL;
         }
         else if (this == '<' && next == '!') {
-            next_next = Tokenizer_READ(self, 2);
-            if (next_next == Tokenizer_READ(self, 3) && next_next == '-') {
+            next_next = Tokenizer_read(self, 2);
+            if (next_next == Tokenizer_read(self, 3) && next_next == '-') {
                 if (Tokenizer_parse_comment(self))
                     return NULL;
             }
             else if (Tokenizer_emit_char(self, this))
                 return NULL;
         }
-        else if (this == '<' && next == '/' && Tokenizer_READ(self, 2)) {
+        else if (this == '<' && next == '/' && Tokenizer_read(self, 2)) {
             if (this_context & LC_TAG_BODY ?
                 Tokenizer_handle_tag_open_close(self) :
                 Tokenizer_handle_invalid_tag_start(self))
@@ -2658,8 +2668,8 @@ PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
                 return NULL;
         }
         else if ((!last || last == '\n') && (this == '-' && this == next &&
-                 this == Tokenizer_READ(self, 2) &&
-                 this == Tokenizer_READ(self, 3))) {
+                 this == Tokenizer_read(self, 2) &&
+                 this == Tokenizer_read(self, 3))) {
             if (Tokenizer_handle_hr(self))
                 return NULL;
         }
diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.c b/mwparserfromhell/parser/ctokenizer/tok_support.c
index 12c7818..eb548ee 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_support.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_support.c
@@ -198,7 +198,7 @@ int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
 /*
     Write a Unicode codepoint to the current textbuffer.
 */
-int Tokenizer_emit_char(Tokenizer* self, Py_UNICODE code)
+int Tokenizer_emit_char(Tokenizer* self, Unicode code)
 {
     return Textbuffer_write(&(self->topstack->textbuffer), code);
 }
@@ -337,26 +337,38 @@ int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
 }
 
 /*
+    Internal function to read the codepoint at the given index from the input.
+*/
+static Unicode read_codepoint(TokenizerInput* text, Py_ssize_t index)
+{
+#ifdef PEP_393
+    return PyUnicode_READ(text->kind, text->data, index);
+#else
+    return text->buf[index];
+#endif
+}
+
+/*
     Read the value at a relative point in the wikicode, forwards.
 */
-PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
+Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
 {
     Py_ssize_t index = self->head + delta;
 
-    if (index >= self->length)
+    if (index >= self->text.length)
         return EMPTY;
-    return PyList_GET_ITEM(self->text, index);
+    return read_codepoint(&self->text, index);
 }
 
 /*
     Read the value at a relative point in the wikicode, backwards.
 */
-PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
+Unicode Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
 {
     Py_ssize_t index;
 
     if (delta > self->head)
         return EMPTY;
     index = self->head - delta;
-    return PyList_GET_ITEM(self->text, index);
+    return read_codepoint(&self->text, index);
 }
diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.h b/mwparserfromhell/parser/ctokenizer/tok_support.h
index 25a302a..1bf7400 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_support.h
+++ b/mwparserfromhell/parser/ctokenizer/tok_support.h
@@ -35,24 +35,20 @@ void* Tokenizer_fail_route(Tokenizer*);
 
 int Tokenizer_emit_token(Tokenizer*, PyObject*, int);
 int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int);
-int Tokenizer_emit_char(Tokenizer*, Py_UNICODE);
+int Tokenizer_emit_char(Tokenizer*, Unicode);
 int Tokenizer_emit_text(Tokenizer*, const char*);
 int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*, int);
 int Tokenizer_emit_all(Tokenizer*, PyObject*);
 int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);
 
-PyObject* Tokenizer_read(Tokenizer*, Py_ssize_t);
-PyObject* Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);
+Unicode Tokenizer_read(Tokenizer*, Py_ssize_t);
+Unicode Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);
 
 /* Macros */
 
 #define MAX_DEPTH 40
 #define MAX_CYCLES 100000
 
-#define Tokenizer_READ(self, delta)                                           \
-    (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta)))
-#define Tokenizer_READ_BACKWARDS(self, delta)                                 \
-    (*PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, delta)))
 #define Tokenizer_CAN_RECURSE(self)                                           \
     (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)
 
diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c
index 7af60a5..23450dd 100644
--- a/mwparserfromhell/parser/ctokenizer/tokenizer.c
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c
@@ -52,12 +52,20 @@ Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
 }
 
 /*
+    Deallocate the given tokenizer's text field.
+*/
+static void dealloc_tokenizer_text(TokenizerInput* text)
+{
+    Py_XDECREF(text->object);
+}
+
+/*
     Deallocate the given tokenizer object.
 */
 static void Tokenizer_dealloc(Tokenizer* self)
 {
     Stack *this = self->topstack, *next;
-    Py_XDECREF(self->text);
+    dealloc_tokenizer_text(&self->text);
 
     while (this) {
         Py_DECREF(this->stack);
@@ -70,6 +78,22 @@ static void Tokenizer_dealloc(Tokenizer* self)
 }
 
 /*
+    Initialize a new tokenizer instance's text field.
+*/
+static void init_tokenizer_text(TokenizerInput* text)
+{
+    text->object = Py_None;
+    Py_INCREF(Py_None);
+    text->length = 0;
+#ifdef PEP_393
+    text->kind = PyUnicode_WCHAR_KIND;
+    text->data = NULL;
+#else
+    text->buf = NULL;
+#endif
+}
+
+/*
     Initialize a new tokenizer instance by setting instance attributes.
 */
 static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
@@ -78,46 +102,63 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
 
     if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
         return -1;
-    self->text = Py_None;
-    Py_INCREF(Py_None);
+    init_tokenizer_text(&self->text);
     self->topstack = NULL;
-    self->head = self->length = self->global = self->depth = self->cycles = 0;
+    self->head = self->global = self->depth = self->cycles = 0;
     self->route_context = self->route_state = 0;
+    self->skip_style_tags = 0;
     return 0;
 }
 
 /*
+    Load input text into the tokenizer.
+*/
+static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
+{
+    dealloc_tokenizer_text(text);
+    text->object = input;
+
+#ifdef PEP_393
+    if (PyUnicode_READY(input) < 0)
+        return -1;
+    text->length = PyUnicode_GET_LENGTH(input);
+    text->kind = PyUnicode_KIND(input);
+    text->data = PyUnicode_DATA(input);
+#else
+    text->length = PyUnicode_GET_SIZE(input);
+    text->buf = PyUnicode_AS_UNICODE(input);
+#endif
+}
+
+/*
     Build a list of tokens from a string of wikicode and return it.
 */
 static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
 {
-    PyObject *text, *temp, *tokens;
+    PyObject *input, *tokens;
     uint64_t context = 0;
     int skip_style_tags = 0;
 
-    if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) {
-        Py_XDECREF(self->text);
-        self->text = PySequence_Fast(text, "expected a sequence");
+    if (PyArg_ParseTuple(args, "U|ii", &input, &context, &skip_style_tags)) {
+        if (load_tokenizer_text(&self->text, input))
+            return NULL;
     }
     else {
-        const char* encoded;
+        const char *encoded;
         Py_ssize_t size;
+
         /* Failed to parse a Unicode object; try a string instead. */
         PyErr_Clear();
         if (!PyArg_ParseTuple(args, "s#|ii", &encoded, &size, &context,
                               &skip_style_tags))
             return NULL;
-        temp = PyUnicode_FromStringAndSize(encoded, size);
-        if (!text)
+        if (!(input = PyUnicode_FromStringAndSize(encoded, size)))
+            return NULL;
+        if (load_tokenizer_text(&self->text, input))
             return NULL;
-        Py_XDECREF(self->text);
-        text = PySequence_Fast(temp, "expected a sequence");
-        Py_XDECREF(temp);
-        self->text = text;
     }
 
     self->head = self->global = self->depth = self->cycles = 0;
-    self->length = PyList_GET_SIZE(self->text);
     self->skip_style_tags = skip_style_tags;
     tokens = Tokenizer_parse(self, context, 1);
 

From 5eac0ab16f51476a4c3b58b43195cab3c6900a7e Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Mon, 27 Jul 2015 17:06:57 -0400
Subject: [PATCH 13/22] More PEP 393 work; update Textbuffer interface and
 usage.

---
 mwparserfromhell/definitions.py                  |   4 +-
 mwparserfromhell/parser/ctokenizer/common.h      |  13 ++-
 mwparserfromhell/parser/ctokenizer/tag_data.c    |  40 +++----
 mwparserfromhell/parser/ctokenizer/tag_data.h    |   4 +-
 mwparserfromhell/parser/ctokenizer/textbuffer.h  |   8 +-
 mwparserfromhell/parser/ctokenizer/tok_parse.c   | 143 ++++++++++-------------
 mwparserfromhell/parser/ctokenizer/tok_support.c |  53 ++-------
 mwparserfromhell/parser/ctokenizer/tok_support.h |   2 +-
 mwparserfromhell/parser/ctokenizer/tokenizer.c   |   8 +-
 9 files changed, 112 insertions(+), 163 deletions(-)

diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py
index e0ba16b..cdacb3d 100644
--- a/mwparserfromhell/definitions.py
+++ b/mwparserfromhell/definitions.py
@@ -81,10 +81,8 @@ def is_single_only(tag):
     """Return whether or not the given *tag* must exist without a close tag."""
     return tag.lower() in SINGLE_ONLY
 
-def is_scheme(scheme, slashes=True, reverse=False):
+def is_scheme(scheme, slashes=True):
     """Return whether *scheme* is valid for external links."""
-    if reverse:  # Convenience for C
-        scheme = scheme[::-1]
     scheme = scheme.lower()
     if slashes:
         return scheme in URI_SCHEMES
diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h
index 55d3906..aa2b123 100644
--- a/mwparserfromhell/parser/ctokenizer/common.h
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -57,6 +57,7 @@ SOFTWARE.
 #define Unicode Py_UNICODE
 #define PyUnicode_FROM_SINGLE(chr)                                            \
     PyUnicode_FromUnicode(&(chr), 1)
+#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE
 #endif
 
 /* Error handling macros */
@@ -73,15 +74,21 @@ SOFTWARE.
 
 extern char** entitydefs;
 
-extern PyObject* EMPTY;
 extern PyObject* NOARGS;
 extern PyObject* definitions;
 
 /* Structs */
 
 typedef struct {
-    Py_ssize_t size;
-    Unicode* data;
+    Py_ssize_t capacity;
+    Py_ssize_t length;
+#ifdef PEP_393
+    PyObject* object;
+    int kind;
+    void* data;
+#else
+    Py_UNICODE* data;
+#endif
 } Textbuffer;
 
 struct Stack {
diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.c b/mwparserfromhell/parser/ctokenizer/tag_data.c
index 968a760..2f67966 100644
--- a/mwparserfromhell/parser/ctokenizer/tag_data.c
+++ b/mwparserfromhell/parser/ctokenizer/tag_data.c
@@ -26,13 +26,13 @@ SOFTWARE.
 /*
     Initialize a new TagData object.
 */
-TagData* TagData_new(void)
+TagData* TagData_new(TokenizerInput* text)
 {
-#define ALLOC_BUFFER(name)     \
-    name = Textbuffer_new();   \
-    if (!name) {               \
-        TagData_dealloc(self); \
-        return NULL;           \
+#define ALLOC_BUFFER(name)       \
+    name = Textbuffer_new(text); \
+    if (!name) {                 \
+        TagData_dealloc(self);   \
+        return NULL;             \
     }
 
     TagData *self = malloc(sizeof(TagData));
@@ -56,16 +56,13 @@ TagData* TagData_new(void)
 */
 void TagData_dealloc(TagData* self)
 {
-#define DEALLOC_BUFFER(name)      \
-    if (name)                     \
-        Textbuffer_dealloc(name);
-
-    DEALLOC_BUFFER(self->pad_first);
-    DEALLOC_BUFFER(self->pad_before_eq);
-    DEALLOC_BUFFER(self->pad_after_eq);
+    if (self->pad_first)
+        Textbuffer_dealloc(self->pad_first);
+    if (self->pad_before_eq)
+        Textbuffer_dealloc(self->pad_before_eq);
+    if (self->pad_after_eq)
+        Textbuffer_dealloc(self->pad_after_eq);
     free(self);
-
-#undef DEALLOC_BUFFER
 }
 
 /*
@@ -73,16 +70,9 @@ void TagData_dealloc(TagData* self)
 */
 int TagData_reset_buffers(TagData* self)
 {
-#define RESET_BUFFER(name)    \
-    Textbuffer_dealloc(name); \
-    name = Textbuffer_new();  \
-    if (!name)                \
+    if (Textbuffer_reset(self->pad_first) ||
+        Textbuffer_reset(self->pad_before_eq) ||
+        Textbuffer_reset(self->pad_after_eq))
         return -1;
-
-    RESET_BUFFER(self->pad_first)
-    RESET_BUFFER(self->pad_before_eq)
-    RESET_BUFFER(self->pad_after_eq)
     return 0;
-
-#undef RESET_BUFFER
 }
diff --git a/mwparserfromhell/parser/ctokenizer/tag_data.h b/mwparserfromhell/parser/ctokenizer/tag_data.h
index e2ae807..f184081 100644
--- a/mwparserfromhell/parser/ctokenizer/tag_data.h
+++ b/mwparserfromhell/parser/ctokenizer/tag_data.h
@@ -32,12 +32,12 @@ typedef struct {
     Textbuffer* pad_first;
     Textbuffer* pad_before_eq;
     Textbuffer* pad_after_eq;
-    Py_UNICODE quoter;
+    Unicode quoter;
     Py_ssize_t reset;
 } TagData;
 
 /* Functions */
 
-TagData* TagData_new(void);
+TagData* TagData_new(TokenizerInput*);
 void TagData_dealloc(TagData*);
 int TagData_reset_buffers(TagData*);
diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.h b/mwparserfromhell/parser/ctokenizer/textbuffer.h
index 389a9fe..123d240 100644
--- a/mwparserfromhell/parser/ctokenizer/textbuffer.h
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h
@@ -26,7 +26,11 @@ SOFTWARE.
 
 /* Functions */
 
-Textbuffer* Textbuffer_new(void);
+Textbuffer* Textbuffer_new(TokenizerInput*);
 void Textbuffer_dealloc(Textbuffer*);
-int Textbuffer_write(Textbuffer**, Py_UNICODE);
+int Textbuffer_reset(Textbuffer*);
+int Textbuffer_write(Textbuffer*, Unicode);
+Unicode Textbuffer_read(Textbuffer*, Py_ssize_t);
 PyObject* Textbuffer_render(Textbuffer*);
+int Textbuffer_concat(Textbuffer*, Textbuffer*);
+void Textbuffer_reverse(Textbuffer*);
diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c
index 712e248..bd742fe 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -34,11 +34,11 @@ SOFTWARE.
 #define MAX_ENTITY_SIZE 8
 
 #define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
-#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL))
-#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL))
-#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL))
-#define IS_SCHEME(scheme, slashes, reverse) \
-    (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False))
+#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL))
+#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL))
+#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL))
+#define IS_SCHEME(scheme, slashes) \
+    (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False))
 
 typedef struct {
     PyObject* title;
@@ -80,14 +80,13 @@ static int heading_level_from_context(uint64_t n)
 }
 
 /*
-    Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as
+    Call the given function in definitions.py, using 'in1' and 'in2' as
     parameters, and return its output as a bool.
 */
-static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2,
-                         PyObject* in3)
+static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2)
 {
     PyObject* func = PyObject_GetAttrString(definitions, funcname);
-    PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL);
+    PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL);
     int ans = (result == Py_True) ? 1 : 0;
 
     Py_DECREF(func);
@@ -432,7 +431,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
         self->head += 2;
     }
     else {
-        buffer = Textbuffer_new();
+        buffer = Textbuffer_new(&self->text);
         if (!buffer)
             return -1;
         while ((this = Tokenizer_read(self, 0))) {
@@ -444,7 +443,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
                     break;
                 i++;
             }
-            Textbuffer_write(&buffer, this);
+            Textbuffer_write(buffer, this);
             if (Tokenizer_emit_char(self, this)) {
                 Textbuffer_dealloc(buffer);
                 return -1;
@@ -475,7 +474,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
         Textbuffer_dealloc(buffer);
         if (!scheme)
             return -1;
-        if (!IS_SCHEME(scheme, slashes, 0)) {
+        if (!IS_SCHEME(scheme, slashes)) {
             Py_DECREF(scheme);
             Tokenizer_fail_route(self);
             return 0;
@@ -491,7 +490,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
 static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
 {
     static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
-    Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer;
+    Textbuffer *scheme_buffer = Textbuffer_new(&self->text);
     PyObject *scheme;
     Py_UNICODE chunk;
     Py_ssize_t i;
@@ -501,28 +500,22 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
         return -1;
     // We have to backtrack through the textbuffer looking for our scheme since
     // it was just parsed as text:
-    temp_buffer = self->topstack->textbuffer;
-    while (temp_buffer) {
-        for (i = temp_buffer->size - 1; i >= 0; i--) {
-            chunk = temp_buffer->data[i];
-            if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
-                goto end_of_loop;
-            j = 0;
-            while (1) {
-                if (!valid[j]) {
-                    Textbuffer_dealloc(scheme_buffer);
-                    FAIL_ROUTE(0);
-                    return 0;
-                }
-                if (chunk == valid[j])
-                    break;
-                j++;
+    for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) {
+        chunk = Textbuffer_read(self->topstack->textbuffer, i);
+        if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
+            goto end_of_loop;
+        j = 0;
+        do {
+            if (!valid[j]) {
+                Textbuffer_dealloc(scheme_buffer);
+                FAIL_ROUTE(0);
+                return 0;
             }
-            Textbuffer_write(&scheme_buffer, chunk);
-        }
-        temp_buffer = temp_buffer->next;
+        } while (chunk != valid[j++]);
+        Textbuffer_write(scheme_buffer, chunk);
     }
     end_of_loop:
+    Textbuffer_reverse(scheme_buffer);
     scheme = Textbuffer_render(scheme_buffer);
     if (!scheme) {
         Textbuffer_dealloc(scheme_buffer);
@@ -530,7 +523,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
     }
     slashes = (Tokenizer_read(self, 0) == '/' &&
                Tokenizer_read(self, 1) == '/');
-    if (!IS_SCHEME(scheme, slashes, 1)) {
+    if (!IS_SCHEME(scheme, slashes)) {
         Py_DECREF(scheme);
         Textbuffer_dealloc(scheme_buffer);
         FAIL_ROUTE(0);
@@ -541,7 +534,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
         Textbuffer_dealloc(scheme_buffer);
         return -1;
     }
-    if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1))
+    if (Tokenizer_emit_textbuffer(self, scheme_buffer))
         return -1;
     if (Tokenizer_emit_char(self, ':'))
         return -1;
@@ -558,27 +551,26 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
 */
 static int
 Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
-                                Textbuffer** tail, Py_UNICODE this)
+                                Textbuffer* tail, Py_UNICODE this)
 {
-    #define PUSH_TAIL_BUFFER(tail, error)                 \
-        if ((tail)->size || (tail)->next) {               \
-            if (Tokenizer_emit_textbuffer(self, tail, 0)) \
-                return error;                             \
-            tail = Textbuffer_new();                      \
-            if (!(tail))                                  \
-                return error;                             \
+    #define PUSH_TAIL_BUFFER(tail, error)                            \
+        if (tail->length > 0) {                                      \
+            if (Textbuffer_concat(self->topstack->textbuffer, tail)) \
+                return error;                                        \
+            if (Textbuffer_reset(tail))                              \
+                return error;                                        \
         }
 
     if (this == '(' && !(*parens)) {
         *parens = 1;
-        PUSH_TAIL_BUFFER(*tail, -1)
+        PUSH_TAIL_BUFFER(tail, -1)
     }
     else if (this == ',' || this == ';' || this == '\\' || this == '.' ||
              this == ':' || this == '!' || this == '?' ||
              (!(*parens) && this == ')'))
         return Textbuffer_write(tail, this);
     else
-        PUSH_TAIL_BUFFER(*tail, -1)
+        PUSH_TAIL_BUFFER(tail, -1)
     return Tokenizer_emit_char(self, this);
 }
 
@@ -605,7 +597,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
 */
 static PyObject*
 Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
-                                     Textbuffer** extra)
+                                     Textbuffer* extra)
 {
     Py_UNICODE this, next;
     int parens = 0;
@@ -624,14 +616,14 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
         this = Tokenizer_read(self, 0);
         next = Tokenizer_read(self, 1);
         if (this == '&') {
-            PUSH_TAIL_BUFFER(*extra, NULL)
+            PUSH_TAIL_BUFFER(extra, NULL)
             if (Tokenizer_parse_entity(self))
                 return NULL;
         }
         else if (this == '<' && next == '!'
                  && Tokenizer_read(self, 2) == '-'
                  && Tokenizer_read(self, 3) == '-') {
-            PUSH_TAIL_BUFFER(*extra, NULL)
+            PUSH_TAIL_BUFFER(extra, NULL)
             if (Tokenizer_parse_comment(self))
                 return NULL;
         }
@@ -642,7 +634,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
         else if (!this || this == '\n')
             return Tokenizer_fail_route(self);
         else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) {
-            PUSH_TAIL_BUFFER(*extra, NULL)
+            PUSH_TAIL_BUFFER(extra, NULL)
             if (Tokenizer_parse_template_or_argument(self))
                 return NULL;
         }
@@ -682,7 +674,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
     PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"),
              *split, *scheme;
     Py_ssize_t length;
-    Textbuffer* temp;
 
     if (!text)
         return -1;
@@ -691,19 +682,9 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
     if (!split)
         return -1;
     scheme = PyList_GET_ITEM(split, 0);
-    length = PyUnicode_GET_SIZE(scheme);
-    while (length) {
-        temp = self->topstack->textbuffer;
-        if (length <= temp->size) {
-            temp->size -= length;
-            break;
-        }
-        length -= temp->size;
-        self->topstack->textbuffer = temp->next;
-        free(temp->data);
-        free(temp);
-    }
+    length = PyUnicode_GET_LENGTH(scheme);
     Py_DECREF(split);
+    self->topstack->textbuffer->length -= length;
     return 0;
 }
 
@@ -720,16 +701,16 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
 
     Py_ssize_t reset = self->head;
     PyObject *link, *kwargs;
-    Textbuffer *extra = 0;
+    Textbuffer *extra;
 
     if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) {
         NOT_A_LINK;
     }
-    extra = Textbuffer_new();
+    extra = Textbuffer_new(&self->text);
     if (!extra)
         return -1;
     self->head++;
-    link = Tokenizer_really_parse_external_link(self, brackets, &extra);
+    link = Tokenizer_really_parse_external_link(self, brackets, extra);
     if (BAD_ROUTE) {
         RESET_ROUTE();
         self->head = reset;
@@ -769,8 +750,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
         Textbuffer_dealloc(extra);
         return -1;
     }
-    if (extra->size || extra->next)
-        return Tokenizer_emit_textbuffer(self, extra, 0);
+    if (extra->length > 0)
+        return Tokenizer_emit_textbuffer(self, extra);
     Textbuffer_dealloc(extra);
     return 0;
 }
@@ -1143,7 +1124,7 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
         kwargs = PyDict_New();
         if (!kwargs)
             return -1;
-        tmp = PyUnicode_FromUnicode(&data->quoter, 1);
+        tmp = PyUnicode_FROM_SINGLE(data->quoter);
         if (!tmp)
             return -1;
         PyDict_SetItemString(kwargs, "char", tmp);
@@ -1207,7 +1188,7 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
         data->context = TAG_ATTR_READY;
     else if (ctx & TAG_ATTR_NAME) {
         data->context |= TAG_NOTE_EQUALS;
-        if (Textbuffer_write(&(data->pad_before_eq), text))
+        if (Textbuffer_write(data->pad_before_eq, text))
             return -1;
     }
     if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) {
@@ -1215,9 +1196,9 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
             return -1;
     }
     else if (data->context & TAG_ATTR_READY)
-        return Textbuffer_write(&(data->pad_first), text);
+        return Textbuffer_write(data->pad_first, text);
     else if (data->context & TAG_ATTR_VALUE)
-        return Textbuffer_write(&(data->pad_after_eq), text);
+        return Textbuffer_write(data->pad_after_eq, text);
     return 0;
 }
 
@@ -1431,7 +1412,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
         else if (this == '<' && next == '/') {
             self->head += 2;
             reset = self->head - 1;
-            buffer = Textbuffer_new();
+            buffer = Textbuffer_new(&self->text);
             if (!buffer)
                 return NULL;
             while ((this = Tokenizer_read(self, 0)), 1) {
@@ -1454,7 +1435,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
                         goto no_matching_end;
                     if (Tokenizer_emit(self, TagOpenClose))
                         return NULL;
-                    if (Tokenizer_emit_textbuffer(self, buffer, 0))
+                    if (Tokenizer_emit_textbuffer(self, buffer))
                         return NULL;
                     if (Tokenizer_emit(self, TagCloseClose))
                         return NULL;
@@ -1468,7 +1449,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
                         return NULL;
                     break;
                 }
-                Textbuffer_write(&buffer, this);
+                Textbuffer_write(buffer, this);
                 self->head++;
             }
         }
@@ -1565,7 +1546,7 @@ static PyObject* Tokenizer_handle_single_tag_end(Tokenizer* self)
 */
 static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
 {
-    TagData *data = TagData_new();
+    TagData *data = TagData_new(&self->text);
     PyObject *token, *text, *trash;
     Py_UNICODE this, next;
     int can_exit;
@@ -1653,7 +1634,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
     Py_UNICODE this;
 
     self->head += 2;
-    buf = Textbuffer_new();
+    buf = Textbuffer_new(&self->text);
     if (!buf)
         return -1;
     while (1) {
@@ -1669,7 +1650,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
             Py_DECREF(name);
             break;
         }
-        Textbuffer_write(&buf, this);
+        Textbuffer_write(buf, this);
         pos++;
     }
     Textbuffer_dealloc(buf);
@@ -1994,18 +1975,18 @@ static int Tokenizer_handle_list(Tokenizer* self)
 static int Tokenizer_handle_hr(Tokenizer* self)
 {
     PyObject *markup, *kwargs;
-    Textbuffer *buffer = Textbuffer_new();
+    Textbuffer *buffer = Textbuffer_new(&self->text);
     int i;
 
     if (!buffer)
         return -1;
     self->head += 3;
     for (i = 0; i < 4; i++) {
-        if (Textbuffer_write(&buffer, '-'))
+        if (Textbuffer_write(buffer, '-'))
             return -1;
     }
     while (Tokenizer_read(self, 1) == '-') {
-        if (Textbuffer_write(&buffer, '-'))
+        if (Textbuffer_write(buffer, '-'))
             return -1;
         self->head++;
     }
@@ -2130,7 +2111,7 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup,
 */
 static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
 {
-    TagData *data = TagData_new();
+    TagData *data = TagData_new(&self->text);
     PyObject *padding, *trash;
     Py_UNICODE this;
     int can_exit;
@@ -2150,7 +2131,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
                 }
             }
             if (Py_UNICODE_ISSPACE(this))
-                Textbuffer_write(&(data->pad_first), this);
+                Textbuffer_write(data->pad_first, this);
             padding = Textbuffer_render(data->pad_first);
             TagData_dealloc(data);
             if (!padding)
diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.c b/mwparserfromhell/parser/ctokenizer/tok_support.c
index eb548ee..bcd4edf 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_support.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_support.c
@@ -37,7 +37,7 @@ int Tokenizer_push(Tokenizer* self, uint64_t context)
     }
     top->stack = PyList_New(0);
     top->context = context;
-    top->textbuffer = Textbuffer_new();
+    top->textbuffer = Textbuffer_new(&self->text);
     if (!top->textbuffer)
         return -1;
     top->next = self->topstack;
@@ -55,7 +55,7 @@ int Tokenizer_push_textbuffer(Tokenizer* self)
     PyObject *text, *kwargs, *token;
     Textbuffer* buffer = self->topstack->textbuffer;
 
-    if (buffer->size == 0 && !buffer->next)
+    if (buffer->length == 0)
         return 0;
     text = Textbuffer_render(buffer);
     if (!text)
@@ -76,9 +76,7 @@ int Tokenizer_push_textbuffer(Tokenizer* self)
         return -1;
     }
     Py_DECREF(token);
-    Textbuffer_dealloc(buffer);
-    self->topstack->textbuffer = Textbuffer_new();
-    if (!self->topstack->textbuffer)
+    if (Textbuffer_reset(buffer))
         return -1;
     return 0;
 }
@@ -200,7 +198,7 @@ int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
 */
 int Tokenizer_emit_char(Tokenizer* self, Unicode code)
 {
-    return Textbuffer_write(&(self->topstack->textbuffer), code);
+    return Textbuffer_write(self->topstack->textbuffer, code);
 }
 
 /*
@@ -222,36 +220,11 @@ int Tokenizer_emit_text(Tokenizer* self, const char* text)
     Write the contents of another textbuffer to the current textbuffer,
     deallocating it in the process.
 */
-int
-Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse)
+int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer)
 {
-    Textbuffer *original = buffer;
-    Py_ssize_t i;
-
-    if (reverse) {
-        do {
-            for (i = buffer->size - 1; i >= 0; i--) {
-                if (Tokenizer_emit_char(self, buffer->data[i])) {
-                    Textbuffer_dealloc(original);
-                    return -1;
-                }
-            }
-        } while ((buffer = buffer->next));
-    }
-    else {
-        while (buffer->next)
-            buffer = buffer->next;
-        do {
-            for (i = 0; i < buffer->size; i++) {
-                if (Tokenizer_emit_char(self, buffer->data[i])) {
-                    Textbuffer_dealloc(original);
-                    return -1;
-                }
-            }
-        } while ((buffer = buffer->prev));
-    }
-    Textbuffer_dealloc(original);
-    return 0;
+    int retval = Textbuffer_concat(self->topstack->textbuffer, buffer);
+    Textbuffer_dealloc(buffer);
+    return retval;
 }
 
 /*
@@ -272,7 +245,7 @@ int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
             case 1: {
                 pushed = 1;
                 buffer = self->topstack->textbuffer;
-                if (buffer->size == 0 && !buffer->next)
+                if (buffer->length == 0)
                     break;
                 left = Textbuffer_render(buffer);
                 if (!left)
@@ -290,9 +263,7 @@ int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
                     return -1;
                 }
                 Py_DECREF(text);
-                Textbuffer_dealloc(buffer);
-                self->topstack->textbuffer = Textbuffer_new();
-                if (!self->topstack->textbuffer)
+                if (Textbuffer_reset(buffer))
                     return -1;
                 break;
             }
@@ -356,7 +327,7 @@ Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
     Py_ssize_t index = self->head + delta;
 
     if (index >= self->text.length)
-        return EMPTY;
+        return '\0';
     return read_codepoint(&self->text, index);
 }
 
@@ -368,7 +339,7 @@ Unicode Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
     Py_ssize_t index;
 
     if (delta > self->head)
-        return EMPTY;
+        return '\0';
     index = self->head - delta;
     return read_codepoint(&self->text, index);
 }
diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.h b/mwparserfromhell/parser/ctokenizer/tok_support.h
index 1bf7400..c167c0a 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_support.h
+++ b/mwparserfromhell/parser/ctokenizer/tok_support.h
@@ -37,7 +37,7 @@ int Tokenizer_emit_token(Tokenizer*, PyObject*, int);
 int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int);
 int Tokenizer_emit_char(Tokenizer*, Unicode);
 int Tokenizer_emit_text(Tokenizer*, const char*);
-int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*, int);
+int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*);
 int Tokenizer_emit_all(Tokenizer*, PyObject*);
 int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);
 
diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c
index 23450dd..f12b35a 100644
--- a/mwparserfromhell/parser/ctokenizer/tokenizer.c
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c
@@ -31,7 +31,6 @@ uint64_t route_context;
 
 char** entitydefs;
 
-PyObject* EMPTY;
 PyObject* NOARGS;
 PyObject* definitions;
 
@@ -121,13 +120,13 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
 #ifdef PEP_393
     if (PyUnicode_READY(input) < 0)
         return -1;
-    text->length = PyUnicode_GET_LENGTH(input);
     text->kind = PyUnicode_KIND(input);
     text->data = PyUnicode_DATA(input);
 #else
-    text->length = PyUnicode_GET_SIZE(input);
     text->buf = PyUnicode_AS_UNICODE(input);
 #endif
+    text->length = PyUnicode_GET_LENGTH(input);
+    return 0;
 }
 
 /*
@@ -301,9 +300,8 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void)
     PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
     Py_INCREF(Py_True);
     PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
-    EMPTY = PyUnicode_FromString("");
     NOARGS = PyTuple_New(0);
-    if (!EMPTY || !NOARGS || load_entities() || load_tokens() || load_defs())
+    if (!NOARGS || load_entities() || load_tokens() || load_defs())
         INIT_ERROR;
 #ifdef IS_PY3K
     return module;

From c1d4feea6651fefb95ee4f8764be5bf294118368 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Mon, 27 Jul 2015 17:18:14 -0400
Subject: [PATCH 14/22] Py_UNICODE -> Unicode everywhere; bugfix for PEP 393.

---
 mwparserfromhell/parser/ctokenizer/common.h    |  2 +-
 mwparserfromhell/parser/ctokenizer/tok_parse.c | 53 +++++++++++++-------------
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h
index aa2b123..8c6e71c 100644
--- a/mwparserfromhell/parser/ctokenizer/common.h
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -45,7 +45,7 @@ SOFTWARE.
 
 /* Unicode support macros */
 
-#if defined(IS_PY3K) && PYTHON_MINOR_VERSION >= 3
+#if defined(IS_PY3K) && PY_MINOR_VERSION >= 3
 #define PEP_393
 #endif
 
diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c
index bd742fe..23cc246 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -53,9 +53,9 @@ static int Tokenizer_handle_dl_term(Tokenizer*);
 static int Tokenizer_parse_tag(Tokenizer*);
 
 /*
-    Determine whether the given Py_UNICODE is a marker.
+    Determine whether the given code point is a marker.
 */
-static int is_marker(Py_UNICODE this)
+static int is_marker(Unicode this)
 {
     int i;
 
@@ -420,7 +420,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
     static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
     Textbuffer* buffer;
     PyObject* scheme;
-    Py_UNICODE this;
+    Unicode this;
     int slashes, i;
 
     if (Tokenizer_push(self, LC_EXT_LINK_URI))
@@ -492,7 +492,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
     static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
     Textbuffer *scheme_buffer = Textbuffer_new(&self->text);
     PyObject *scheme;
-    Py_UNICODE chunk;
+    Unicode chunk;
     Py_ssize_t i;
     int slashes, j;
 
@@ -549,9 +549,8 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
 /*
     Handle text in a free external link, including trailing punctuation.
 */
-static int
-Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
-                                Textbuffer* tail, Py_UNICODE this)
+static int Tokenizer_handle_free_link_text(
+    Tokenizer* self, int* parens, Textbuffer* tail, Unicode this)
 {
     #define PUSH_TAIL_BUFFER(tail, error)                            \
         if (tail->length > 0) {                                      \
@@ -578,10 +577,10 @@ Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
     Return whether the current head is the end of a free link.
 */
 static int
-Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
+Tokenizer_is_free_link(Tokenizer* self, Unicode this, Unicode next)
 {
     // Built from Tokenizer_parse()'s end sentinels:
-    Py_UNICODE after = Tokenizer_read(self, 2);
+    Unicode after = Tokenizer_read(self, 2);
     uint64_t ctx = self->topstack->context;
 
     return (!this || this == '\n' || this == '[' || this == ']' ||
@@ -599,7 +598,7 @@ static PyObject*
 Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
                                      Textbuffer* extra)
 {
-    Py_UNICODE this, next;
+    Unicode this, next;
     int parens = 0;
 
     if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) :
@@ -898,7 +897,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
 static int Tokenizer_really_parse_entity(Tokenizer* self)
 {
     PyObject *kwargs, *charobj, *textobj;
-    Py_UNICODE this;
+    Unicode this;
     int numeric, hexadecimal, i, j, zeroes, test;
     char *valid, *text, *buffer, *def;
 
@@ -1073,7 +1072,7 @@ static int Tokenizer_parse_comment(Tokenizer* self)
 {
     Py_ssize_t reset = self->head + 3;
     PyObject *comment;
-    Py_UNICODE this;
+    Unicode this;
 
     self->head += 4;
     if (Tokenizer_push(self, 0))
@@ -1172,8 +1171,8 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
 /*
     Handle whitespace inside of an HTML open tag.
 */
-static int
-Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
+static int Tokenizer_handle_tag_space(
+    Tokenizer* self, TagData* data, Unicode text)
 {
     uint64_t ctx = data->context;
     uint64_t end_of_value = (ctx & TAG_ATTR_VALUE &&
@@ -1205,9 +1204,9 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
 /*
     Handle regular text inside of an HTML open tag.
 */
-static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
+static int Tokenizer_handle_tag_text(Tokenizer* self, Unicode text)
 {
-    Py_UNICODE next = Tokenizer_read(self, 1);
+    Unicode next = Tokenizer_read(self, 1);
 
     if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self))
         return Tokenizer_emit_char(self, text);
@@ -1223,8 +1222,8 @@ static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
 /*
     Handle all sorts of text data inside of an HTML open tag.
 */
-static int
-Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk)
+static int Tokenizer_handle_tag_data(
+    Tokenizer* self, TagData* data, Unicode chunk)
 {
     PyObject *trash;
     int first_time, escaped;
@@ -1400,7 +1399,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
 {
     Textbuffer* buffer;
     PyObject *buf_tmp, *end_tag, *start_tag;
-    Py_UNICODE this, next;
+    Unicode this, next;
     Py_ssize_t reset;
     int cmp;
 
@@ -1548,7 +1547,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
 {
     TagData *data = TagData_new(&self->text);
     PyObject *token, *text, *trash;
-    Py_UNICODE this, next;
+    Unicode this, next;
     int can_exit;
 
     if (!data)
@@ -1631,7 +1630,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
     Py_ssize_t reset = self->head + 1, pos = 0;
     Textbuffer* buf;
     PyObject *name, *tag;
-    Py_UNICODE this;
+    Unicode this;
 
     self->head += 2;
     buf = Textbuffer_new(&self->text);
@@ -1928,7 +1927,7 @@ static PyObject* Tokenizer_parse_style(Tokenizer* self)
 static int Tokenizer_handle_list_marker(Tokenizer* self)
 {
     PyObject *kwargs, *markup;
-    Py_UNICODE code = Tokenizer_read(self, 0);
+    Unicode code = Tokenizer_read(self, 0);
 
     if (code == ';')
         self->topstack->context |= LC_DLTERM;
@@ -1955,7 +1954,7 @@ static int Tokenizer_handle_list_marker(Tokenizer* self)
 */
 static int Tokenizer_handle_list(Tokenizer* self)
 {
-    Py_UNICODE marker = Tokenizer_read(self, 1);
+    Unicode marker = Tokenizer_read(self, 1);
 
     if (Tokenizer_handle_list_marker(self))
         return -1;
@@ -2113,7 +2112,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
 {
     TagData *data = TagData_new(&self->text);
     PyObject *padding, *trash;
-    Py_UNICODE this;
+    Unicode this;
     int can_exit;
 
     if (!data)
@@ -2413,7 +2412,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
     everything is safe, or -1 if the route must be failed.
 */
 static int
-Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
+Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Unicode data)
 {
     if (context & LC_FAIL_NEXT)
         return -1;
@@ -2498,7 +2497,7 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
 static int Tokenizer_has_leading_whitespace(Tokenizer* self)
 {
     int offset = 1;
-    Py_UNICODE current_character;
+    Unicode current_character;
     while (1) {
         current_character = Tokenizer_read_backwards(self, offset);
         if (!current_character || current_character == '\n')
@@ -2516,7 +2515,7 @@ static int Tokenizer_has_leading_whitespace(Tokenizer* self)
 PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
 {
     uint64_t this_context;
-    Py_UNICODE this, next, next_next, last;
+    Unicode this, next, next_next, last;
     PyObject* temp;
 
     if (push) {

From 1357da119d37eb893b5de44bfa5ff79ea464cb69 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 28 Jul 2015 02:11:18 -0400
Subject: [PATCH 15/22] Finish improved Unicode support for PEP 393.

---
 mwparserfromhell/parser/ctokenizer/common.h     |   5 +-
 mwparserfromhell/parser/ctokenizer/textbuffer.c | 220 +++++++++++++++++++-----
 2 files changed, 179 insertions(+), 46 deletions(-)

diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h
index 8c6e71c..abade02 100644
--- a/mwparserfromhell/parser/ctokenizer/common.h
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -40,8 +40,9 @@ SOFTWARE.
 #define uint64_t unsigned PY_LONG_LONG
 #endif
 
-#define malloc PyObject_Malloc  // XXX: yuck
-#define free   PyObject_Free
+#define malloc  PyObject_Malloc  // XXX: yuck
+#define realloc PyObject_Realloc
+#define free    PyObject_Free
 
 /* Unicode support macros */
 
diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.c b/mwparserfromhell/parser/ctokenizer/textbuffer.c
index 63d45d6..e028a58 100644
--- a/mwparserfromhell/parser/ctokenizer/textbuffer.c
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c
@@ -22,28 +22,95 @@ SOFTWARE.
 
 #include "textbuffer.h"
 
-#define TEXTBUFFER_BLOCKSIZE 1024
+#define INITIAL_CAPACITY 32
+#define RESIZE_FACTOR 2
+#define CONCAT_EXTRA 32
+
+/*
+    Internal allocation function for textbuffers.
+*/
+static int internal_alloc(Textbuffer* self, Unicode maxchar)
+{
+    self->capacity = INITIAL_CAPACITY;
+    self->length = 0;
+
+#ifdef PEP_393
+    self->object = PyUnicode_New(self->capacity, maxchar);
+    if (!self->object)
+        return -1;
+    self->kind = PyUnicode_KIND(self->object);
+    self->data = PyUnicode_DATA(self->object);
+#else
+    (void) maxchar;  // Unused
+    self->data = malloc(sizeof(Unicode) * self->capacity);
+    if (!self->data)
+        return -1;
+#endif
+
+    return 0;
+}
+
+/*
+    Internal deallocation function for textbuffers.
+*/
+static void internal_dealloc(Textbuffer* self)
+{
+#ifdef PEP_393
+    Py_DECREF(self->object);
+#else
+    free(self->data);
+#endif
+}
+
+/*
+    Internal resize function.
+*/
+static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
+{
+#ifdef PEP_393
+    PyObject *newobj;
+    void *newdata;
+
+    newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object));
+    if (!newobj)
+        return -1;
+    newdata = PyUnicode_DATA(newobj);
+    memcpy(newdata, self->data, self->length * self->kind);
+    Py_DECREF(self->object);
+    self->object = newobj;
+    self->data = newdata;
+#else
+    if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap)))
+        return -1;
+#endif
+
+    self->capacity = new_cap;
+    return 0;
+}
 
 /*
     Create a new textbuffer object.
 */
-Textbuffer* Textbuffer_new(void)
+Textbuffer* Textbuffer_new(TokenizerInput* text)
 {
-    Textbuffer* buffer = malloc(sizeof(Textbuffer));
+    Textbuffer* self = malloc(sizeof(Textbuffer));
+    Unicode maxchar = 0;
 
-    if (!buffer) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-    buffer->size = 0;
-    buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
-    if (!buffer->data) {
-        free(buffer);
-        PyErr_NoMemory();
-        return NULL;
-    }
-    buffer->prev = buffer->next = NULL;
-    return buffer;
+#ifdef PEP_393
+    maxchar = PyUnicode_MAX_CHAR_VALUE(text->object);
+#endif
+
+    if (!self)
+        goto fail_nomem;
+    if (internal_alloc(self, maxchar) < 0)
+        goto fail_dealloc;
+    return self;
+
+    fail_dealloc:
+    free(self);
+    fail_nomem:
+    PyErr_NoMemory();
+    return NULL;
 }
 
 /*
@@ -51,50 +118,115 @@ Textbuffer* Textbuffer_new(void)
 */
 void Textbuffer_dealloc(Textbuffer* self)
 {
-    Textbuffer* next;
+    internal_dealloc(self);
+    free(self);
+}
 
-    while (self) {
-        free(self->data);
-        next = self->next;
-        free(self);
-        self = next;
-    }
+/*
+    Reset a textbuffer to its initial, empty state.
+*/
+int Textbuffer_reset(Textbuffer* self)
+{
+    Unicode maxchar = 0;
+
+#ifdef PEP_393
+    maxchar = PyUnicode_MAX_CHAR_VALUE(self->object);
+#endif
+
+    internal_dealloc(self);
+    if (internal_alloc(self, maxchar))
+        return -1;
+    return 0;
 }
 
 /*
     Write a Unicode codepoint to the given textbuffer.
 */
-int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
+int Textbuffer_write(Textbuffer* self, Unicode code)
 {
-    Textbuffer* self = *this;
-
-    if (self->size == TEXTBUFFER_BLOCKSIZE) {
-        Textbuffer* new = Textbuffer_new();
-        if (!new)
+    if (self->length >= self->capacity) {
+        if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0)
             return -1;
-        new->next = self;
-        self->prev = new;
-        *this = self = new;
     }
-    self->data[self->size++] = code;
+
+#ifdef PEP_393
+    PyUnicode_WRITE(self->kind, self->data, self->length++, code);
+#else
+    self->data[self->length++] = code;
+#endif
+
     return 0;
 }
 
 /*
+    Read a Unicode codepoint from the given index of the given textbuffer.
+
+    This function does not check for bounds.
+*/
+Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index)
+{
+#ifdef PEP_393
+    return PyUnicode_READ(self->kind, self->data, index);
+#else
+    return self->data[index];
+#endif
+}
+
+/*
     Return the contents of the textbuffer as a Python Unicode object.
 */
 PyObject* Textbuffer_render(Textbuffer* self)
 {
-    PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
-    PyObject *left, *concat;
-
-    while (self->next) {
-        self = self->next;
-        left = PyUnicode_FromUnicode(self->data, self->size);
-        concat = PyUnicode_Concat(left, result);
-        Py_DECREF(left);
-        Py_DECREF(result);
-        result = concat;
+#ifdef PEP_393
+    return PyUnicode_FromKindAndData(self->kind, self->data, self->length);
+#else
+    return PyUnicode_FromUnicode(self->data, self->length);
+#endif
+}
+
+/*
+    Concatenate the 'other' textbuffer onto the end of the given textbuffer.
+*/
+int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
+{
+    Py_ssize_t newlen = self->length + other->length;
+
+    if (newlen > self->capacity) {
+        if (internal_resize(self, newlen + CONCAT_EXTRA) < 0)
+            return -1;
+    }
+
+#ifdef PEP_393
+    assert(self->kind == other->kind);
+    memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data,
+           other->length * other->kind);
+#else
+    memcpy(self->data + self->length, other->data,
+           other->length * sizeof(Unicode));
+#endif
+
+    self->length = newlen;
+    return 0;
+}
+
+/*
+    Reverse the contents of the given textbuffer.
+*/
+void Textbuffer_reverse(Textbuffer* self)
+{
+    Py_ssize_t i, mid = self->length / 2;
+    Unicode tmp;
+
+    for (i = 0; i < mid; i++) {
+#ifdef PEP_393
+        tmp = PyUnicode_READ(self->kind, self->data, i);
+        PyUnicode_WRITE(self->kind, self->data, i,
+                        PyUnicode_READ(self->kind, self->data, mid + i));
+        PyUnicode_WRITE(self->kind, self->data, mid + i, tmp);
+#else
+        tmp = self->data[i];
+        self->data[i] = self->data[mid + i];
+        self->data[mid + i] = tmp;
+#endif
     }
-    return result;
 }

From 8963c1f683244b730da1b5a1f669d7433328bba3 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 28 Jul 2015 03:17:30 -0400
Subject: [PATCH 16/22] Fix Textbuffer_reverse()

---
 mwparserfromhell/parser/ctokenizer/textbuffer.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.c b/mwparserfromhell/parser/ctokenizer/textbuffer.c
index e028a58..0c711c5 100644
--- a/mwparserfromhell/parser/ctokenizer/textbuffer.c
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c
@@ -214,19 +214,19 @@ int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
 */
 void Textbuffer_reverse(Textbuffer* self)
 {
-    Py_ssize_t i, mid = self->length / 2;
+    Py_ssize_t i, end = self->length - 1;
     Unicode tmp;
 
-    for (i = 0; i < mid; i++) {
+    for (i = 0; i < self->length / 2; i++) {
 #ifdef PEP_393
         tmp = PyUnicode_READ(self->kind, self->data, i);
         PyUnicode_WRITE(self->kind, self->data, i,
-                        PyUnicode_READ(self->kind, self->data, mid + i));
-        PyUnicode_WRITE(self->kind, self->data, mid + i, tmp);
+                        PyUnicode_READ(self->kind, self->data, end - i));
+        PyUnicode_WRITE(self->kind, self->data, end - i, tmp);
 #else
         tmp = self->data[i];
-        self->data[i] = self->data[mid + i];
-        self->data[mid + i] = tmp;
+        self->data[i] = self->data[end - i];
+        self->data[end - i] = tmp;
 #endif
     }
 }

From 8e7a600b51bdbf24e1b4987209518ea94d039345 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 28 Jul 2015 03:52:41 -0400
Subject: [PATCH 17/22] Fix use-after-free bug.

---
 mwparserfromhell/parser/ctokenizer/tokenizer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c
index f12b35a..2b3d321 100644
--- a/mwparserfromhell/parser/ctokenizer/tokenizer.c
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c
@@ -139,6 +139,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
     int skip_style_tags = 0;
 
     if (PyArg_ParseTuple(args, "U|ii", &input, &context, &skip_style_tags)) {
+        Py_INCREF(input);
         if (load_tokenizer_text(&self->text, input))
             return NULL;
     }

From 6503b5682f56a14a64437a8b1243306eb1c3a437 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 28 Jul 2015 04:15:24 -0400
Subject: [PATCH 18/22] Finish C tokenizer refactoring (fixes #37)

---
 CHANGELOG          | 5 ++++-
 docs/changelog.rst | 8 +++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 47d4331..629ab82 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -11,7 +11,10 @@ v0.4.1 (unreleased):
 - Fixed parser bugs involving:
   - templates with completely blank names;
   - templates with newlines and comments.
-- Heavy refactoring and fixes to the C tokenizer.
+- Heavy refactoring and fixes to the C tokenizer, including:
+  - corrected a design flaw in text handling, allowing for substantial speed
+    improvements when parsing long strings of plain text;
+  - implemented new Python 3.3 PEP 393 Unicode APIs.
 - Fixed some bugs in the release scripts.
 
 v0.4 (released May 23, 2015):
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 6b5e32d..05fc1b8 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -20,7 +20,13 @@ Unreleased
   - templates with completely blank names;
   - templates with newlines and comments.
 
-- Heavy refactoring and fixes to the C tokenizer.
+- Heavy refactoring and fixes to the C tokenizer, including:
+
+  - corrected a design flaw in text handling, allowing for substantial speed
+    improvements when parsing long strings of plain text;
+  - implemented new Python 3.3
+    `PEP 393 <https://www.python.org/dev/peps/pep-0393/>`_ Unicode APIs.
+
 - Fixed some bugs in the release scripts.
 
 v0.4

From ab9f6a97fbcbde7ddb395d826f0346aff116e384 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Tue, 28 Jul 2015 23:35:33 -0400
Subject: [PATCH 19/22] Use weakrefs for SmartList children; remove
 _ListProxy.detach().

---
 CHANGELOG                      |  2 ++
 docs/changelog.rst             |  6 ++++--
 mwparserfromhell/smart_list.py | 45 ++++++++++++++++--------------------------
 tests/test_smart_list.py       | 10 +---------
 4 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 629ab82..51a3c5a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -8,6 +8,8 @@ v0.4.1 (unreleased):
   includes when denoting tags, but not comments.
 - Fixed the behavior of preserve_spacing in Template.add() and keep_field in
   Template.remove() on parameters with hidden keys.
+- Removed _ListProxy.detach(). SmartLists now use weak references and their
+  children are garbage-collected properly.
 - Fixed parser bugs involving:
   - templates with completely blank names;
   - templates with newlines and comments.
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 05fc1b8..39b9ab3 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -13,8 +13,10 @@ Unreleased
 - Added support for Python 3.5.
 - ``<`` and ``>`` are now disallowed in wikilink titles and template names.
   This includes when denoting tags, but not comments.
-- Fixed the behavior of *preserve_spacing* in :func:`~.Template.add` and
-  *keep_field* in :func:`~.Template.remove` on parameters with hidden keys.
+- Fixed the behavior of *preserve_spacing* in :meth:`.Template.add` and
+  *keep_field* in :meth:`.Template.remove` on parameters with hidden keys.
+- Removed :meth:`._ListProxy.detach`. :class:`.SmartList`\ s now use weak
+  references and their children are garbage-collected properly.
 - Fixed parser bugs involving:
 
   - templates with completely blank names;
diff --git a/mwparserfromhell/smart_list.py b/mwparserfromhell/smart_list.py
index c552050..a0f858f 100644
--- a/mwparserfromhell/smart_list.py
+++ b/mwparserfromhell/smart_list.py
@@ -27,6 +27,7 @@ reflect changes made to the main list, and vice-versa.
 """
 
 from __future__ import unicode_literals
+from weakref import ref
 
 from .compat import maxsize, py3k
 
@@ -80,13 +81,6 @@ class SmartList(_SliceNormalizerMixIn, list):
         [2, 3, 4]
         >>> parent
         [0, 1, 2, 3, 4]
-
-    The parent needs to keep a list of its children in order to update them,
-    which prevents them from being garbage-collected. If you are keeping the
-    parent around for a while but creating many children, it is advisable to
-    call :meth:`._ListProxy.detach` when you're finished with them. Certain
-    parent methods, like :meth:`reverse` and :meth:`sort`, will do this
-    automatically.
     """
 
     def __init__(self, iterable=None):
@@ -102,7 +96,8 @@ class SmartList(_SliceNormalizerMixIn, list):
         key = self._normalize_slice(key)
         sliceinfo = [key.start, key.stop, key.step]
         child = _ListProxy(self, sliceinfo)
-        self._children[id(child)] = (child, sliceinfo)
+        child_ref = ref(child, self._delete_child)
+        self._children[id(child_ref)] = (child_ref, sliceinfo)
         return child
 
     def __setitem__(self, key, item):
@@ -112,13 +107,14 @@ class SmartList(_SliceNormalizerMixIn, list):
         super(SmartList, self).__setitem__(key, item)
         key = self._normalize_slice(key)
         diff = len(item) + (key.start - key.stop) // key.step
+        if not diff:
+            return
         values = self._children.values if py3k else self._children.itervalues
-        if diff:
-            for child, (start, stop, step) in values():
-                if start > key.stop:
-                    self._children[id(child)][1][0] += diff
-                if stop >= key.stop and stop != maxsize:
-                    self._children[id(child)][1][1] += diff
+        for child, (start, stop, step) in values():
+            if start > key.stop:
+                self._children[id(child)][1][0] += diff
+            if stop >= key.stop and stop != maxsize:
+                self._children[id(child)][1][1] += diff
 
     def __delitem__(self, key):
         super(SmartList, self).__delitem__(key)
@@ -154,10 +150,16 @@ class SmartList(_SliceNormalizerMixIn, list):
         self.extend(other)
         return self
 
+    def _delete_child(self, child_ref):
+        """Remove a child reference that is about to be garbage-collected."""
+        del self._children[id(child_ref)]
+
     def _detach_children(self):
+        """Remove all children and give them independent parent copies."""
         children = [val[0] for val in self._children.values()]
         for child in children:
-            child.detach()
+            child()._parent = list(self)
+        self._children.clear()
 
     @inheritdoc
     def append(self, item):
@@ -226,7 +228,6 @@ class _ListProxy(_SliceNormalizerMixIn, list):
         super(_ListProxy, self).__init__()
         self._parent = parent
         self._sliceinfo = sliceinfo
-        self._detached = False
 
     def __repr__(self):
         return repr(self._render())
@@ -456,17 +457,5 @@ class _ListProxy(_SliceNormalizerMixIn, list):
             item.sort(**kwargs)
             self._parent[self._start:self._stop:self._step] = item
 
-    def detach(self):
-        """Detach the child so it operates like a normal list.
-
-        This allows children to be properly garbage-collected if their parent
-        is being kept around for a long time. This method has no effect if the
-        child is already detached.
-        """
-        if not self._detached:
-            self._parent._children.pop(id(self))
-            self._parent = list(self._parent)
-            self._detached = True
-
 
 del inheritdoc
diff --git a/tests/test_smart_list.py b/tests/test_smart_list.py
index a7106e4..794bfc8 100644
--- a/tests/test_smart_list.py
+++ b/tests/test_smart_list.py
@@ -389,28 +389,20 @@ class TestSmartList(unittest.TestCase):
         self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
         self.assertEqual([4, 3, 2, 1.9, 1.8], child2)
 
-        child1.detach()
+        del child1
         self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], parent)
-        self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
         self.assertEqual([4, 3, 2, 1.9, 1.8], child2)
         self.assertEqual(1, len(parent._children))
 
         parent.remove(1.9)
         parent.remove(1.8)
         self.assertEqual([1, 4, 3, 2, 5, 6, 7, 8, 8.1, 8.2], parent)
-        self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
         self.assertEqual([4, 3, 2], child2)
 
         parent.reverse()
         self.assertEqual([8.2, 8.1, 8, 7, 6, 5, 2, 3, 4, 1], parent)
-        self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
         self.assertEqual([4, 3, 2], child2)
         self.assertEqual(0, len(parent._children))
 
-        child2.detach()
-        self.assertEqual([8.2, 8.1, 8, 7, 6, 5, 2, 3, 4, 1], parent)
-        self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
-        self.assertEqual([4, 3, 2], child2)
-
 if __name__ == "__main__":
     unittest.main(verbosity=2)

From 67214b7c05f9fa594973950d44a2d0e2078d7405 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Wed, 29 Jul 2015 23:24:46 -0400
Subject: [PATCH 20/22] Add some failing tests for SmartList features.

---
 tests/test_smart_list.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/tests/test_smart_list.py b/tests/test_smart_list.py
index 794bfc8..4a27a04 100644
--- a/tests/test_smart_list.py
+++ b/tests/test_smart_list.py
@@ -52,6 +52,7 @@ class TestSmartList(unittest.TestCase):
         self.assertEqual([0, 1, 2], list1[:3])
         self.assertEqual([0, 1, 2, 3, "one", "two"], list1[:])
         self.assertEqual([3, "one", "two"], list1[3:])
+        self.assertEqual([3, "one", "two"], list1[3:100])
         self.assertEqual(["one", "two"], list1[-2:])
         self.assertEqual([0, 1], list1[:-4])
         self.assertEqual([], list1[6:])
@@ -389,18 +390,33 @@ class TestSmartList(unittest.TestCase):
         self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], child1)
         self.assertEqual([4, 3, 2, 1.9, 1.8], child2)
 
+        child3 = parent[9:]
+        self.assertEqual([8, 8.1, 8.2], child3)
+
+        del parent[8:]
+        self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent)
+        self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6], child1)
+        self.assertEqual([4, 3, 2, 1.9, 1.8], child2)
+        self.assertEqual([], child3)
+
         del child1
-        self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6, 7, 8, 8.1, 8.2], parent)
+        self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent)
+        self.assertEqual([4, 3, 2, 1.9, 1.8], child2)
+        self.assertEqual([], child3)
+        self.assertEqual(2, len(parent._children))
+
+        del child3
+        self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent)
         self.assertEqual([4, 3, 2, 1.9, 1.8], child2)
         self.assertEqual(1, len(parent._children))
 
         parent.remove(1.9)
         parent.remove(1.8)
-        self.assertEqual([1, 4, 3, 2, 5, 6, 7, 8, 8.1, 8.2], parent)
+        self.assertEqual([1, 4, 3, 2, 5, 6], parent)
         self.assertEqual([4, 3, 2], child2)
 
         parent.reverse()
-        self.assertEqual([8.2, 8.1, 8, 7, 6, 5, 2, 3, 4, 1], parent)
+        self.assertEqual([6, 5, 2, 3, 4, 1], parent)
         self.assertEqual([4, 3, 2], child2)
         self.assertEqual(0, len(parent._children))
 

From 480f70ff1ac2898ba97d05c554f940c0aeecd609 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Wed, 29 Jul 2015 23:25:40 -0400
Subject: [PATCH 21/22] Fix various SmartList bugs.

---
 CHANGELOG                      |  2 ++
 docs/changelog.rst             |  2 ++
 mwparserfromhell/compat.py     |  2 --
 mwparserfromhell/smart_list.py | 57 +++++++++++++++++++-----------------------
 4 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 51a3c5a..d878d0d 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -17,6 +17,8 @@ v0.4.1 (unreleased):
   - corrected a design flaw in text handling, allowing for substantial speed
     improvements when parsing long strings of plain text;
   - implemented new Python 3.3 PEP 393 Unicode APIs.
+- Fixed various bugs in SmartList, including one that was causing memory issues
+  on 64-bit builds of Python 2 on Windows.
 - Fixed some bugs in the release scripts.
 
 v0.4 (released May 23, 2015):
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 39b9ab3..f64aba6 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -29,6 +29,8 @@ Unreleased
   - implemented new Python 3.3
     `PEP 393 <https://www.python.org/dev/peps/pep-0393/>`_ Unicode APIs.
 
+- Fixed various bugs in :class:`.SmartList`, including one that was causing
+  memory issues on 64-bit builds of Python 2 on Windows.
 - Fixed some bugs in the release scripts.
 
 v0.4
diff --git a/mwparserfromhell/compat.py b/mwparserfromhell/compat.py
index 590a271..7a83cd1 100644
--- a/mwparserfromhell/compat.py
+++ b/mwparserfromhell/compat.py
@@ -18,14 +18,12 @@ if py3k:
     bytes = bytes
     str = str
     range = range
-    maxsize = sys.maxsize
     import html.entities as htmlentities
 
 else:
     bytes = str
     str = unicode
     range = xrange
-    maxsize = sys.maxint
     import htmlentitydefs as htmlentities
 
 del sys
diff --git a/mwparserfromhell/smart_list.py b/mwparserfromhell/smart_list.py
index a0f858f..1ff1cc2 100644
--- a/mwparserfromhell/smart_list.py
+++ b/mwparserfromhell/smart_list.py
@@ -27,9 +27,10 @@ reflect changes made to the main list, and vice-versa.
 """
 
 from __future__ import unicode_literals
+from sys import maxsize
 from weakref import ref
 
-from .compat import maxsize, py3k
+from .compat import py3k
 
 __all__ = ["SmartList"]
 
@@ -46,16 +47,16 @@ def inheritdoc(method):
 class _SliceNormalizerMixIn(object):
     """MixIn that provides a private method to normalize slices."""
 
-    def _normalize_slice(self, key):
+    def _normalize_slice(self, key, clamp=False):
         """Return a slice equivalent to the input *key*, standardized."""
-        if key.start is not None:
+        if key.start is None:
+            start = 0
+        else:
             start = (len(self) + key.start) if key.start < 0 else key.start
+        if key.stop is None or key.stop == maxsize:
+            stop = len(self) if clamp else None
         else:
-            start = 0
-        if key.stop is not None:
             stop = (len(self) + key.stop) if key.stop < 0 else key.stop
-        else:
-            stop = maxsize
         return slice(start, stop, key.step or 1)
 
 
@@ -93,7 +94,7 @@ class SmartList(_SliceNormalizerMixIn, list):
     def __getitem__(self, key):
         if not isinstance(key, slice):
             return super(SmartList, self).__getitem__(key)
-        key = self._normalize_slice(key)
+        key = self._normalize_slice(key, clamp=False)
         sliceinfo = [key.start, key.stop, key.step]
         child = _ListProxy(self, sliceinfo)
         child_ref = ref(child, self._delete_child)
@@ -105,7 +106,7 @@ class SmartList(_SliceNormalizerMixIn, list):
             return super(SmartList, self).__setitem__(key, item)
         item = list(item)
         super(SmartList, self).__setitem__(key, item)
-        key = self._normalize_slice(key)
+        key = self._normalize_slice(key, clamp=True)
         diff = len(item) + (key.start - key.stop) // key.step
         if not diff:
             return
@@ -113,13 +114,13 @@ class SmartList(_SliceNormalizerMixIn, list):
         for child, (start, stop, step) in values():
             if start > key.stop:
                 self._children[id(child)][1][0] += diff
-            if stop >= key.stop and stop != maxsize:
+            if stop is not None and stop >= key.stop:
                 self._children[id(child)][1][1] += diff
 
     def __delitem__(self, key):
         super(SmartList, self).__delitem__(key)
         if isinstance(key, slice):
-            key = self._normalize_slice(key)
+            key = self._normalize_slice(key, clamp=True)
         else:
             key = slice(key, key + 1, 1)
         diff = (key.stop - key.start) // key.step
@@ -127,7 +128,7 @@ class SmartList(_SliceNormalizerMixIn, list):
         for child, (start, stop, step) in values():
             if start > key.start:
                 self._children[id(child)][1][0] -= diff
-            if stop >= key.stop and stop != maxsize:
+            if stop is not None and stop >= key.stop:
                 self._children[id(child)][1][1] -= diff
 
     if not py3k:
@@ -274,24 +275,20 @@ class _ListProxy(_SliceNormalizerMixIn, list):
 
     def __getitem__(self, key):
         if isinstance(key, slice):
-            key = self._normalize_slice(key)
-            if key.stop == maxsize:
-                keystop = self._stop
-            else:
-                keystop = key.stop + self._start
-            adjusted = slice(key.start + self._start, keystop, key.step)
+            key = self._normalize_slice(key, clamp=True)
+            keystart = min(self._start + key.start, self._stop)
+            keystop = min(self._start + key.stop, self._stop)
+            adjusted = slice(keystart, keystop, key.step)
             return self._parent[adjusted]
         else:
             return self._render()[key]
 
     def __setitem__(self, key, item):
         if isinstance(key, slice):
-            key = self._normalize_slice(key)
-            if key.stop == maxsize:
-                keystop = self._stop
-            else:
-                keystop = key.stop + self._start
-            adjusted = slice(key.start + self._start, keystop, key.step)
+            key = self._normalize_slice(key, clamp=True)
+            keystart = min(self._start + key.start, self._stop)
+            keystop = min(self._start + key.stop, self._stop)
+            adjusted = slice(keystart, keystop, key.step)
             self._parent[adjusted] = item
         else:
             length = len(self)
@@ -303,12 +300,10 @@ class _ListProxy(_SliceNormalizerMixIn, list):
 
     def __delitem__(self, key):
         if isinstance(key, slice):
-            key = self._normalize_slice(key)
-            if key.stop == maxsize:
-                keystop = self._stop
-            else:
-                keystop = key.stop + self._start
-            adjusted = slice(key.start + self._start, keystop, key.step)
+            key = self._normalize_slice(key, clamp=True)
+            keystart = min(self._start + key.start, self._stop)
+            keystop = min(self._start + key.stop, self._stop)
+            adjusted = slice(keystart, keystop, key.step)
             del self._parent[adjusted]
         else:
             length = len(self)
@@ -371,7 +366,7 @@ class _ListProxy(_SliceNormalizerMixIn, list):
     @property
     def _stop(self):
         """The ending index of this list, exclusive."""
-        if self._sliceinfo[1] == maxsize:
+        if self._sliceinfo[1] is None:
             return len(self._parent)
         return self._sliceinfo[1]
 

From 8ae14ab3ad7ba8afbeeb822a38af6f61655f8d24 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Thu, 30 Jul 2015 02:22:26 -0400
Subject: [PATCH 22/22] Finalize Windows build/release code (closes #95)

---
 appveyor.yml            | 8 +++++++-
 scripts/release.sh      | 9 +--------
 scripts/win_wrapper.cmd | 4 ----
 3 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index 9d0d8c8..ffefaee 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -14,6 +14,9 @@ environment:
     WRAPPER: "cmd /E:ON /V:ON /C .\\scripts\\win_wrapper.cmd"
     PIP:     "%WRAPPER% %PYTHON%\\Scripts\\pip.exe"
     SETUPPY: "%WRAPPER% %PYTHON%\\python setup.py --with-extension"
+    PYPI_USERNAME: "earwigbot"
+    PYPI_PASSWORD:
+      secure: gOIcvPxSC2ujuhwOzwj3v8xjq3CCYd8keFWVnguLM+gcL0e02qshDHy7gwZZwj0+
 
   matrix:
     - PYTHON:         "C:\\Python27"
@@ -41,7 +44,7 @@ environment:
       PYTHON_ARCH:    "64"
 
 install:
-  - "%PIP% install wheel"
+  - "%PIP% install wheel twine"
 
 build_script:
   - "%SETUPPY% build"
@@ -52,6 +55,9 @@ test_script:
 after_test:
   - "%SETUPPY% bdist_wheel"
 
+on_success:
+  - "twine upload dist\\* -u %PYPI_USERNAME% -p %PYPI_PASSWORD%"
+
 artifacts:
   - path: dist\*
 
diff --git a/scripts/release.sh b/scripts/release.sh
index c256c7c..dd4e1d4 100755
--- a/scripts/release.sh
+++ b/scripts/release.sh
@@ -80,18 +80,12 @@ upload_to_pypi() {
     echo " done."
 }
 
-windows_build() {
-    echo "PyPI: building/uploading Windows binaries..."
-    echo "*** Run in Windows: ./scripts/win_build.py"
-    echo "*** Press enter when done."
-    read
-}
-
 post_release() {
     echo
     echo "*** Release completed."
     echo "*** Update: https://github.com/earwig/mwparserfromhell/releases/tag/v$VERSION"
     echo "*** Verify: https://pypi.python.org/pypi/mwparserfromhell"
+    echo "*** Verify: https://ci.appveyor.com/project/earwig/mwparserfromhell"
     echo "*** Verify: https://mwparserfromhell.readthedocs.org"
     echo "*** Press enter to sanity-check the release."
     read
@@ -164,7 +158,6 @@ update_changelog
 update_docs_changelog
 do_git_stuff
 upload_to_pypi
-windows_build
 post_release
 test_release
 
diff --git a/scripts/win_wrapper.cmd b/scripts/win_wrapper.cmd
index 3a472bc..13a4b1f 100644
--- a/scripts/win_wrapper.cmd
+++ b/scripts/win_wrapper.cmd
@@ -33,15 +33,11 @@ IF %MAJOR_PYTHON_VERSION% == "2" (
 )
 
 IF "%PYTHON_ARCH%"=="64" (
-    ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture
     SET DISTUTILS_USE_SDK=1
     SET MSSdk=1
     "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION%
     "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release
-    ECHO Executing: %COMMAND_TO_RUN%
     call %COMMAND_TO_RUN% || EXIT 1
 ) ELSE (
-    ECHO Using default MSVC build environment for 32 bit architecture
-    ECHO Executing: %COMMAND_TO_RUN%
     call %COMMAND_TO_RUN% || EXIT 1
 )