From 0e547aa416f76970fc09092f110e3367bced99fd Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Sun, 14 Jun 2015 17:40:10 -0400
Subject: [PATCH] Begin splitting up C tokenizer.

---
 CHANGELOG                                          |   1 +
 docs/changelog.rst                                 |   1 +
 mwparserfromhell/parser/ctokenizer/common.h        |  40 +++++++++
 mwparserfromhell/parser/ctokenizer/textbuffer.c    | 100 +++++++++++++++++++++
 mwparserfromhell/parser/ctokenizer/textbuffer.h    |  40 +++++++++
 .../parser/{ => ctokenizer}/tokenizer.c            |  70 ---------------
 .../parser/{ => ctokenizer}/tokenizer.h            |  32 +------
 setup.py                                           |  11 ++-
 8 files changed, 191 insertions(+), 104 deletions(-)
 create mode 100644 mwparserfromhell/parser/ctokenizer/common.h
 create mode 100644 mwparserfromhell/parser/ctokenizer/textbuffer.c
 create mode 100644 mwparserfromhell/parser/ctokenizer/textbuffer.h
 rename mwparserfromhell/parser/{ => ctokenizer}/tokenizer.c (98%)
 rename mwparserfromhell/parser/{ => ctokenizer}/tokenizer.h (95%)

diff --git a/CHANGELOG b/CHANGELOG
index c49aaf7..7ad2930 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -6,6 +6,7 @@ v0.4.1 (unreleased):
 - Added support for Python 3.5.
 - '<' and '>' are now disallowed in wikilink titles and template names. This
   includes when denoting tags, but not comments.
+- Heavy refactoring and fixes to the C tokenizer.
 - Fixed some bugs in the release scripts.
 
 v0.4 (released May 23, 2015):
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 3217a35..2944992 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -13,6 +13,7 @@ Unreleased
 - Added support for Python 3.5.
 - ``<`` and ``>`` are now disallowed in wikilink titles and template names.
   This includes when denoting tags, but not comments.
+- Heavy refactoring and fixes to the C tokenizer.
 - Fixed some bugs in the release scripts.
 
 v0.4
diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h
new file mode 100644
index 0000000..2ed5a02
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -0,0 +1,40 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#ifndef PY_SSIZE_T_CLEAN
+#define PY_SSIZE_T_CLEAN
+#endif
+
+#include <Python.h>
+#include <structmember.h>
+#include <bytesobject.h>
+
+#if PY_MAJOR_VERSION >= 3
+#define IS_PY3K
+#endif
+
+#ifndef uint64_t
+#define uint64_t unsigned PY_LONG_LONG
+#endif
+
+#define malloc PyObject_Malloc
+#define free   PyObject_Free
diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.c b/mwparserfromhell/parser/ctokenizer/textbuffer.c
new file mode 100644
index 0000000..63d45d6
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c
@@ -0,0 +1,100 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "textbuffer.h"
+
+#define TEXTBUFFER_BLOCKSIZE 1024
+
+/*
+    Create a new textbuffer object.
+*/
+Textbuffer* Textbuffer_new(void)
+{
+    Textbuffer* buffer = malloc(sizeof(Textbuffer));
+
+    if (!buffer) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    buffer->size = 0;
+    buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
+    if (!buffer->data) {
+        free(buffer);
+        PyErr_NoMemory();
+        return NULL;
+    }
+    buffer->prev = buffer->next = NULL;
+    return buffer;
+}
+
+/*
+    Deallocate the given textbuffer.
+*/
+void Textbuffer_dealloc(Textbuffer* self)
+{
+    Textbuffer* next;
+
+    while (self) {
+        free(self->data);
+        next = self->next;
+        free(self);
+        self = next;
+    }
+}
+
+/*
+    Write a Unicode codepoint to the given textbuffer.
+*/
+int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
+{
+    Textbuffer* self = *this;
+
+    if (self->size == TEXTBUFFER_BLOCKSIZE) {
+        Textbuffer* new = Textbuffer_new();
+        if (!new)
+            return -1;
+        new->next = self;
+        self->prev = new;
+        *this = self = new;
+    }
+    self->data[self->size++] = code;
+    return 0;
+}
+
+/*
+    Return the contents of the textbuffer as a Python Unicode object.
+*/
+PyObject* Textbuffer_render(Textbuffer* self)
+{
+    PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
+    PyObject *left, *concat;
+
+    while (self->next) {
+        self = self->next;
+        left = PyUnicode_FromUnicode(self->data, self->size);
+        concat = PyUnicode_Concat(left, result);
+        Py_DECREF(left);
+        Py_DECREF(result);
+        result = concat;
+    }
+    return result;
+}
diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.h b/mwparserfromhell/parser/ctokenizer/textbuffer.h
new file mode 100644
index 0000000..36b2207
--- /dev/null
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h
@@ -0,0 +1,40 @@
+/*
+Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+
+#include "common.h"
+
+/* Structs */
+
+struct Textbuffer {
+    Py_ssize_t size;
+    Py_UNICODE* data;
+    struct Textbuffer* prev;
+    struct Textbuffer* next;
+};
+typedef struct Textbuffer Textbuffer;
+
+/* Functions */
+
+Textbuffer* Textbuffer_new(void);
+void Textbuffer_dealloc(Textbuffer*);
+int Textbuffer_write(Textbuffer**, Py_UNICODE);
+PyObject* Textbuffer_render(Textbuffer*);
diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c
similarity index 98%
rename from mwparserfromhell/parser/tokenizer.c
rename to mwparserfromhell/parser/ctokenizer/tokenizer.c
index dd11d16..2bce247 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c
@@ -1,5 +1,4 @@
 /*
-Tokenizer for MWParserFromHell
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ -89,75 +88,6 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr)
     return lowered;
 }
 
-static Textbuffer* Textbuffer_new(void)
-{
-    Textbuffer* buffer = malloc(sizeof(Textbuffer));
-
-    if (!buffer) {
-        PyErr_NoMemory();
-        return NULL;
-    }
-    buffer->size = 0;
-    buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
-    if (!buffer->data) {
-        free(buffer);
-        PyErr_NoMemory();
-        return NULL;
-    }
-    buffer->prev = buffer->next = NULL;
-    return buffer;
-}
-
-static void Textbuffer_dealloc(Textbuffer* self)
-{
-    Textbuffer* next;
-
-    while (self) {
-        free(self->data);
-        next = self->next;
-        free(self);
-        self = next;
-    }
-}
-
-/*
-    Write a Unicode codepoint to the given textbuffer.
-*/
-static int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
-{
-    Textbuffer* self = *this;
-
-    if (self->size == TEXTBUFFER_BLOCKSIZE) {
-        Textbuffer* new = Textbuffer_new();
-        if (!new)
-            return -1;
-        new->next = self;
-        self->prev = new;
-        *this = self = new;
-    }
-    self->data[self->size++] = code;
-    return 0;
-}
-
-/*
-    Return the contents of the textbuffer as a Python Unicode object.
-*/
-static PyObject* Textbuffer_render(Textbuffer* self)
-{
-    PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
-    PyObject *left, *concat;
-
-    while (self->next) {
-        self = self->next;
-        left = PyUnicode_FromUnicode(self->data, self->size);
-        concat = PyUnicode_Concat(left, result);
-        Py_DECREF(left);
-        Py_DECREF(result);
-        result = concat;
-    }
-    return result;
-}
-
 static TagData* TagData_new(void)
 {
     TagData *self = malloc(sizeof(TagData));
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/ctokenizer/tokenizer.h
similarity index 95%
rename from mwparserfromhell/parser/tokenizer.h
rename to mwparserfromhell/parser/ctokenizer/tokenizer.h
index 102fecd..66f1e90 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.h
@@ -1,5 +1,4 @@
 /*
-Tokenizer Header File for MWParserFromHell
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ -21,25 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */
 
-#ifndef PY_SSIZE_T_CLEAN
-#define PY_SSIZE_T_CLEAN
-#endif
-
-#include <Python.h>
 #include <math.h>
-#include <structmember.h>
-#include <bytesobject.h>
 
-#if PY_MAJOR_VERSION >= 3
-#define IS_PY3K
-#endif
-
-#ifndef uint64_t
-#define uint64_t unsigned PY_LONG_LONG
-#endif
-
-#define malloc PyObject_Malloc
-#define free   PyObject_Free
+#include "common.h"
+#include "textbuffer.h"
 
 #define DIGITS    "0123456789"
 #define HEXDIGITS "0123456789abcdefABCDEF"
@@ -50,7 +34,6 @@ static const char MARKERS[] = {
     '-', '!', '\n', '\0'};
 
 #define NUM_MARKERS 19
-#define TEXTBUFFER_BLOCKSIZE 1024
 #define MAX_DEPTH 40
 #define MAX_CYCLES 100000
 #define MAX_BRACES 255
@@ -196,13 +179,6 @@ static PyObject* TagCloseClose;
 
 /* Miscellaneous structs: */
 
-struct Textbuffer {
-    Py_ssize_t size;
-    Py_UNICODE* data;
-    struct Textbuffer* prev;
-    struct Textbuffer* next;
-};
-
 struct Stack {
     PyObject* stack;
     uint64_t context;
@@ -224,7 +200,6 @@ typedef struct {
     Py_ssize_t reset;
 } TagData;
 
-typedef struct Textbuffer Textbuffer;
 typedef struct Stack Stack;
 
 
@@ -268,9 +243,6 @@ typedef struct {
 
 /* Function prototypes: */
 
-static Textbuffer* Textbuffer_new(void);
-static void Textbuffer_dealloc(Textbuffer*);
-
 static TagData* TagData_new(void);
 static void TagData_dealloc(TagData*);
 
diff --git a/setup.py b/setup.py
index dcdd563..1bca436 100644
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,7 @@
 
 from __future__ import print_function
 from distutils.errors import DistutilsError, CCompilerError
+from glob import glob
 from os import environ
 import sys
 
@@ -39,10 +40,6 @@ from mwparserfromhell.compat import py26, py3k
 with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp:
     long_docs = fp.read()
 
-tokenizer = Extension("mwparserfromhell.parser._tokenizer",
-                      sources=["mwparserfromhell/parser/tokenizer.c"],
-                      depends=["mwparserfromhell/parser/tokenizer.h"])
-
 use_extension = True
 fallback = True
 
@@ -75,6 +72,12 @@ def build_ext_patched(self):
 if fallback:
     build_ext.run, build_ext_original = build_ext_patched, build_ext.run
 
+# Project-specific part begins here:
+
+tokenizer = Extension("mwparserfromhell.parser._tokenizer",
+                      sources=glob("mwparserfromhell/parser/ctokenizer/*.c"),
+                      depends=glob("mwparserfromhell/parser/ctokenizer/*.h"))
+
 setup(
     name = "mwparserfromhell",
     packages = find_packages(exclude=("tests",)),