Begin splitting up C tokenizer.

9 anni fa · 0e547aa416
--- a/+ 1
+++ b/+ 1
@@ -6,6 +6,7 @@ v0.4.1 (unreleased):
 - Added support for Python 3.5.
 - '<' and '>' are now disallowed in wikilink titles and template names. This
  includes when denoting tags, but not comments.
 - Heavy refactoring and fixes to the C tokenizer.
 - Fixed some bugs in the release scripts.

 v0.4 (released May 23, 2015):
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -13,6 +13,7 @@ Unreleased
 - Added support for Python 3.5.
 - ``<`` and ``>`` are now disallowed in wikilink titles and template names.
  This includes when denoting tags, but not comments.
 - Heavy refactoring and fixes to the C tokenizer.
 - Fixed some bugs in the release scripts.

 v0.4
--- a/mwparserfromhell/parser/ctokenizer/common.h
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -0,0 +1,40 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #ifndef PY_SSIZE_T_CLEAN
 #define PY_SSIZE_T_CLEAN
 #endif

 #include <Python.h>
 #include <structmember.h>
 #include <bytesobject.h>

 #if PY_MAJOR_VERSION >= 3
 #define IS_PY3K
 #endif

 #ifndef uint64_t
 #define uint64_t unsigned PY_LONG_LONG
 #endif

 #define malloc PyObject_Malloc
 #define free   PyObject_Free
--- a/mwparserfromhell/parser/ctokenizer/textbuffer.c
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c
@@ -0,0 +1,100 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #include "textbuffer.h"

 #define TEXTBUFFER_BLOCKSIZE 1024

 /*
    Create a new textbuffer object.
 */
 Textbuffer* Textbuffer_new(void)
 {
    Textbuffer* buffer = malloc(sizeof(Textbuffer));

    if (!buffer) {
        PyErr_NoMemory();
        return NULL;
    }
    buffer->size = 0;
    buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
    if (!buffer->data) {
        free(buffer);
        PyErr_NoMemory();
        return NULL;
    }
    buffer->prev = buffer->next = NULL;
    return buffer;
 }

 /*
    Deallocate the given textbuffer.
 */
 void Textbuffer_dealloc(Textbuffer* self)
 {
    Textbuffer* next;

    while (self) {
        free(self->data);
        next = self->next;
        free(self);
        self = next;
    }
 }

 /*
    Write a Unicode codepoint to the given textbuffer.
 */
 int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
 {
    Textbuffer* self = *this;

    if (self->size == TEXTBUFFER_BLOCKSIZE) {
        Textbuffer* new = Textbuffer_new();
        if (!new)
            return -1;
        new->next = self;
        self->prev = new;
        *this = self = new;
    }
    self->data[self->size++] = code;
    return 0;
 }

 /*
    Return the contents of the textbuffer as a Python Unicode object.
 */
 PyObject* Textbuffer_render(Textbuffer* self)
 {
    PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
    PyObject *left, *concat;

    while (self->next) {
        self = self->next;
        left = PyUnicode_FromUnicode(self->data, self->size);
        concat = PyUnicode_Concat(left, result);
        Py_DECREF(left);
        Py_DECREF(result);
        result = concat;
    }
    return result;
 }
--- a/mwparserfromhell/parser/ctokenizer/textbuffer.h
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h
@@ -0,0 +1,40 @@
 /*
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
 the Software without restriction, including without limitation the rights to
 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 of the Software, and to permit persons to whom the Software is furnished to do
 so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #include "common.h"

 /* Structs */

 struct Textbuffer {
    Py_ssize_t size;
    Py_UNICODE* data;
    struct Textbuffer* prev;
    struct Textbuffer* next;
 };
 typedef struct Textbuffer Textbuffer;

 /* Functions */

 Textbuffer* Textbuffer_new(void);
 void Textbuffer_dealloc(Textbuffer*);
 int Textbuffer_write(Textbuffer**, Py_UNICODE);
 PyObject* Textbuffer_render(Textbuffer*);
--- a/mwparserfromhell/parser/ctokenizer/tokenizer.c
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c
@@ -1,5 +1,4 @@
 /*
 Tokenizer for MWParserFromHell
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ -89,75 +88,6 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr)
    return lowered;
 }

 static Textbuffer* Textbuffer_new(void)
 {
    Textbuffer* buffer = malloc(sizeof(Textbuffer));

    if (!buffer) {
        PyErr_NoMemory();
        return NULL;
    }
    buffer->size = 0;
    buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
    if (!buffer->data) {
        free(buffer);
        PyErr_NoMemory();
        return NULL;
    }
    buffer->prev = buffer->next = NULL;
    return buffer;
 }

 static void Textbuffer_dealloc(Textbuffer* self)
 {
    Textbuffer* next;

    while (self) {
        free(self->data);
        next = self->next;
        free(self);
        self = next;
    }
 }

 /*
    Write a Unicode codepoint to the given textbuffer.
 */
 static int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
 {
    Textbuffer* self = *this;

    if (self->size == TEXTBUFFER_BLOCKSIZE) {
        Textbuffer* new = Textbuffer_new();
        if (!new)
            return -1;
        new->next = self;
        self->prev = new;
        *this = self = new;
    }
    self->data[self->size++] = code;
    return 0;
 }

 /*
    Return the contents of the textbuffer as a Python Unicode object.
 */
 static PyObject* Textbuffer_render(Textbuffer* self)
 {
    PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
    PyObject *left, *concat;

    while (self->next) {
        self = self->next;
        left = PyUnicode_FromUnicode(self->data, self->size);
        concat = PyUnicode_Concat(left, result);
        Py_DECREF(left);
        Py_DECREF(result);
        result = concat;
    }
    return result;
 }

 static TagData* TagData_new(void)
 {
    TagData *self = malloc(sizeof(TagData));
--- a/mwparserfromhell/parser/ctokenizer/tokenizer.h
+++ b/mwparserfromhell/parser/ctokenizer/tokenizer.h
@@ -1,5 +1,4 @@
 /*
 Tokenizer Header File for MWParserFromHell
 Copyright (C) 2012-2015 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
@@ -21,25 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 */

 #ifndef PY_SSIZE_T_CLEAN
 #define PY_SSIZE_T_CLEAN
 #endif

 #include <Python.h>
 #include <math.h>
 #include <structmember.h>
 #include <bytesobject.h>

 #if PY_MAJOR_VERSION >= 3
 #define IS_PY3K
 #endif

 #ifndef uint64_t
 #define uint64_t unsigned PY_LONG_LONG
 #endif

 #define malloc PyObject_Malloc
 #define free   PyObject_Free
 #include "common.h"
 #include "textbuffer.h"

 #define DIGITS    "0123456789"
 #define HEXDIGITS "0123456789abcdefABCDEF"
@@ -50,7 +34,6 @@ static const char MARKERS[] = {
    '-', '!', '\n', '\0'};

 #define NUM_MARKERS 19
 #define TEXTBUFFER_BLOCKSIZE 1024
 #define MAX_DEPTH 40
 #define MAX_CYCLES 100000
 #define MAX_BRACES 255
@@ -196,13 +179,6 @@ static PyObject* TagCloseClose;

 /* Miscellaneous structs: */

 struct Textbuffer {
    Py_ssize_t size;
    Py_UNICODE* data;
    struct Textbuffer* prev;
    struct Textbuffer* next;
 };

 struct Stack {
    PyObject* stack;
    uint64_t context;
@@ -224,7 +200,6 @@ typedef struct {
    Py_ssize_t reset;
 } TagData;

 typedef struct Textbuffer Textbuffer;
 typedef struct Stack Stack;


@@ -268,9 +243,6 @@ typedef struct {

 /* Function prototypes: */

 static Textbuffer* Textbuffer_new(void);
 static void Textbuffer_dealloc(Textbuffer*);

 static TagData* TagData_new(void);
 static void TagData_dealloc(TagData*);

--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,7 @@

 from __future__ import print_function
 from distutils.errors import DistutilsError, CCompilerError
 from glob import glob
 from os import environ
 import sys

@@ -39,10 +40,6 @@ from mwparserfromhell.compat import py26, py3k
 with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp:
    long_docs = fp.read()

 tokenizer = Extension("mwparserfromhell.parser._tokenizer",
                      sources=["mwparserfromhell/parser/tokenizer.c"],
                      depends=["mwparserfromhell/parser/tokenizer.h"])

 use_extension = True
 fallback = True

@@ -75,6 +72,12 @@ def build_ext_patched(self):
 if fallback:
    build_ext.run, build_ext_original = build_ext_patched, build_ext.run

 # Project-specific part begins here:

 tokenizer = Extension("mwparserfromhell.parser._tokenizer",
                      sources=glob("mwparserfromhell/parser/ctokenizer/*.c"),
                      depends=glob("mwparserfromhell/parser/ctokenizer/*.h"))

 setup(
    name = "mwparserfromhell",
    packages = find_packages(exclude=("tests",)),