From 0e547aa416f76970fc09092f110e3367bced99fd Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 14 Jun 2015 17:40:10 -0400 Subject: [PATCH] Begin splitting up C tokenizer. --- CHANGELOG | 1 + docs/changelog.rst | 1 + mwparserfromhell/parser/ctokenizer/common.h | 40 +++++++++ mwparserfromhell/parser/ctokenizer/textbuffer.c | 100 +++++++++++++++++++++ mwparserfromhell/parser/ctokenizer/textbuffer.h | 40 +++++++++ .../parser/{ => ctokenizer}/tokenizer.c | 70 --------------- .../parser/{ => ctokenizer}/tokenizer.h | 32 +------ setup.py | 11 ++- 8 files changed, 191 insertions(+), 104 deletions(-) create mode 100644 mwparserfromhell/parser/ctokenizer/common.h create mode 100644 mwparserfromhell/parser/ctokenizer/textbuffer.c create mode 100644 mwparserfromhell/parser/ctokenizer/textbuffer.h rename mwparserfromhell/parser/{ => ctokenizer}/tokenizer.c (98%) rename mwparserfromhell/parser/{ => ctokenizer}/tokenizer.h (95%) diff --git a/CHANGELOG b/CHANGELOG index c49aaf7..7ad2930 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -6,6 +6,7 @@ v0.4.1 (unreleased): - Added support for Python 3.5. - '<' and '>' are now disallowed in wikilink titles and template names. This includes when denoting tags, but not comments. +- Heavy refactoring and fixes to the C tokenizer. - Fixed some bugs in the release scripts. v0.4 (released May 23, 2015): diff --git a/docs/changelog.rst b/docs/changelog.rst index 3217a35..2944992 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -13,6 +13,7 @@ Unreleased - Added support for Python 3.5. - ``<`` and ``>`` are now disallowed in wikilink titles and template names. This includes when denoting tags, but not comments. +- Heavy refactoring and fixes to the C tokenizer. - Fixed some bugs in the release scripts. v0.4 diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h new file mode 100644 index 0000000..2ed5a02 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -0,0 +1,40 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef PY_SSIZE_T_CLEAN +#define PY_SSIZE_T_CLEAN +#endif + +#include +#include +#include + +#if PY_MAJOR_VERSION >= 3 +#define IS_PY3K +#endif + +#ifndef uint64_t +#define uint64_t unsigned PY_LONG_LONG +#endif + +#define malloc PyObject_Malloc +#define free PyObject_Free diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.c b/mwparserfromhell/parser/ctokenizer/textbuffer.c new file mode 100644 index 0000000..63d45d6 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c @@ -0,0 +1,100 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "textbuffer.h" + +#define TEXTBUFFER_BLOCKSIZE 1024 + +/* + Create a new textbuffer object. +*/ +Textbuffer* Textbuffer_new(void) +{ + Textbuffer* buffer = malloc(sizeof(Textbuffer)); + + if (!buffer) { + PyErr_NoMemory(); + return NULL; + } + buffer->size = 0; + buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE); + if (!buffer->data) { + free(buffer); + PyErr_NoMemory(); + return NULL; + } + buffer->prev = buffer->next = NULL; + return buffer; +} + +/* + Deallocate the given textbuffer. +*/ +void Textbuffer_dealloc(Textbuffer* self) +{ + Textbuffer* next; + + while (self) { + free(self->data); + next = self->next; + free(self); + self = next; + } +} + +/* + Write a Unicode codepoint to the given textbuffer. +*/ +int Textbuffer_write(Textbuffer** this, Py_UNICODE code) +{ + Textbuffer* self = *this; + + if (self->size == TEXTBUFFER_BLOCKSIZE) { + Textbuffer* new = Textbuffer_new(); + if (!new) + return -1; + new->next = self; + self->prev = new; + *this = self = new; + } + self->data[self->size++] = code; + return 0; +} + +/* + Return the contents of the textbuffer as a Python Unicode object. +*/ +PyObject* Textbuffer_render(Textbuffer* self) +{ + PyObject *result = PyUnicode_FromUnicode(self->data, self->size); + PyObject *left, *concat; + + while (self->next) { + self = self->next; + left = PyUnicode_FromUnicode(self->data, self->size); + concat = PyUnicode_Concat(left, result); + Py_DECREF(left); + Py_DECREF(result); + result = concat; + } + return result; +} diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.h b/mwparserfromhell/parser/ctokenizer/textbuffer.h new file mode 100644 index 0000000..36b2207 --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.h @@ -0,0 +1,40 @@ +/* +Copyright (C) 2012-2015 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include "common.h" + +/* Structs */ + +struct Textbuffer { + Py_ssize_t size; + Py_UNICODE* data; + struct Textbuffer* prev; + struct Textbuffer* next; +}; +typedef struct Textbuffer Textbuffer; + +/* Functions */ + +Textbuffer* Textbuffer_new(void); +void Textbuffer_dealloc(Textbuffer*); +int Textbuffer_write(Textbuffer**, Py_UNICODE); +PyObject* Textbuffer_render(Textbuffer*); diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c similarity index 98% rename from mwparserfromhell/parser/tokenizer.c rename to mwparserfromhell/parser/ctokenizer/tokenizer.c index dd11d16..2bce247 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c @@ -1,5 +1,4 @@ /* -Tokenizer for MWParserFromHell Copyright (C) 2012-2015 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of @@ -89,75 +88,6 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr) return lowered; } -static Textbuffer* Textbuffer_new(void) -{ - Textbuffer* buffer = malloc(sizeof(Textbuffer)); - - if (!buffer) { - PyErr_NoMemory(); - return NULL; - } - buffer->size = 0; - buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE); - if (!buffer->data) { - free(buffer); - PyErr_NoMemory(); - return NULL; - } - buffer->prev = buffer->next = NULL; - return buffer; -} - -static void Textbuffer_dealloc(Textbuffer* self) -{ - Textbuffer* next; - - while (self) { - free(self->data); - next = self->next; - free(self); - self = next; - } -} - -/* - Write a Unicode codepoint to the given textbuffer. -*/ -static int Textbuffer_write(Textbuffer** this, Py_UNICODE code) -{ - Textbuffer* self = *this; - - if (self->size == TEXTBUFFER_BLOCKSIZE) { - Textbuffer* new = Textbuffer_new(); - if (!new) - return -1; - new->next = self; - self->prev = new; - *this = self = new; - } - self->data[self->size++] = code; - return 0; -} - -/* - Return the contents of the textbuffer as a Python Unicode object. -*/ -static PyObject* Textbuffer_render(Textbuffer* self) -{ - PyObject *result = PyUnicode_FromUnicode(self->data, self->size); - PyObject *left, *concat; - - while (self->next) { - self = self->next; - left = PyUnicode_FromUnicode(self->data, self->size); - concat = PyUnicode_Concat(left, result); - Py_DECREF(left); - Py_DECREF(result); - result = concat; - } - return result; -} - static TagData* TagData_new(void) { TagData *self = malloc(sizeof(TagData)); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/ctokenizer/tokenizer.h similarity index 95% rename from mwparserfromhell/parser/tokenizer.h rename to mwparserfromhell/parser/ctokenizer/tokenizer.h index 102fecd..66f1e90 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/ctokenizer/tokenizer.h @@ -1,5 +1,4 @@ /* -Tokenizer Header File for MWParserFromHell Copyright (C) 2012-2015 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of @@ -21,25 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef PY_SSIZE_T_CLEAN -#define PY_SSIZE_T_CLEAN -#endif - -#include #include -#include -#include -#if PY_MAJOR_VERSION >= 3 -#define IS_PY3K -#endif - -#ifndef uint64_t -#define uint64_t unsigned PY_LONG_LONG -#endif - -#define malloc PyObject_Malloc -#define free PyObject_Free +#include "common.h" +#include "textbuffer.h" #define DIGITS "0123456789" #define HEXDIGITS "0123456789abcdefABCDEF" @@ -50,7 +34,6 @@ static const char MARKERS[] = { '-', '!', '\n', '\0'}; #define NUM_MARKERS 19 -#define TEXTBUFFER_BLOCKSIZE 1024 #define MAX_DEPTH 40 #define MAX_CYCLES 100000 #define MAX_BRACES 255 @@ -196,13 +179,6 @@ static PyObject* TagCloseClose; /* Miscellaneous structs: */ -struct Textbuffer { - Py_ssize_t size; - Py_UNICODE* data; - struct Textbuffer* prev; - struct Textbuffer* next; -}; - struct Stack { PyObject* stack; uint64_t context; @@ -224,7 +200,6 @@ typedef struct { Py_ssize_t reset; } TagData; -typedef struct Textbuffer Textbuffer; typedef struct Stack Stack; @@ -268,9 +243,6 @@ typedef struct { /* Function prototypes: */ -static Textbuffer* Textbuffer_new(void); -static void Textbuffer_dealloc(Textbuffer*); - static TagData* TagData_new(void); static void TagData_dealloc(TagData*); diff --git a/setup.py b/setup.py index dcdd563..1bca436 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ from __future__ import print_function from distutils.errors import DistutilsError, CCompilerError +from glob import glob from os import environ import sys @@ -39,10 +40,6 @@ from mwparserfromhell.compat import py26, py3k with open("README.rst", **({'encoding':'utf-8'} if py3k else {})) as fp: long_docs = fp.read() -tokenizer = Extension("mwparserfromhell.parser._tokenizer", - sources=["mwparserfromhell/parser/tokenizer.c"], - depends=["mwparserfromhell/parser/tokenizer.h"]) - use_extension = True fallback = True @@ -75,6 +72,12 @@ def build_ext_patched(self): if fallback: build_ext.run, build_ext_original = build_ext_patched, build_ext.run +# Project-specific part begins here: + +tokenizer = Extension("mwparserfromhell.parser._tokenizer", + sources=glob("mwparserfromhell/parser/ctokenizer/*.c"), + depends=glob("mwparserfromhell/parser/ctokenizer/*.h")) + setup( name = "mwparserfromhell", packages = find_packages(exclude=("tests",)),