From e0660f8bc31a00c3119d13d2d37bcf18042b3102 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 22 Sep 2012 22:47:05 -0400 Subject: [PATCH 01/47] Committing this C work for now. --- docs/conf.py | 5 +- mwparserfromhell/parser/builder.c | 24 +++ mwparserfromhell/parser/tokenizer.c | 322 ++++++++++++++++++++++++++++++++++++ setup.py | 9 +- 4 files changed, 357 insertions(+), 3 deletions(-) create mode 100644 mwparserfromhell/parser/builder.c create mode 100644 mwparserfromhell/parser/tokenizer.c diff --git a/docs/conf.py b/docs/conf.py index 6cc3664..cff089b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -17,6 +17,7 @@ import sys, os # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. sys.path.insert(0, os.path.abspath('..')) +import mwparserfromhell # -- General configuration ----------------------------------------------------- @@ -48,9 +49,9 @@ copyright = u'2012 Ben Kurtovic' # built documents. # # The short X.Y version. -version = '0.2' +version = ".".join(mwparserfromhell.__version__.split(".", 2)[:2]) # The full version, including alpha/beta/rc tags. -release = '0.2.dev' +release = mwparserfromhell.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/mwparserfromhell/parser/builder.c b/mwparserfromhell/parser/builder.c new file mode 100644 index 0000000..7cbe236 --- /dev/null +++ b/mwparserfromhell/parser/builder.c @@ -0,0 +1,24 @@ +/* +Builder for MWParserFromHell +Copyright (C) 2012 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#include diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c new file mode 100644 index 0000000..3fdc370 --- /dev/null +++ b/mwparserfromhell/parser/tokenizer.c @@ -0,0 +1,322 @@ +/* +Tokenizer for MWParserFromHell +Copyright (C) 2012 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef PY_SSIZE_T_CLEAN +#define PY_SSIZE_T_CLEAN +#endif + +#include +#include "structmember.h" + +static const Py_UNICODE* OUT_OF_BOUNDS = ""; +static const Py_UNICODE* MARKERS[] = {"{", "}", "[", "]", "<", ">", "|", "=", + "&", "#", "*", ";", ":", "/", "-", "!", + "\n", OUT_OF_BOUNDS}; + +static PyMethodDef +module_methods[] = { + {NULL} +}; + +typedef struct { + PyObject_HEAD + PyObject* text; /* text to tokenize */ + PyObject* stacks; /* token stacks */ + PyObject* topstack; /* topmost stack */ + Py_ssize_t head; /* current position in text */ + Py_ssize_t length; /* length of text */ + Py_ssize_t global; /* global context */ +} Tokenizer; + +static PyObject* +Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds) +{ + Tokenizer *self; + + self = (Tokenizer*) type->tp_alloc(type, 0); + if (self != NULL) { + + self->text = Py_None; + Py_INCREF(Py_None); + + self->stacks = PyList_New(0); + if (self->stacks == NULL) { + Py_DECREF(self); + return NULL; + } + + self->head = 0; + self->length = 0; + self->global = 0; + } + + return (PyObject*) self; +} + +static void +Tokenizer_dealloc(Tokenizer* self) +{ + Py_XDECREF(self->text); + Py_XDECREF(self->stacks); + Py_XDECREF(self->topstack); + self->ob_type->tp_free((PyObject*) self); +} + +static int +Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) +{ + static char* kwlist[] = {NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) + return -1; + return 0; +} + +#define Tokenizer_STACK(self) PyList_GET_ITEM(self->topstack, 0) +#define Tokenizer_CONTEXT(self) PyList_GET_ITEM(self->topstack, 1) +#define Tokenizer_TEXTBUFFER(self) PyList_GET_ITEM(self->topstack, 2) + +static int +Tokenizer_set_context(Tokenizer* self, Py_ssize_t value) +{ + if (PyList_SetItem(self->topstack, 1, PyInt_FromSsize_t(value))) + return -1; + return 0; +} + +static int +Tokenizer_set_textbuffer(Tokenizer* self, PyObject* value) +{ + if (PyList_SetItem(self->topstack, 2, value)) + return -1; + return 0; +} + +/* + Add a new token stack, context, and textbuffer to the list. +*/ +static int +Tokenizer_push(Tokenizer* self, int context) +{ + PyObject* top = PyList_New(3); + PyList_SET_ITEM(top, 0, PyList_New(0)); + PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(0)); + PyList_SET_ITEM(top, 2, PyList_New(0)); + + Py_XDECREF(self->topstack); + self->topstack = top; + + if (PyList_Append(self->stacks, top)) + return -1; + return 0; +} + +/* + Push the textbuffer onto the stack as a Text node and clear it. +*/ +static int +Tokenizer_push_textbuffer(Tokenizer* self) +{ + if (PyList_GET_SIZE(Tokenizer_TEXTBUFFER(self)) > 0) { + + PyObject* text; + // tokens.Text(text="".join(self._textbuffer)) + + if (PyList_Append(Tokenizer_STACK(self), text) + return -1; + + if (Tokenizer_set_textbuffer(self, PyList_New(0))) + return -1; + + return 0; + } +} + +/* + Pop the current stack/context/textbuffer, returing the stack. +*/ +static PyObject* +Tokenizer_pop(Tokenizer* self) +{ + if (Tokenizer_push_textbuffer(self)) + return NULL; + + self->stacks // POP!? +} + +/* + Pop the current stack/context/textbuffer, returing the stack. We will also + replace the underlying stack's context with the current stack's. +*/ +static PyObject* +Tokenizer_pop_keeping_context(Tokenizer* self) +{ + if (Tokenizer_push_textbuffer(self)) + return NULL; +} + +/* + Read the value at a relative point in the wikicode. +*/ +static Py_UNICODE* +Tokenizer_read(Tokenizer* self, Py_ssize_t delta) +{ + Py_ssize_t index = self->head + delta; + + if (index >= self->length) { + return OUT_OF_BOUNDS; + } + + PyObject* item = PySequence_Fast_GET_ITEM(self->text, index); + return PyUnicode_AS_UNICODE(item); +} + +/* + Parse the wikicode string, using *context* for when to stop. +*/ +static PyObject* +Tokenizer_parse(Tokenizer* self, int context) +{ + Py_UNICODE* this; + + Tokenizer_push(self, context); + + while (1) { + this = Tokenizer_read(self, 0); + if (this not in MARKERS) { + WRITE TEXT + } + if (this == OUT_OF_BOUNDS) { + return Tokenizer_push(self); + } + printf("%p %i %c\n", this, *this, *this); + self->head++; + } +} + +/* + Build a list of tokens from a string of wikicode and return it. +*/ +static PyObject* +Tokenizer_tokenize(Tokenizer* self, PyObject *args) +{ + PyObject* text; + + if (!PyArg_ParseTuple(args, "U", &text)) { + /* Failed to parse a Unicode object; try a string instead. */ + PyErr_Clear(); + const char* encoded; + Py_ssize_t size; + + if (!PyArg_ParseTuple(args, "s#", &encoded, &size)) { + return NULL; + } + + PyObject* temp; + temp = PyUnicode_FromStringAndSize(encoded, size); + if (text == NULL) + return NULL; + + Py_XDECREF(self->text); + text = PySequence_Fast(temp, "expected a sequence"); + Py_XDECREF(temp); + self->text = text; + } + else { + Py_XDECREF(self->text); + self->text = PySequence_Fast(text, "expected a sequence"); + } + + self->length = PySequence_Length(self->text); + + return Tokenizer_parse(self, 0); +} + +static PyMethodDef +Tokenizer_methods[] = { + {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, + "Build a list of tokens from a string of wikicode and return it."}, + {NULL} +}; + +static PyMemberDef +Tokenizer_members[] = { + {NULL} +}; + +static PyTypeObject +TokenizerType = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "_tokenizer.CTokenizer", /* tp_name */ + sizeof(Tokenizer), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor) Tokenizer_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "Creates a list of tokens from a string of wikicode.", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Tokenizer_methods, /* tp_methods */ + Tokenizer_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc) Tokenizer_init, /* tp_init */ + 0, /* tp_alloc */ + Tokenizer_new, /* tp_new */ +}; + +PyMODINIT_FUNC +init_tokenizer(void) +{ + PyObject* module; + + TokenizerType.tp_new = PyType_GenericNew; + if (PyType_Ready(&TokenizerType) < 0) + return; + + module = Py_InitModule("_tokenizer", module_methods); + + Py_INCREF(&TokenizerType); + PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); +} diff --git a/setup.py b/setup.py index 9faa56c..3664626 100644 --- a/setup.py +++ b/setup.py @@ -21,16 +21,23 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from setuptools import setup, find_packages +from setuptools import setup, find_packages, Extension from mwparserfromhell import __version__ with open("README.rst") as fp: long_docs = fp.read() +builder = Extension("mwparserfromhell.parser._builder", + sources = ["mwparserfromhell/parser/builder.c"]) + +tokenizer = Extension("mwparserfromhell.parser._tokenizer", + sources = ["mwparserfromhell/parser/tokenizer.c"]) + setup( name = "mwparserfromhell", packages = find_packages(exclude=("tests",)), + ext_modules = [builder, tokenizer], test_suite = "tests", version = __version__, author = "Ben Kurtovic", From 4cc4791d4871b833454ade8d9f52ee35e8bca742 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Sep 2012 01:29:27 -0400 Subject: [PATCH 02/47] Adding a bunch more, and implementing Tokenizer_push_textbuffer. --- mwparserfromhell/parser/tokenizer.c | 116 ++++++++++++++++++++++++++++++------ setup.py | 2 +- 2 files changed, 98 insertions(+), 20 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 3fdc370..aec7b1d 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -28,10 +28,15 @@ SOFTWARE. #include #include "structmember.h" -static const Py_UNICODE* OUT_OF_BOUNDS = ""; -static const Py_UNICODE* MARKERS[] = {"{", "}", "[", "]", "<", ">", "|", "=", - "&", "#", "*", ";", ":", "/", "-", "!", - "\n", OUT_OF_BOUNDS}; +#define PU (Py_UNICODE*) +static const Py_UNICODE* OUT_OF_BOUNDS = PU""; +static const Py_UNICODE* MARKERS[] = {PU"{", PU"}", PU"[", PU"]", PU"<", PU">", + PU"|", PU"=", PU"&", PU"#", PU"*", PU";", + PU":", PU"/", PU"-", PU"!", PU"\n", PU""}; +#undef PU + +static PyObject* contexts; +static PyObject* tokens; static PyMethodDef module_methods[] = { @@ -60,7 +65,7 @@ Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds) Py_INCREF(Py_None); self->stacks = PyList_New(0); - if (self->stacks == NULL) { + if (!self->stacks) { Py_DECREF(self); return NULL; } @@ -91,9 +96,9 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) return 0; } -#define Tokenizer_STACK(self) PyList_GET_ITEM(self->topstack, 0) -#define Tokenizer_CONTEXT(self) PyList_GET_ITEM(self->topstack, 1) -#define Tokenizer_TEXTBUFFER(self) PyList_GET_ITEM(self->topstack, 2) +#define Tokenizer_STACK(self) PySequence_Fast_GET_ITEM(self->topstack, 0) +#define Tokenizer_CONTEXT(self) PySequence_Fast_GET_ITEM(self->topstack, 1) +#define Tokenizer_TEXTBUFFER(self) PySequence_Fast_GET_ITEM(self->topstack, 2) static int Tokenizer_set_context(Tokenizer* self, Py_ssize_t value) @@ -136,19 +141,65 @@ Tokenizer_push(Tokenizer* self, int context) static int Tokenizer_push_textbuffer(Tokenizer* self) { - if (PyList_GET_SIZE(Tokenizer_TEXTBUFFER(self)) > 0) { + if (PySequence_Fast_GET_SIZE(Tokenizer_TEXTBUFFER(self)) > 0) { + PyObject* sep = PyUnicode_FromString(""); + if (!sep) return -1; + PyObject* text = PyUnicode_Join(sep, Tokenizer_TEXTBUFFER(self)); + Py_DECREF(sep); + if (!text) return -1; + + PyObject* klass = PyObject_GetAttrString(tokens, "Text"); + if (!klass) return -1; + PyObject* args = PyTuple_New(0); + if (!args) return -1; + PyObject* kwargs = PyDict_New(); + if (!kwargs) return -1; + PyDict_SetItemString(kwargs, "text", text); + Py_DECREF(text); + + PyObject* token = PyInstance_New(klass, args, kwargs); + if (!token) { + Py_DECREF(klass); + Py_DECREF(args); + Py_DECREF(kwargs); + return -1; + } - PyObject* text; - // tokens.Text(text="".join(self._textbuffer)) + Py_DECREF(klass); + Py_DECREF(args); + Py_DECREF(kwargs); - if (PyList_Append(Tokenizer_STACK(self), text) + if (PyList_Append(Tokenizer_STACK(self), token)) { + Py_XDECREF(token); return -1; + } + + Py_XDECREF(token); if (Tokenizer_set_textbuffer(self, PyList_New(0))) return -1; + } + return 0; +} - return 0; +static int +Tokenizer_delete_top_of_stack(Tokenizer* self) +{ + if (PySequence_DelItem(self->stacks, -1)) + return -1; + Py_DECREF(self->topstack); + + Py_ssize_t size = PySequence_Fast_GET_SIZE(self->stacks); + if (size > 0) { + PyObject* top = PySequence_Fast_GET_ITEM(self->stacks, size - 1); + self->topstack = top; + Py_INCREF(top); + } + else { + self->topstack = NULL; } + + return 0; } /* @@ -160,7 +211,13 @@ Tokenizer_pop(Tokenizer* self) if (Tokenizer_push_textbuffer(self)) return NULL; - self->stacks // POP!? + PyObject* stack = Tokenizer_STACK(self); + Py_INCREF(stack); + + if (Tokenizer_delete_top_of_stack(self)) + return NULL; + + return stack; } /* @@ -172,6 +229,19 @@ Tokenizer_pop_keeping_context(Tokenizer* self) { if (Tokenizer_push_textbuffer(self)) return NULL; + + PyObject* stack = Tokenizer_STACK(self); + PyObject* context = Tokenizer_CONTEXT(self); + Py_INCREF(stack); + Py_INCREF(context); + + if (Tokenizer_delete_top_of_stack(self)) + return NULL; + + if (PyList_SetItem(self->topstack, 1, context)) + return NULL; + + return stack; } /* @@ -183,7 +253,7 @@ Tokenizer_read(Tokenizer* self, Py_ssize_t delta) Py_ssize_t index = self->head + delta; if (index >= self->length) { - return OUT_OF_BOUNDS; + return (Py_UNICODE*) OUT_OF_BOUNDS; } PyObject* item = PySequence_Fast_GET_ITEM(self->text, index); @@ -202,11 +272,11 @@ Tokenizer_parse(Tokenizer* self, int context) while (1) { this = Tokenizer_read(self, 0); - if (this not in MARKERS) { + /* if (this not in MARKERS) { WRITE TEXT - } + } */ if (this == OUT_OF_BOUNDS) { - return Tokenizer_push(self); + return Tokenizer_pop(self); } printf("%p %i %c\n", this, *this, *this); self->head++; @@ -233,7 +303,7 @@ Tokenizer_tokenize(Tokenizer* self, PyObject *args) PyObject* temp; temp = PyUnicode_FromStringAndSize(encoded, size); - if (text == NULL) + if (!text) return NULL; Py_XDECREF(self->text); @@ -319,4 +389,12 @@ init_tokenizer(void) Py_INCREF(&TokenizerType); PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); + + PyObject* globals = PyEval_GetGlobals(); + PyObject* locals = PyEval_GetLocals(); + PyObject* fromlist = PyList_New(0); + + contexts = PyImport_ImportModuleLevel("contexts", globals, locals, fromlist, 1); + tokens = PyImport_ImportModuleLevel("tokens", globals, locals, fromlist, 1); + Py_DECREF(fromlist); } diff --git a/setup.py b/setup.py index 3664626..e348ce5 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ builder = Extension("mwparserfromhell.parser._builder", sources = ["mwparserfromhell/parser/builder.c"]) tokenizer = Extension("mwparserfromhell.parser._tokenizer", - sources = ["mwparserfromhell/parser/tokenizer.c"]) + sources = ["mwparserfromhell/parser/tokenizer.c"]) setup( name = "mwparserfromhell", From 9c4aba13912c9d5b274a61a5f7c6d9945f72c0b6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Sep 2012 03:40:19 -0400 Subject: [PATCH 03/47] Adding a few more functions. --- mwparserfromhell/parser/tokenizer.c | 114 ++++++++++++++++++++++++++++++++---- 1 file changed, 101 insertions(+), 13 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index aec7b1d..99c9bfc 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -26,15 +26,20 @@ SOFTWARE. #endif #include +#include "setjmp.h" #include "structmember.h" +static PyObject* EMPTY; + #define PU (Py_UNICODE*) -static const Py_UNICODE* OUT_OF_BOUNDS = PU""; static const Py_UNICODE* MARKERS[] = {PU"{", PU"}", PU"[", PU"]", PU"<", PU">", PU"|", PU"=", PU"&", PU"#", PU"*", PU";", PU":", PU"/", PU"-", PU"!", PU"\n", PU""}; #undef PU +static jmp_buf exception_env; +static const int BAD_ROUTE = 1; + static PyObject* contexts; static PyObject* tokens; @@ -142,10 +147,7 @@ static int Tokenizer_push_textbuffer(Tokenizer* self) { if (PySequence_Fast_GET_SIZE(Tokenizer_TEXTBUFFER(self)) > 0) { - PyObject* sep = PyUnicode_FromString(""); - if (!sep) return -1; - PyObject* text = PyUnicode_Join(sep, Tokenizer_TEXTBUFFER(self)); - Py_DECREF(sep); + PyObject* text = PyUnicode_Join(EMPTY, Tokenizer_TEXTBUFFER(self)); if (!text) return -1; PyObject* klass = PyObject_GetAttrString(tokens, "Text"); @@ -174,7 +176,7 @@ Tokenizer_push_textbuffer(Tokenizer* self) return -1; } - Py_XDECREF(token); + Py_DECREF(token); if (Tokenizer_set_textbuffer(self, PyList_New(0))) return -1; @@ -245,19 +247,104 @@ Tokenizer_pop_keeping_context(Tokenizer* self) } /* + Fail the current tokenization route. + + Discards the current stack/context/textbuffer and "raises a BAD_ROUTE + exception", which is implemented using longjmp(). +*/ +static void +Tokenizer_fail_route(Tokenizer* self) +{ + Tokenizer_pop(self); + longjmp(exception_env, BAD_ROUTE); +} + +/* + Write a token to the end of the current token stack. +*/ +static int +Tokenizer_write(Tokenizer* self, PyObject* token) +{ + if (Tokenizer_push_textbuffer(self)) + return -1; + + if (PyList_Append(Tokenizer_STACK(self), token)) { + Py_XDECREF(token); + return -1; + } + + Py_XDECREF(token); + return 0; +} + +/* + Write a token to the beginning of the current token stack. +*/ +static int +Tokenizer_write_first(Tokenizer* self, PyObject* token) +{ + if (Tokenizer_push_textbuffer(self)) + return -1; + + if (PyList_Insert(Tokenizer_STACK(self), 0, token)) { + Py_XDECREF(token); + return -1; + } + + Py_XDECREF(token); + return 0; +} + +/* + Write text to the current textbuffer. +*/ +static int +Tokenizer_write_text(Tokenizer* self, PyObject* text) +{ + if (PyList_Append(Tokenizer_TEXTBUFFER(self), text)) { + Py_XDECREF(text); + return -1; + } + + Py_XDECREF(text); + return 0; +} + +/* + Write a series of tokens to the current stack at once. +*/ +static int +Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist) +{ + if (Tokenizer_push_textbuffer(self)) + Py_XDECREF(tokenlist); + return -1; + + PyObject* stack = Tokenizer_STACK(self); + Py_ssize_t size = PySequence_Fast_GET_SIZE(stack); + + if (PyList_SetSlice(stack, size, size, tokenlist)) { + Py_XDECREF(tokenlist); + return -1; + } + + Py_XDECREF(tokenlist); + return 0; +} + +/* Read the value at a relative point in the wikicode. */ -static Py_UNICODE* +static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta) { Py_ssize_t index = self->head + delta; if (index >= self->length) { - return (Py_UNICODE*) OUT_OF_BOUNDS; + return EMPTY; } - PyObject* item = PySequence_Fast_GET_ITEM(self->text, index); - return PyUnicode_AS_UNICODE(item); + return PySequence_Fast_GET_ITEM(self->text, index); } /* @@ -266,7 +353,7 @@ Tokenizer_read(Tokenizer* self, Py_ssize_t delta) static PyObject* Tokenizer_parse(Tokenizer* self, int context) { - Py_UNICODE* this; + PyObject* this; Tokenizer_push(self, context); @@ -275,10 +362,9 @@ Tokenizer_parse(Tokenizer* self, int context) /* if (this not in MARKERS) { WRITE TEXT } */ - if (this == OUT_OF_BOUNDS) { + if (this == EMPTY) { return Tokenizer_pop(self); } - printf("%p %i %c\n", this, *this, *this); self->head++; } } @@ -390,6 +476,8 @@ init_tokenizer(void) Py_INCREF(&TokenizerType); PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); + EMPTY = PyUnicode_FromString(""); + PyObject* globals = PyEval_GetGlobals(); PyObject* locals = PyEval_GetLocals(); PyObject* fromlist = PyList_New(0); From 5267c30cf60b9c03cdf908112f8bffc390a87ac1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Sep 2012 03:57:04 -0400 Subject: [PATCH 04/47] Fix refcount handling; implement Tokenizer_write_text_then_stack. --- mwparserfromhell/parser/tokenizer.c | 52 +++++++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 99c9bfc..3f7e84e 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -247,10 +247,9 @@ Tokenizer_pop_keeping_context(Tokenizer* self) } /* - Fail the current tokenization route. - - Discards the current stack/context/textbuffer and "raises a BAD_ROUTE - exception", which is implemented using longjmp(). + Fail the current tokenization route. Discards the current + stack/context/textbuffer and "raises a BAD_ROUTE exception", which is + implemented using longjmp(). */ static void Tokenizer_fail_route(Tokenizer* self) @@ -268,12 +267,9 @@ Tokenizer_write(Tokenizer* self, PyObject* token) if (Tokenizer_push_textbuffer(self)) return -1; - if (PyList_Append(Tokenizer_STACK(self), token)) { - Py_XDECREF(token); + if (PyList_Append(Tokenizer_STACK(self), token)) return -1; - } - Py_XDECREF(token); return 0; } @@ -286,12 +282,9 @@ Tokenizer_write_first(Tokenizer* self, PyObject* token) if (Tokenizer_push_textbuffer(self)) return -1; - if (PyList_Insert(Tokenizer_STACK(self), 0, token)) { - Py_XDECREF(token); + if (PyList_Insert(Tokenizer_STACK(self), 0, token)) return -1; - } - Py_XDECREF(token); return 0; } @@ -301,12 +294,9 @@ Tokenizer_write_first(Tokenizer* self, PyObject* token) static int Tokenizer_write_text(Tokenizer* self, PyObject* text) { - if (PyList_Append(Tokenizer_TEXTBUFFER(self), text)) { - Py_XDECREF(text); + if (PyList_Append(Tokenizer_TEXTBUFFER(self), text)) return -1; - } - Py_XDECREF(text); return 0; } @@ -317,18 +307,40 @@ static int Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist) { if (Tokenizer_push_textbuffer(self)) - Py_XDECREF(tokenlist); return -1; PyObject* stack = Tokenizer_STACK(self); Py_ssize_t size = PySequence_Fast_GET_SIZE(stack); - if (PyList_SetSlice(stack, size, size, tokenlist)) { - Py_XDECREF(tokenlist); + if (PyList_SetSlice(stack, size, size, tokenlist)) + return -1; + + return 0; +} + +/* + Pop the current stack, write text, and then write the stack. +*/ +static int +Tokenizer_write_text_then_stack(Tokenizer* self, PyObject* text) +{ + PyObject* stack = Tokenizer_pop(self); + if (Tokenizer_write_text(self, text)) { + Py_XDECREF(stack); return -1; } - Py_XDECREF(tokenlist); + if (stack) { + if (PySequence_Fast_GET_SIZE(stack) > 0) { + if (Tokenizer_write_all(self, stack)) { + Py_DECREF(stack); + return -1; + } + } + Py_DECREF(stack); + } + + self->head--; return 0; } From 8729d20f078df40c50a70ee7cbd392b534173a88 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Sep 2012 17:40:46 -0400 Subject: [PATCH 05/47] Fill out Tokenizer_parse(); build a bunch of empty function definitions. --- mwparserfromhell/parser/tokenizer.c | 252 ++++++++++++++++++++++++++++++++++-- 1 file changed, 240 insertions(+), 12 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 3f7e84e..0d18473 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -26,8 +26,8 @@ SOFTWARE. #endif #include -#include "setjmp.h" -#include "structmember.h" +#include +#include static PyObject* EMPTY; @@ -35,7 +35,10 @@ static PyObject* EMPTY; static const Py_UNICODE* MARKERS[] = {PU"{", PU"}", PU"[", PU"]", PU"<", PU">", PU"|", PU"=", PU"&", PU"#", PU"*", PU";", PU":", PU"/", PU"-", PU"!", PU"\n", PU""}; -#undef PU +static const int NUM_MARKERS = 17; + +#define CONTEXT(name) PyInt_AsSsize_t((PyIntObject*) \ + PyObject_GetAttrString(contexts, name)) static jmp_buf exception_env; static const int BAD_ROUTE = 1; @@ -103,6 +106,7 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) #define Tokenizer_STACK(self) PySequence_Fast_GET_ITEM(self->topstack, 0) #define Tokenizer_CONTEXT(self) PySequence_Fast_GET_ITEM(self->topstack, 1) +#define Tokenizer_CONTEXT_VAL(self) PyInt_AsSsize_t((PyIntObject*) Tokenizer_CONTEXT(self)) #define Tokenizer_TEXTBUFFER(self) PySequence_Fast_GET_ITEM(self->topstack, 2) static int @@ -125,11 +129,11 @@ Tokenizer_set_textbuffer(Tokenizer* self, PyObject* value) Add a new token stack, context, and textbuffer to the list. */ static int -Tokenizer_push(Tokenizer* self, int context) +Tokenizer_push(Tokenizer* self, Py_ssize_t context) { PyObject* top = PyList_New(3); PyList_SET_ITEM(top, 0, PyList_New(0)); - PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(0)); + PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(context)); PyList_SET_ITEM(top, 2, PyList_New(0)); Py_XDECREF(self->topstack); @@ -345,7 +349,7 @@ Tokenizer_write_text_then_stack(Tokenizer* self, PyObject* text) } /* - Read the value at a relative point in the wikicode. + Read the value at a relative point in the wikicode, forwards. */ static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta) @@ -360,23 +364,247 @@ Tokenizer_read(Tokenizer* self, Py_ssize_t delta) } /* - Parse the wikicode string, using *context* for when to stop. + Read the value at a relative point in the wikicode, backwards. */ static PyObject* -Tokenizer_parse(Tokenizer* self, int context) +Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) +{ + if (delta > self->head) { + return EMPTY; + } + + Py_ssize_t index = self->head - delta; + return PySequence_Fast_GET_ITEM(self->text, index); +} + +static int +Tokenizer_parse_template_or_argument(Tokenizer* self) +{ + +} + +static int +Tokenizer_parse_template(Tokenizer* self) +{ + +} + +static int +Tokenizer_parse_argument(Tokenizer* self) +{ + +} + +static int +Tokenizer_verify_safe(Tokenizer* self) +{ + +} + +static int +Tokenizer_handle_template_param(Tokenizer* self) +{ + +} + +static int +Tokenizer_handle_template_param_value(Tokenizer* self) { - PyObject* this; + +} + +static PyObject* +Tokenizer_handle_template_end(Tokenizer* self) +{ + +} + +static int +Tokenizer_handle_argument_separator(Tokenizer* self) +{ + +} + +static PyObject* +Tokenizer_handle_argument_end(Tokenizer* self) +{ + +} + +static int +Tokenizer_parse_wikilink(Tokenizer* self) +{ + +} + +static int +Tokenizer_handle_wikilink_separator(Tokenizer* self) +{ + +} + +static PyObject* +Tokenizer_handle_wikilink_end(Tokenizer* self) +{ + +} + +static int +Tokenizer_parse_heading(Tokenizer* self) +{ + +} + +static PyObject* +Tokenizer_handle_heading_end(Tokenizer* self) +{ + +} + +static int +Tokenizer_really_parse_entity(Tokenizer* self) +{ + +} + +static int +Tokenizer_parse_entity(Tokenizer* self) +{ + +} + +static int +Tokenizer_parse_comment(Tokenizer* self) +{ + +} + + +/* + Parse the wikicode string, using context for when to stop. +*/ +static PyObject* +Tokenizer_parse(Tokenizer* self, Py_ssize_t context) +{ + Py_ssize_t fail_contexts = ( + CONTEXT("TEMPLATE") | CONTEXT("ARGUMENT") | CONTEXT("HEADING") | + CONTEXT("COMMENT")); + + PyObject *this, *next; + Py_UNICODE *this_data, *next_data, *next_next_data, *last_data; + Py_ssize_t this_context; + int is_marker, i; Tokenizer_push(self, context); while (1) { this = Tokenizer_read(self, 0); - /* if (this not in MARKERS) { - WRITE TEXT - } */ + this_data = PyUnicode_AS_UNICODE(this); + + is_marker = 0; + for (i = 0; i < NUM_MARKERS; i++) { + if (MARKERS[i] == this_data) { + is_marker = 1; + break; + } + } + + if (!is_marker) { + Tokenizer_write_text(self, this); + self->head++; + continue; + } + + this_context = Tokenizer_CONTEXT_VAL(self); + if (this == EMPTY) { + if (this_context & fail_contexts) { + Tokenizer_fail_route(self); + } return Tokenizer_pop(self); } + + next = Tokenizer_read(self, 1); + next_data = PyUnicode_AS_UNICODE(next); + + if (this_context & CONTEXT("COMMENT")) { + if (this_data == next_data && next_data == PU "-") { + if (PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)) == PU ">") { + return Tokenizer_pop(self); + } + } + Tokenizer_write_text(self, this); + } + else if (this_data == next_data && next_data == PU "{") { + Tokenizer_parse_template_or_argument(self); + } + else if (this_data == PU "|" && this_context & CONTEXT("TEMPLATE")) { + Tokenizer_handle_template_param(self); + } + else if (this_data == PU "=" && this_context & CONTEXT("TEMPLATE_PARAM_KEY")) { + Tokenizer_handle_template_param_value(self); + } + else if (this_data == next_data && next_data == PU "}" && + this_context & CONTEXT("TEMPLATE")) { + Tokenizer_handle_template_end(self); + } + else if (this_data == PU "|" && this_context & CONTEXT("ARGUMENT_NAME")) { + Tokenizer_handle_argument_separator(self); + } + else if (this_data == next_data && next_data == PU "}" && + this_context & CONTEXT("ARGUMENT")) { + if (PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)) == PU "}") { + return Tokenizer_handle_argument_end(self); + } + Tokenizer_write_text(self, this); + } + else if (this_data == next_data && next_data == PU "[") { + if (!(this_context & CONTEXT("WIKILINK_TITLE"))) { + Tokenizer_parse_wikilink(self); + } + else { + Tokenizer_write_text(self, this); + } + } + else if (this_data == PU "|" && this_context & CONTEXT("WIKILINK_TITLE")) { + Tokenizer_handle_wikilink_separator(self); + } + else if (this_data == next_data && next_data == PU "]" && + this_context & CONTEXT("WIKILINK")) { + return Tokenizer_handle_wikilink_end(self); + } + else if (this_data == PU "=" && !(self->global & CONTEXT("GL_HEADING"))) { + last_data = PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, 1)); + if (last_data == PU "\n" || last_data == PU "") { + Tokenizer_parse_heading(self); + } + else { + Tokenizer_write_text(self, this); + } + } + else if (this_data == PU "=" && this_context & CONTEXT("HEADING")) { + return Tokenizer_handle_heading_end(self); + } + else if (this_data == PU "\n" && this_context & CONTEXT("HEADING")) { + Tokenizer_fail_route(self); + } + else if (this_data == PU "&") { + Tokenizer_parse_entity(self); + } + else if (this_data == PU "<" && next_data == PU "!") { + next_next_data = PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)); + if (next_next_data == PyUnicode_AS_UNICODE(Tokenizer_read(self, 3)) && + next_next_data == PU "-") { + Tokenizer_parse_comment(self); + } + else { + Tokenizer_write_text(self, this); + } + } + else { + Tokenizer_write_text(self, this); + } + self->head++; } } From 1ecb0e0d4485e71f9d49555d114df56ac9f0acff Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Sep 2012 17:48:57 -0400 Subject: [PATCH 06/47] Fix Tokenizer_verify_safe()'s prototype; add documentation. --- mwparserfromhell/parser/tokenizer.c | 57 +++++++++++++++++++++++++++++++++++-- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 0d18473..ad013cb 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -377,109 +377,162 @@ Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) return PySequence_Fast_GET_ITEM(self->text, index); } +/* + Parse a template or argument at the head of the wikicode string. +*/ static int Tokenizer_parse_template_or_argument(Tokenizer* self) { } +/* + Parse a template at the head of the wikicode string. +*/ static int Tokenizer_parse_template(Tokenizer* self) { } +/* + Parse an argument at the head of the wikicode string. +*/ static int Tokenizer_parse_argument(Tokenizer* self) { } +/* + Verify that there are no unsafe characters in the current stack. The route + will be failed if the name contains any element of unsafes in it (not + merely at the beginning or end). This is used when parsing a template name + or parameter key, which cannot contain newlines. +*/ static int -Tokenizer_verify_safe(Tokenizer* self) +Tokenizer_verify_safe(Tokenizer* self, Py_UNICODE* unsafes[]) { } +/* + Handle a template parameter at the head of the string. +*/ static int Tokenizer_handle_template_param(Tokenizer* self) { } +/* + Handle a template parameter's value at the head of the string. +*/ static int Tokenizer_handle_template_param_value(Tokenizer* self) { } +/* + Handle the end of a template at the head of the string. +*/ static PyObject* Tokenizer_handle_template_end(Tokenizer* self) { } +/* + Handle the separator between an argument's name and default. +*/ static int Tokenizer_handle_argument_separator(Tokenizer* self) { } +/* + Handle the end of an argument at the head of the string. +*/ static PyObject* Tokenizer_handle_argument_end(Tokenizer* self) { } +/* + Parse an internal wikilink at the head of the wikicode string. +*/ static int Tokenizer_parse_wikilink(Tokenizer* self) { } +/* + Handle the separator between a wikilink's title and its text. +*/ static int Tokenizer_handle_wikilink_separator(Tokenizer* self) { } +/* + Handle the end of a wikilink at the head of the string. +*/ static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self) { } +/* + Parse a section heading at the head of the wikicode string. +*/ static int Tokenizer_parse_heading(Tokenizer* self) { } +/* + Handle the end of a section heading at the head of the string. +*/ static PyObject* Tokenizer_handle_heading_end(Tokenizer* self) { } +/* + Actually parse an HTML entity and ensure that it is valid. +*/ static int Tokenizer_really_parse_entity(Tokenizer* self) { } +/* + Parse an HTML entity at the head of the wikicode string. +*/ static int Tokenizer_parse_entity(Tokenizer* self) { } +/* + Parse an HTML comment at the head of the wikicode string. +*/ static int Tokenizer_parse_comment(Tokenizer* self) { } - /* Parse the wikicode string, using context for when to stop. */ From 7fc45783b78772b5b689f3b724481997e23cd4ca Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Sep 2012 18:30:04 -0400 Subject: [PATCH 07/47] Add a header file; improve context handling. --- mwparserfromhell/parser/tokenizer.c | 131 +++--------------------- mwparserfromhell/parser/tokenizer.h | 199 ++++++++++++++++++++++++++++++++++++ 2 files changed, 213 insertions(+), 117 deletions(-) create mode 100644 mwparserfromhell/parser/tokenizer.h diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index ad013cb..41713e2 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -21,45 +21,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifndef PY_SSIZE_T_CLEAN -#define PY_SSIZE_T_CLEAN -#endif - -#include -#include -#include - -static PyObject* EMPTY; - -#define PU (Py_UNICODE*) -static const Py_UNICODE* MARKERS[] = {PU"{", PU"}", PU"[", PU"]", PU"<", PU">", - PU"|", PU"=", PU"&", PU"#", PU"*", PU";", - PU":", PU"/", PU"-", PU"!", PU"\n", PU""}; -static const int NUM_MARKERS = 17; - -#define CONTEXT(name) PyInt_AsSsize_t((PyIntObject*) \ - PyObject_GetAttrString(contexts, name)) - -static jmp_buf exception_env; -static const int BAD_ROUTE = 1; - -static PyObject* contexts; -static PyObject* tokens; - -static PyMethodDef -module_methods[] = { - {NULL} -}; - -typedef struct { - PyObject_HEAD - PyObject* text; /* text to tokenize */ - PyObject* stacks; /* token stacks */ - PyObject* topstack; /* topmost stack */ - Py_ssize_t head; /* current position in text */ - Py_ssize_t length; /* length of text */ - Py_ssize_t global; /* global context */ -} Tokenizer; +#include "tokenizer.h" static PyObject* Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds) @@ -104,11 +66,6 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) return 0; } -#define Tokenizer_STACK(self) PySequence_Fast_GET_ITEM(self->topstack, 0) -#define Tokenizer_CONTEXT(self) PySequence_Fast_GET_ITEM(self->topstack, 1) -#define Tokenizer_CONTEXT_VAL(self) PyInt_AsSsize_t((PyIntObject*) Tokenizer_CONTEXT(self)) -#define Tokenizer_TEXTBUFFER(self) PySequence_Fast_GET_ITEM(self->topstack, 2) - static int Tokenizer_set_context(Tokenizer* self, Py_ssize_t value) { @@ -539,9 +496,7 @@ Tokenizer_parse_comment(Tokenizer* self) static PyObject* Tokenizer_parse(Tokenizer* self, Py_ssize_t context) { - Py_ssize_t fail_contexts = ( - CONTEXT("TEMPLATE") | CONTEXT("ARGUMENT") | CONTEXT("HEADING") | - CONTEXT("COMMENT")); + Py_ssize_t fail_contexts = LC_TEMPLATE | LC_ARGUMENT | LC_HEADING | LC_COMMENT; PyObject *this, *next; Py_UNICODE *this_data, *next_data, *next_next_data, *last_data; @@ -580,7 +535,7 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) next = Tokenizer_read(self, 1); next_data = PyUnicode_AS_UNICODE(next); - if (this_context & CONTEXT("COMMENT")) { + if (this_context & LC_COMMENT) { if (this_data == next_data && next_data == PU "-") { if (PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)) == PU ">") { return Tokenizer_pop(self); @@ -591,42 +546,40 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) else if (this_data == next_data && next_data == PU "{") { Tokenizer_parse_template_or_argument(self); } - else if (this_data == PU "|" && this_context & CONTEXT("TEMPLATE")) { + else if (this_data == PU "|" && this_context & LC_TEMPLATE) { Tokenizer_handle_template_param(self); } - else if (this_data == PU "=" && this_context & CONTEXT("TEMPLATE_PARAM_KEY")) { + else if (this_data == PU "=" && this_context & LC_TEMPLATE_PARAM_KEY) { Tokenizer_handle_template_param_value(self); } - else if (this_data == next_data && next_data == PU "}" && - this_context & CONTEXT("TEMPLATE")) { + else if (this_data == next_data && next_data == PU "}" && this_context & LC_TEMPLATE) { Tokenizer_handle_template_end(self); } - else if (this_data == PU "|" && this_context & CONTEXT("ARGUMENT_NAME")) { + else if (this_data == PU "|" && this_context & LC_ARGUMENT_NAME) { Tokenizer_handle_argument_separator(self); } - else if (this_data == next_data && next_data == PU "}" && - this_context & CONTEXT("ARGUMENT")) { + else if (this_data == next_data && next_data == PU "}" && this_context & LC_ARGUMENT) { if (PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)) == PU "}") { return Tokenizer_handle_argument_end(self); } Tokenizer_write_text(self, this); } else if (this_data == next_data && next_data == PU "[") { - if (!(this_context & CONTEXT("WIKILINK_TITLE"))) { + if (!(this_context & LC_WIKILINK_TITLE)) { Tokenizer_parse_wikilink(self); } else { Tokenizer_write_text(self, this); } } - else if (this_data == PU "|" && this_context & CONTEXT("WIKILINK_TITLE")) { + else if (this_data == PU "|" && this_context & LC_WIKILINK_TITLE) { Tokenizer_handle_wikilink_separator(self); } else if (this_data == next_data && next_data == PU "]" && - this_context & CONTEXT("WIKILINK")) { + this_context & LC_WIKILINK) { return Tokenizer_handle_wikilink_end(self); } - else if (this_data == PU "=" && !(self->global & CONTEXT("GL_HEADING"))) { + else if (this_data == PU "=" && !(self->global & GL_HEADING)) { last_data = PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, 1)); if (last_data == PU "\n" || last_data == PU "") { Tokenizer_parse_heading(self); @@ -635,10 +588,10 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) Tokenizer_write_text(self, this); } } - else if (this_data == PU "=" && this_context & CONTEXT("HEADING")) { + else if (this_data == PU "=" && this_context & LC_HEADING) { return Tokenizer_handle_heading_end(self); } - else if (this_data == PU "\n" && this_context & CONTEXT("HEADING")) { + else if (this_data == PU "\n" && this_context & LC_HEADING) { Tokenizer_fail_route(self); } else if (this_data == PU "&") { @@ -700,61 +653,6 @@ Tokenizer_tokenize(Tokenizer* self, PyObject *args) return Tokenizer_parse(self, 0); } -static PyMethodDef -Tokenizer_methods[] = { - {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, - "Build a list of tokens from a string of wikicode and return it."}, - {NULL} -}; - -static PyMemberDef -Tokenizer_members[] = { - {NULL} -}; - -static PyTypeObject -TokenizerType = { - PyObject_HEAD_INIT(NULL) - 0, /* ob_size */ - "_tokenizer.CTokenizer", /* tp_name */ - sizeof(Tokenizer), /* tp_basicsize */ - 0, /* tp_itemsize */ - (destructor) Tokenizer_dealloc, /* tp_dealloc */ - 0, /* tp_print */ - 0, /* tp_getattr */ - 0, /* tp_setattr */ - 0, /* tp_compare */ - 0, /* tp_repr */ - 0, /* tp_as_number */ - 0, /* tp_as_sequence */ - 0, /* tp_as_mapping */ - 0, /* tp_hash */ - 0, /* tp_call */ - 0, /* tp_str */ - 0, /* tp_getattro */ - 0, /* tp_setattro */ - 0, /* tp_as_buffer */ - Py_TPFLAGS_DEFAULT, /* tp_flags */ - "Creates a list of tokens from a string of wikicode.", /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - Tokenizer_methods, /* tp_methods */ - Tokenizer_members, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - (initproc) Tokenizer_init, /* tp_init */ - 0, /* tp_alloc */ - Tokenizer_new, /* tp_new */ -}; - PyMODINIT_FUNC init_tokenizer(void) { @@ -775,7 +673,6 @@ init_tokenizer(void) PyObject* locals = PyEval_GetLocals(); PyObject* fromlist = PyList_New(0); - contexts = PyImport_ImportModuleLevel("contexts", globals, locals, fromlist, 1); tokens = PyImport_ImportModuleLevel("tokens", globals, locals, fromlist, 1); Py_DECREF(fromlist); } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h new file mode 100644 index 0000000..c504dd8 --- /dev/null +++ b/mwparserfromhell/parser/tokenizer.h @@ -0,0 +1,199 @@ +/* +Tokenizer Header File for MWParserFromHell +Copyright (C) 2012 Ben Kurtovic + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + +#ifndef PY_SSIZE_T_CLEAN +#define PY_SSIZE_T_CLEAN +#endif + +#include +#include +#include + +#define PU (Py_UNICODE*) + +static const Py_UNICODE* MARKERS[] = { + PU "{", PU "}", PU "[", PU "]", PU "<", PU ">", PU "|", PU "=", PU "&", + PU "#", PU "*", PU ";", PU ":", PU "/", PU "-", PU "!", PU "\n", PU ""}; +static const int NUM_MARKERS = 17; + +static jmp_buf exception_env; +static const int BAD_ROUTE = 1; + +static PyObject* EMPTY; +static PyObject* tokens; + + +/* Local contexts: */ + +static const Py_ssize_t LC_TEMPLATE = 0x0007; +static const Py_ssize_t LC_TEMPLATE_NAME = 0x0001; +static const Py_ssize_t LC_TEMPLATE_PARAM_KEY = 0x0002; +static const Py_ssize_t LC_TEMPLATE_PARAM_VALUE = 0x0004; + +static const Py_ssize_t LC_ARGUMENT = 0x0018; +static const Py_ssize_t LC_ARGUMENT_NAME = 0x0008; +static const Py_ssize_t LC_ARGUMENT_DEFAULT = 0x0010; + +static const Py_ssize_t LC_WIKILINK = 0x0060; +static const Py_ssize_t LC_WIKILINK_TITLE = 0x0020; +static const Py_ssize_t LC_WIKILINK_TEXT = 0x0040; + +static const Py_ssize_t LC_HEADING = 0x1f80; +static const Py_ssize_t LC_HEADING_LEVEL_1 = 0x0080; +static const Py_ssize_t LC_HEADING_LEVEL_2 = 0x0100; +static const Py_ssize_t LC_HEADING_LEVEL_3 = 0x0200; +static const Py_ssize_t LC_HEADING_LEVEL_4 = 0x0400; +static const Py_ssize_t LC_HEADING_LEVEL_5 = 0x0800; +static const Py_ssize_t LC_HEADING_LEVEL_6 = 0x1000; + +static const Py_ssize_t LC_COMMENT = 0x2000; + + +/* Global contexts: */ + +static const Py_ssize_t GL_HEADING = 0x1; + + +/* Tokenizer object definition: */ + +typedef struct { + PyObject_HEAD + PyObject* text; /* text to tokenize */ + PyObject* stacks; /* token stacks */ + PyObject* topstack; /* topmost stack */ + Py_ssize_t head; /* current position in text */ + Py_ssize_t length; /* length of text */ + Py_ssize_t global; /* global context */ +} Tokenizer; + + +/* Some macros for accessing Tokenizer data: */ + +#define Tokenizer_STACK(self) PySequence_Fast_GET_ITEM(self->topstack, 0) +#define Tokenizer_CONTEXT(self) PySequence_Fast_GET_ITEM(self->topstack, 1) +#define Tokenizer_CONTEXT_VAL(self) PyInt_AsSsize_t(Tokenizer_CONTEXT(self)) +#define Tokenizer_TEXTBUFFER(self) PySequence_Fast_GET_ITEM(self->topstack, 2) + + +/* Tokenizer function prototypes: */ + +static PyObject* Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds); +static void Tokenizer_dealloc(Tokenizer* self); +static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds); +static int Tokenizer_set_context(Tokenizer* self, Py_ssize_t value); +static int Tokenizer_set_textbuffer(Tokenizer* self, PyObject* value); +static int Tokenizer_push(Tokenizer* self, Py_ssize_t context); +static int Tokenizer_push_textbuffer(Tokenizer* self); +static int Tokenizer_delete_top_of_stack(Tokenizer* self); +static PyObject* Tokenizer_pop(Tokenizer* self); +static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self); +static void Tokenizer_fail_route(Tokenizer* self); +static int Tokenizer_write(Tokenizer* self, PyObject* token); +static int Tokenizer_write_first(Tokenizer* self, PyObject* token); +static int Tokenizer_write_text(Tokenizer* self, PyObject* text); +static int Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist); +static int Tokenizer_write_text_then_stack(Tokenizer* self, PyObject* text); +static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta); +static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta); +static int Tokenizer_parse_template_or_argument(Tokenizer* self); +static int Tokenizer_parse_template(Tokenizer* self); +static int Tokenizer_parse_argument(Tokenizer* self); +static int Tokenizer_verify_safe(Tokenizer* self, Py_UNICODE* unsafes[]); +static int Tokenizer_handle_template_param(Tokenizer* self); +static int Tokenizer_handle_template_param_value(Tokenizer* self); +static PyObject* Tokenizer_handle_template_end(Tokenizer* self); +static int Tokenizer_handle_argument_separator(Tokenizer* self); +static PyObject* Tokenizer_handle_argument_end(Tokenizer* self); +static int Tokenizer_parse_wikilink(Tokenizer* self); +static int Tokenizer_handle_wikilink_separator(Tokenizer* self); +static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self); +static int Tokenizer_parse_heading(Tokenizer* self); +static PyObject* Tokenizer_handle_heading_end(Tokenizer* self); +static int Tokenizer_really_parse_entity(Tokenizer* self); +static int Tokenizer_parse_entity(Tokenizer* self); +static int Tokenizer_parse_comment(Tokenizer* self); +static PyObject* Tokenizer_parse(Tokenizer* self, Py_ssize_t context); +static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject *args); + + +/* More structs for creating the Tokenizer type: */ + +static PyMethodDef +Tokenizer_methods[] = { + {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, + "Build a list of tokens from a string of wikicode and return it."}, + {NULL} +}; + +static PyMemberDef +Tokenizer_members[] = { + {NULL} +}; + +static PyMethodDef +module_methods[] = { + {NULL} +}; + +static PyTypeObject +TokenizerType = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "_tokenizer.CTokenizer", /* tp_name */ + sizeof(Tokenizer), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor) Tokenizer_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + "Creates a list of tokens from a string of wikicode.", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Tokenizer_methods, /* tp_methods */ + Tokenizer_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc) Tokenizer_init, /* tp_init */ + 0, /* tp_alloc */ + Tokenizer_new, /* tp_new */ +}; From 6edc24037eff257e82cfe3d86d3d2b253d2b5fa5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Sep 2012 19:14:23 -0400 Subject: [PATCH 08/47] Implement Tokenizer_parse_template_or_argument(). --- mwparserfromhell/parser/tokenizer.c | 118 ++++++++++++++++++++++++++++------- mwparserfromhell/parser/tokenizer.h | 3 +- mwparserfromhell/parser/tokenizer.py | 2 +- 3 files changed, 98 insertions(+), 25 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 41713e2..4877773 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -313,9 +313,8 @@ Tokenizer_read(Tokenizer* self, Py_ssize_t delta) { Py_ssize_t index = self->head + delta; - if (index >= self->length) { + if (index >= self->length) return EMPTY; - } return PySequence_Fast_GET_ITEM(self->text, index); } @@ -326,9 +325,8 @@ Tokenizer_read(Tokenizer* self, Py_ssize_t delta) static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) { - if (delta > self->head) { + if (delta > self->head) return EMPTY; - } Py_ssize_t index = self->head - delta; return PySequence_Fast_GET_ITEM(self->text, index); @@ -340,7 +338,84 @@ Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) static int Tokenizer_parse_template_or_argument(Tokenizer* self) { + self->head += 2; + unsigned int braces = 2, i; + + while (Tokenizer_READ(self, 0) == PU "{") { + self->head++; + braces++; + } + Tokenizer_push(self, 0); + + while (braces) { + if (braces == 1) { + PyObject* text = PyUnicode_FromString("{"); + + if (Tokenizer_write_text_then_stack(self, text)) { + Py_XDECREF(text); + return -1; + } + + Py_XDECREF(text); + return 0; + } + + if (braces == 2) { + if (setjmp(exception_env) == BAD_ROUTE) { + PyObject* text = PyUnicode_FromString("{{"); + + if (Tokenizer_write_text_then_stack(self, text)) { + Py_XDECREF(text); + return -1; + } + + Py_XDECREF(text); + return 0; + } else { + Tokenizer_parse_template(self); + } + break; + } + + if (setjmp(exception_env) == BAD_ROUTE) { + if (setjmp(exception_env) == BAD_ROUTE) { + char bracestr[braces]; + for (i = 0; i < braces; i++) { + bracestr[i] = *"{"; + } + PyObject* text = PyUnicode_FromString(bracestr); + + if (Tokenizer_write_text_then_stack(self, text)) { + Py_XDECREF(text); + return -1; + } + + Py_XDECREF(text); + return 0; + } + else { + Tokenizer_parse_template(self); + braces -= 2; + } + } + else { + Tokenizer_parse_argument(self); + braces -= 3; + } + + if (braces) { + self->head++; + } + } + PyObject* tokenlist = Tokenizer_pop(self); + if (Tokenizer_write_all(self, tokenlist)) { + Py_DECREF(tokenlist); + return -1; + } + + Py_DECREF(tokenlist); + return 0; } /* @@ -498,8 +573,8 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) { Py_ssize_t fail_contexts = LC_TEMPLATE | LC_ARGUMENT | LC_HEADING | LC_COMMENT; - PyObject *this, *next; - Py_UNICODE *this_data, *next_data, *next_next_data, *last_data; + PyObject *this; + Py_UNICODE *this_data, *next, *next_next, *last; Py_ssize_t this_context; int is_marker, i; @@ -532,18 +607,17 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) return Tokenizer_pop(self); } - next = Tokenizer_read(self, 1); - next_data = PyUnicode_AS_UNICODE(next); + next = Tokenizer_READ(self, 1); if (this_context & LC_COMMENT) { - if (this_data == next_data && next_data == PU "-") { - if (PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)) == PU ">") { + if (this_data == next && next == PU "-") { + if (Tokenizer_READ(self, 2) == PU ">") { return Tokenizer_pop(self); } } Tokenizer_write_text(self, this); } - else if (this_data == next_data && next_data == PU "{") { + else if (this_data == next && next == PU "{") { Tokenizer_parse_template_or_argument(self); } else if (this_data == PU "|" && this_context & LC_TEMPLATE) { @@ -552,19 +626,19 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) else if (this_data == PU "=" && this_context & LC_TEMPLATE_PARAM_KEY) { Tokenizer_handle_template_param_value(self); } - else if (this_data == next_data && next_data == PU "}" && this_context & LC_TEMPLATE) { + else if (this_data == next && next == PU "}" && this_context & LC_TEMPLATE) { Tokenizer_handle_template_end(self); } else if (this_data == PU "|" && this_context & LC_ARGUMENT_NAME) { Tokenizer_handle_argument_separator(self); } - else if (this_data == next_data && next_data == PU "}" && this_context & LC_ARGUMENT) { - if (PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)) == PU "}") { + else if (this_data == next && next == PU "}" && this_context & LC_ARGUMENT) { + if (Tokenizer_READ(self, 2) == PU "}") { return Tokenizer_handle_argument_end(self); } Tokenizer_write_text(self, this); } - else if (this_data == next_data && next_data == PU "[") { + else if (this_data == next && next == PU "[") { if (!(this_context & LC_WIKILINK_TITLE)) { Tokenizer_parse_wikilink(self); } @@ -575,13 +649,12 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) else if (this_data == PU "|" && this_context & LC_WIKILINK_TITLE) { Tokenizer_handle_wikilink_separator(self); } - else if (this_data == next_data && next_data == PU "]" && - this_context & LC_WIKILINK) { + else if (this_data == next && next == PU "]" && this_context & LC_WIKILINK) { return Tokenizer_handle_wikilink_end(self); } else if (this_data == PU "=" && !(self->global & GL_HEADING)) { - last_data = PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, 1)); - if (last_data == PU "\n" || last_data == PU "") { + last = PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, 1)); + if (last == PU "\n" || last == PU "") { Tokenizer_parse_heading(self); } else { @@ -597,10 +670,9 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) else if (this_data == PU "&") { Tokenizer_parse_entity(self); } - else if (this_data == PU "<" && next_data == PU "!") { - next_next_data = PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)); - if (next_next_data == PyUnicode_AS_UNICODE(Tokenizer_read(self, 3)) && - next_next_data == PU "-") { + else if (this_data == PU "<" && next == PU "!") { + next_next = Tokenizer_READ(self, 2); + if (next_next == Tokenizer_READ(self, 3) && next_next == PU "-") { Tokenizer_parse_comment(self); } else { diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index c504dd8..3f7dfdf 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -87,12 +87,13 @@ typedef struct { } Tokenizer; -/* Some macros for accessing Tokenizer data: */ +/* Macros for accessing Tokenizer data: */ #define Tokenizer_STACK(self) PySequence_Fast_GET_ITEM(self->topstack, 0) #define Tokenizer_CONTEXT(self) PySequence_Fast_GET_ITEM(self->topstack, 1) #define Tokenizer_CONTEXT_VAL(self) PyInt_AsSsize_t(Tokenizer_CONTEXT(self)) #define Tokenizer_TEXTBUFFER(self) PySequence_Fast_GET_ITEM(self->topstack, 2) +#define Tokenizer_READ(self, num) PyUnicode_AS_UNICODE(Tokenizer_read(self, num)) /* Tokenizer function prototypes: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index ca645b0..364455d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -162,8 +162,8 @@ class Tokenizer(object): self._head += 2 braces = 2 while self._read() == "{": - braces += 1 self._head += 1 + braces += 1 self._push() while braces: From 0d720a7ef13e7e377dd0d47c88d1e68c717e8b2c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Sep 2012 20:35:49 -0400 Subject: [PATCH 09/47] Implement Tokenizer_parse_template(); NOARGS and NOKWARGS. --- mwparserfromhell/parser/tokenizer.c | 50 +++++++++++++++++++++++++++---------- mwparserfromhell/parser/tokenizer.h | 2 ++ 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 4877773..d9b953b 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -111,29 +111,22 @@ Tokenizer_push_textbuffer(Tokenizer* self) PyObject* text = PyUnicode_Join(EMPTY, Tokenizer_TEXTBUFFER(self)); if (!text) return -1; - PyObject* klass = PyObject_GetAttrString(tokens, "Text"); - if (!klass) return -1; - PyObject* args = PyTuple_New(0); - if (!args) return -1; + PyObject* class = PyObject_GetAttrString(tokens, "Text"); + if (!class) return -1; PyObject* kwargs = PyDict_New(); if (!kwargs) return -1; PyDict_SetItemString(kwargs, "text", text); Py_DECREF(text); - PyObject* token = PyInstance_New(klass, args, kwargs); + PyObject* token = PyInstance_New(class, NOARGS, kwargs); + Py_DECREF(class); + Py_DECREF(kwargs); if (!token) { - Py_DECREF(klass); - Py_DECREF(args); - Py_DECREF(kwargs); return -1; } - Py_DECREF(klass); - Py_DECREF(args); - Py_DECREF(kwargs); - if (PyList_Append(Tokenizer_STACK(self), token)) { - Py_XDECREF(token); + Py_DECREF(token); return -1; } @@ -424,7 +417,36 @@ Tokenizer_parse_template_or_argument(Tokenizer* self) static int Tokenizer_parse_template(Tokenizer* self) { + Py_ssize_t reset = self->head; + if (setjmp(exception_env) == BAD_ROUTE) { + self->head = reset; + longjmp(exception_env, BAD_ROUTE); + } + else { + PyObject* template = Tokenizer_parse(self, LC_TEMPLATE_NAME); + if (!template) return -1; + + PyObject* class = PyObject_GetAttrString(tokens, "TemplateOpen"); + if (!class) return -1; + PyObject* token = PyInstance_New(class, NOARGS, NOKWARGS); + Py_DECREF(class); + if (!token) return -1; + Tokenizer_write_first(self, token); + Py_DECREF(token); + + Tokenizer_write_all(self, template); + Py_DECREF(template); + + class = PyObject_GetAttrString(tokens, "TemplateClose"); + if (!class) return -1; + token = PyInstance_New(class, NOARGS, NOKWARGS); + Py_DECREF(class); + if (!token) return -1; + + Tokenizer_write(self, token); + Py_DECREF(token); + } } /* @@ -740,6 +762,8 @@ init_tokenizer(void) PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); EMPTY = PyUnicode_FromString(""); + NOARGS = PyTuple_New(0); + NOKWARGS = PyDict_New(); PyObject* globals = PyEval_GetGlobals(); PyObject* locals = PyEval_GetLocals(); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 3f7dfdf..d6c97c8 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -40,6 +40,8 @@ static jmp_buf exception_env; static const int BAD_ROUTE = 1; static PyObject* EMPTY; +static PyObject* NOARGS; +static PyObject* NOKWARGS; static PyObject* tokens; From 849016f73488eb4eee51fb8c0b16f49231e2dc3b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Sep 2012 21:27:08 -0400 Subject: [PATCH 10/47] Implement Tokenizer_verify_safe() and some others. --- mwparserfromhell/parser/tokenizer.c | 164 +++++++++++++++++++++++++++++++++--- mwparserfromhell/parser/tokenizer.h | 2 +- 2 files changed, 155 insertions(+), 11 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index d9b953b..3d3b95f 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -121,9 +121,7 @@ Tokenizer_push_textbuffer(Tokenizer* self) PyObject* token = PyInstance_New(class, NOARGS, kwargs); Py_DECREF(class); Py_DECREF(kwargs); - if (!token) { - return -1; - } + if (!token) return -1; if (PyList_Append(Tokenizer_STACK(self), token)) { Py_DECREF(token); @@ -417,25 +415,34 @@ Tokenizer_parse_template_or_argument(Tokenizer* self) static int Tokenizer_parse_template(Tokenizer* self) { + PyObject *template, *class, *token; Py_ssize_t reset = self->head; + if (setjmp(exception_env) == BAD_ROUTE) { self->head = reset; longjmp(exception_env, BAD_ROUTE); } + else { - PyObject* template = Tokenizer_parse(self, LC_TEMPLATE_NAME); + template = Tokenizer_parse(self, LC_TEMPLATE_NAME); if (!template) return -1; - PyObject* class = PyObject_GetAttrString(tokens, "TemplateOpen"); + class = PyObject_GetAttrString(tokens, "TemplateOpen"); if (!class) return -1; - PyObject* token = PyInstance_New(class, NOARGS, NOKWARGS); + token = PyInstance_New(class, NOARGS, NOKWARGS); Py_DECREF(class); if (!token) return -1; - Tokenizer_write_first(self, token); + if (Tokenizer_write_first(self, token)) { + Py_DECREF(token); + return -1; + } Py_DECREF(token); - Tokenizer_write_all(self, template); + if (Tokenizer_write_all(self, template)) { + Py_DECREF(template); + return -1; + } Py_DECREF(template); class = PyObject_GetAttrString(tokens, "TemplateClose"); @@ -444,9 +451,14 @@ Tokenizer_parse_template(Tokenizer* self) Py_DECREF(class); if (!token) return -1; - Tokenizer_write(self, token); + if (Tokenizer_write(self, token)) { + Py_DECREF(token); + return -1; + } Py_DECREF(token); } + + return 0; } /* @@ -455,7 +467,50 @@ Tokenizer_parse_template(Tokenizer* self) static int Tokenizer_parse_argument(Tokenizer* self) { + PyObject *argument, *class, *token; + Py_ssize_t reset = self->head; + + if (setjmp(exception_env) == BAD_ROUTE) { + self->head = reset; + longjmp(exception_env, BAD_ROUTE); + } + + else { + argument = Tokenizer_parse(self, LC_ARGUMENT_NAME); + if (!argument) return -1; + + class = PyObject_GetAttrString(tokens, "ArgumentOpen"); + if (!class) return -1; + token = PyInstance_New(class, NOARGS, NOKWARGS); + Py_DECREF(class); + if (!token) return -1; + + if (Tokenizer_write_first(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + + if (Tokenizer_write_all(self, argument)) { + Py_DECREF(argument); + return -1; + } + Py_DECREF(argument); + + class = PyObject_GetAttrString(tokens, "ArgumentClose"); + if (!class) return -1; + token = PyInstance_New(class, NOARGS, NOKWARGS); + Py_DECREF(class); + if (!token) return -1; + if (Tokenizer_write(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + } + + return 0; } /* @@ -465,9 +520,98 @@ Tokenizer_parse_argument(Tokenizer* self) or parameter key, which cannot contain newlines. */ static int -Tokenizer_verify_safe(Tokenizer* self, Py_UNICODE* unsafes[]) +Tokenizer_verify_safe(Tokenizer* self, const char* unsafes[]) { + if (Tokenizer_push_textbuffer(self)) + return -1; + PyObject* stack = Tokenizer_STACK(self); + if (stack) { + PyObject* textlist = PyList_New(0); + if (!textlist) return -1; + + PyObject* class = PyObject_GetAttrString(tokens, "Text"); + if (!class) { + Py_DECREF(textlist); + return -1; + } + + int i; + Py_ssize_t length = PySequence_Fast_GET_SIZE(stack); + PyObject *token, *textdata; + + for (i = 0; i < length; i++) { + token = PySequence_Fast_GET_ITEM(stack, i); + switch (PyObject_IsInstance(token, class)) { + case -1: + Py_DECREF(textlist); + Py_DECREF(class); + return -1; + case 0: + break; + case 1: + textdata = PyObject_GetAttrString(token, "text"); + if (!textdata) { + Py_DECREF(textlist); + Py_DECREF(class); + return -1; + } + if (PyList_Append(textlist, textdata)) { + Py_DECREF(textlist); + Py_DECREF(class); + Py_DECREF(textdata); + return -1; + } + Py_DECREF(textdata); + } + } + Py_DECREF(class); + + PyObject* text = PyUnicode_Join(EMPTY, textlist); + if (!text) { + Py_DECREF(textlist); + return -1; + } + Py_DECREF(textlist); + + PyObject* stripped = PyObject_CallMethod(text, "strip", NULL); + if (!stripped) { + Py_DECREF(text); + return -1; + } + Py_DECREF(text); + + const char* unsafe_char; + PyObject* unsafe; + i = 0; + while (1) { + unsafe_char = unsafes[i]; + if (!unsafe_char) break; + + unsafe = PyUnicode_FromString(unsafe_char); + + if (!unsafe) { + Py_DECREF(stripped); + return -1; + } + + switch (PyUnicode_Contains(stripped, unsafe)) { + case -1: + Py_DECREF(stripped); + Py_DECREF(unsafe); + return -1; + case 0: + break; + case 1: + Py_DECREF(stripped); + Py_DECREF(unsafe); + Tokenizer_fail_route(self); + } + i++; + } + } + + return 0; } /* diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index d6c97c8..951e238 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -121,7 +121,7 @@ static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta); static int Tokenizer_parse_template_or_argument(Tokenizer* self); static int Tokenizer_parse_template(Tokenizer* self); static int Tokenizer_parse_argument(Tokenizer* self); -static int Tokenizer_verify_safe(Tokenizer* self, Py_UNICODE* unsafes[]); +static int Tokenizer_verify_safe(Tokenizer* self, const char* unsafes[]); static int Tokenizer_handle_template_param(Tokenizer* self); static int Tokenizer_handle_template_param_value(Tokenizer* self); static PyObject* Tokenizer_handle_template_end(Tokenizer* self); From 17af353fb652e01eb61584c0f5c6248edd17e9be Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 24 Sep 2012 21:18:44 -0400 Subject: [PATCH 11/47] Implement Tokenizer_handle_template_param(). --- mwparserfromhell/parser/tokenizer.c | 62 +++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 3d3b95f..3ab2437 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -206,7 +206,8 @@ Tokenizer_pop_keeping_context(Tokenizer* self) static void Tokenizer_fail_route(Tokenizer* self) { - Tokenizer_pop(self); + PyObject* stack = Tokenizer_pop(self); + Py_XDECREF(stack); longjmp(exception_env, BAD_ROUTE); } @@ -400,6 +401,7 @@ Tokenizer_parse_template_or_argument(Tokenizer* self) } PyObject* tokenlist = Tokenizer_pop(self); + if (!tokenlist) return -1; if (Tokenizer_write_all(self, tokenlist)) { Py_DECREF(tokenlist); return -1; @@ -543,10 +545,6 @@ Tokenizer_verify_safe(Tokenizer* self, const char* unsafes[]) for (i = 0; i < length; i++) { token = PySequence_Fast_GET_ITEM(stack, i); switch (PyObject_IsInstance(token, class)) { - case -1: - Py_DECREF(textlist); - Py_DECREF(class); - return -1; case 0: break; case 1: @@ -563,6 +561,11 @@ Tokenizer_verify_safe(Tokenizer* self, const char* unsafes[]) return -1; } Py_DECREF(textdata); + break; + case -1: + Py_DECREF(textlist); + Py_DECREF(class); + return -1; } } Py_DECREF(class); @@ -596,16 +599,17 @@ Tokenizer_verify_safe(Tokenizer* self, const char* unsafes[]) } switch (PyUnicode_Contains(stripped, unsafe)) { - case -1: - Py_DECREF(stripped); - Py_DECREF(unsafe); - return -1; case 0: break; case 1: Py_DECREF(stripped); Py_DECREF(unsafe); Tokenizer_fail_route(self); + break; + case -1: + Py_DECREF(stripped); + Py_DECREF(unsafe); + return -1; } i++; } @@ -620,7 +624,47 @@ Tokenizer_verify_safe(Tokenizer* self, const char* unsafes[]) static int Tokenizer_handle_template_param(Tokenizer* self) { + Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); + if (context & LC_TEMPLATE_NAME) { + if (Tokenizer_verify_safe(self, {"\n", "{", "}", "[", "]"})) + return -1; + if (Tokenizer_set_context(self, context ^ LC_TEMPLATE_NAME)) + return -1; + } + else if (context & LC_TEMPLATE_PARAM_VALUE) { + if (Tokenizer_set_context(self, context ^ LC_TEMPLATE_PARAM_VALUE)) + return -1; + } + + if (context & LC_TEMPLATE_PARAM_KEY) { + PyObject* stack = Tokenizer_pop_keeping_context(self); + if (!stack) return -1; + if (Tokenizer_write_all(stack)) { + Py_DECREF(stack); + return -1; + } + Py_DECREF(stack); + } + else { + if (Tokenizer_set_context(self, context | LC_TEMPLATE_PARAM_KEY)) + return -1; + } + + class = PyObject_GetAttrString(tokens, "TemplateParamSeparator"); + if (!class) return -1; + token = PyInstance_New(class, NOARGS, NOKWARGS); + Py_DECREF(class); + if (!token) return -1; + + if (Tokenizer_write(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + + Tokenizer_push(self, Tokenizer_CONTEXT_VAL(self)); + return 0; } /* From 41535992a1a3488724435f4482642c6aa40bca45 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 25 Sep 2012 17:09:27 -0400 Subject: [PATCH 12/47] Implement Tokenizer_handle_template_param_value(). --- mwparserfromhell/parser/tokenizer.c | 45 ++++++++++++++++++++++++++++++++---- mwparserfromhell/parser/tokenizer.py | 17 ++++++-------- 2 files changed, 48 insertions(+), 14 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 3ab2437..e7699fd 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -627,7 +627,8 @@ Tokenizer_handle_template_param(Tokenizer* self) Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); if (context & LC_TEMPLATE_NAME) { - if (Tokenizer_verify_safe(self, {"\n", "{", "}", "[", "]"})) + const char* unsafes[] = {"\n", "{", "}", "[", "]"}; + if (Tokenizer_verify_safe(self, unsafes)) return -1; if (Tokenizer_set_context(self, context ^ LC_TEMPLATE_NAME)) return -1; @@ -640,7 +641,7 @@ Tokenizer_handle_template_param(Tokenizer* self) if (context & LC_TEMPLATE_PARAM_KEY) { PyObject* stack = Tokenizer_pop_keeping_context(self); if (!stack) return -1; - if (Tokenizer_write_all(stack)) { + if (Tokenizer_write_all(self, stack)) { Py_DECREF(stack); return -1; } @@ -651,9 +652,9 @@ Tokenizer_handle_template_param(Tokenizer* self) return -1; } - class = PyObject_GetAttrString(tokens, "TemplateParamSeparator"); + PyObject* class = PyObject_GetAttrString(tokens, "TemplateParamSeparator"); if (!class) return -1; - token = PyInstance_New(class, NOARGS, NOKWARGS); + PyObject* token = PyInstance_New(class, NOARGS, NOKWARGS); Py_DECREF(class); if (!token) return -1; @@ -673,7 +674,43 @@ Tokenizer_handle_template_param(Tokenizer* self) static int Tokenizer_handle_template_param_value(Tokenizer* self) { + if (setjmp(exception_env) == BAD_ROUTE) { + PyObject* stack = Tokenizer_pop(self); + Py_XDECREF(stack); + longjmp(exception_env, BAD_ROUTE); + } + + else { + const char* unsafes[] = {"\n", "{{", "}}"}; + if (Tokenizer_verify_safe(self, unsafes)) + return -1; + } + + PyObject* stack = Tokenizer_pop_keeping_context(self); + if (!stack) return -1; + if (Tokenizer_write_all(self, stack)) { + Py_DECREF(stack); + return -1; + } + Py_DECREF(stack); + Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); + context ^= LC_TEMPLATE_PARAM_KEY; + context |= LC_TEMPLATE_PARAM_VALUE; + if (Tokenizer_set_context(self, context)) + return -1; + + PyObject* class = PyObject_GetAttrString(tokens, "TemplateParamEquals"); + if (!class) return -1; + PyObject* token = PyInstance_New(class, NOARGS, NOKWARGS); + Py_DECREF(class); + if (!token) return -1; + + if (Tokenizer_write(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); } /* diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 364455d..508344e 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -197,10 +197,9 @@ class Tokenizer(object): except BadRoute: self._head = reset raise - else: - self._write_first(tokens.TemplateOpen()) - self._write_all(template) - self._write(tokens.TemplateClose()) + self._write_first(tokens.TemplateOpen()) + self._write_all(template) + self._write(tokens.TemplateClose()) def _parse_argument(self): """Parse an argument at the head of the wikicode string.""" @@ -210,10 +209,9 @@ class Tokenizer(object): except BadRoute: self._head = reset raise - else: - self._write_first(tokens.ArgumentOpen()) - self._write_all(argument) - self._write(tokens.ArgumentClose()) + self._write_first(tokens.ArgumentOpen()) + self._write_all(argument) + self._write(tokens.ArgumentClose()) def _verify_safe(self, unsafes): """Verify that there are no unsafe characters in the current stack. @@ -249,8 +247,7 @@ class Tokenizer(object): except BadRoute: self._pop() raise - else: - self._write_all(self._pop(keep_context=True)) + self._write_all(self._pop(keep_context=True)) self._context ^= contexts.TEMPLATE_PARAM_KEY self._context |= contexts.TEMPLATE_PARAM_VALUE self._write(tokens.TemplateParamEquals()) From f401ede179b469118ac936a8646e5f5a3be128d4 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 25 Sep 2012 17:32:43 -0400 Subject: [PATCH 13/47] Implementing more stuff. --- mwparserfromhell/parser/tokenizer.c | 84 +++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index e7699fd..b895f6c 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -259,6 +259,39 @@ Tokenizer_write_text(Tokenizer* self, PyObject* text) static int Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist) { + if (PySequence_Fast_GET_SIZE(tokenlist) > 0) { + PyObject* token = PySequence_Fast_GET_ITEM(tokenlist, 0); + PyObject* class = PyObject_GetAttrString(tokens, "Text"); + if (!class) return -1; + + switch (PyObject_IsInstance(token, class)) { + case 0: + break; + case 1: + PyObject* text = PyObject_GetAttrString(token, "text"); + if (!text) { + Py_DECREF(class); + return -1; + } + if (PySequence_DelItem(tokenlist, 0)) { + Py_DECREF(text); + Py_DECREF(class); + return -1; + } + if (Tokenizer_write_text(self, text)) { + Py_DECREF(text); + Py_DECREF(class); + return -1; + } + Py_DECREF(text); + break + case -1: + Py_DECREF(class); + return -1; + } + Py_DECREF(class); + } + if (Tokenizer_push_textbuffer(self)) return -1; @@ -711,6 +744,7 @@ Tokenizer_handle_template_param_value(Tokenizer* self) return -1; } Py_DECREF(token); + return 0; } /* @@ -719,7 +753,27 @@ Tokenizer_handle_template_param_value(Tokenizer* self) static PyObject* Tokenizer_handle_template_end(Tokenizer* self) { + PyObject* stack; + Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); + if (context & LC_TEMPLATE_NAME) { + const char* unsafes[] = {"\n", "{", "}", "[", "]"}; + if (Tokenizer_verify_safe(self, unsafes)) + return NULL; + } + else if (context & LC_TEMPLATE_PARAM_KEY) { + stack = Tokenizer_pop_keeping_context(self); + if (!stack) return NULL; + if (Tokenizer_write_all(self, stack)) { + Py_DECREF(stack); + return NULL; + } + Py_DECREF(stack); + } + + self->head++; + stack = Tokenizer_pop(self); + return stack; } /* @@ -728,7 +782,28 @@ Tokenizer_handle_template_end(Tokenizer* self) static int Tokenizer_handle_argument_separator(Tokenizer* self) { + const char* unsafes[] = {"\n", "{{", "}}"}; + if (Tokenizer_verify_safe(self, unsafes)) + return -1; + + Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); + context ^= LC_ARGUMENT_NAME; + context |= LC_ARGUMENT_DEFAULT; + if (Tokenizer_set_context(self, context)) + return -1; + + PyObject* class = PyObject_GetAttrString(tokens, "ArgumentSeparator"); + if (!class) return -1; + PyObject* token = PyInstance_New(class, NOARGS, NOKWARGS); + Py_DECREF(class); + if (!token) return -1; + if (Tokenizer_write(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + return 0; } /* @@ -737,7 +812,16 @@ Tokenizer_handle_argument_separator(Tokenizer* self) static PyObject* Tokenizer_handle_argument_end(Tokenizer* self) { + Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); + if (context & LC_ARGUMENT_NAME) { + const char* unsafes[] = {"\n", "{{", "}}"}; + if (Tokenizer_verify_safe(self, unsafes)) + return NULL; + } + self->head += 2; + PyObject* stack = Tokenizer_pop(self); + return stack; } /* From 707ecc383740165096d74c471e5f1b739f752f71 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 25 Sep 2012 17:51:23 -0400 Subject: [PATCH 14/47] Implement Tokenizer_parse_wikilink() and more. --- mwparserfromhell/parser/tokenizer.c | 83 ++++++++++++++++++++++++++++++++++--- 1 file changed, 77 insertions(+), 6 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index b895f6c..9068d94 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -264,11 +264,12 @@ Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist) PyObject* class = PyObject_GetAttrString(tokens, "Text"); if (!class) return -1; + PyObject* text; switch (PyObject_IsInstance(token, class)) { case 0: break; case 1: - PyObject* text = PyObject_GetAttrString(token, "text"); + text = PyObject_GetAttrString(token, "text"); if (!text) { Py_DECREF(class); return -1; @@ -284,7 +285,7 @@ Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist) return -1; } Py_DECREF(text); - break + break; case -1: Py_DECREF(class); return -1; @@ -463,13 +464,20 @@ Tokenizer_parse_template(Tokenizer* self) if (!template) return -1; class = PyObject_GetAttrString(tokens, "TemplateOpen"); - if (!class) return -1; + if (!class) { + Py_DECREF(template); + return -1; + } token = PyInstance_New(class, NOARGS, NOKWARGS); Py_DECREF(class); - if (!token) return -1; + if (!token) { + Py_DECREF(template); + return -1; + } if (Tokenizer_write_first(self, token)) { Py_DECREF(token); + Py_DECREF(template); return -1; } Py_DECREF(token); @@ -515,13 +523,20 @@ Tokenizer_parse_argument(Tokenizer* self) if (!argument) return -1; class = PyObject_GetAttrString(tokens, "ArgumentOpen"); - if (!class) return -1; + if (!class) { + Py_DECREF(argument); + return -1; + } token = PyInstance_New(class, NOARGS, NOKWARGS); Py_DECREF(class); - if (!token) return -1; + if (!token) { + Py_DECREF(argument); + return -1; + } if (Tokenizer_write_first(self, token)) { Py_DECREF(token); + Py_DECREF(argument); return -1; } Py_DECREF(token); @@ -830,7 +845,63 @@ Tokenizer_handle_argument_end(Tokenizer* self) static int Tokenizer_parse_wikilink(Tokenizer* self) { + self->head += 2; + Py_ssize_t reset = self->head - 1; + + if (setjmp(exception_env) == BAD_ROUTE) { + self->head = reset; + PyObject* text = PyUnicode_FromString("[["); + if (!text) return -1; + if (Tokenizer_write_text(self, text)) { + Py_XDECREF(text); + return -1; + } + } + + else { + PyObject *class, *token; + PyObject *wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE); + if (!wikilink) return -1; + + class = PyObject_GetAttrString(tokens, "WikilinkOpen"); + if (!class) { + Py_DECREF(wikilink); + return -1; + } + token = PyInstance_New(class, NOARGS, NOKWARGS); + Py_DECREF(class); + if (!token) { + Py_DECREF(wikilink); + return -1; + } + + if (Tokenizer_write(self, token)) { + Py_DECREF(token); + Py_DECREF(wikilink); + return -1; + } + Py_DECREF(token); + if (Tokenizer_write_all(self, wikilink)) { + Py_DECREF(wikilink); + return -1; + } + Py_DECREF(wikilink); + + class = PyObject_GetAttrString(tokens, "WikilinkClose"); + if (!class) return -1; + token = PyInstance_New(class, NOARGS, NOKWARGS); + Py_DECREF(class); + if (!token) return -1; + + if (Tokenizer_write(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + } + + return 0; } /* From 7c29a2a65e253ad5a9473fe7fc65786666889d1a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 25 Sep 2012 17:54:38 -0400 Subject: [PATCH 15/47] Implement Tokenizer_handle_wikilink_separator()/_end(). --- mwparserfromhell/parser/tokenizer.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 9068d94..907c55e 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -827,8 +827,7 @@ Tokenizer_handle_argument_separator(Tokenizer* self) static PyObject* Tokenizer_handle_argument_end(Tokenizer* self) { - Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); - if (context & LC_ARGUMENT_NAME) { + if (Tokenizer_CONTEXT_VAL(self) & LC_ARGUMENT_NAME) { const char* unsafes[] = {"\n", "{{", "}}"}; if (Tokenizer_verify_safe(self, unsafes)) return NULL; @@ -910,7 +909,28 @@ Tokenizer_parse_wikilink(Tokenizer* self) static int Tokenizer_handle_wikilink_separator(Tokenizer* self) { + const char* unsafes[] = {"\n", "{", "}", "[", "]"}; + if (Tokenizer_verify_safe(self, unsafes)) + return -1; + + Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); + context ^= LC_WIKILINK_TITLE; + context |= LC_WIKILINK_TEXT; + if (Tokenizer_set_context(self, context)) + return -1; + + PyObject* class = PyObject_GetAttrString(tokens, "WikilinkSeparator"); + if (!class) return -1; + PyObject* token = PyInstance_New(class, NOARGS, NOKWARGS); + Py_DECREF(class); + if (!token) return -1; + if (Tokenizer_write(self, token)) { + Py_DECREF(token); + return -1; + } + Py_DECREF(token); + return 0; } /* @@ -919,7 +939,15 @@ Tokenizer_handle_wikilink_separator(Tokenizer* self) static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self) { + if (Tokenizer_CONTEXT_VAL(self) & LC_WIKILINK_TITLE) { + const char* unsafes[] = {"\n", "{", "}", "[", "]"}; + if (Tokenizer_verify_safe(self, unsafes)) + return NULL; + } + self->head += 1; + PyObject* stack = Tokenizer_pop(self); + return stack; } /* From 150f3311290a8569eb960084e070eb23f6e70c3c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 25 Sep 2012 18:11:29 -0400 Subject: [PATCH 16/47] Implement Tokenizer_parse_entity(), Tokenizer_parse_comment(). --- mwparserfromhell/parser/tokenizer.c | 79 ++++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 907c55e..d302ea2 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -856,7 +856,6 @@ Tokenizer_parse_wikilink(Tokenizer* self) return -1; } } - else { PyObject *class, *token; PyObject *wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE); @@ -899,7 +898,6 @@ Tokenizer_parse_wikilink(Tokenizer* self) } Py_DECREF(token); } - return 0; } @@ -983,7 +981,29 @@ Tokenizer_really_parse_entity(Tokenizer* self) static int Tokenizer_parse_entity(Tokenizer* self) { + Py_ssize_t reset = self->head; + if (Tokenizer_push(self, 0)) + return -1; + if (setjmp(exception_env) == BAD_ROUTE) { + self->head = reset; + if (Tokenizer_write_text(self, Tokenizer_read(self, 0))) + return -1; + } + else { + if (Tokenizer_really_parse_entity(self)) + return -1; + + PyObject* tokenlist = Tokenizer_pop(self); + if (!tokenlist) return -1; + if (Tokenizer_write_all(self, tokenlist)) { + Py_DECREF(tokenlist); + return -1; + } + + Py_DECREF(tokenlist); + } + return 0; } /* @@ -992,7 +1012,62 @@ Tokenizer_parse_entity(Tokenizer* self) static int Tokenizer_parse_comment(Tokenizer* self) { + self->head += 4; + Py_ssize_t reset = self->head - 1; + if (setjmp(exception_env) == BAD_ROUTE) { + self->head = reset; + PyObject* text = PyUnicode_FromString("