diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 31bebe8..7ba7472 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -26,25 +26,7 @@ SOFTWARE. static PyObject* Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds) { - Tokenizer *self; - - self = (Tokenizer*) type->tp_alloc(type, 0); - if (self != NULL) { - - self->text = Py_None; - Py_INCREF(Py_None); - - self->stacks = PyList_New(0); - if (!self->stacks) { - Py_DECREF(self); - return NULL; - } - - self->head = 0; - self->length = 0; - self->global = 0; - } - + Tokenizer* self = (Tokenizer*) type->tp_alloc(type, 0); return (PyObject*) self; } @@ -63,6 +45,22 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) static char* kwlist[] = {NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) return -1; + + self->text = Py_None; + self->topstack = Py_None; + Py_INCREF(Py_None); + Py_INCREF(Py_None); + + self->stacks = PyList_New(0); + if (!self->stacks) { + Py_DECREF(self); + return -1; + } + + self->head = 0; + self->length = 0; + self->global = 0; + return 0; } @@ -89,6 +87,7 @@ static int Tokenizer_push(Tokenizer* self, Py_ssize_t context) { PyObject* top = PyList_New(3); + if (!top) return -1; PyList_SET_ITEM(top, 0, PyList_New(0)); PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(context)); PyList_SET_ITEM(top, 2, PyList_New(0)); @@ -1094,7 +1093,7 @@ Tokenizer_handle_heading_end(Tokenizer* self) self->head++; } - Py_ssize_t current = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1); // FIXME + Py_ssize_t current = log2(Tokenizer_CONTEXT_VAL(self) / LC_HEADING_LEVEL_1) + 1; Py_ssize_t level = current > best ? (best > 6 ? 6 : best) : (current > 6 ? 6 : current); if (setjmp(exception_env) == BAD_ROUTE) { @@ -1387,7 +1386,7 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) Build a list of tokens from a string of wikicode and return it. */ static PyObject* -Tokenizer_tokenize(Tokenizer* self, PyObject *args) +Tokenizer_tokenize(Tokenizer* self, PyObject* args) { PyObject* text; @@ -1439,10 +1438,24 @@ init_tokenizer(void) NOARGS = PyTuple_New(0); NOKWARGS = PyDict_New(); + char* name = "mwparserfromhell.parser"; PyObject* globals = PyEval_GetGlobals(); PyObject* locals = PyEval_GetLocals(); - PyObject* fromlist = PyList_New(0); + PyObject* fromlist = PyList_New(1); + if (!fromlist) return; + PyObject* submodname = PyBytes_FromString("tokens"); + if (!submodname) { + Py_DECREF(fromlist); + return; + } + PyList_SET_ITEM(fromlist, 0, submodname); - tokens = PyImport_ImportModuleLevel("tokens", globals, locals, fromlist, 1); + PyObject* tokmodule = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); Py_DECREF(fromlist); + if (!tokmodule) { + return; + } + + tokens = PyObject_GetAttrString(tokmodule, "tokens"); + Py_DECREF(tokmodule); } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 3a87a37..7ba9c40 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -26,6 +26,7 @@ SOFTWARE. #endif #include +#include #include #include @@ -108,43 +109,43 @@ typedef struct { /* Tokenizer function prototypes: */ -static PyObject* Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds); -static void Tokenizer_dealloc(Tokenizer* self); -static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds); -static int Tokenizer_set_context(Tokenizer* self, Py_ssize_t value); -static int Tokenizer_set_textbuffer(Tokenizer* self, PyObject* value); -static int Tokenizer_push(Tokenizer* self, Py_ssize_t context); -static int Tokenizer_push_textbuffer(Tokenizer* self); -static int Tokenizer_delete_top_of_stack(Tokenizer* self); -static PyObject* Tokenizer_pop(Tokenizer* self); -static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self); -static void Tokenizer_fail_route(Tokenizer* self); -static int Tokenizer_write(Tokenizer* self, PyObject* token); -static int Tokenizer_write_first(Tokenizer* self, PyObject* token); -static int Tokenizer_write_text(Tokenizer* self, PyObject* text); -static int Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist); -static int Tokenizer_write_text_then_stack(Tokenizer* self, PyObject* text); -static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta); -static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta); -static int Tokenizer_parse_template_or_argument(Tokenizer* self); -static int Tokenizer_parse_template(Tokenizer* self); -static int Tokenizer_parse_argument(Tokenizer* self); -static int Tokenizer_verify_safe(Tokenizer* self, const char* unsafes[]); -static int Tokenizer_handle_template_param(Tokenizer* self); -static int Tokenizer_handle_template_param_value(Tokenizer* self); -static PyObject* Tokenizer_handle_template_end(Tokenizer* self); -static int Tokenizer_handle_argument_separator(Tokenizer* self); -static PyObject* Tokenizer_handle_argument_end(Tokenizer* self); -static int Tokenizer_parse_wikilink(Tokenizer* self); -static int Tokenizer_handle_wikilink_separator(Tokenizer* self); -static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self); -static int Tokenizer_parse_heading(Tokenizer* self); -static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self); -static int Tokenizer_really_parse_entity(Tokenizer* self); -static int Tokenizer_parse_entity(Tokenizer* self); -static int Tokenizer_parse_comment(Tokenizer* self); -static PyObject* Tokenizer_parse(Tokenizer* self, Py_ssize_t context); -static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject *args); +static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); +static void Tokenizer_dealloc(Tokenizer*); +static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); +static int Tokenizer_set_context(Tokenizer*, Py_ssize_t); +static int Tokenizer_set_textbuffer(Tokenizer*, PyObject*); +static int Tokenizer_push(Tokenizer*, Py_ssize_t); +static int Tokenizer_push_textbuffer(Tokenizer*); +static int Tokenizer_delete_top_of_stack(Tokenizer*); +static PyObject* Tokenizer_pop(Tokenizer*); +static PyObject* Tokenizer_pop_keeping_context(Tokenizer*); +static void Tokenizer_fail_route(Tokenizer*); +static int Tokenizer_write(Tokenizer*, PyObject*); +static int Tokenizer_write_first(Tokenizer*, PyObject*); +static int Tokenizer_write_text(Tokenizer*, PyObject*); +static int Tokenizer_write_all(Tokenizer*, PyObject*); +static int Tokenizer_write_text_then_stack(Tokenizer*, PyObject*); +static PyObject* Tokenizer_read(Tokenizer*, Py_ssize_t); +static PyObject* Tokenizer_read_backwards(Tokenizer*, Py_ssize_t); +static int Tokenizer_parse_template_or_argument(Tokenizer*); +static int Tokenizer_parse_template(Tokenizer*); +static int Tokenizer_parse_argument(Tokenizer*); +static int Tokenizer_verify_safe(Tokenizer*, const char* []); +static int Tokenizer_handle_template_param(Tokenizer*); +static int Tokenizer_handle_template_param_value(Tokenizer*); +static PyObject* Tokenizer_handle_template_end(Tokenizer*); +static int Tokenizer_handle_argument_separator(Tokenizer*); +static PyObject* Tokenizer_handle_argument_end(Tokenizer*); +static int Tokenizer_parse_wikilink(Tokenizer*); +static int Tokenizer_handle_wikilink_separator(Tokenizer*); +static PyObject* Tokenizer_handle_wikilink_end(Tokenizer*); +static int Tokenizer_parse_heading(Tokenizer*); +static HeadingData* Tokenizer_handle_heading_end(Tokenizer*); +static int Tokenizer_really_parse_entity(Tokenizer*); +static int Tokenizer_parse_entity(Tokenizer*); +static int Tokenizer_parse_comment(Tokenizer*); +static PyObject* Tokenizer_parse(Tokenizer*, Py_ssize_t); +static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); /* More structs for creating the Tokenizer type: */