diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 6550aad..d7c206d 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -34,8 +34,14 @@ static void Tokenizer_dealloc(Tokenizer* self) { Py_XDECREF(self->text); - Py_XDECREF(self->stacks); - Py_XDECREF(self->topstack); + struct Stack *this = self->topstack, *next; + while (this) { + Py_DECREF(this->stack); + Py_DECREF(this->textbuffer); + next = this->next; + free(this); + this = next; + } self->ob_type->tp_free((PyObject*) self); } @@ -47,57 +53,26 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) return -1; self->text = Py_None; - self->topstack = Py_None; - Py_INCREF(Py_None); Py_INCREF(Py_None); - - self->stacks = PyList_New(0); - if (!self->stacks) { - Py_DECREF(self); - return -1; - } - + self->topstack = NULL; self->head = 0; self->length = 0; self->global = 0; - - return 0; -} - -static int -Tokenizer_set_context(Tokenizer* self, Py_ssize_t value) -{ - if (PyList_SetItem(self->topstack, 1, PyInt_FromSsize_t(value))) - return -1; - return 0; -} - -static int -Tokenizer_set_textbuffer(Tokenizer* self, PyObject* value) -{ - if (PyList_SetItem(self->topstack, 2, value)) - return -1; return 0; } /* Add a new token stack, context, and textbuffer to the list. */ -static int -Tokenizer_push(Tokenizer* self, Py_ssize_t context) +static void +Tokenizer_push(Tokenizer* self, int context) { - PyObject* top = PyList_New(3); - if (!top) return -1; - PyList_SET_ITEM(top, 0, PyList_New(0)); - PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(context)); - PyList_SET_ITEM(top, 2, PyList_New(0)); - - Py_XDECREF(self->topstack); + struct Stack* top = malloc(sizeof(struct Stack)); + top->stack = PyList_New(0); + top->context = context; + top->textbuffer = PyList_New(0); + top->next = self->topstack; self->topstack = top; - - if (PyList_Append(self->stacks, top)) - return -1; - return 0; } /* @@ -106,8 +81,8 @@ Tokenizer_push(Tokenizer* self, Py_ssize_t context) static int Tokenizer_push_textbuffer(Tokenizer* self) { - if (PyList_GET_SIZE(Tokenizer_TEXTBUFFER(self)) > 0) { - PyObject* text = PyUnicode_Join(EMPTY, Tokenizer_TEXTBUFFER(self)); + if (PyList_GET_SIZE(self->topstack->textbuffer) > 0) { + PyObject* text = PyUnicode_Join(EMPTY, self->topstack->textbuffer); if (!text) return -1; PyObject* class = PyObject_GetAttrString(tokens, "Text"); @@ -129,37 +104,28 @@ Tokenizer_push_textbuffer(Tokenizer* self) Py_DECREF(kwargs); if (!token) return -1; - if (PyList_Append(Tokenizer_STACK(self), token)) { + if (PyList_Append(self->topstack->stack, token)) { Py_DECREF(token); return -1; } Py_DECREF(token); - if (Tokenizer_set_textbuffer(self, PyList_New(0))) + self->topstack->textbuffer = PyList_New(0); + if (!self->topstack->textbuffer) return -1; } return 0; } -static int +static void Tokenizer_delete_top_of_stack(Tokenizer* self) { - if (PySequence_DelItem(self->stacks, -1)) - return -1; - Py_DECREF(self->topstack); - - Py_ssize_t size = PyList_GET_SIZE(self->stacks); - if (size > 0) { - PyObject* top = PyList_GET_ITEM(self->stacks, size - 1); - self->topstack = top; - Py_INCREF(top); - } - else { - self->topstack = NULL; - } - - return 0; + struct Stack* top = self->topstack; + Py_DECREF(top->stack); + Py_DECREF(top->textbuffer); + self->topstack = top->next; + free(top); } /* @@ -171,12 +137,10 @@ Tokenizer_pop(Tokenizer* self) if (Tokenizer_push_textbuffer(self)) return NULL; - PyObject* stack = Tokenizer_STACK(self); + PyObject* stack = self->topstack->stack; Py_INCREF(stack); - if (Tokenizer_delete_top_of_stack(self)) - return NULL; - + Tokenizer_delete_top_of_stack(self); return stack; } @@ -190,17 +154,12 @@ Tokenizer_pop_keeping_context(Tokenizer* self) if (Tokenizer_push_textbuffer(self)) return NULL; - PyObject* stack = Tokenizer_STACK(self); - PyObject* context = Tokenizer_CONTEXT(self); + PyObject* stack = self->topstack->stack; Py_INCREF(stack); - Py_INCREF(context); - - if (Tokenizer_delete_top_of_stack(self)) - return NULL; - - if (PyList_SetItem(self->topstack, 1, context)) - return NULL; + int context = self->topstack->context; + Tokenizer_delete_top_of_stack(self); + self->topstack->context = context; return stack; } @@ -226,7 +185,7 @@ Tokenizer_write(Tokenizer* self, PyObject* token) if (Tokenizer_push_textbuffer(self)) return -1; - if (PyList_Append(Tokenizer_STACK(self), token)) + if (PyList_Append(self->topstack->stack, token)) return -1; return 0; @@ -241,7 +200,7 @@ Tokenizer_write_first(Tokenizer* self, PyObject* token) if (Tokenizer_push_textbuffer(self)) return -1; - if (PyList_Insert(Tokenizer_STACK(self), 0, token)) + if (PyList_Insert(self->topstack->stack, 0, token)) return -1; return 0; @@ -253,7 +212,7 @@ Tokenizer_write_first(Tokenizer* self, PyObject* token) static int Tokenizer_write_text(Tokenizer* self, PyObject* text) { - if (PyList_Append(Tokenizer_TEXTBUFFER(self), text)) + if (PyList_Append(self->topstack->textbuffer, text)) return -1; return 0; @@ -302,7 +261,7 @@ Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist) if (Tokenizer_push_textbuffer(self)) return -1; - PyObject* stack = Tokenizer_STACK(self); + PyObject* stack = self->topstack->stack; Py_ssize_t size = PyList_GET_SIZE(stack); if (PyList_SetSlice(stack, size, size, tokenlist)) @@ -579,7 +538,7 @@ Tokenizer_verify_safe(Tokenizer* self, const char* unsafes[]) if (Tokenizer_push_textbuffer(self)) return -1; - PyObject* stack = Tokenizer_STACK(self); + PyObject* stack = self->topstack->stack; if (stack) { PyObject* textlist = PyList_New(0); if (!textlist) return -1; @@ -673,24 +632,18 @@ Tokenizer_verify_safe(Tokenizer* self, const char* unsafes[]) static int Tokenizer_handle_template_param(Tokenizer* self) { - Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); - - if (context & LC_TEMPLATE_NAME) { + if (self->topstack->context & LC_TEMPLATE_NAME) { const char* unsafes[] = {"\n", "{", "}", "[", "]", NULL}; if (Tokenizer_verify_safe(self, unsafes)) return -1; if (BAD_ROUTE) return -1; - context ^= LC_TEMPLATE_NAME; - if (Tokenizer_set_context(self, context)) - return -1; + self->topstack->context ^= LC_TEMPLATE_NAME; } - else if (context & LC_TEMPLATE_PARAM_VALUE) { - context ^= LC_TEMPLATE_PARAM_VALUE; - if (Tokenizer_set_context(self, context)) - return -1; + else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE) { + self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE; } - if (context & LC_TEMPLATE_PARAM_KEY) { + if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { PyObject* stack = Tokenizer_pop_keeping_context(self); if (!stack) return -1; if (Tokenizer_write_all(self, stack)) { @@ -700,9 +653,7 @@ Tokenizer_handle_template_param(Tokenizer* self) Py_DECREF(stack); } else { - context |= LC_TEMPLATE_PARAM_KEY; - if (Tokenizer_set_context(self, context)) - return -1; + self->topstack->context |= LC_TEMPLATE_PARAM_KEY; } PyObject* class = PyObject_GetAttrString(tokens, "TemplateParamSeparator"); @@ -717,7 +668,7 @@ Tokenizer_handle_template_param(Tokenizer* self) } Py_DECREF(token); - Tokenizer_push(self, context); + Tokenizer_push(self, self->topstack->context); return 0; } @@ -744,11 +695,8 @@ Tokenizer_handle_template_param_value(Tokenizer* self) } Py_DECREF(stack); - Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); - context ^= LC_TEMPLATE_PARAM_KEY; - context |= LC_TEMPLATE_PARAM_VALUE; - if (Tokenizer_set_context(self, context)) - return -1; + self->topstack->context ^= LC_TEMPLATE_PARAM_KEY; + self->topstack->context |= LC_TEMPLATE_PARAM_VALUE; PyObject* class = PyObject_GetAttrString(tokens, "TemplateParamEquals"); if (!class) return -1; @@ -771,14 +719,12 @@ static PyObject* Tokenizer_handle_template_end(Tokenizer* self) { PyObject* stack; - Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); - - if (context & LC_TEMPLATE_NAME) { + if (self->topstack->context & LC_TEMPLATE_NAME) { const char* unsafes[] = {"\n", "{", "}", "[", "]", NULL}; if (Tokenizer_verify_safe(self, unsafes)) return NULL; } - else if (context & LC_TEMPLATE_PARAM_KEY) { + else if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { stack = Tokenizer_pop_keeping_context(self); if (!stack) return NULL; if (Tokenizer_write_all(self, stack)) { @@ -803,11 +749,8 @@ Tokenizer_handle_argument_separator(Tokenizer* self) if (Tokenizer_verify_safe(self, unsafes)) return -1; - Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); - context ^= LC_ARGUMENT_NAME; - context |= LC_ARGUMENT_DEFAULT; - if (Tokenizer_set_context(self, context)) - return -1; + self->topstack->context ^= LC_ARGUMENT_NAME; + self->topstack->context |= LC_ARGUMENT_DEFAULT; PyObject* class = PyObject_GetAttrString(tokens, "ArgumentSeparator"); if (!class) return -1; @@ -829,7 +772,7 @@ Tokenizer_handle_argument_separator(Tokenizer* self) static PyObject* Tokenizer_handle_argument_end(Tokenizer* self) { - if (Tokenizer_CONTEXT_VAL(self) & LC_ARGUMENT_NAME) { + if (self->topstack->context & LC_ARGUMENT_NAME) { const char* unsafes[] = {"\n", "{{", "}}", NULL}; if (Tokenizer_verify_safe(self, unsafes)) return NULL; @@ -914,11 +857,8 @@ Tokenizer_handle_wikilink_separator(Tokenizer* self) if (Tokenizer_verify_safe(self, unsafes)) return -1; - Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); - context ^= LC_WIKILINK_TITLE; - context |= LC_WIKILINK_TEXT; - if (Tokenizer_set_context(self, context)) - return -1; + self->topstack->context ^= LC_WIKILINK_TITLE; + self->topstack->context |= LC_WIKILINK_TEXT; PyObject* class = PyObject_GetAttrString(tokens, "WikilinkSeparator"); if (!class) return -1; @@ -940,7 +880,7 @@ Tokenizer_handle_wikilink_separator(Tokenizer* self) static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self) { - if (Tokenizer_CONTEXT_VAL(self) & LC_WIKILINK_TITLE) { + if (self->topstack->context & LC_WIKILINK_TITLE) { const char* unsafes[] = {"\n", "{", "}", "[", "]", NULL}; if (Tokenizer_verify_safe(self, unsafes)) return NULL; @@ -960,7 +900,7 @@ Tokenizer_parse_heading(Tokenizer* self) self->global |= GL_HEADING; Py_ssize_t reset = self->head; self->head += 1; - Py_ssize_t best = 1; + int best = 1; PyObject* text; int i; @@ -969,7 +909,7 @@ Tokenizer_parse_heading(Tokenizer* self) self->head++; } - Py_ssize_t context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1); + int context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1); HeadingData* heading = (HeadingData*) Tokenizer_parse(self, context); if (BAD_ROUTE) { @@ -1032,7 +972,7 @@ Tokenizer_parse_heading(Tokenizer* self) Py_DECREF(token); if (heading->level < best) { - Py_ssize_t diff = best - heading->level; + int diff = best - heading->level; char diffblocks[diff]; for (i = 0; i < diff; i++) diffblocks[i] = *"="; PyObject* text = PyUnicode_FromStringAndSize(diffblocks, diff); @@ -1092,16 +1032,14 @@ Tokenizer_handle_heading_end(Tokenizer* self) self->head++; } - Py_ssize_t current = log2(Tokenizer_CONTEXT_VAL(self) / LC_HEADING_LEVEL_1) + 1; - Py_ssize_t level = current > best ? (best > 6 ? 6 : best) : (current > 6 ? 6 : current); - - Py_ssize_t context = Tokenizer_CONTEXT_VAL(self); - HeadingData* after = (HeadingData*) Tokenizer_parse(self, context); + int current = log2(self->topstack->context / LC_HEADING_LEVEL_1) + 1; + int level = current > best ? (best > 6 ? 6 : best) : (current > 6 ? 6 : current); + HeadingData* after = (HeadingData*) Tokenizer_parse(self, self->topstack->context); if (BAD_ROUTE) { RESET_ROUTE(); if (level < best) { - Py_ssize_t diff = best - level; + int diff = best - level; char diffblocks[diff]; for (i = 0; i < diff; i++) diffblocks[i] = *"="; text = PyUnicode_FromStringAndSize(diffblocks, diff); @@ -1174,8 +1112,7 @@ static int Tokenizer_parse_entity(Tokenizer* self) { Py_ssize_t reset = self->head; - if (Tokenizer_push(self, 0)) - return -1; + Tokenizer_push(self, 0); if (Tokenizer_really_parse_entity(self)) return -1; @@ -1268,13 +1205,12 @@ Tokenizer_parse_comment(Tokenizer* self) Parse the wikicode string, using context for when to stop. */ static PyObject* -Tokenizer_parse(Tokenizer* self, Py_ssize_t context) +Tokenizer_parse(Tokenizer* self, int context) { PyObject *this; Py_UNICODE this_data, next, next_next, last; - Py_ssize_t this_context; - Py_ssize_t fail_contexts = ( - LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_COMMENT); + int this_context; + int fail_contexts = LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_COMMENT; int is_marker, i; Tokenizer_push(self, context); @@ -1297,7 +1233,7 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context) continue; } - this_context = Tokenizer_CONTEXT_VAL(self); + this_context = self->topstack->context; if (this_data == *"") { if (this_context & LC_TEMPLATE_PARAM_KEY) { diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 91da10e..1b9b76a 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -46,40 +46,47 @@ static PyObject* tokens; /* Local contexts: */ -static const Py_ssize_t LC_TEMPLATE = 0x0007; -static const Py_ssize_t LC_TEMPLATE_NAME = 0x0001; -static const Py_ssize_t LC_TEMPLATE_PARAM_KEY = 0x0002; -static const Py_ssize_t LC_TEMPLATE_PARAM_VALUE = 0x0004; +static const int LC_TEMPLATE = 0x0007; +static const int LC_TEMPLATE_NAME = 0x0001; +static const int LC_TEMPLATE_PARAM_KEY = 0x0002; +static const int LC_TEMPLATE_PARAM_VALUE = 0x0004; -static const Py_ssize_t LC_ARGUMENT = 0x0018; -static const Py_ssize_t LC_ARGUMENT_NAME = 0x0008; -static const Py_ssize_t LC_ARGUMENT_DEFAULT = 0x0010; +static const int LC_ARGUMENT = 0x0018; +static const int LC_ARGUMENT_NAME = 0x0008; +static const int LC_ARGUMENT_DEFAULT = 0x0010; -static const Py_ssize_t LC_WIKILINK = 0x0060; -static const Py_ssize_t LC_WIKILINK_TITLE = 0x0020; -static const Py_ssize_t LC_WIKILINK_TEXT = 0x0040; +static const int LC_WIKILINK = 0x0060; +static const int LC_WIKILINK_TITLE = 0x0020; +static const int LC_WIKILINK_TEXT = 0x0040; -static const Py_ssize_t LC_HEADING = 0x1f80; -static const Py_ssize_t LC_HEADING_LEVEL_1 = 0x0080; -static const Py_ssize_t LC_HEADING_LEVEL_2 = 0x0100; -static const Py_ssize_t LC_HEADING_LEVEL_3 = 0x0200; -static const Py_ssize_t LC_HEADING_LEVEL_4 = 0x0400; -static const Py_ssize_t LC_HEADING_LEVEL_5 = 0x0800; -static const Py_ssize_t LC_HEADING_LEVEL_6 = 0x1000; +static const int LC_HEADING = 0x1f80; +static const int LC_HEADING_LEVEL_1 = 0x0080; +static const int LC_HEADING_LEVEL_2 = 0x0100; +static const int LC_HEADING_LEVEL_3 = 0x0200; +static const int LC_HEADING_LEVEL_4 = 0x0400; +static const int LC_HEADING_LEVEL_5 = 0x0800; +static const int LC_HEADING_LEVEL_6 = 0x1000; -static const Py_ssize_t LC_COMMENT = 0x2000; +static const int LC_COMMENT = 0x2000; /* Global contexts: */ -static const Py_ssize_t GL_HEADING = 0x1; +static const int GL_HEADING = 0x1; /* Miscellaneous structs: */ +struct Stack { + PyObject* stack; + int context; + PyObject* textbuffer; + struct Stack* next; +}; + typedef struct { PyObject* title; - Py_ssize_t level; + int level; } HeadingData; @@ -87,22 +94,17 @@ typedef struct { typedef struct { PyObject_HEAD - PyObject* text; /* text to tokenize */ - PyObject* stacks; /* token stacks */ - PyObject* topstack; /* topmost stack */ - Py_ssize_t head; /* current position in text */ - Py_ssize_t length; /* length of text */ - Py_ssize_t global; /* global context */ + PyObject* text; /* text to tokenize */ + struct Stack* topstack; /* topmost stack */ + Py_ssize_t head; /* current position in text */ + Py_ssize_t length; /* length of text */ + int global; /* global context */ } Tokenizer; /* Macros for accessing Tokenizer data: */ -#define Tokenizer_STACK(self) PySequence_Fast_GET_ITEM(self->topstack, 0) -#define Tokenizer_CONTEXT(self) PySequence_Fast_GET_ITEM(self->topstack, 1) -#define Tokenizer_CONTEXT_VAL(self) PyInt_AsSsize_t(Tokenizer_CONTEXT(self)) -#define Tokenizer_TEXTBUFFER(self) PySequence_Fast_GET_ITEM(self->topstack, 2) -#define Tokenizer_READ(self, num) PyUnicode_AS_UNICODE(Tokenizer_read(self, num)) +#define Tokenizer_READ(self, delta) PyUnicode_AS_UNICODE(Tokenizer_read(self, delta)) /* Tokenizer function prototypes: */ @@ -110,11 +112,9 @@ typedef struct { static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); static void Tokenizer_dealloc(Tokenizer*); static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); -static int Tokenizer_set_context(Tokenizer*, Py_ssize_t); -static int Tokenizer_set_textbuffer(Tokenizer*, PyObject*); -static int Tokenizer_push(Tokenizer*, Py_ssize_t); +static void Tokenizer_push(Tokenizer*, int); static int Tokenizer_push_textbuffer(Tokenizer*); -static int Tokenizer_delete_top_of_stack(Tokenizer*); +static void Tokenizer_delete_top_of_stack(Tokenizer*); static PyObject* Tokenizer_pop(Tokenizer*); static PyObject* Tokenizer_pop_keeping_context(Tokenizer*); static void* Tokenizer_fail_route(Tokenizer*); @@ -142,7 +142,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer*); static int Tokenizer_really_parse_entity(Tokenizer*); static int Tokenizer_parse_entity(Tokenizer*); static int Tokenizer_parse_comment(Tokenizer*); -static PyObject* Tokenizer_parse(Tokenizer*, Py_ssize_t); +static PyObject* Tokenizer_parse(Tokenizer*, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);