diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 3f7e84e..0d18473 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -26,8 +26,8 @@ SOFTWARE. #endif #include -#include "setjmp.h" -#include "structmember.h" +#include +#include static PyObject* EMPTY; @@ -35,7 +35,10 @@ static PyObject* EMPTY; static const Py_UNICODE* MARKERS[] = {PU"{", PU"}", PU"[", PU"]", PU"<", PU">", PU"|", PU"=", PU"&", PU"#", PU"*", PU";", PU":", PU"/", PU"-", PU"!", PU"\n", PU""}; -#undef PU +static const int NUM_MARKERS = 17; + +#define CONTEXT(name) PyInt_AsSsize_t((PyIntObject*) \ + PyObject_GetAttrString(contexts, name)) static jmp_buf exception_env; static const int BAD_ROUTE = 1; @@ -103,6 +106,7 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) #define Tokenizer_STACK(self) PySequence_Fast_GET_ITEM(self->topstack, 0) #define Tokenizer_CONTEXT(self) PySequence_Fast_GET_ITEM(self->topstack, 1) +#define Tokenizer_CONTEXT_VAL(self) PyInt_AsSsize_t((PyIntObject*) Tokenizer_CONTEXT(self)) #define Tokenizer_TEXTBUFFER(self) PySequence_Fast_GET_ITEM(self->topstack, 2) static int @@ -125,11 +129,11 @@ Tokenizer_set_textbuffer(Tokenizer* self, PyObject* value) Add a new token stack, context, and textbuffer to the list. */ static int -Tokenizer_push(Tokenizer* self, int context) +Tokenizer_push(Tokenizer* self, Py_ssize_t context) { PyObject* top = PyList_New(3); PyList_SET_ITEM(top, 0, PyList_New(0)); - PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(0)); + PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(context)); PyList_SET_ITEM(top, 2, PyList_New(0)); Py_XDECREF(self->topstack); @@ -345,7 +349,7 @@ Tokenizer_write_text_then_stack(Tokenizer* self, PyObject* text) } /* - Read the value at a relative point in the wikicode. + Read the value at a relative point in the wikicode, forwards. */ static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta) @@ -360,23 +364,247 @@ Tokenizer_read(Tokenizer* self, Py_ssize_t delta) } /* - Parse the wikicode string, using *context* for when to stop. + Read the value at a relative point in the wikicode, backwards. */ static PyObject* -Tokenizer_parse(Tokenizer* self, int context) +Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) +{ + if (delta > self->head) { + return EMPTY; + } + + Py_ssize_t index = self->head - delta; + return PySequence_Fast_GET_ITEM(self->text, index); +} + +static int +Tokenizer_parse_template_or_argument(Tokenizer* self) +{ + +} + +static int +Tokenizer_parse_template(Tokenizer* self) +{ + +} + +static int +Tokenizer_parse_argument(Tokenizer* self) +{ + +} + +static int +Tokenizer_verify_safe(Tokenizer* self) +{ + +} + +static int +Tokenizer_handle_template_param(Tokenizer* self) +{ + +} + +static int +Tokenizer_handle_template_param_value(Tokenizer* self) { - PyObject* this; + +} + +static PyObject* +Tokenizer_handle_template_end(Tokenizer* self) +{ + +} + +static int +Tokenizer_handle_argument_separator(Tokenizer* self) +{ + +} + +static PyObject* +Tokenizer_handle_argument_end(Tokenizer* self) +{ + +} + +static int +Tokenizer_parse_wikilink(Tokenizer* self) +{ + +} + +static int +Tokenizer_handle_wikilink_separator(Tokenizer* self) +{ + +} + +static PyObject* +Tokenizer_handle_wikilink_end(Tokenizer* self) +{ + +} + +static int +Tokenizer_parse_heading(Tokenizer* self) +{ + +} + +static PyObject* +Tokenizer_handle_heading_end(Tokenizer* self) +{ + +} + +static int +Tokenizer_really_parse_entity(Tokenizer* self) +{ + +} + +static int +Tokenizer_parse_entity(Tokenizer* self) +{ + +} + +static int +Tokenizer_parse_comment(Tokenizer* self) +{ + +} + + +/* + Parse the wikicode string, using context for when to stop. +*/ +static PyObject* +Tokenizer_parse(Tokenizer* self, Py_ssize_t context) +{ + Py_ssize_t fail_contexts = ( + CONTEXT("TEMPLATE") | CONTEXT("ARGUMENT") | CONTEXT("HEADING") | + CONTEXT("COMMENT")); + + PyObject *this, *next; + Py_UNICODE *this_data, *next_data, *next_next_data, *last_data; + Py_ssize_t this_context; + int is_marker, i; Tokenizer_push(self, context); while (1) { this = Tokenizer_read(self, 0); - /* if (this not in MARKERS) { - WRITE TEXT - } */ + this_data = PyUnicode_AS_UNICODE(this); + + is_marker = 0; + for (i = 0; i < NUM_MARKERS; i++) { + if (MARKERS[i] == this_data) { + is_marker = 1; + break; + } + } + + if (!is_marker) { + Tokenizer_write_text(self, this); + self->head++; + continue; + } + + this_context = Tokenizer_CONTEXT_VAL(self); + if (this == EMPTY) { + if (this_context & fail_contexts) { + Tokenizer_fail_route(self); + } return Tokenizer_pop(self); } + + next = Tokenizer_read(self, 1); + next_data = PyUnicode_AS_UNICODE(next); + + if (this_context & CONTEXT("COMMENT")) { + if (this_data == next_data && next_data == PU "-") { + if (PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)) == PU ">") { + return Tokenizer_pop(self); + } + } + Tokenizer_write_text(self, this); + } + else if (this_data == next_data && next_data == PU "{") { + Tokenizer_parse_template_or_argument(self); + } + else if (this_data == PU "|" && this_context & CONTEXT("TEMPLATE")) { + Tokenizer_handle_template_param(self); + } + else if (this_data == PU "=" && this_context & CONTEXT("TEMPLATE_PARAM_KEY")) { + Tokenizer_handle_template_param_value(self); + } + else if (this_data == next_data && next_data == PU "}" && + this_context & CONTEXT("TEMPLATE")) { + Tokenizer_handle_template_end(self); + } + else if (this_data == PU "|" && this_context & CONTEXT("ARGUMENT_NAME")) { + Tokenizer_handle_argument_separator(self); + } + else if (this_data == next_data && next_data == PU "}" && + this_context & CONTEXT("ARGUMENT")) { + if (PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)) == PU "}") { + return Tokenizer_handle_argument_end(self); + } + Tokenizer_write_text(self, this); + } + else if (this_data == next_data && next_data == PU "[") { + if (!(this_context & CONTEXT("WIKILINK_TITLE"))) { + Tokenizer_parse_wikilink(self); + } + else { + Tokenizer_write_text(self, this); + } + } + else if (this_data == PU "|" && this_context & CONTEXT("WIKILINK_TITLE")) { + Tokenizer_handle_wikilink_separator(self); + } + else if (this_data == next_data && next_data == PU "]" && + this_context & CONTEXT("WIKILINK")) { + return Tokenizer_handle_wikilink_end(self); + } + else if (this_data == PU "=" && !(self->global & CONTEXT("GL_HEADING"))) { + last_data = PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, 1)); + if (last_data == PU "\n" || last_data == PU "") { + Tokenizer_parse_heading(self); + } + else { + Tokenizer_write_text(self, this); + } + } + else if (this_data == PU "=" && this_context & CONTEXT("HEADING")) { + return Tokenizer_handle_heading_end(self); + } + else if (this_data == PU "\n" && this_context & CONTEXT("HEADING")) { + Tokenizer_fail_route(self); + } + else if (this_data == PU "&") { + Tokenizer_parse_entity(self); + } + else if (this_data == PU "<" && next_data == PU "!") { + next_next_data = PyUnicode_AS_UNICODE(Tokenizer_read(self, 2)); + if (next_next_data == PyUnicode_AS_UNICODE(Tokenizer_read(self, 3)) && + next_next_data == PU "-") { + Tokenizer_parse_comment(self); + } + else { + Tokenizer_write_text(self, this); + } + } + else { + Tokenizer_write_text(self, this); + } + self->head++; } }