From 7de34ed5a6151e7d3d787dbce5bbc83fdba44e64 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 1 Nov 2012 15:15:56 -0400 Subject: [PATCH] Rewrite verify_safe completely. --- mwparserfromhell/parser/tokenizer.c | 210 +++++++++++++----------------------- mwparserfromhell/parser/tokenizer.h | 9 +- 2 files changed, 79 insertions(+), 140 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 01acd50..82cffaf 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -576,107 +576,12 @@ Tokenizer_parse_argument(Tokenizer* self) } /* - Verify that there are no unsafe characters in the current stack. The route - will be failed if the name contains any element of unsafes in it (not - merely at the beginning or end). This is used when parsing a template name - or parameter key, which cannot contain newlines. -*/ -static int -Tokenizer_verify_safe(Tokenizer* self, const char* unsafes[]) -{ - if (Tokenizer_push_textbuffer(self)) - return -1; - - PyObject* stack = self->topstack->stack; - if (stack) { - PyObject* textlist = PyList_New(0); - if (!textlist) return -1; - - int i; - Py_ssize_t length = PyList_GET_SIZE(stack); - PyObject *token, *textdata; - - for (i = 0; i < length; i++) { - token = PyList_GET_ITEM(stack, i); - switch (PyObject_IsInstance(token, Text)) { - case 0: - break; - case 1: - textdata = PyObject_GetAttrString(token, "text"); - if (!textdata) { - Py_DECREF(textlist); - return -1; - } - Py_DECREF(textdata); - if (PyList_Append(textlist, textdata)) { - Py_DECREF(textlist); - Py_DECREF(textdata); - return -1; - } - Py_DECREF(textdata); - break; - case -1: - Py_DECREF(textlist); - return -1; - } - } - - PyObject* text = PyUnicode_Join(EMPTY, textlist); - if (!text) { - Py_DECREF(textlist); - return -1; - } - Py_DECREF(textlist); - - PyObject* stripped = PyObject_CallMethod(text, "strip", NULL); - if (!stripped) { - Py_DECREF(text); - return -1; - } - Py_DECREF(text); - - const char* unsafe_char; - PyObject* unsafe; - i = 0; - while (1) { - unsafe_char = unsafes[i]; - if (!unsafe_char) break; - - unsafe = PyUnicode_FromString(unsafe_char); - - if (!unsafe) { - Py_DECREF(stripped); - return -1; - } - - switch (PyUnicode_Contains(stripped, unsafe)) { - case 0: - break; - case 1: - Tokenizer_fail_route(self); - case -1: - Py_DECREF(stripped); - Py_DECREF(unsafe); - return -1; - } - i++; - } - } - - return 0; -} - -/* Handle a template parameter at the head of the string. */ static int Tokenizer_handle_template_param(Tokenizer* self) { if (self->topstack->context & LC_TEMPLATE_NAME) { - const char* unsafes[] = {"\n", "{", "}", "[", "]", NULL}; - if (Tokenizer_verify_safe(self, unsafes)) - return -1; - if (BAD_ROUTE) return -1; self->topstack->context ^= LC_TEMPLATE_NAME; } else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE) { @@ -716,15 +621,6 @@ Tokenizer_handle_template_param(Tokenizer* self) static int Tokenizer_handle_template_param_value(Tokenizer* self) { - const char* unsafes[] = {"\n", "{{", "}}", NULL}; - if (Tokenizer_verify_safe(self, unsafes)) { - if (BAD_ROUTE) { - PyObject* stack = Tokenizer_pop(self); - Py_XDECREF(stack); - } - return -1; - } - PyObject* stack = Tokenizer_pop_keeping_context(self); if (!stack) return -1; if (Tokenizer_write_all(self, stack)) { @@ -754,12 +650,7 @@ static PyObject* Tokenizer_handle_template_end(Tokenizer* self) { PyObject* stack; - if (self->topstack->context & LC_TEMPLATE_NAME) { - const char* unsafes[] = {"\n", "{", "}", "[", "]", NULL}; - if (Tokenizer_verify_safe(self, unsafes)) - return NULL; - } - else if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { + if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { stack = Tokenizer_pop_keeping_context(self); if (!stack) return NULL; if (Tokenizer_write_all(self, stack)) { @@ -780,10 +671,6 @@ Tokenizer_handle_template_end(Tokenizer* self) static int Tokenizer_handle_argument_separator(Tokenizer* self) { - const char* unsafes[] = {"\n", "{{", "}}", NULL}; - if (Tokenizer_verify_safe(self, unsafes)) - return -1; - self->topstack->context ^= LC_ARGUMENT_NAME; self->topstack->context |= LC_ARGUMENT_DEFAULT; @@ -804,12 +691,6 @@ Tokenizer_handle_argument_separator(Tokenizer* self) static PyObject* Tokenizer_handle_argument_end(Tokenizer* self) { - if (self->topstack->context & LC_ARGUMENT_NAME) { - const char* unsafes[] = {"\n", "{{", "}}", NULL}; - if (Tokenizer_verify_safe(self, unsafes)) - return NULL; - } - self->head += 2; PyObject* stack = Tokenizer_pop(self); return stack; @@ -826,7 +707,6 @@ Tokenizer_parse_wikilink(Tokenizer* self) PyObject *token; PyObject *wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE); - if (!wikilink) return -1; if (BAD_ROUTE) { RESET_ROUTE(); @@ -838,6 +718,7 @@ Tokenizer_parse_wikilink(Tokenizer* self) } return 0; } + if (!wikilink) return -1; token = PyObject_CallObject(WikilinkOpen, NULL); if (!token) { @@ -875,10 +756,6 @@ Tokenizer_parse_wikilink(Tokenizer* self) static int Tokenizer_handle_wikilink_separator(Tokenizer* self) { - const char* unsafes[] = {"\n", "{", "}", "[", "]", NULL}; - if (Tokenizer_verify_safe(self, unsafes)) - return -1; - self->topstack->context ^= LC_WIKILINK_TITLE; self->topstack->context |= LC_WIKILINK_TEXT; @@ -899,12 +776,6 @@ Tokenizer_handle_wikilink_separator(Tokenizer* self) static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self) { - if (self->topstack->context & LC_WIKILINK_TITLE) { - const char* unsafes[] = {"\n", "{", "}", "[", "]", NULL}; - if (Tokenizer_verify_safe(self, unsafes)) - return NULL; - } - self->head += 1; PyObject* stack = Tokenizer_pop(self); return stack; @@ -1124,7 +995,6 @@ Tokenizer_parse_comment(Tokenizer* self) PyObject *token; PyObject *comment = Tokenizer_parse(self, LC_WIKILINK_TITLE); - if (!comment) return -1; if (BAD_ROUTE) { RESET_ROUTE(); @@ -1139,7 +1009,9 @@ Tokenizer_parse_comment(Tokenizer* self) } i++; } + return 0; } + if (!comment) return -1; token = PyObject_CallObject(CommentStart, NULL); if (!token) { @@ -1173,16 +1045,74 @@ Tokenizer_parse_comment(Tokenizer* self) } /* + Make sure we are not trying to write an invalid character. +*/ +static void +Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) +{ + if (context & LC_FAIL_NEXT) { + Tokenizer_fail_route(self); + return; + } + + if (context & (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE)) { + if (data == *"{" || data == *"}" || data == *"[" || data == *"]") { + self->topstack->context |= LC_FAIL_NEXT; + return; + } + } + else if (context & (LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)) { + if (context & LC_FAIL_ON_LBRACE) { + if (data == *"{") { + self->topstack->context |= LC_FAIL_NEXT; + return; + } + self->topstack->context ^= LC_FAIL_ON_LBRACE; + } + else if (context & LC_FAIL_ON_RBRACE) { + if (data == *"}") { + self->topstack->context |= LC_FAIL_NEXT; + return; + } + self->topstack->context ^= LC_FAIL_ON_RBRACE; + } + else if (data == *"{") { + self->topstack->context |= LC_FAIL_ON_LBRACE; + } + else if (data == *"}") { + self->topstack->context |= LC_FAIL_ON_RBRACE; + } + } + + if (context & LC_HAS_TEXT) { + if (context & LC_FAIL_ON_TEXT) { + if (!Py_UNICODE_ISSPACE(data)) { + Tokenizer_fail_route(self); + return; + } + } + else { + if (data == *"\n") { + self->topstack->context |= LC_FAIL_ON_TEXT; + } + } + } + else if (!Py_UNICODE_ISSPACE(data)) { + self->topstack->context |= LC_HAS_TEXT; + } +} + +/* Parse the wikicode string, using context for when to stop. */ static PyObject* Tokenizer_parse(Tokenizer* self, int context) { - PyObject *this; + static int fail_contexts = LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_COMMENT; + static int unsafe_contexts = LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME; + int this_context, is_marker, i; Py_UNICODE this_data, next, next_next, last; - int this_context; - int fail_contexts = LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_COMMENT; - int is_marker, i; + PyObject *this; if (Tokenizer_push(self, context)) return NULL; @@ -1190,6 +1120,12 @@ Tokenizer_parse(Tokenizer* self, int context) while (1) { this = Tokenizer_read(self, 0); this_data = *PyUnicode_AS_UNICODE(this); + this_context = self->topstack->context; + + if (this_context & unsafe_contexts) { + Tokenizer_verify_safe(self, this_context, this_data); + if (BAD_ROUTE) return NULL; + } is_marker = 0; for (i = 0; i < NUM_MARKERS; i++) { @@ -1205,8 +1141,6 @@ Tokenizer_parse(Tokenizer* self, int context) continue; } - this_context = self->topstack->context; - if (this_data == *"") { if (this_context & LC_TEMPLATE_PARAM_KEY) { PyObject* trash = Tokenizer_pop(self); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 468700c..2888f6b 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -98,7 +98,7 @@ static PyObject* TagCloseClose; #define LC_WIKILINK_TITLE 0x0020 #define LC_WIKILINK_TEXT 0x0040 -#define LC_HEADING 0x1f80 +#define LC_HEADING 0x1F80 #define LC_HEADING_LEVEL_1 0x0080 #define LC_HEADING_LEVEL_2 0x0100 #define LC_HEADING_LEVEL_3 0x0200 @@ -108,6 +108,11 @@ static PyObject* TagCloseClose; #define LC_COMMENT 0x2000 +#define LC_HAS_TEXT 0x010000 +#define LC_FAIL_ON_TEXT 0x020000 +#define LC_FAIL_NEXT 0x040000 +#define LC_FAIL_ON_LBRACE 0x080000 +#define LC_FAIL_ON_RBRACE 0x100000 /* Global contexts: */ @@ -176,7 +181,6 @@ static PyObject* Tokenizer_read_backwards(Tokenizer*, Py_ssize_t); static int Tokenizer_parse_template_or_argument(Tokenizer*); static int Tokenizer_parse_template(Tokenizer*); static int Tokenizer_parse_argument(Tokenizer*); -static int Tokenizer_verify_safe(Tokenizer*, const char* []); static int Tokenizer_handle_template_param(Tokenizer*); static int Tokenizer_handle_template_param_value(Tokenizer*); static PyObject* Tokenizer_handle_template_end(Tokenizer*); @@ -190,6 +194,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer*); static int Tokenizer_really_parse_entity(Tokenizer*); static int Tokenizer_parse_entity(Tokenizer*); static int Tokenizer_parse_comment(Tokenizer*); +static void Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE); static PyObject* Tokenizer_parse(Tokenizer*, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);