From d6f2723a06c45d92e478cffeedf3ce2c4be21a43 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 24 Jan 2013 03:07:36 -0500 Subject: [PATCH] Fix safety checks on template params in some odd cases (closes #24). Also, fix parsing of wikilinks in both tokenizers such that newlines in any location within the title are an automatic failure. --- mwparserfromhell/parser/tokenizer.c | 57 ++++++++++++++++++++++++++---------- mwparserfromhell/parser/tokenizer.h | 1 + mwparserfromhell/parser/tokenizer.py | 16 ++++++---- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 40ec723..09649a7 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1144,17 +1144,24 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) Tokenizer_fail_route(self); return; } - if (context & (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE)) { - if (data == *"{" || data == *"}" || data == *"[" || data == *"]") { + if (context & LC_WIKILINK_TITLE) { + if (data == *"]" || data == *"{") + self->topstack->context |= LC_FAIL_NEXT; + else if (data == *"\n" || data == *"[" || data == *"}") + Tokenizer_fail_route(self); + return; + } + if (context & LC_TEMPLATE_NAME) { + if (data == *"{" || data == *"}" || data == *"[") { self->topstack->context |= LC_FAIL_NEXT; return; } - if (data == *"|") { - if (context & LC_FAIL_ON_TEXT) { - self->topstack->context ^= LC_FAIL_ON_TEXT; - return; - } + if (data == *"]") { + Tokenizer_fail_route(self); + return; } + if (data == *"|") + return; } else if (context & (LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)) { if (context & LC_FAIL_ON_EQUALS) { @@ -1210,6 +1217,28 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) } /* + Unset any safety-checking contexts set by Tokenizer_verify_safe(). Used + when we preserve a context but previous data becomes invalid, like when + moving between template parameters. +*/ +static void +Tokenizer_reset_safety_checks(Tokenizer* self) +{ + static int checks[] = { + LC_HAS_TEXT, LC_FAIL_ON_TEXT, LC_FAIL_NEXT, LC_FAIL_ON_LBRACE, + LC_FAIL_ON_RBRACE, LC_FAIL_ON_EQUALS, 0}; + int context = self->topstack->context, i = 0, this; + while (1) { + this = checks[i]; + if (!this) + return; + if (context & this) + self->topstack->context ^= this; + i++; + } +} + +/* Parse the wikicode string, using context for when to stop. */ static PyObject* @@ -1274,6 +1303,7 @@ Tokenizer_parse(Tokenizer* self, int context) self->topstack->context ^= LC_FAIL_NEXT; } else if (this == *"|" && this_context & LC_TEMPLATE) { + Tokenizer_reset_safety_checks(self); if (Tokenizer_handle_template_param(self)) return NULL; } @@ -1294,15 +1324,10 @@ Tokenizer_parse(Tokenizer* self, int context) Tokenizer_write_text(self, this); } else if (this == next && next == *"[") { - if (!(this_context & LC_WIKILINK_TITLE)) { - if (Tokenizer_parse_wikilink(self)) - return NULL; - if (self->topstack->context & LC_FAIL_NEXT) - self->topstack->context ^= LC_FAIL_NEXT; - } - else { - Tokenizer_write_text(self, this); - } + if (Tokenizer_parse_wikilink(self)) + return NULL; + if (self->topstack->context & LC_FAIL_NEXT) + self->topstack->context ^= LC_FAIL_NEXT; } else if (this == *"|" && this_context & LC_WIKILINK_TITLE) { if (Tokenizer_handle_wikilink_separator(self)) diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index dffa0fb..3293a8f 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -206,6 +206,7 @@ static int Tokenizer_really_parse_entity(Tokenizer*); static int Tokenizer_parse_entity(Tokenizer*); static int Tokenizer_parse_comment(Tokenizer*); static void Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE); +static void Tokenizer_reset_safety_checks(Tokenizer*); static PyObject* Tokenizer_parse(Tokenizer*, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index a2b405c..eead131 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -213,17 +213,21 @@ class Tokenizer(object): self._write_all(argument) self._write(tokens.ArgumentClose()) - def _verify_safe(self, unsafes): + def _verify_safe(self, unsafes, strip=True): """Verify that there are no unsafe characters in the current stack. The route will be failed if the name contains any element of *unsafes* - in it (not merely at the beginning or end). This is used when parsing a - template name or parameter key, which cannot contain newlines. + in it. This is used when parsing template names, parameter keys, and so + on, which cannot contain newlines and some other characters. If *strip* + is ``True``, the text will be stripped of whitespace, since this is + allowed at the ends of certain elements but not between text. """ self._push_textbuffer() if self._stack: text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] - text = "".join([token.text for token in text]).strip() + text = "".join([token.text for token in text]) + if strip: + text = text.strip() if text and any([unsafe in text for unsafe in unsafes]): self._fail_route() @@ -291,7 +295,7 @@ class Tokenizer(object): def _handle_wikilink_separator(self): """Handle the separator between a wikilink's title and its text.""" - self._verify_safe(["\n", "{", "}", "[", "]"]) + self._verify_safe(["\n", "{", "}", "[", "]"], strip=False) self._context ^= contexts.WIKILINK_TITLE self._context |= contexts.WIKILINK_TEXT self._write(tokens.WikilinkSeparator()) @@ -299,7 +303,7 @@ class Tokenizer(object): def _handle_wikilink_end(self): """Handle the end of a wikilink at the head of the string.""" if self._context & contexts.WIKILINK_TITLE: - self._verify_safe(["\n", "{", "}", "[", "]"]) + self._verify_safe(["\n", "{", "}", "[", "]"], strip=False) self._head += 1 return self._pop()