From f16c7e25cac66bb3430cbf223a44844493cda1f3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 24 Jul 2015 02:29:59 -0400 Subject: [PATCH] Fully fix parsing templates with blank names, I hope (#111) --- CHANGELOG | 3 +++ docs/changelog.rst | 5 ++++ mwparserfromhell/parser/contexts.py | 18 +++++++------ mwparserfromhell/parser/tokenizer.c | 52 ++++++++++++++++++++++-------------- mwparserfromhell/parser/tokenizer.h | 21 ++++++++------- mwparserfromhell/parser/tokenizer.py | 35 +++++++++++++++--------- tests/tokenizer/templates.mwtest | 4 +-- 7 files changed, 85 insertions(+), 53 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index c696b98..5b5d794 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -8,6 +8,9 @@ v0.4.1 (unreleased): includes when denoting tags, but not comments. - Fixed the behavior of preserve_spacing in Template.add() and keep_field in Template.remove() on parameters with hidden keys. +- Fixed parser bugs involving: + - templates with completely blank names; + - templates with newlines and comments. - Fixed some bugs in the release scripts. v0.4 (released May 23, 2015): diff --git a/docs/changelog.rst b/docs/changelog.rst index 54f8af8..4e64a8b 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -15,6 +15,11 @@ Unreleased This includes when denoting tags, but not comments. - Fixed the behavior of *preserve_spacing* in :func:`~.Template.add` and *keep_field* in :func:`~.Template.remove` on parameters with hidden keys. +- Fixed parser bugs involving: + + - templates with completely blank names; + - templates with newlines and comments. + - Fixed some bugs in the release scripts. v0.4 diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index e98d8f7..b676e86 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -89,6 +89,7 @@ Local (stack-specific) contexts: * :const:`FAIL_ON_LBRACE` * :const:`FAIL_ON_RBRACE` * :const:`FAIL_ON_EQUALS` + * :const:`HAS_TEMPLATE` * :const:`TABLE` @@ -161,15 +162,16 @@ FAIL_NEXT = 1 << 26 FAIL_ON_LBRACE = 1 << 27 FAIL_ON_RBRACE = 1 << 28 FAIL_ON_EQUALS = 1 << 29 +HAS_TEMPLATE = 1 << 30 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + - FAIL_ON_RBRACE + FAIL_ON_EQUALS) - -TABLE_OPEN = 1 << 30 -TABLE_CELL_OPEN = 1 << 31 -TABLE_CELL_STYLE = 1 << 32 -TABLE_ROW_OPEN = 1 << 33 -TABLE_TD_LINE = 1 << 34 -TABLE_TH_LINE = 1 << 35 + FAIL_ON_RBRACE + FAIL_ON_EQUALS + HAS_TEMPLATE) + +TABLE_OPEN = 1 << 31 +TABLE_CELL_OPEN = 1 << 32 +TABLE_CELL_STYLE = 1 << 33 +TABLE_ROW_OPEN = 1 << 34 +TABLE_TD_LINE = 1 << 35 +TABLE_TH_LINE = 1 << 36 TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + TABLE_TD_LINE + TABLE_TH_LINE) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index ec0315f..f4e801b 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -582,12 +582,16 @@ static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) /* Parse a template at the head of the wikicode string. */ -static int Tokenizer_parse_template(Tokenizer* self) +static int Tokenizer_parse_template(Tokenizer* self, int has_content) { PyObject *template; Py_ssize_t reset = self->head; + uint64_t context = LC_TEMPLATE_NAME; - template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1); + if (has_content) + context |= LC_HAS_TEMPLATE; + + template = Tokenizer_parse(self, context, 1); if (BAD_ROUTE) { self->head = reset; return 0; @@ -643,6 +647,7 @@ static int Tokenizer_parse_argument(Tokenizer* self) static int Tokenizer_parse_template_or_argument(Tokenizer* self) { unsigned int braces = 2, i; + int has_content = 0; PyObject *tokenlist; self->head += 2; @@ -659,7 +664,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) return 0; } if (braces == 2) { - if (Tokenizer_parse_template(self)) + if (Tokenizer_parse_template(self, has_content)) return -1; if (BAD_ROUTE) { RESET_ROUTE(); @@ -673,7 +678,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) return -1; if (BAD_ROUTE) { RESET_ROUTE(); - if (Tokenizer_parse_template(self)) + if (Tokenizer_parse_template(self, has_content)) return -1; if (BAD_ROUTE) { char text[MAX_BRACES + 1]; @@ -689,8 +694,10 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) } else braces -= 3; - if (braces) + if (braces) { + has_content = 1; self->head++; + } } tokenlist = Tokenizer_pop(self); if (!tokenlist) @@ -712,8 +719,13 @@ static int Tokenizer_handle_template_param(Tokenizer* self) { PyObject *stack; - if (self->topstack->context & LC_TEMPLATE_NAME) + if (self->topstack->context & LC_TEMPLATE_NAME) { + if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE))) { + Tokenizer_fail_route(self); + return -1; + } self->topstack->context ^= LC_TEMPLATE_NAME; + } else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE) self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE; if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { @@ -764,7 +776,11 @@ static PyObject* Tokenizer_handle_template_end(Tokenizer* self) { PyObject* stack; - if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { + if (self->topstack->context & LC_TEMPLATE_NAME) { + if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE))) + return Tokenizer_fail_route(self); + } + else if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) { stack = Tokenizer_pop_keeping_context(self); if (!stack) return NULL; @@ -2885,30 +2901,26 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) if (context & LC_TAG_CLOSE) return (data == '<') ? -1 : 0; if (context & LC_TEMPLATE_NAME) { - if (data == '{' || data == '}' || data == '[') { + if (data == '{') { + self->topstack->context |= LC_HAS_TEMPLATE | LC_FAIL_NEXT; + return 0; + } + if (data == '}' || (data == '<' && Tokenizer_READ(self, 1) == '!')) { self->topstack->context |= LC_FAIL_NEXT; return 0; } - if (data == ']' || data == '>' || (data == '<' && - Tokenizer_READ(self, 1) != '!')) { + if (data == '[' || data == ']' || data == '<' || data == '>') { return -1; } if (data == '|') return 0; if (context & LC_HAS_TEXT) { if (context & LC_FAIL_ON_TEXT) { - if (!Py_UNICODE_ISSPACE(data)) { - if (data == '<' && Tokenizer_READ(self, 1) == '!') { - self->topstack->context |= LC_FAIL_NEXT; - return 0; - } + if (!Py_UNICODE_ISSPACE(data)) return -1; - } - } - else { - if (data == '\n') - self->topstack->context |= LC_FAIL_ON_TEXT; } + else if (data == '\n') + self->topstack->context |= LC_FAIL_ON_TEXT; } else if (!Py_UNICODE_ISSPACE(data)) self->topstack->context |= LC_HAS_TEXT; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 842e65d..d477acb 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -150,22 +150,23 @@ static PyObject* TagCloseClose; #define LC_DLTERM 0x0000000000800000 -#define LC_SAFETY_CHECK 0x000000003F000000 +#define LC_SAFETY_CHECK 0x000000007F000000 #define LC_HAS_TEXT 0x0000000001000000 #define LC_FAIL_ON_TEXT 0x0000000002000000 #define LC_FAIL_NEXT 0x0000000004000000 #define LC_FAIL_ON_LBRACE 0x0000000008000000 #define LC_FAIL_ON_RBRACE 0x0000000010000000 #define LC_FAIL_ON_EQUALS 0x0000000020000000 - -#define LC_TABLE 0x0000000FC0000000 -#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000 -#define LC_TABLE_OPEN 0x0000000040000000 -#define LC_TABLE_CELL_OPEN 0x0000000080000000 -#define LC_TABLE_CELL_STYLE 0x0000000100000000 -#define LC_TABLE_ROW_OPEN 0x0000000200000000 -#define LC_TABLE_TD_LINE 0x0000000400000000 -#define LC_TABLE_TH_LINE 0x0000000800000000 +#define LC_HAS_TEMPLATE 0x0000000040000000 + +#define LC_TABLE 0x0000001F80000000 +#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000 +#define LC_TABLE_OPEN 0x0000000080000000 +#define LC_TABLE_CELL_OPEN 0x0000000100000000 +#define LC_TABLE_CELL_STYLE 0x0000000200000000 +#define LC_TABLE_ROW_OPEN 0x0000000400000000 +#define LC_TABLE_TD_LINE 0x0000000800000000 +#define LC_TABLE_TH_LINE 0x0000001000000000 /* Global contexts: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 4d7d885..5c89455 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -192,11 +192,14 @@ class Tokenizer(object): self._fail_route() return self.END - def _parse_template(self): + def _parse_template(self, has_content): """Parse a template at the head of the wikicode string.""" reset = self._head + context = contexts.TEMPLATE_NAME + if has_content: + context |= contexts.HAS_TEMPLATE try: - template = self._parse(contexts.TEMPLATE_NAME) + template = self._parse(context) except BadRoute: self._head = reset raise @@ -223,6 +226,7 @@ class Tokenizer(object): while self._read() == "{": self._head += 1 braces += 1 + has_content = False self._push() while braces: @@ -230,7 +234,7 @@ class Tokenizer(object): return self._emit_text_then_stack("{") if braces == 2: try: - self._parse_template() + self._parse_template(has_content) except BadRoute: return self._emit_text_then_stack("{{") break @@ -239,11 +243,12 @@ class Tokenizer(object): braces -= 3 except BadRoute: try: - self._parse_template() + self._parse_template(has_content) braces -= 2 except BadRoute: return self._emit_text_then_stack("{" * braces) if braces: + has_content = True self._head += 1 self._emit_all(self._pop()) @@ -253,6 +258,8 @@ class Tokenizer(object): def _handle_template_param(self): """Handle a template parameter at the head of the string.""" if self._context & contexts.TEMPLATE_NAME: + if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE): + self._fail_route() self._context ^= contexts.TEMPLATE_NAME elif self._context & contexts.TEMPLATE_PARAM_VALUE: self._context ^= contexts.TEMPLATE_PARAM_VALUE @@ -271,7 +278,10 @@ class Tokenizer(object): def _handle_template_end(self): """Handle the end of a template at the head of the string.""" - if self._context & contexts.TEMPLATE_PARAM_KEY: + if self._context & contexts.TEMPLATE_NAME: + if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE): + self._fail_route() + elif self._context & contexts.TEMPLATE_PARAM_KEY: self._emit_all(self._pop(keep_context=True)) self._head += 1 return self._pop() @@ -1183,23 +1193,22 @@ class Tokenizer(object): elif context & contexts.EXT_LINK_TITLE: return this != "\n" elif context & contexts.TEMPLATE_NAME: - if this == "{" or this == "}" or this == "[": + if this == "{": + self._context |= contexts.HAS_TEMPLATE | contexts.FAIL_NEXT + return True + if this == "}" or (this == "<" and self._read(1) == "!"): self._context |= contexts.FAIL_NEXT return True - if this == "]" or this == ">" or (this == "<" and self._read(1) != "!"): + if this == "[" or this == "]" or this == "<" or this == ">": return False if this == "|": return True if context & contexts.HAS_TEXT: if context & contexts.FAIL_ON_TEXT: if this is self.END or not this.isspace(): - if this == "<" and self._read(1) == "!": - self._context |= contexts.FAIL_NEXT - return True return False - else: - if this == "\n": - self._context |= contexts.FAIL_ON_TEXT + elif this == "\n": + self._context |= contexts.FAIL_ON_TEXT elif this is self.END or not this.isspace(): self._context |= contexts.HAS_TEXT return True diff --git a/tests/tokenizer/templates.mwtest b/tests/tokenizer/templates.mwtest index 4756ac6..1913f5d 100644 --- a/tests/tokenizer/templates.mwtest +++ b/tests/tokenizer/templates.mwtest @@ -686,5 +686,5 @@ output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ name: recursion_opens_and_closes label: test potentially dangerous recursion: template openings and closings -input: "{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}" -output: [Text(text="{{|"), TemplateOpen(), TemplateClose(), Text(text="{{|"), TemplateOpen(), TemplateClose(), TemplateOpen(), TemplateParamSeparator(), TemplateOpen(), TemplateClose(), Text(text="{{"), TemplateParamSeparator(), Text(text="{{"), TemplateClose(), Text(text="{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}")] +input: "{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}" +output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), TemplateOpen(), Text(text="x"), TemplateParamSeparator(), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x"), TemplateParamSeparator(), Text(text="{{x"), TemplateClose(), Text(text="{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}")]