From 0803417901d09d7df830e65300355507715e67cb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 23 Feb 2013 13:12:16 -0500 Subject: [PATCH] Port CTokenizer's verify_safe method to Python to solve a failing test. --- mwparserfromhell/parser/contexts.py | 62 +++++++++++------- mwparserfromhell/parser/tokenizer.c | 12 ++-- mwparserfromhell/parser/tokenizer.h | 1 + mwparserfromhell/parser/tokenizer.py | 122 +++++++++++++++++++++++++---------- 4 files changed, 137 insertions(+), 60 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index b65946c..896d137 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -62,6 +62,15 @@ Local (stack-specific) contexts: * :py:const:`COMMENT` +* :py:const:`SAFETY_CHECK` + + * :py:const:`HAS_TEXT` + * :py:const:`FAIL_ON_TEXT` + * :py:const:`FAIL_NEXT` + * :py:const:`FAIL_ON_LBRACE` + * :py:const:`FAIL_ON_RBRACE` + * :py:const:`FAIL_ON_EQUALS` + Global contexts: * :py:const:`GL_HEADING` @@ -69,29 +78,36 @@ Global contexts: # Local contexts: -TEMPLATE = 0b00000000000111 -TEMPLATE_NAME = 0b00000000000001 -TEMPLATE_PARAM_KEY = 0b00000000000010 -TEMPLATE_PARAM_VALUE = 0b00000000000100 - -ARGUMENT = 0b00000000011000 -ARGUMENT_NAME = 0b00000000001000 -ARGUMENT_DEFAULT = 0b00000000010000 - -WIKILINK = 0b00000001100000 -WIKILINK_TITLE = 0b00000000100000 -WIKILINK_TEXT = 0b00000001000000 - -HEADING = 0b01111110000000 -HEADING_LEVEL_1 = 0b00000010000000 -HEADING_LEVEL_2 = 0b00000100000000 -HEADING_LEVEL_3 = 0b00001000000000 -HEADING_LEVEL_4 = 0b00010000000000 -HEADING_LEVEL_5 = 0b00100000000000 -HEADING_LEVEL_6 = 0b01000000000000 - -COMMENT = 0b10000000000000 - +TEMPLATE = 0b00000000000000000111 +TEMPLATE_NAME = 0b00000000000000000001 +TEMPLATE_PARAM_KEY = 0b00000000000000000010 +TEMPLATE_PARAM_VALUE = 0b00000000000000000100 + +ARGUMENT = 0b00000000000000011000 +ARGUMENT_NAME = 0b00000000000000001000 +ARGUMENT_DEFAULT = 0b00000000000000010000 + +WIKILINK = 0b00000000000001100000 +WIKILINK_TITLE = 0b00000000000000100000 +WIKILINK_TEXT = 0b00000000000001000000 + +HEADING = 0b00000001111110000000 +HEADING_LEVEL_1 = 0b00000000000010000000 +HEADING_LEVEL_2 = 0b00000000000100000000 +HEADING_LEVEL_3 = 0b00000000001000000000 +HEADING_LEVEL_4 = 0b00000000010000000000 +HEADING_LEVEL_5 = 0b00000000100000000000 +HEADING_LEVEL_6 = 0b00000001000000000000 + +COMMENT = 0b00000010000000000000 + +SAFETY_CHECK = 0b11111100000000000000 +HAS_TEXT = 0b00000100000000000000 +FAIL_ON_TEXT = 0b00001000000000000000 +FAIL_NEXT = 0b00010000000000000000 +FAIL_ON_LBRACE = 0b00100000000000000000 +FAIL_ON_RBRACE = 0b01000000000000000000 +FAIL_ON_EQUALS = 0b10000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 09649a7..d82b080 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1324,10 +1324,14 @@ Tokenizer_parse(Tokenizer* self, int context) Tokenizer_write_text(self, this); } else if (this == next && next == *"[") { - if (Tokenizer_parse_wikilink(self)) - return NULL; - if (self->topstack->context & LC_FAIL_NEXT) - self->topstack->context ^= LC_FAIL_NEXT; + if (!(this_context & LC_WIKILINK_TITLE)) { + if (Tokenizer_parse_wikilink(self)) + return NULL; + if (self->topstack->context & LC_FAIL_NEXT) + self->topstack->context ^= LC_FAIL_NEXT; + } + else + Tokenizer_write_text(self, this); } else if (this == *"|" && this_context & LC_WIKILINK_TITLE) { if (Tokenizer_handle_wikilink_separator(self)) diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 3293a8f..af86321 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -118,6 +118,7 @@ static PyObject* TagCloseClose; #define LC_COMMENT 0x02000 +#define LC_SAFETY_CHECK 0xFC000 #define LC_HAS_TEXT 0x04000 #define LC_FAIL_ON_TEXT 0x08000 #define LC_FAIL_NEXT 0x10000 diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index eead131..a365db8 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -213,28 +213,9 @@ class Tokenizer(object): self._write_all(argument) self._write(tokens.ArgumentClose()) - def _verify_safe(self, unsafes, strip=True): - """Verify that there are no unsafe characters in the current stack. - - The route will be failed if the name contains any element of *unsafes* - in it. This is used when parsing template names, parameter keys, and so - on, which cannot contain newlines and some other characters. If *strip* - is ``True``, the text will be stripped of whitespace, since this is - allowed at the ends of certain elements but not between text. - """ - self._push_textbuffer() - if self._stack: - text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] - text = "".join([token.text for token in text]) - if strip: - text = text.strip() - if text and any([unsafe in text for unsafe in unsafes]): - self._fail_route() - def _handle_template_param(self): """Handle a template parameter at the head of the string.""" if self._context & contexts.TEMPLATE_NAME: - self._verify_safe(["\n", "{", "}", "[", "]"]) self._context ^= contexts.TEMPLATE_NAME elif self._context & contexts.TEMPLATE_PARAM_VALUE: self._context ^= contexts.TEMPLATE_PARAM_VALUE @@ -246,11 +227,6 @@ class Tokenizer(object): def _handle_template_param_value(self): """Handle a template parameter's value at the head of the string.""" - try: - self._verify_safe(["\n", "{{", "}}"]) - except BadRoute: - self._pop() - raise self._write_all(self._pop(keep_context=True)) self._context ^= contexts.TEMPLATE_PARAM_KEY self._context |= contexts.TEMPLATE_PARAM_VALUE @@ -258,24 +234,19 @@ class Tokenizer(object): def _handle_template_end(self): """Handle the end of a template at the head of the string.""" - if self._context & contexts.TEMPLATE_NAME: - self._verify_safe(["\n", "{", "}", "[", "]"]) - elif self._context & contexts.TEMPLATE_PARAM_KEY: + if self._context & contexts.TEMPLATE_PARAM_KEY: self._write_all(self._pop(keep_context=True)) self._head += 1 return self._pop() def _handle_argument_separator(self): """Handle the separator between an argument's name and default.""" - self._verify_safe(["\n", "{{", "}}"]) self._context ^= contexts.ARGUMENT_NAME self._context |= contexts.ARGUMENT_DEFAULT self._write(tokens.ArgumentSeparator()) def _handle_argument_end(self): """Handle the end of an argument at the head of the string.""" - if self._context & contexts.ARGUMENT_NAME: - self._verify_safe(["\n", "{{", "}}"]) self._head += 2 return self._pop() @@ -295,15 +266,12 @@ class Tokenizer(object): def _handle_wikilink_separator(self): """Handle the separator between a wikilink's title and its text.""" - self._verify_safe(["\n", "{", "}", "[", "]"], strip=False) self._context ^= contexts.WIKILINK_TITLE self._context |= contexts.WIKILINK_TEXT self._write(tokens.WikilinkSeparator()) def _handle_wikilink_end(self): """Handle the end of a wikilink at the head of the string.""" - if self._context & contexts.WIKILINK_TITLE: - self._verify_safe(["\n", "{", "}", "[", "]"], strip=False) self._head += 1 return self._pop() @@ -424,11 +392,94 @@ class Tokenizer(object): self._write(tokens.CommentEnd()) self._head += 2 + def _verify_safe(self, this): + """Make sure we are not trying to write an invalid character.""" + context = self._context + if context & contexts.FAIL_NEXT: + self._fail_route() + if context & contexts.WIKILINK_TITLE: + if this == "]" or this == "{": + self._context |= contexts.FAIL_NEXT + elif this == "\n" or this == "[" or this == "}": + self._fail_route() + return + if context & contexts.TEMPLATE_NAME: + if this == "{" or this == "}" or this == "[": + self._context |= contexts.FAIL_NEXT + return + if this == "]": + self._fail_route() + return + if this == "|": + return + elif context & (contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME): + if context & contexts.FAIL_ON_EQUALS: + if this == "=": + self._fail_route() + return + elif context & contexts.FAIL_ON_LBRACE: + if this == "{": + if context & contexts.TEMPLATE: + self._context |= contexts.FAIL_ON_EQUALS + else: + self._context |= contexts.FAIL_NEXT + return + self._context ^= contexts.FAIL_ON_LBRACE + elif context & contexts.FAIL_ON_RBRACE: + if this == "}": + if context & contexts.TEMPLATE: + self._context |= contexts.FAIL_ON_EQUALS + else: + self._context |= contexts.FAIL_NEXT + return + self._context ^= contexts.FAIL_ON_RBRACE + elif this == "{": + self._context |= contexts.FAIL_ON_LBRACE + elif this == "}": + self._context |= contexts.FAIL_ON_RBRACE + if context & contexts.HAS_TEXT: + if context & contexts.FAIL_ON_TEXT: + if this is self.END or not this.isspace(): + if context & contexts.TEMPLATE_PARAM_KEY: + self._context ^= contexts.FAIL_ON_TEXT + self._context |= contexts.FAIL_ON_EQUALS + else: + self._fail_route() + return + else: + if this == "\n": + self._context |= contexts.FAIL_ON_TEXT + elif this is self.END or not this.isspace(): + self._context |= contexts.HAS_TEXT + + def _reset_safety_checks(self): + """Unset any safety-checking contexts set by Tokenizer_verify_safe(). + + Used when we preserve a context but previous data becomes invalid, like + when moving between template parameters. + """ + context = self._context + checks = (contexts.HAS_TEXT, contexts.FAIL_ON_TEXT, contexts.FAIL_NEXT, + contexts.FAIL_ON_LBRACE, contexts.FAIL_ON_RBRACE, + contexts.FAIL_ON_EQUALS) + for check in checks: + if context & check: + self._context ^= check; + def _parse(self, context=0): """Parse the wikicode string, using *context* for when to stop.""" self._push(context) while True: this = self._read() + unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | + contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME) + if self._context & unsafe: + try: + self._verify_safe(this) + except BadRoute: + if self._context & contexts.TEMPLATE_PARAM_KEY: + self._pop() + raise if this not in self.MARKERS: self._write_text(this) self._head += 1 @@ -450,7 +501,10 @@ class Tokenizer(object): self._write_text(this) elif this == next == "{": self._parse_template_or_argument() + if self._context & contexts.FAIL_NEXT: + self._context ^= contexts.FAIL_NEXT elif this == "|" and self._context & contexts.TEMPLATE: + self._reset_safety_checks() self._handle_template_param() elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY: self._handle_template_param_value() @@ -466,6 +520,8 @@ class Tokenizer(object): elif this == next == "[": if not self._context & contexts.WIKILINK_TITLE: self._parse_wikilink() + if self._context & contexts.FAIL_NEXT: + self._context ^= contexts.FAIL_NEXT else: self._write_text("[") elif this == "|" and self._context & contexts.WIKILINK_TITLE: