Port CTokenizer's verify_safe method to Python to solve a failing test.

12 years ago · 0803417901
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -62,6 +62,15 @@ Local (stack-specific) contexts:

 * :py:const:`COMMENT`

 * :py:const:`SAFETY_CHECK`

    * :py:const:`HAS_TEXT`
    * :py:const:`FAIL_ON_TEXT`
    * :py:const:`FAIL_NEXT`
    * :py:const:`FAIL_ON_LBRACE`
    * :py:const:`FAIL_ON_RBRACE`
    * :py:const:`FAIL_ON_EQUALS`

 Global contexts:

 * :py:const:`GL_HEADING`
@@ -69,29 +78,36 @@ Global contexts:

 # Local contexts:

 TEMPLATE =              0b00000000000111
 TEMPLATE_NAME =         0b00000000000001
 TEMPLATE_PARAM_KEY =    0b00000000000010
 TEMPLATE_PARAM_VALUE =  0b00000000000100

 ARGUMENT =              0b00000000011000
 ARGUMENT_NAME =         0b00000000001000
 ARGUMENT_DEFAULT =      0b00000000010000

 WIKILINK =              0b00000001100000
 WIKILINK_TITLE =        0b00000000100000
 WIKILINK_TEXT =         0b00000001000000

 HEADING =               0b01111110000000
 HEADING_LEVEL_1 =       0b00000010000000
 HEADING_LEVEL_2 =       0b00000100000000
 HEADING_LEVEL_3 =       0b00001000000000
 HEADING_LEVEL_4 =       0b00010000000000
 HEADING_LEVEL_5 =       0b00100000000000
 HEADING_LEVEL_6 =       0b01000000000000

 COMMENT =               0b10000000000000

 TEMPLATE =              0b00000000000000000111
 TEMPLATE_NAME =         0b00000000000000000001
 TEMPLATE_PARAM_KEY =    0b00000000000000000010
 TEMPLATE_PARAM_VALUE =  0b00000000000000000100

 ARGUMENT =              0b00000000000000011000
 ARGUMENT_NAME =         0b00000000000000001000
 ARGUMENT_DEFAULT =      0b00000000000000010000

 WIKILINK =              0b00000000000001100000
 WIKILINK_TITLE =        0b00000000000000100000
 WIKILINK_TEXT =         0b00000000000001000000

 HEADING =               0b00000001111110000000
 HEADING_LEVEL_1 =       0b00000000000010000000
 HEADING_LEVEL_2 =       0b00000000000100000000
 HEADING_LEVEL_3 =       0b00000000001000000000
 HEADING_LEVEL_4 =       0b00000000010000000000
 HEADING_LEVEL_5 =       0b00000000100000000000
 HEADING_LEVEL_6 =       0b00000001000000000000

 COMMENT =               0b00000010000000000000

 SAFETY_CHECK =          0b11111100000000000000
 HAS_TEXT =              0b00000100000000000000
 FAIL_ON_TEXT =          0b00001000000000000000
 FAIL_NEXT  =            0b00010000000000000000
 FAIL_ON_LBRACE =        0b00100000000000000000
 FAIL_ON_RBRACE =        0b01000000000000000000
 FAIL_ON_EQUALS =        0b10000000000000000000

 # Global contexts:

--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1324,10 +1324,14 @@ Tokenizer_parse(Tokenizer* self, int context)
            Tokenizer_write_text(self, this);
        }
        else if (this == next && next == *"[") {
            if (Tokenizer_parse_wikilink(self))
                return NULL;
            if (self->topstack->context & LC_FAIL_NEXT)
                self->topstack->context ^= LC_FAIL_NEXT;
            if (!(this_context & LC_WIKILINK_TITLE)) {
                if (Tokenizer_parse_wikilink(self))
                    return NULL;
                if (self->topstack->context & LC_FAIL_NEXT)
                    self->topstack->context ^= LC_FAIL_NEXT;
            }
            else
                Tokenizer_write_text(self, this);
        }
        else if (this == *"|" && this_context & LC_WIKILINK_TITLE) {
            if (Tokenizer_handle_wikilink_separator(self))
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -118,6 +118,7 @@ static PyObject* TagCloseClose;

 #define LC_COMMENT              0x02000

 #define LC_SAFETY_CHECK         0xFC000
 #define LC_HAS_TEXT             0x04000
 #define LC_FAIL_ON_TEXT         0x08000
 #define LC_FAIL_NEXT            0x10000
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -213,28 +213,9 @@ class Tokenizer(object):
        self._write_all(argument)
        self._write(tokens.ArgumentClose())

    def _verify_safe(self, unsafes, strip=True):
        """Verify that there are no unsafe characters in the current stack.

        The route will be failed if the name contains any element of *unsafes*
        in it. This is used when parsing template names, parameter keys, and so
        on, which cannot contain newlines and some other characters. If *strip*
        is ``True``, the text will be stripped of whitespace, since this is
        allowed at the ends of certain elements but not between text.
        """
        self._push_textbuffer()
        if self._stack:
            text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
            text = "".join([token.text for token in text])
            if strip:
                text = text.strip()
            if text and any([unsafe in text for unsafe in unsafes]):
                self._fail_route()

    def _handle_template_param(self):
        """Handle a template parameter at the head of the string."""
        if self._context & contexts.TEMPLATE_NAME:
            self._verify_safe(["\n", "{", "}", "[", "]"])
            self._context ^= contexts.TEMPLATE_NAME
        elif self._context & contexts.TEMPLATE_PARAM_VALUE:
            self._context ^= contexts.TEMPLATE_PARAM_VALUE
@@ -246,11 +227,6 @@ class Tokenizer(object):

    def _handle_template_param_value(self):
        """Handle a template parameter's value at the head of the string."""
        try:
            self._verify_safe(["\n", "{{", "}}"])
        except BadRoute:
            self._pop()
            raise
        self._write_all(self._pop(keep_context=True))
        self._context ^= contexts.TEMPLATE_PARAM_KEY
        self._context |= contexts.TEMPLATE_PARAM_VALUE
@@ -258,24 +234,19 @@ class Tokenizer(object):

    def _handle_template_end(self):
        """Handle the end of a template at the head of the string."""
        if self._context & contexts.TEMPLATE_NAME:
            self._verify_safe(["\n", "{", "}", "[", "]"])
        elif self._context & contexts.TEMPLATE_PARAM_KEY:
        if self._context & contexts.TEMPLATE_PARAM_KEY:
            self._write_all(self._pop(keep_context=True))
        self._head += 1
        return self._pop()

    def _handle_argument_separator(self):
        """Handle the separator between an argument's name and default."""
        self._verify_safe(["\n", "{{", "}}"])
        self._context ^= contexts.ARGUMENT_NAME
        self._context |= contexts.ARGUMENT_DEFAULT
        self._write(tokens.ArgumentSeparator())

    def _handle_argument_end(self):
        """Handle the end of an argument at the head of the string."""
        if self._context & contexts.ARGUMENT_NAME:
            self._verify_safe(["\n", "{{", "}}"])
        self._head += 2
        return self._pop()

@@ -295,15 +266,12 @@ class Tokenizer(object):

    def _handle_wikilink_separator(self):
        """Handle the separator between a wikilink's title and its text."""
        self._verify_safe(["\n", "{", "}", "[", "]"], strip=False)
        self._context ^= contexts.WIKILINK_TITLE
        self._context |= contexts.WIKILINK_TEXT
        self._write(tokens.WikilinkSeparator())

    def _handle_wikilink_end(self):
        """Handle the end of a wikilink at the head of the string."""
        if self._context & contexts.WIKILINK_TITLE:
            self._verify_safe(["\n", "{", "}", "[", "]"], strip=False)
        self._head += 1
        return self._pop()

@@ -424,11 +392,94 @@ class Tokenizer(object):
            self._write(tokens.CommentEnd())
            self._head += 2

    def _verify_safe(self, this):
        """Make sure we are not trying to write an invalid character."""
        context = self._context
        if context & contexts.FAIL_NEXT:
            self._fail_route()
        if context & contexts.WIKILINK_TITLE:
            if this == "]" or this == "{":
                self._context |= contexts.FAIL_NEXT
            elif this == "\n" or this == "[" or this == "}":
                self._fail_route()
            return
        if context & contexts.TEMPLATE_NAME:
            if this == "{" or this == "}" or this == "[":
                self._context |= contexts.FAIL_NEXT
                return
            if this == "]":
                self._fail_route()
                return
            if this == "|":
                return
        elif context & (contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME):
            if context & contexts.FAIL_ON_EQUALS:
                if this == "=":
                    self._fail_route()
                    return
            elif context & contexts.FAIL_ON_LBRACE:
                if this == "{":
                    if context & contexts.TEMPLATE:
                        self._context |= contexts.FAIL_ON_EQUALS
                    else:
                        self._context |= contexts.FAIL_NEXT
                    return
                self._context ^= contexts.FAIL_ON_LBRACE
            elif context & contexts.FAIL_ON_RBRACE:
                if this == "}":
                    if context & contexts.TEMPLATE:
                        self._context |= contexts.FAIL_ON_EQUALS
                    else:
                        self._context |= contexts.FAIL_NEXT
                    return
                self._context ^= contexts.FAIL_ON_RBRACE
            elif this == "{":
                self._context |= contexts.FAIL_ON_LBRACE
            elif this == "}":
                self._context |= contexts.FAIL_ON_RBRACE
        if context & contexts.HAS_TEXT:
            if context & contexts.FAIL_ON_TEXT:
                if this is self.END or not this.isspace():
                    if context & contexts.TEMPLATE_PARAM_KEY:
                        self._context ^= contexts.FAIL_ON_TEXT
                        self._context |= contexts.FAIL_ON_EQUALS
                    else:
                        self._fail_route()
                    return
            else:
                if this == "\n":
                    self._context |= contexts.FAIL_ON_TEXT
        elif this is self.END or not this.isspace():
            self._context |= contexts.HAS_TEXT

    def _reset_safety_checks(self):
        """Unset any safety-checking contexts set by Tokenizer_verify_safe().

        Used when we preserve a context but previous data becomes invalid, like
        when moving between template parameters.
        """
        context = self._context
        checks = (contexts.HAS_TEXT, contexts.FAIL_ON_TEXT, contexts.FAIL_NEXT,
                  contexts.FAIL_ON_LBRACE, contexts.FAIL_ON_RBRACE,
                  contexts.FAIL_ON_EQUALS)
        for check in checks:
            if context & check:
                self._context ^= check;

    def _parse(self, context=0):
        """Parse the wikicode string, using *context* for when to stop."""
        self._push(context)
        while True:
            this = self._read()
            unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
                      contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME)
            if self._context & unsafe:
                try:
                    self._verify_safe(this)
                except BadRoute:
                    if self._context & contexts.TEMPLATE_PARAM_KEY:
                        self._pop()
                    raise
            if this not in self.MARKERS:
                self._write_text(this)
                self._head += 1
@@ -450,7 +501,10 @@ class Tokenizer(object):
                    self._write_text(this)
            elif this == next == "{":
                self._parse_template_or_argument()
                if self._context & contexts.FAIL_NEXT:
                    self._context ^= contexts.FAIL_NEXT
            elif this == "|" and self._context & contexts.TEMPLATE:
                self._reset_safety_checks()
                self._handle_template_param()
            elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
                self._handle_template_param_value()
@@ -466,6 +520,8 @@ class Tokenizer(object):
            elif this == next == "[":
                if not self._context & contexts.WIKILINK_TITLE:
                    self._parse_wikilink()
                    if self._context & contexts.FAIL_NEXT:
                        self._context ^= contexts.FAIL_NEXT
                else:
                    self._write_text("[")
            elif this == "|" and self._context & contexts.WIKILINK_TITLE: