From 0803417901d09d7df830e65300355507715e67cb Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 23 Feb 2013 13:12:16 -0500
Subject: [PATCH] Port CTokenizer's verify_safe method to Python to solve a
 failing test.

---
 mwparserfromhell/parser/contexts.py  |  62 +++++++++++-------
 mwparserfromhell/parser/tokenizer.c  |  12 ++--
 mwparserfromhell/parser/tokenizer.h  |   1 +
 mwparserfromhell/parser/tokenizer.py | 122 +++++++++++++++++++++++++----------
 4 files changed, 137 insertions(+), 60 deletions(-)

diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py
index b65946c..896d137 100644
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -62,6 +62,15 @@ Local (stack-specific) contexts:
 
 * :py:const:`COMMENT`
 
+* :py:const:`SAFETY_CHECK`
+
+    * :py:const:`HAS_TEXT`
+    * :py:const:`FAIL_ON_TEXT`
+    * :py:const:`FAIL_NEXT`
+    * :py:const:`FAIL_ON_LBRACE`
+    * :py:const:`FAIL_ON_RBRACE`
+    * :py:const:`FAIL_ON_EQUALS`
+
 Global contexts:
 
 * :py:const:`GL_HEADING`
@@ -69,29 +78,36 @@ Global contexts:
 
 # Local contexts:
 
-TEMPLATE =              0b00000000000111
-TEMPLATE_NAME =         0b00000000000001
-TEMPLATE_PARAM_KEY =    0b00000000000010
-TEMPLATE_PARAM_VALUE =  0b00000000000100
-
-ARGUMENT =              0b00000000011000
-ARGUMENT_NAME =         0b00000000001000
-ARGUMENT_DEFAULT =      0b00000000010000
-
-WIKILINK =              0b00000001100000
-WIKILINK_TITLE =        0b00000000100000
-WIKILINK_TEXT =         0b00000001000000
-
-HEADING =               0b01111110000000
-HEADING_LEVEL_1 =       0b00000010000000
-HEADING_LEVEL_2 =       0b00000100000000
-HEADING_LEVEL_3 =       0b00001000000000
-HEADING_LEVEL_4 =       0b00010000000000
-HEADING_LEVEL_5 =       0b00100000000000
-HEADING_LEVEL_6 =       0b01000000000000
-
-COMMENT =               0b10000000000000
-
+TEMPLATE =              0b00000000000000000111
+TEMPLATE_NAME =         0b00000000000000000001
+TEMPLATE_PARAM_KEY =    0b00000000000000000010
+TEMPLATE_PARAM_VALUE =  0b00000000000000000100
+
+ARGUMENT =              0b00000000000000011000
+ARGUMENT_NAME =         0b00000000000000001000
+ARGUMENT_DEFAULT =      0b00000000000000010000
+
+WIKILINK =              0b00000000000001100000
+WIKILINK_TITLE =        0b00000000000000100000
+WIKILINK_TEXT =         0b00000000000001000000
+
+HEADING =               0b00000001111110000000
+HEADING_LEVEL_1 =       0b00000000000010000000
+HEADING_LEVEL_2 =       0b00000000000100000000
+HEADING_LEVEL_3 =       0b00000000001000000000
+HEADING_LEVEL_4 =       0b00000000010000000000
+HEADING_LEVEL_5 =       0b00000000100000000000
+HEADING_LEVEL_6 =       0b00000001000000000000
+
+COMMENT =               0b00000010000000000000
+
+SAFETY_CHECK =          0b11111100000000000000
+HAS_TEXT =              0b00000100000000000000
+FAIL_ON_TEXT =          0b00001000000000000000
+FAIL_NEXT  =            0b00010000000000000000
+FAIL_ON_LBRACE =        0b00100000000000000000
+FAIL_ON_RBRACE =        0b01000000000000000000
+FAIL_ON_EQUALS =        0b10000000000000000000
 
 # Global contexts:
 
diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 09649a7..d82b080 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1324,10 +1324,14 @@ Tokenizer_parse(Tokenizer* self, int context)
             Tokenizer_write_text(self, this);
         }
         else if (this == next && next == *"[") {
-            if (Tokenizer_parse_wikilink(self))
-                return NULL;
-            if (self->topstack->context & LC_FAIL_NEXT)
-                self->topstack->context ^= LC_FAIL_NEXT;
+            if (!(this_context & LC_WIKILINK_TITLE)) {
+                if (Tokenizer_parse_wikilink(self))
+                    return NULL;
+                if (self->topstack->context & LC_FAIL_NEXT)
+                    self->topstack->context ^= LC_FAIL_NEXT;
+            }
+            else
+                Tokenizer_write_text(self, this);
         }
         else if (this == *"|" && this_context & LC_WIKILINK_TITLE) {
             if (Tokenizer_handle_wikilink_separator(self))
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index 3293a8f..af86321 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -118,6 +118,7 @@ static PyObject* TagCloseClose;
 
 #define LC_COMMENT              0x02000
 
+#define LC_SAFETY_CHECK         0xFC000
 #define LC_HAS_TEXT             0x04000
 #define LC_FAIL_ON_TEXT         0x08000
 #define LC_FAIL_NEXT            0x10000
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index eead131..a365db8 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -213,28 +213,9 @@ class Tokenizer(object):
         self._write_all(argument)
         self._write(tokens.ArgumentClose())
 
-    def _verify_safe(self, unsafes, strip=True):
-        """Verify that there are no unsafe characters in the current stack.
-
-        The route will be failed if the name contains any element of *unsafes*
-        in it. This is used when parsing template names, parameter keys, and so
-        on, which cannot contain newlines and some other characters. If *strip*
-        is ``True``, the text will be stripped of whitespace, since this is
-        allowed at the ends of certain elements but not between text.
-        """
-        self._push_textbuffer()
-        if self._stack:
-            text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
-            text = "".join([token.text for token in text])
-            if strip:
-                text = text.strip()
-            if text and any([unsafe in text for unsafe in unsafes]):
-                self._fail_route()
-
     def _handle_template_param(self):
         """Handle a template parameter at the head of the string."""
         if self._context & contexts.TEMPLATE_NAME:
-            self._verify_safe(["\n", "{", "}", "[", "]"])
             self._context ^= contexts.TEMPLATE_NAME
         elif self._context & contexts.TEMPLATE_PARAM_VALUE:
             self._context ^= contexts.TEMPLATE_PARAM_VALUE
@@ -246,11 +227,6 @@ class Tokenizer(object):
 
     def _handle_template_param_value(self):
         """Handle a template parameter's value at the head of the string."""
-        try:
-            self._verify_safe(["\n", "{{", "}}"])
-        except BadRoute:
-            self._pop()
-            raise
         self._write_all(self._pop(keep_context=True))
         self._context ^= contexts.TEMPLATE_PARAM_KEY
         self._context |= contexts.TEMPLATE_PARAM_VALUE
@@ -258,24 +234,19 @@ class Tokenizer(object):
 
     def _handle_template_end(self):
         """Handle the end of a template at the head of the string."""
-        if self._context & contexts.TEMPLATE_NAME:
-            self._verify_safe(["\n", "{", "}", "[", "]"])
-        elif self._context & contexts.TEMPLATE_PARAM_KEY:
+        if self._context & contexts.TEMPLATE_PARAM_KEY:
             self._write_all(self._pop(keep_context=True))
         self._head += 1
         return self._pop()
 
     def _handle_argument_separator(self):
         """Handle the separator between an argument's name and default."""
-        self._verify_safe(["\n", "{{", "}}"])
         self._context ^= contexts.ARGUMENT_NAME
         self._context |= contexts.ARGUMENT_DEFAULT
         self._write(tokens.ArgumentSeparator())
 
     def _handle_argument_end(self):
         """Handle the end of an argument at the head of the string."""
-        if self._context & contexts.ARGUMENT_NAME:
-            self._verify_safe(["\n", "{{", "}}"])
         self._head += 2
         return self._pop()
 
@@ -295,15 +266,12 @@ class Tokenizer(object):
 
     def _handle_wikilink_separator(self):
         """Handle the separator between a wikilink's title and its text."""
-        self._verify_safe(["\n", "{", "}", "[", "]"], strip=False)
         self._context ^= contexts.WIKILINK_TITLE
         self._context |= contexts.WIKILINK_TEXT
         self._write(tokens.WikilinkSeparator())
 
     def _handle_wikilink_end(self):
         """Handle the end of a wikilink at the head of the string."""
-        if self._context & contexts.WIKILINK_TITLE:
-            self._verify_safe(["\n", "{", "}", "[", "]"], strip=False)
         self._head += 1
         return self._pop()
 
@@ -424,11 +392,94 @@ class Tokenizer(object):
             self._write(tokens.CommentEnd())
             self._head += 2
 
+    def _verify_safe(self, this):
+        """Make sure we are not trying to write an invalid character."""
+        context = self._context
+        if context & contexts.FAIL_NEXT:
+            self._fail_route()
+        if context & contexts.WIKILINK_TITLE:
+            if this == "]" or this == "{":
+                self._context |= contexts.FAIL_NEXT
+            elif this == "\n" or this == "[" or this == "}":
+                self._fail_route()
+            return
+        if context & contexts.TEMPLATE_NAME:
+            if this == "{" or this == "}" or this == "[":
+                self._context |= contexts.FAIL_NEXT
+                return
+            if this == "]":
+                self._fail_route()
+                return
+            if this == "|":
+                return
+        elif context & (contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME):
+            if context & contexts.FAIL_ON_EQUALS:
+                if this == "=":
+                    self._fail_route()
+                    return
+            elif context & contexts.FAIL_ON_LBRACE:
+                if this == "{":
+                    if context & contexts.TEMPLATE:
+                        self._context |= contexts.FAIL_ON_EQUALS
+                    else:
+                        self._context |= contexts.FAIL_NEXT
+                    return
+                self._context ^= contexts.FAIL_ON_LBRACE
+            elif context & contexts.FAIL_ON_RBRACE:
+                if this == "}":
+                    if context & contexts.TEMPLATE:
+                        self._context |= contexts.FAIL_ON_EQUALS
+                    else:
+                        self._context |= contexts.FAIL_NEXT
+                    return
+                self._context ^= contexts.FAIL_ON_RBRACE
+            elif this == "{":
+                self._context |= contexts.FAIL_ON_LBRACE
+            elif this == "}":
+                self._context |= contexts.FAIL_ON_RBRACE
+        if context & contexts.HAS_TEXT:
+            if context & contexts.FAIL_ON_TEXT:
+                if this is self.END or not this.isspace():
+                    if context & contexts.TEMPLATE_PARAM_KEY:
+                        self._context ^= contexts.FAIL_ON_TEXT
+                        self._context |= contexts.FAIL_ON_EQUALS
+                    else:
+                        self._fail_route()
+                    return
+            else:
+                if this == "\n":
+                    self._context |= contexts.FAIL_ON_TEXT
+        elif this is self.END or not this.isspace():
+            self._context |= contexts.HAS_TEXT
+
+    def _reset_safety_checks(self):
+        """Unset any safety-checking contexts set by Tokenizer_verify_safe().
+
+        Used when we preserve a context but previous data becomes invalid, like
+        when moving between template parameters.
+        """
+        context = self._context
+        checks = (contexts.HAS_TEXT, contexts.FAIL_ON_TEXT, contexts.FAIL_NEXT,
+                  contexts.FAIL_ON_LBRACE, contexts.FAIL_ON_RBRACE,
+                  contexts.FAIL_ON_EQUALS)
+        for check in checks:
+            if context & check:
+                self._context ^= check;
+
     def _parse(self, context=0):
         """Parse the wikicode string, using *context* for when to stop."""
         self._push(context)
         while True:
             this = self._read()
+            unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
+                      contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME)
+            if self._context & unsafe:
+                try:
+                    self._verify_safe(this)
+                except BadRoute:
+                    if self._context & contexts.TEMPLATE_PARAM_KEY:
+                        self._pop()
+                    raise
             if this not in self.MARKERS:
                 self._write_text(this)
                 self._head += 1
@@ -450,7 +501,10 @@ class Tokenizer(object):
                     self._write_text(this)
             elif this == next == "{":
                 self._parse_template_or_argument()
+                if self._context & contexts.FAIL_NEXT:
+                    self._context ^= contexts.FAIL_NEXT
             elif this == "|" and self._context & contexts.TEMPLATE:
+                self._reset_safety_checks()
                 self._handle_template_param()
             elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
                 self._handle_template_param_value()
@@ -466,6 +520,8 @@ class Tokenizer(object):
             elif this == next == "[":
                 if not self._context & contexts.WIKILINK_TITLE:
                     self._parse_wikilink()
+                    if self._context & contexts.FAIL_NEXT:
+                        self._context ^= contexts.FAIL_NEXT
                 else:
                     self._write_text("[")
             elif this == "|" and self._context & contexts.WIKILINK_TITLE: