From d6f2723a06c45d92e478cffeedf3ce2c4be21a43 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Thu, 24 Jan 2013 03:07:36 -0500
Subject: [PATCH] Fix safety checks on template params in some odd cases
 (closes #24).

Also, fix parsing of wikilinks in both tokenizers such that newlines
in any location within the title are an automatic failure.
---
 mwparserfromhell/parser/tokenizer.c  | 57 ++++++++++++++++++++++++++----------
 mwparserfromhell/parser/tokenizer.h  |  1 +
 mwparserfromhell/parser/tokenizer.py | 16 ++++++----
 3 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 40ec723..09649a7 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1144,17 +1144,24 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
         Tokenizer_fail_route(self);
         return;
     }
-    if (context & (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE)) {
-        if (data == *"{" || data == *"}" || data == *"[" || data == *"]") {
+    if (context & LC_WIKILINK_TITLE) {
+        if (data == *"]" || data == *"{")
+            self->topstack->context |= LC_FAIL_NEXT;
+        else if (data == *"\n" || data == *"[" || data == *"}")
+            Tokenizer_fail_route(self);
+        return;
+    }
+    if (context & LC_TEMPLATE_NAME) {
+        if (data == *"{" || data == *"}" || data == *"[") {
             self->topstack->context |= LC_FAIL_NEXT;
             return;
         }
-        if (data == *"|") {
-            if (context & LC_FAIL_ON_TEXT) {
-                self->topstack->context ^= LC_FAIL_ON_TEXT;
-                return;
-            }
+        if (data == *"]") {
+            Tokenizer_fail_route(self);
+            return;
         }
+        if (data == *"|")
+            return;
     }
     else if (context & (LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)) {
         if (context & LC_FAIL_ON_EQUALS) {
@@ -1210,6 +1217,28 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
 }
 
 /*
+    Unset any safety-checking contexts set by Tokenizer_verify_safe(). Used
+    when we preserve a context but previous data becomes invalid, like when
+    moving between template parameters.
+*/
+static void
+Tokenizer_reset_safety_checks(Tokenizer* self)
+{
+    static int checks[] = {
+        LC_HAS_TEXT, LC_FAIL_ON_TEXT, LC_FAIL_NEXT, LC_FAIL_ON_LBRACE,
+        LC_FAIL_ON_RBRACE, LC_FAIL_ON_EQUALS, 0};
+    int context = self->topstack->context, i = 0, this;
+    while (1) {
+        this = checks[i];
+        if (!this)
+            return;
+        if (context & this)
+            self->topstack->context ^= this;
+        i++;
+    }
+}
+
+/*
     Parse the wikicode string, using context for when to stop.
 */
 static PyObject*
@@ -1274,6 +1303,7 @@ Tokenizer_parse(Tokenizer* self, int context)
                 self->topstack->context ^= LC_FAIL_NEXT;
         }
         else if (this == *"|" && this_context & LC_TEMPLATE) {
+            Tokenizer_reset_safety_checks(self);
             if (Tokenizer_handle_template_param(self))
                 return NULL;
         }
@@ -1294,15 +1324,10 @@ Tokenizer_parse(Tokenizer* self, int context)
             Tokenizer_write_text(self, this);
         }
         else if (this == next && next == *"[") {
-            if (!(this_context & LC_WIKILINK_TITLE)) {
-                if (Tokenizer_parse_wikilink(self))
-                    return NULL;
-                if (self->topstack->context & LC_FAIL_NEXT)
-                    self->topstack->context ^= LC_FAIL_NEXT;
-            }
-            else {
-                Tokenizer_write_text(self, this);
-            }
+            if (Tokenizer_parse_wikilink(self))
+                return NULL;
+            if (self->topstack->context & LC_FAIL_NEXT)
+                self->topstack->context ^= LC_FAIL_NEXT;
         }
         else if (this == *"|" && this_context & LC_WIKILINK_TITLE) {
             if (Tokenizer_handle_wikilink_separator(self))
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index dffa0fb..3293a8f 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -206,6 +206,7 @@ static int Tokenizer_really_parse_entity(Tokenizer*);
 static int Tokenizer_parse_entity(Tokenizer*);
 static int Tokenizer_parse_comment(Tokenizer*);
 static void Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE);
+static void Tokenizer_reset_safety_checks(Tokenizer*);
 static PyObject* Tokenizer_parse(Tokenizer*, int);
 static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);
 
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index a2b405c..eead131 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -213,17 +213,21 @@ class Tokenizer(object):
         self._write_all(argument)
         self._write(tokens.ArgumentClose())
 
-    def _verify_safe(self, unsafes):
+    def _verify_safe(self, unsafes, strip=True):
         """Verify that there are no unsafe characters in the current stack.
 
         The route will be failed if the name contains any element of *unsafes*
-        in it (not merely at the beginning or end). This is used when parsing a
-        template name or parameter key, which cannot contain newlines.
+        in it. This is used when parsing template names, parameter keys, and so
+        on, which cannot contain newlines and some other characters. If *strip*
+        is ``True``, the text will be stripped of whitespace, since this is
+        allowed at the ends of certain elements but not between text.
         """
         self._push_textbuffer()
         if self._stack:
             text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
-            text = "".join([token.text for token in text]).strip()
+            text = "".join([token.text for token in text])
+            if strip:
+                text = text.strip()
             if text and any([unsafe in text for unsafe in unsafes]):
                 self._fail_route()
 
@@ -291,7 +295,7 @@ class Tokenizer(object):
 
     def _handle_wikilink_separator(self):
         """Handle the separator between a wikilink's title and its text."""
-        self._verify_safe(["\n", "{", "}", "[", "]"])
+        self._verify_safe(["\n", "{", "}", "[", "]"], strip=False)
         self._context ^= contexts.WIKILINK_TITLE
         self._context |= contexts.WIKILINK_TEXT
         self._write(tokens.WikilinkSeparator())
@@ -299,7 +303,7 @@ class Tokenizer(object):
     def _handle_wikilink_end(self):
         """Handle the end of a wikilink at the head of the string."""
         if self._context & contexts.WIKILINK_TITLE:
-            self._verify_safe(["\n", "{", "}", "[", "]"])
+            self._verify_safe(["\n", "{", "}", "[", "]"], strip=False)
         self._head += 1
         return self._pop()