From f16c7e25cac66bb3430cbf223a44844493cda1f3 Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Fri, 24 Jul 2015 02:29:59 -0400
Subject: [PATCH] Fully fix parsing templates with blank names, I hope (#111)

---
 CHANGELOG                            |  3 +++
 docs/changelog.rst                   |  5 ++++
 mwparserfromhell/parser/contexts.py  | 18 +++++++------
 mwparserfromhell/parser/tokenizer.c  | 52 ++++++++++++++++++++++--------------
 mwparserfromhell/parser/tokenizer.h  | 21 ++++++++-------
 mwparserfromhell/parser/tokenizer.py | 35 +++++++++++++++---------
 tests/tokenizer/templates.mwtest     |  4 +--
 7 files changed, 85 insertions(+), 53 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index c696b98..5b5d794 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -8,6 +8,9 @@ v0.4.1 (unreleased):
   includes when denoting tags, but not comments.
 - Fixed the behavior of preserve_spacing in Template.add() and keep_field in
   Template.remove() on parameters with hidden keys.
+- Fixed parser bugs involving:
+  - templates with completely blank names;
+  - templates with newlines and comments.
 - Fixed some bugs in the release scripts.
 
 v0.4 (released May 23, 2015):
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 54f8af8..4e64a8b 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -15,6 +15,11 @@ Unreleased
   This includes when denoting tags, but not comments.
 - Fixed the behavior of *preserve_spacing* in :func:`~.Template.add` and
   *keep_field* in :func:`~.Template.remove` on parameters with hidden keys.
+- Fixed parser bugs involving:
+
+  - templates with completely blank names;
+  - templates with newlines and comments.
+
 - Fixed some bugs in the release scripts.
 
 v0.4
diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py
index e98d8f7..b676e86 100644
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -89,6 +89,7 @@ Local (stack-specific) contexts:
     * :const:`FAIL_ON_LBRACE`
     * :const:`FAIL_ON_RBRACE`
     * :const:`FAIL_ON_EQUALS`
+    * :const:`HAS_TEMPLATE`
 
 * :const:`TABLE`
 
@@ -161,15 +162,16 @@ FAIL_NEXT  =     1 << 26
 FAIL_ON_LBRACE = 1 << 27
 FAIL_ON_RBRACE = 1 << 28
 FAIL_ON_EQUALS = 1 << 29
+HAS_TEMPLATE =   1 << 30
 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
-                FAIL_ON_RBRACE + FAIL_ON_EQUALS)
-
-TABLE_OPEN =       1 << 30
-TABLE_CELL_OPEN =  1 << 31
-TABLE_CELL_STYLE = 1 << 32
-TABLE_ROW_OPEN =   1 << 33
-TABLE_TD_LINE =    1 << 34
-TABLE_TH_LINE =    1 << 35
+                FAIL_ON_RBRACE + FAIL_ON_EQUALS + HAS_TEMPLATE)
+
+TABLE_OPEN =       1 << 31
+TABLE_CELL_OPEN =  1 << 32
+TABLE_CELL_STYLE = 1 << 33
+TABLE_ROW_OPEN =   1 << 34
+TABLE_TD_LINE =    1 << 35
+TABLE_TH_LINE =    1 << 36
 TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE
 TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN +
          TABLE_TD_LINE + TABLE_TH_LINE)
diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index ec0315f..f4e801b 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -582,12 +582,16 @@ static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
 /*
     Parse a template at the head of the wikicode string.
 */
-static int Tokenizer_parse_template(Tokenizer* self)
+static int Tokenizer_parse_template(Tokenizer* self, int has_content)
 {
     PyObject *template;
     Py_ssize_t reset = self->head;
+    uint64_t context = LC_TEMPLATE_NAME;
 
-    template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1);
+    if (has_content)
+        context |= LC_HAS_TEMPLATE;
+
+    template = Tokenizer_parse(self, context, 1);
     if (BAD_ROUTE) {
         self->head = reset;
         return 0;
@@ -643,6 +647,7 @@ static int Tokenizer_parse_argument(Tokenizer* self)
 static int Tokenizer_parse_template_or_argument(Tokenizer* self)
 {
     unsigned int braces = 2, i;
+    int has_content = 0;
     PyObject *tokenlist;
 
     self->head += 2;
@@ -659,7 +664,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
             return 0;
         }
         if (braces == 2) {
-            if (Tokenizer_parse_template(self))
+            if (Tokenizer_parse_template(self, has_content))
                 return -1;
             if (BAD_ROUTE) {
                 RESET_ROUTE();
@@ -673,7 +678,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
             return -1;
         if (BAD_ROUTE) {
             RESET_ROUTE();
-            if (Tokenizer_parse_template(self))
+            if (Tokenizer_parse_template(self, has_content))
                 return -1;
             if (BAD_ROUTE) {
                 char text[MAX_BRACES + 1];
@@ -689,8 +694,10 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
         }
         else
             braces -= 3;
-        if (braces)
+        if (braces) {
+            has_content = 1;
             self->head++;
+        }
     }
     tokenlist = Tokenizer_pop(self);
     if (!tokenlist)
@@ -712,8 +719,13 @@ static int Tokenizer_handle_template_param(Tokenizer* self)
 {
     PyObject *stack;
 
-    if (self->topstack->context & LC_TEMPLATE_NAME)
+    if (self->topstack->context & LC_TEMPLATE_NAME) {
+        if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE))) {
+            Tokenizer_fail_route(self);
+            return -1;
+        }
         self->topstack->context ^= LC_TEMPLATE_NAME;
+    }
     else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE)
         self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE;
     if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
@@ -764,7 +776,11 @@ static PyObject* Tokenizer_handle_template_end(Tokenizer* self)
 {
     PyObject* stack;
 
-    if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
+    if (self->topstack->context & LC_TEMPLATE_NAME) {
+        if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE)))
+            return Tokenizer_fail_route(self);
+    }
+    else if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
         stack = Tokenizer_pop_keeping_context(self);
         if (!stack)
             return NULL;
@@ -2885,30 +2901,26 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
     if (context & LC_TAG_CLOSE)
         return (data == '<') ? -1 : 0;
     if (context & LC_TEMPLATE_NAME) {
-        if (data == '{' || data == '}' || data == '[') {
+        if (data == '{') {
+            self->topstack->context |= LC_HAS_TEMPLATE | LC_FAIL_NEXT;
+            return 0;
+        }
+        if (data == '}' || (data == '<' && Tokenizer_READ(self, 1) == '!')) {
             self->topstack->context |= LC_FAIL_NEXT;
             return 0;
         }
-        if (data == ']' || data == '>' || (data == '<' &&
-                                           Tokenizer_READ(self, 1) != '!')) {
+        if (data == '[' || data == ']' || data == '<' || data == '>') {
             return -1;
         }
         if (data == '|')
             return 0;
         if (context & LC_HAS_TEXT) {
             if (context & LC_FAIL_ON_TEXT) {
-                if (!Py_UNICODE_ISSPACE(data)) {
-                    if (data == '<' && Tokenizer_READ(self, 1) == '!') {
-                        self->topstack->context |= LC_FAIL_NEXT;
-                        return 0;
-                    }
+                if (!Py_UNICODE_ISSPACE(data))
                     return -1;
-                }
-            }
-            else {
-                if (data == '\n')
-                    self->topstack->context |= LC_FAIL_ON_TEXT;
             }
+            else if (data == '\n')
+                self->topstack->context |= LC_FAIL_ON_TEXT;
         }
         else if (!Py_UNICODE_ISSPACE(data))
             self->topstack->context |= LC_HAS_TEXT;
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index 842e65d..d477acb 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -150,22 +150,23 @@ static PyObject* TagCloseClose;
 
 #define LC_DLTERM                   0x0000000000800000
 
-#define LC_SAFETY_CHECK             0x000000003F000000
+#define LC_SAFETY_CHECK             0x000000007F000000
 #define LC_HAS_TEXT                 0x0000000001000000
 #define LC_FAIL_ON_TEXT             0x0000000002000000
 #define LC_FAIL_NEXT                0x0000000004000000
 #define LC_FAIL_ON_LBRACE           0x0000000008000000
 #define LC_FAIL_ON_RBRACE           0x0000000010000000
 #define LC_FAIL_ON_EQUALS           0x0000000020000000
-
-#define LC_TABLE                    0x0000000FC0000000
-#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000
-#define LC_TABLE_OPEN               0x0000000040000000
-#define LC_TABLE_CELL_OPEN          0x0000000080000000
-#define LC_TABLE_CELL_STYLE         0x0000000100000000
-#define LC_TABLE_ROW_OPEN           0x0000000200000000
-#define LC_TABLE_TD_LINE            0x0000000400000000
-#define LC_TABLE_TH_LINE            0x0000000800000000
+#define LC_HAS_TEMPLATE             0x0000000040000000
+
+#define LC_TABLE                    0x0000001F80000000
+#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000
+#define LC_TABLE_OPEN               0x0000000080000000
+#define LC_TABLE_CELL_OPEN          0x0000000100000000
+#define LC_TABLE_CELL_STYLE         0x0000000200000000
+#define LC_TABLE_ROW_OPEN           0x0000000400000000
+#define LC_TABLE_TD_LINE            0x0000000800000000
+#define LC_TABLE_TH_LINE            0x0000001000000000
 
 /* Global contexts: */
 
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 4d7d885..5c89455 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -192,11 +192,14 @@ class Tokenizer(object):
                 self._fail_route()
             return self.END
 
-    def _parse_template(self):
+    def _parse_template(self, has_content):
         """Parse a template at the head of the wikicode string."""
         reset = self._head
+        context = contexts.TEMPLATE_NAME
+        if has_content:
+            context |= contexts.HAS_TEMPLATE
         try:
-            template = self._parse(contexts.TEMPLATE_NAME)
+            template = self._parse(context)
         except BadRoute:
             self._head = reset
             raise
@@ -223,6 +226,7 @@ class Tokenizer(object):
         while self._read() == "{":
             self._head += 1
             braces += 1
+        has_content = False
         self._push()
 
         while braces:
@@ -230,7 +234,7 @@ class Tokenizer(object):
                 return self._emit_text_then_stack("{")
             if braces == 2:
                 try:
-                    self._parse_template()
+                    self._parse_template(has_content)
                 except BadRoute:
                     return self._emit_text_then_stack("{{")
                 break
@@ -239,11 +243,12 @@ class Tokenizer(object):
                 braces -= 3
             except BadRoute:
                 try:
-                    self._parse_template()
+                    self._parse_template(has_content)
                     braces -= 2
                 except BadRoute:
                     return self._emit_text_then_stack("{" * braces)
             if braces:
+                has_content = True
                 self._head += 1
 
         self._emit_all(self._pop())
@@ -253,6 +258,8 @@ class Tokenizer(object):
     def _handle_template_param(self):
         """Handle a template parameter at the head of the string."""
         if self._context & contexts.TEMPLATE_NAME:
+            if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE):
+                self._fail_route()
             self._context ^= contexts.TEMPLATE_NAME
         elif self._context & contexts.TEMPLATE_PARAM_VALUE:
             self._context ^= contexts.TEMPLATE_PARAM_VALUE
@@ -271,7 +278,10 @@ class Tokenizer(object):
 
     def _handle_template_end(self):
         """Handle the end of a template at the head of the string."""
-        if self._context & contexts.TEMPLATE_PARAM_KEY:
+        if self._context & contexts.TEMPLATE_NAME:
+            if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE):
+                self._fail_route()
+        elif self._context & contexts.TEMPLATE_PARAM_KEY:
             self._emit_all(self._pop(keep_context=True))
         self._head += 1
         return self._pop()
@@ -1183,23 +1193,22 @@ class Tokenizer(object):
         elif context & contexts.EXT_LINK_TITLE:
             return this != "\n"
         elif context & contexts.TEMPLATE_NAME:
-            if this == "{" or this == "}" or this == "[":
+            if this == "{":
+                self._context |= contexts.HAS_TEMPLATE | contexts.FAIL_NEXT
+                return True
+            if this == "}" or (this == "<" and self._read(1) == "!"):
                 self._context |= contexts.FAIL_NEXT
                 return True
-            if this == "]" or this == ">" or (this == "<" and self._read(1) != "!"):
+            if this == "[" or this == "]" or this == "<" or this == ">":
                 return False
             if this == "|":
                 return True
             if context & contexts.HAS_TEXT:
                 if context & contexts.FAIL_ON_TEXT:
                     if this is self.END or not this.isspace():
-                        if this == "<" and self._read(1) == "!":
-                            self._context |= contexts.FAIL_NEXT
-                            return True
                         return False
-                else:
-                    if this == "\n":
-                        self._context |= contexts.FAIL_ON_TEXT
+                elif this == "\n":
+                    self._context |= contexts.FAIL_ON_TEXT
             elif this is self.END or not this.isspace():
                 self._context |= contexts.HAS_TEXT
             return True
diff --git a/tests/tokenizer/templates.mwtest b/tests/tokenizer/templates.mwtest
index 4756ac6..1913f5d 100644
--- a/tests/tokenizer/templates.mwtest
+++ b/tests/tokenizer/templates.mwtest
@@ -686,5 +686,5 @@ output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{
 
 name:   recursion_opens_and_closes
 label:  test potentially dangerous recursion: template openings and closings
-input:  "{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}"
-output: [Text(text="{{|"), TemplateOpen(), TemplateClose(), Text(text="{{|"), TemplateOpen(), TemplateClose(), TemplateOpen(), TemplateParamSeparator(), TemplateOpen(), TemplateClose(), Text(text="{{"), TemplateParamSeparator(), Text(text="{{"), TemplateClose(), Text(text="{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}")]
+input:  "{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}"
+output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), TemplateOpen(), Text(text="x"), TemplateParamSeparator(), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x"), TemplateParamSeparator(), Text(text="{{x"), TemplateClose(), Text(text="{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}")]