From 9fc4b909e150cd786e97caf7daeb479733e5330e Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@gmail.com>
Date: Fri, 24 Oct 2014 03:40:37 -0500
Subject: [PATCH] Refactor a lot of table error recovery code.

---
 mwparserfromhell/parser/contexts.py  |   4 +-
 mwparserfromhell/parser/tokenizer.c  | 100 +++++++++++++++--------------------
 mwparserfromhell/parser/tokenizer.h  |   2 +-
 mwparserfromhell/parser/tokenizer.py |  82 ++++++++++++----------------
 tests/tokenizer/tables.mwtest        |   7 +++
 5 files changed, 87 insertions(+), 108 deletions(-)

diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py
index ef44ce2..17912cb 100644
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -171,7 +171,7 @@ TABLE_ROW_OPEN =   1 << 33
 TABLE_TD_LINE =    1 << 34
 TABLE_TH_LINE =    1 << 35
 TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE
-TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + + TABLE_ROW_OPEN +
+TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN +
          TABLE_TD_LINE + TABLE_TH_LINE)
 
 # Global contexts:
@@ -184,6 +184,6 @@ FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG +
         STYLE + TABLE)
 UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE +
           TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE)
-DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE
+DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN
 NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI
 NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK
diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 1b68b46..301ecfc 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -2510,10 +2510,9 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup,
 }
 
 /*
-    Parse until ``end_token`` as style attributes for a table.
+    Handle style attributes for a table until an ending token.
 */
-static PyObject*
-Tokenizer_parse_as_table_style(Tokenizer* self, char end_token)
+static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
 {
     TagData *data = TagData_new();
     PyObject *padding, *trash;
@@ -2569,9 +2568,9 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token)
 }
 
 /*
-    Handle the start of a table.
+    Parse a wikicode table by starting with the first line.
 */
-static int Tokenizer_handle_table_start(Tokenizer* self)
+static int Tokenizer_parse_table(Tokenizer* self)
 {
     Py_ssize_t reset = self->head + 1;
     PyObject *style, *padding;
@@ -2580,7 +2579,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self)
 
     if(Tokenizer_push(self, LC_TABLE_OPEN))
         return -1;
-    padding = Tokenizer_parse_as_table_style(self, '\n');
+    padding = Tokenizer_handle_table_style(self, '\n');
     if (BAD_ROUTE) {
         RESET_ROUTE();
         self->head = reset;
@@ -2622,20 +2621,10 @@ static int Tokenizer_handle_table_start(Tokenizer* self)
 }
 
 /*
-    Return the stack in order to handle the table end.
-*/
-static PyObject* Tokenizer_handle_table_end(Tokenizer* self)
-{
-    self->head += 2;
-    return Tokenizer_pop(self);
-}
-
-/*
     Parse as style until end of the line, then continue.
 */
 static int Tokenizer_handle_table_row(Tokenizer* self)
 {
-    Py_ssize_t reset = self->head;
     PyObject *padding, *style, *row, *trash;
     self->head += 2;
 
@@ -2648,11 +2637,10 @@ static int Tokenizer_handle_table_row(Tokenizer* self)
 
     if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN))
         return -1;
-    padding = Tokenizer_parse_as_table_style(self, '\n');
+    padding = Tokenizer_handle_table_style(self, '\n');
     if (BAD_ROUTE) {
         trash = Tokenizer_pop(self);
         Py_XDECREF(trash);
-        self->head = reset;
         return 0;
     }
     if (!padding)
@@ -2666,14 +2654,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self)
     // Don't parse the style separator
     self->head++;
     row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1);
-    if (BAD_ROUTE) {
-        trash = Tokenizer_pop(self);
-        Py_XDECREF(trash);
-        Py_DECREF(padding);
-        Py_DECREF(style);
-        self->head = reset;
-        return 0;
-    }
     if (!row) {
         Py_DECREF(padding);
         Py_DECREF(style);
@@ -2688,14 +2668,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self)
 }
 
 /*
-    Return the stack in order to handle the table row end.
-*/
-static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self)
-{
-    return Tokenizer_pop(self);
-}
-
-/*
     Parse as normal syntax unless we hit a style marker, then parse style
     as HTML attributes and the remainder as normal syntax.
 */
@@ -2705,11 +2677,10 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
 {
     uint64_t old_context = self->topstack->context;
     uint64_t cell_context;
-    Py_ssize_t reset = self->head;
-    PyObject *padding, *cell, *trash;
-    PyObject *style = NULL;
+    PyObject *padding, *cell, *style = NULL;
     const char *close_open_markup = NULL;
     self->head += strlen(markup);
+    Py_ssize_t reset = self->head;
 
     if (!Tokenizer_CAN_RECURSE(self)) {
         if (Tokenizer_emit_text(self, markup))
@@ -2720,12 +2691,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
 
     cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
                            LC_TABLE_CELL_STYLE | line_context, 1);
-    if (BAD_ROUTE) {
-        trash = Tokenizer_pop(self);
-        Py_XDECREF(trash);
-        self->head = reset;
-        return 0;
-    }
     if (!cell)
         return -1;
     cell_context = self->topstack->context;
@@ -2733,11 +2698,11 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
 
     if (cell_context & LC_TABLE_CELL_STYLE) {
         Py_DECREF(cell);
-        self->head = reset + strlen(markup);
+        self->head = reset;
         if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
                           line_context))
             return -1;
-        padding = Tokenizer_parse_as_table_style(self, '|');
+        padding = Tokenizer_handle_table_style(self, '|');
         if (!padding)
             return -1;
         style = Tokenizer_pop(self);
@@ -2749,14 +2714,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
         self->head++;
         cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
                                line_context, 1);
-        if (BAD_ROUTE) {
-            Py_DECREF(padding);
-            Py_DECREF(style);
-            trash = Tokenizer_pop(self);
-            Py_XDECREF(trash);
-            self->head = reset;
-            return 0;
-        }
         if (!cell) {
             Py_DECREF(padding);
             Py_DECREF(style);
@@ -2801,6 +2758,23 @@ Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style)
 }
 
 /*
+    Return the stack in order to handle the table row end.
+*/
+static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self)
+{
+    return Tokenizer_pop(self);
+}
+
+/*
+    Return the stack in order to handle the table end.
+*/
+static PyObject* Tokenizer_handle_table_end(Tokenizer* self)
+{
+    self->head += 2;
+    return Tokenizer_pop(self);
+}
+
+/*
     Handle the end of the stream of wikitext.
 */
 static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
@@ -2819,9 +2793,16 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
             if (single)
                 return Tokenizer_handle_single_tag_end(self);
         }
-        else if (context & AGG_DOUBLE) {
-            trash = Tokenizer_pop(self);
-            Py_XDECREF(trash);
+        else {
+            if (context & LC_TABLE_CELL_OPEN) {
+                trash = Tokenizer_pop(self);
+                Py_XDECREF(trash);
+                context = self->topstack->context;
+            }
+            if (context & AGG_DOUBLE) {
+                trash = Tokenizer_pop(self);
+                Py_XDECREF(trash);
+            }
         }
         return Tokenizer_fail_route(self);
     }
@@ -3082,7 +3063,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
         // Start of table parsing
         else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) {
             if (Tokenizer_CAN_RECURSE(self)) {
-                if (Tokenizer_handle_table_start(self))
+                if (Tokenizer_parse_table(self))
                     return NULL;
             }
             else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next))
@@ -3197,7 +3178,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
     self->skip_style_tags = skip_style_tags;
     tokens = Tokenizer_parse(self, context, 1);
 
-    if (!tokens && !PyErr_Occurred()) {
+    if ((!tokens && !PyErr_Occurred()) || self->topstack) {
         if (!ParserError) {
             if (load_exceptions())
                 return NULL;
@@ -3206,6 +3187,9 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
             RESET_ROUTE();
             PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");
         }
+        else if (self->topstack)
+            PyErr_SetString(ParserError,
+                            "C tokenizer exited with non-empty token stack");
         else
             PyErr_SetString(ParserError, "C tokenizer exited unexpectedly");
         return NULL;
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index 8d2d428..33ba0e1 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -175,7 +175,7 @@ static PyObject* TagCloseClose;
 
 #define AGG_FAIL         (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
 #define AGG_UNSAFE       (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
-#define AGG_DOUBLE       (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)
+#define AGG_DOUBLE       (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
 #define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
 #define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)
 
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 7921e7c..3ac25a5 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -1009,8 +1009,8 @@ class Tokenizer(object):
         self._emit_text(tag)
         self._emit(tokens.TagCloseClose())
 
-    def _parse_as_table_style(self, end_token):
-        """Parse until ``end_token`` as style attributes for a table."""
+    def _handle_table_style(self, end_token):
+        """Handle style attributes for a table until ``end_token``."""
         data = _TagOpenData()
         data.context = _TagOpenData.CX_ATTR_READY
         while True:
@@ -1037,14 +1037,13 @@ class Tokenizer(object):
                 self._handle_tag_data(data, this)
             self._head += 1
 
-    def _handle_table_start(self):
-        """Handle the start of a table."""
+    def _parse_table(self):
+        """Parse a wikicode table by starting with the first line."""
         reset = self._head + 1
         self._head += 2
-
         self._push(contexts.TABLE_OPEN)
         try:
-            padding = self._parse_as_table_style("\n")
+            padding = self._handle_table_style("\n")
         except BadRoute:
             self._head = reset
             self._emit_text("{|")
@@ -1063,14 +1062,8 @@ class Tokenizer(object):
         # Offset displacement done by _parse():
         self._head -= 1
 
-    def _handle_table_end(self):
-        """Return the stack in order to handle the table end."""
-        self._head += 2
-        return self._pop()
-
     def _handle_table_row(self):
         """Parse as style until end of the line, then continue."""
-        reset = self._head
         self._head += 2
         if not self._can_recurse():
             self._emit_text("|-")
@@ -1079,67 +1072,47 @@ class Tokenizer(object):
 
         self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
         try:
-            padding = self._parse_as_table_style("\n")
+            padding = self._handle_table_style("\n")
         except BadRoute:
-            self._head = reset
             self._pop()
             raise
         style = self._pop()
 
         # Don't parse the style separator:
         self._head += 1
-        try:
-            row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
-        except BadRoute:
-            self._head = reset
-            self._pop()
-            raise
+        row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
 
         self._emit_table_tag("|-", "tr", style, padding, None, row, "")
         # Offset displacement done by parse():
         self._head -= 1
 
-    def _handle_table_row_end(self):
-        """Return the stack in order to handle the table row end."""
-        return self._pop()
-
     def _handle_table_cell(self, markup, tag, line_context):
         """Parse as normal syntax unless we hit a style marker, then parse
         style as HTML attributes and the remainder as normal syntax."""
         old_context = self._context
-        reset = self._head
-        reset_for_style, padding, style = False, "", None
+        padding, style = "", None
         self._head += len(markup)
+        reset = self._head
         if not self._can_recurse():
             self._emit_text(markup)
             self._head -= 1
             return
 
-        try:
-            cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
-                               line_context | contexts.TABLE_CELL_STYLE)
-        except BadRoute:
-            self._head = reset
-            self._pop()
-            raise
+        cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
+                           line_context | contexts.TABLE_CELL_STYLE)
         cell_context = self._context
         self._context = old_context
         reset_for_style = cell_context & contexts.TABLE_CELL_STYLE
         if reset_for_style:
-            self._head = reset + len(markup)
+            self._head = reset
             self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
                        line_context)
-            padding = self._parse_as_table_style("|")
+            padding = self._handle_table_style("|")
             style = self._pop()
             # Don't parse the style separator:
             self._head += 1
-            try:
-                cell = self._parse(contexts.TABLE_OPEN |
-                                   contexts.TABLE_CELL_OPEN | line_context)
-            except BadRoute:
-                self._head = reset
-                ret = self._pop()
-                raise
+            cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
+                               line_context)
             cell_context = self._context
             self._context = old_context
 
@@ -1161,12 +1134,23 @@ class Tokenizer(object):
             self._context &= ~contexts.TABLE_CELL_STYLE
         return self._pop(keep_context=True)
 
+    def _handle_table_row_end(self):
+        """Return the stack in order to handle the table row end."""
+        return self._pop()
+
+    def _handle_table_end(self):
+        """Return the stack in order to handle the table end."""
+        self._head += 2
+        return self._pop()
+
     def _handle_end(self):
         """Handle the end of the stream of wikitext."""
         if self._context & contexts.FAIL:
             if self._context & contexts.TAG_BODY:
                 if is_single(self._stack[1].text):
                     return self._handle_single_tag_end()
+            if self._context & contexts.TABLE_CELL_OPEN:
+                self._pop()
             if self._context & contexts.DOUBLE:
                 self._pop()
             self._fail_route()
@@ -1327,19 +1311,19 @@ class Tokenizer(object):
             elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or
                     (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
                 if self._can_recurse():
-                    self._handle_table_start()
+                    self._parse_table()
                 else:
                     self._emit_text("{|")
             elif self._context & contexts.TABLE_OPEN:
-                if this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE:
+                if this == next == "|" and self._context & contexts.TABLE_TD_LINE:
                     if self._context & contexts.TABLE_CELL_OPEN:
                         return self._handle_table_cell_end()
                     self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE)
-                elif this == "|" and next == "|" and self._context & contexts.TABLE_TH_LINE:
+                elif this == next == "|" and self._context & contexts.TABLE_TH_LINE:
                     if self._context & contexts.TABLE_CELL_OPEN:
                         return self._handle_table_cell_end()
                     self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE)
-                elif this == "!" and next == "!" and self._context & contexts.TABLE_TH_LINE:
+                elif this == next == "!" and self._context & contexts.TABLE_TH_LINE:
                     if self._context & contexts.TABLE_CELL_OPEN:
                         return self._handle_table_cell_end()
                     self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE)
@@ -1387,6 +1371,10 @@ class Tokenizer(object):
         self._text = [segment for segment in split if segment]
         self._head = self._global = self._depth = self._cycles = 0
         try:
-            return self._parse(context)
+            tokens = self._parse(context)
         except BadRoute:  # pragma: no cover (untestable/exceptional case)
             raise ParserError("Python tokenizer exited with BadRoute")
+        if self._stacks:  # pragma: no cover (untestable/exceptional case)
+            err = "Python tokenizer exited with non-empty token stack"
+            raise ParserError(err)
+        return tokens
diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest
index e042467..16012cf 100644
--- a/tests/tokenizer/tables.mwtest
+++ b/tests/tokenizer/tables.mwtest
@@ -61,6 +61,13 @@ output: [Text(text="{| \n|- \n ")]
 
 ---
 
+name:   no_table_close_row_and_cell
+label:  no table close while inside a cell inside a row
+input:  "{| \n|- \n|"
+output: [Text(text="{| \n|- \n|")]
+
+---
+
 name:   no_table_close_attributes
 label:  don't parse attributes as attributes if the table doesn't exist
 input:  "{| border="1""