From 1a4c88e11f8b6403e4a15a1e24b67b3185c884c6 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Fri, 25 Jul 2014 15:54:37 -0700 Subject: [PATCH] Correctly handle no table endings Tests were not correctly testing the situations without a table close. Fixed tests and then fixed tokenizers for failing tests. Also refactored pytokenizer to more closely match the ctokenizer by only holding the `_parse` methods in the try blocks and no other code. --- mwparserfromhell/parser/tokenizer.c | 28 ++++++++++++--- mwparserfromhell/parser/tokenizer.py | 70 +++++++++++++++++++++++------------- tests/tokenizer/tables.mwtest | 49 +++++++++++++++++++++---- 3 files changed, 110 insertions(+), 37 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index c902c3d..bad72ef 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2636,8 +2636,9 @@ static int Tokenizer_handle_table_start(Tokenizer* self) self->head++; table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); if (BAD_ROUTE) { + Py_DECREF(padding); + Py_DECREF(style); RESET_ROUTE(); - // offset displacement done by parse() self->head = reset; if (Tokenizer_emit_text(self, "{|")) return -1; @@ -2676,7 +2677,7 @@ static PyObject * Tokenizer_handle_table_end(Tokenizer* self) static int Tokenizer_handle_table_row(Tokenizer* self) { Py_ssize_t reset = self->head; - PyObject *padding, *style, *row; + PyObject *padding, *style, *row, *trash; self->head += 2; if (!Tokenizer_CAN_RECURSE(self)) { @@ -2690,6 +2691,8 @@ static int Tokenizer_handle_table_row(Tokenizer* self) return -1; padding = Tokenizer_parse_as_table_style(self, '\n', 0); if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); self->head = reset; return 0; } @@ -2704,6 +2707,8 @@ static int Tokenizer_handle_table_row(Tokenizer* self) self->head++; row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); Py_DECREF(padding); Py_DECREF(style); self->head = reset; @@ -2712,7 +2717,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self) if (!row) { Py_DECREF(padding); Py_DECREF(style); - Py_DECREF(row); return -1; } @@ -2741,7 +2745,7 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, uint64_t old_context = self->topstack->context; uint64_t cell_context; Py_ssize_t reset = self->head; - PyObject *padding, *cell; + PyObject *padding, *cell, *trash; PyObject *style = NULL; const char *close_open_markup = NULL; self->head += strlen(markup); @@ -2755,6 +2759,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); self->head = reset; return 0; } @@ -2770,6 +2776,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, return -1; padding = Tokenizer_parse_as_table_style(self, '|', 0); if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); self->head = reset; return 0; } @@ -2784,11 +2792,18 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, self->head++; cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context, 1); if (BAD_ROUTE) { + Py_DECREF(padding); + Py_DECREF(style); + trash = Tokenizer_pop(self); + Py_XDECREF(trash); self->head = reset; return 0; } - if (!cell) + if (!cell) { + Py_DECREF(padding); + Py_DECREF(style); return -1; + } cell_context = self->topstack->context; self->topstack->context = old_context; } @@ -3148,6 +3163,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) } else if (Tokenizer_emit_char(self, this)) return NULL; + // Raise BadRoute to table start + if (BAD_ROUTE) + return NULL; } else if (Tokenizer_emit_char(self, this)) return NULL; diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 59f2156..527d364 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1053,24 +1053,30 @@ class Tokenizer(object): reset = self._head + 1 style, table = None, None self._head += 2 + + self._push(contexts.TABLE_OPEN) try: - self._push(contexts.TABLE_OPEN) padding = self._parse_as_table_style("\n", break_on_table_end=True) - style = self._pop() - # continue to parse if it is NOT an inline table - if "\n" in padding: - self._head += 1 - table = self._parse(contexts.TABLE_OPEN) - else: - # close tag - self._head += 2 except BadRoute: - # offset displacement done by _parse() self._head = reset self._emit_text("{|") + return + style = self._pop() + # continue to parse if it is NOT an inline table + if "\n" in padding: + self._head += 1 + try: + table = self._parse(contexts.TABLE_OPEN) + except BadRoute: + self._head = reset + self._emit_text("{|") + return else: - self._emit_table_tag("{|", "table", style, padding, None, table, "|}") - self._head -= 1 + # close tag + self._head += 2 + self._emit_table_tag("{|", "table", style, padding, None, table, "|}") + # offset displacement done by _parse() + self._head -= 1 def _handle_table_end(self): """Return the stack in order to handle the table end.""" @@ -1087,15 +1093,21 @@ class Tokenizer(object): self._head -= 1 return + self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) try: - self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) padding = self._parse_as_table_style("\n") - style = self._pop() - # don't parse the style separator - self._head += 1 + except BadRoute: + self._head = reset + self._pop() + raise + style = self._pop() + # don't parse the style separator + self._head += 1 + try: row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) except BadRoute: self._head = reset + self._pop() raise self._emit_table_tag("|-", "tr", style, padding, None, row, "") # offset displacement done by parse() @@ -1119,26 +1131,34 @@ class Tokenizer(object): try: cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context | contexts.TABLE_CELL_STYLE) - cell_context = self._context - self._context = old_context - reset_for_style = cell_context & contexts.TABLE_CELL_STYLE except BadRoute: self._head = reset + self._pop() raise + cell_context = self._context + self._context = old_context + reset_for_style = cell_context & contexts.TABLE_CELL_STYLE if reset_for_style: self._head = reset + len(markup) + self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) try: - self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) padding = self._parse_as_table_style("|") - style = self._pop() - # Don't parse the style separator - self._head += 1 + except BadRoute: + self._head = reset + self._pop() + raise + style = self._pop() + # Don't parse the style separator + self._head += 1 + try: cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) - cell_context = self._context - self._context = old_context except BadRoute: self._head = reset + ret = self._pop() raise + cell_context = self._context + self._context = old_context + close_open_markup = "|" if reset_for_style else None self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "") # keep header/cell line contexts diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 39acf0c..ecace32 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -13,23 +13,51 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- name: no_table_close_simple -label: Handle case when there is no table close. +label: No table close on inline table input: "{| " output: [Text(text="{| ")] --- +name: no_table_close_newline +label: No table close with a newline +input: "{| \n " +output: [Text(text="{| \n ")] + +--- + name: no_table_close_inside_cell -label: Handle case when there is no table close while inside of a cell. -input: "{| | " -output: [Text(text="{| | ")] +label: No table close while inside of a cell +input: "{| \n| " +output: [Text(text="{| \n| ")] + +--- + +name: no_table_close_inside_cell_after_newline +label: No table close while inside of a cell after a newline +input: "{| \n| \n " +output: [Text(text="{| \n| \n ")] + +--- + +name: no_table_close_inside_cell_with_attributes +label: No table close while inside of a cell with attributes +input: "{| \n| red | test" +output: [Text(text="{| \n| red | test")] --- name: no_table_close_inside_row -label: Handle case when there is no table close while inside of a row. -input: "{| |- " -output: [Text(text="{| |- ")] +label: No table close while inside of a row +input: "{| \n|- " +output: [Text(text="{| \n|- ")] + +--- + +name: no_table_close_inside_row_after_newline +label: No table close while inside of a row after a newline +input: "{| \n|- \n " +output: [Text(text="{| \n|- \n ")] --- @@ -40,6 +68,13 @@ output: [Text(text="{| border=\"1\"")] --- +name: no_table_close_unclosed_attributes +label: Don't parse unclosed attributes if the table doesn't exist. +input: "{| border=" +output: [Text(text="{| border=")] + +--- + name: no_table_close_row_attributes label: Don't parse row attributes as attributes if the table doesn't exist. input: "{| |- border="1""