From 9fc4b909e150cd786e97caf7daeb479733e5330e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 24 Oct 2014 03:40:37 -0500 Subject: [PATCH] Refactor a lot of table error recovery code. --- mwparserfromhell/parser/contexts.py | 4 +- mwparserfromhell/parser/tokenizer.c | 100 +++++++++++++++-------------------- mwparserfromhell/parser/tokenizer.h | 2 +- mwparserfromhell/parser/tokenizer.py | 82 ++++++++++++---------------- tests/tokenizer/tables.mwtest | 7 +++ 5 files changed, 87 insertions(+), 108 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index ef44ce2..17912cb 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -171,7 +171,7 @@ TABLE_ROW_OPEN = 1 << 33 TABLE_TD_LINE = 1 << 34 TABLE_TH_LINE = 1 << 35 TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE -TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + + TABLE_ROW_OPEN + +TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + TABLE_TD_LINE + TABLE_TH_LINE) # Global contexts: @@ -184,6 +184,6 @@ FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE) UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) -DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE +DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 1b68b46..301ecfc 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2510,10 +2510,9 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, } /* - Parse until ``end_token`` as style attributes for a table. + Handle style attributes for a table until an ending token. */ -static PyObject* -Tokenizer_parse_as_table_style(Tokenizer* self, char end_token) +static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token) { TagData *data = TagData_new(); PyObject *padding, *trash; @@ -2569,9 +2568,9 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token) } /* - Handle the start of a table. + Parse a wikicode table by starting with the first line. */ -static int Tokenizer_handle_table_start(Tokenizer* self) +static int Tokenizer_parse_table(Tokenizer* self) { Py_ssize_t reset = self->head + 1; PyObject *style, *padding; @@ -2580,7 +2579,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self) if(Tokenizer_push(self, LC_TABLE_OPEN)) return -1; - padding = Tokenizer_parse_as_table_style(self, '\n'); + padding = Tokenizer_handle_table_style(self, '\n'); if (BAD_ROUTE) { RESET_ROUTE(); self->head = reset; @@ -2622,20 +2621,10 @@ static int Tokenizer_handle_table_start(Tokenizer* self) } /* - Return the stack in order to handle the table end. -*/ -static PyObject* Tokenizer_handle_table_end(Tokenizer* self) -{ - self->head += 2; - return Tokenizer_pop(self); -} - -/* Parse as style until end of the line, then continue. */ static int Tokenizer_handle_table_row(Tokenizer* self) { - Py_ssize_t reset = self->head; PyObject *padding, *style, *row, *trash; self->head += 2; @@ -2648,11 +2637,10 @@ static int Tokenizer_handle_table_row(Tokenizer* self) if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) return -1; - padding = Tokenizer_parse_as_table_style(self, '\n'); + padding = Tokenizer_handle_table_style(self, '\n'); if (BAD_ROUTE) { trash = Tokenizer_pop(self); Py_XDECREF(trash); - self->head = reset; return 0; } if (!padding) @@ -2666,14 +2654,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self) // Don't parse the style separator self->head++; row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); - if (BAD_ROUTE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - Py_DECREF(padding); - Py_DECREF(style); - self->head = reset; - return 0; - } if (!row) { Py_DECREF(padding); Py_DECREF(style); @@ -2688,14 +2668,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self) } /* - Return the stack in order to handle the table row end. -*/ -static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) -{ - return Tokenizer_pop(self); -} - -/* Parse as normal syntax unless we hit a style marker, then parse style as HTML attributes and the remainder as normal syntax. */ @@ -2705,11 +2677,10 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, { uint64_t old_context = self->topstack->context; uint64_t cell_context; - Py_ssize_t reset = self->head; - PyObject *padding, *cell, *trash; - PyObject *style = NULL; + PyObject *padding, *cell, *style = NULL; const char *close_open_markup = NULL; self->head += strlen(markup); + Py_ssize_t reset = self->head; if (!Tokenizer_CAN_RECURSE(self)) { if (Tokenizer_emit_text(self, markup)) @@ -2720,12 +2691,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); - if (BAD_ROUTE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - self->head = reset; - return 0; - } if (!cell) return -1; cell_context = self->topstack->context; @@ -2733,11 +2698,11 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if (cell_context & LC_TABLE_CELL_STYLE) { Py_DECREF(cell); - self->head = reset + strlen(markup); + self->head = reset; if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context)) return -1; - padding = Tokenizer_parse_as_table_style(self, '|'); + padding = Tokenizer_handle_table_style(self, '|'); if (!padding) return -1; style = Tokenizer_pop(self); @@ -2749,14 +2714,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, self->head++; cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context, 1); - if (BAD_ROUTE) { - Py_DECREF(padding); - Py_DECREF(style); - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - self->head = reset; - return 0; - } if (!cell) { Py_DECREF(padding); Py_DECREF(style); @@ -2801,6 +2758,23 @@ Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) } /* + Return the stack in order to handle the table row end. +*/ +static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) +{ + return Tokenizer_pop(self); +} + +/* + Return the stack in order to handle the table end. +*/ +static PyObject* Tokenizer_handle_table_end(Tokenizer* self) +{ + self->head += 2; + return Tokenizer_pop(self); +} + +/* Handle the end of the stream of wikitext. */ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) @@ -2819,9 +2793,16 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) if (single) return Tokenizer_handle_single_tag_end(self); } - else if (context & AGG_DOUBLE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); + else { + if (context & LC_TABLE_CELL_OPEN) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + context = self->topstack->context; + } + if (context & AGG_DOUBLE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + } } return Tokenizer_fail_route(self); } @@ -3082,7 +3063,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) // Start of table parsing else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) { if (Tokenizer_CAN_RECURSE(self)) { - if (Tokenizer_handle_table_start(self)) + if (Tokenizer_parse_table(self)) return NULL; } else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next)) @@ -3197,7 +3178,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) self->skip_style_tags = skip_style_tags; tokens = Tokenizer_parse(self, context, 1); - if (!tokens && !PyErr_Occurred()) { + if ((!tokens && !PyErr_Occurred()) || self->topstack) { if (!ParserError) { if (load_exceptions()) return NULL; @@ -3206,6 +3187,9 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) RESET_ROUTE(); PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); } + else if (self->topstack) + PyErr_SetString(ParserError, + "C tokenizer exited with non-empty token stack"); else PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); return NULL; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 8d2d428..33ba0e1 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -175,7 +175,7 @@ static PyObject* TagCloseClose; #define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) -#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) +#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) #define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) #define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 7921e7c..3ac25a5 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1009,8 +1009,8 @@ class Tokenizer(object): self._emit_text(tag) self._emit(tokens.TagCloseClose()) - def _parse_as_table_style(self, end_token): - """Parse until ``end_token`` as style attributes for a table.""" + def _handle_table_style(self, end_token): + """Handle style attributes for a table until ``end_token``.""" data = _TagOpenData() data.context = _TagOpenData.CX_ATTR_READY while True: @@ -1037,14 +1037,13 @@ class Tokenizer(object): self._handle_tag_data(data, this) self._head += 1 - def _handle_table_start(self): - """Handle the start of a table.""" + def _parse_table(self): + """Parse a wikicode table by starting with the first line.""" reset = self._head + 1 self._head += 2 - self._push(contexts.TABLE_OPEN) try: - padding = self._parse_as_table_style("\n") + padding = self._handle_table_style("\n") except BadRoute: self._head = reset self._emit_text("{|") @@ -1063,14 +1062,8 @@ class Tokenizer(object): # Offset displacement done by _parse(): self._head -= 1 - def _handle_table_end(self): - """Return the stack in order to handle the table end.""" - self._head += 2 - return self._pop() - def _handle_table_row(self): """Parse as style until end of the line, then continue.""" - reset = self._head self._head += 2 if not self._can_recurse(): self._emit_text("|-") @@ -1079,67 +1072,47 @@ class Tokenizer(object): self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) try: - padding = self._parse_as_table_style("\n") + padding = self._handle_table_style("\n") except BadRoute: - self._head = reset self._pop() raise style = self._pop() # Don't parse the style separator: self._head += 1 - try: - row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) - except BadRoute: - self._head = reset - self._pop() - raise + row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) self._emit_table_tag("|-", "tr", style, padding, None, row, "") # Offset displacement done by parse(): self._head -= 1 - def _handle_table_row_end(self): - """Return the stack in order to handle the table row end.""" - return self._pop() - def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse style as HTML attributes and the remainder as normal syntax.""" old_context = self._context - reset = self._head - reset_for_style, padding, style = False, "", None + padding, style = "", None self._head += len(markup) + reset = self._head if not self._can_recurse(): self._emit_text(markup) self._head -= 1 return - try: - cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | - line_context | contexts.TABLE_CELL_STYLE) - except BadRoute: - self._head = reset - self._pop() - raise + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context | contexts.TABLE_CELL_STYLE) cell_context = self._context self._context = old_context reset_for_style = cell_context & contexts.TABLE_CELL_STYLE if reset_for_style: - self._head = reset + len(markup) + self._head = reset self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) - padding = self._parse_as_table_style("|") + padding = self._handle_table_style("|") style = self._pop() # Don't parse the style separator: self._head += 1 - try: - cell = self._parse(contexts.TABLE_OPEN | - contexts.TABLE_CELL_OPEN | line_context) - except BadRoute: - self._head = reset - ret = self._pop() - raise + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context) cell_context = self._context self._context = old_context @@ -1161,12 +1134,23 @@ class Tokenizer(object): self._context &= ~contexts.TABLE_CELL_STYLE return self._pop(keep_context=True) + def _handle_table_row_end(self): + """Return the stack in order to handle the table row end.""" + return self._pop() + + def _handle_table_end(self): + """Return the stack in order to handle the table end.""" + self._head += 2 + return self._pop() + def _handle_end(self): """Handle the end of the stream of wikitext.""" if self._context & contexts.FAIL: if self._context & contexts.TAG_BODY: if is_single(self._stack[1].text): return self._handle_single_tag_end() + if self._context & contexts.TABLE_CELL_OPEN: + self._pop() if self._context & contexts.DOUBLE: self._pop() self._fail_route() @@ -1327,19 +1311,19 @@ class Tokenizer(object): elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if self._can_recurse(): - self._handle_table_start() + self._parse_table() else: self._emit_text("{|") elif self._context & contexts.TABLE_OPEN: - if this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE: + if this == next == "|" and self._context & contexts.TABLE_TD_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE) - elif this == "|" and next == "|" and self._context & contexts.TABLE_TH_LINE: + elif this == next == "|" and self._context & contexts.TABLE_TH_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE) - elif this == "!" and next == "!" and self._context & contexts.TABLE_TH_LINE: + elif this == next == "!" and self._context & contexts.TABLE_TH_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) @@ -1387,6 +1371,10 @@ class Tokenizer(object): self._text = [segment for segment in split if segment] self._head = self._global = self._depth = self._cycles = 0 try: - return self._parse(context) + tokens = self._parse(context) except BadRoute: # pragma: no cover (untestable/exceptional case) raise ParserError("Python tokenizer exited with BadRoute") + if self._stacks: # pragma: no cover (untestable/exceptional case) + err = "Python tokenizer exited with non-empty token stack" + raise ParserError(err) + return tokens diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index e042467..16012cf 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -61,6 +61,13 @@ output: [Text(text="{| \n|- \n ")] --- +name: no_table_close_row_and_cell +label: no table close while inside a cell inside a row +input: "{| \n|- \n|" +output: [Text(text="{| \n|- \n|")] + +--- + name: no_table_close_attributes label: don't parse attributes as attributes if the table doesn't exist input: "{| border="1""