Refactor a lot of table error recovery code.

10 years ago · 9fc4b909e1
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -171,7 +171,7 @@ TABLE_ROW_OPEN =   1 << 33
 TABLE_TD_LINE =    1 << 34
 TABLE_TH_LINE =    1 << 35
 TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE
 TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + + TABLE_ROW_OPEN +
 TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN +
         TABLE_TD_LINE + TABLE_TH_LINE)

 # Global contexts:
@@ -184,6 +184,6 @@ FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG +
        STYLE + TABLE)
 UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE +
          TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE)
 DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE
 DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN
 NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI
 NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -2510,10 +2510,9 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup,
 }

 /*
    Parse until ``end_token`` as style attributes for a table.
    Handle style attributes for a table until an ending token.
 */
 static PyObject*
 Tokenizer_parse_as_table_style(Tokenizer* self, char end_token)
 static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
 {
    TagData *data = TagData_new();
    PyObject *padding, *trash;
@@ -2569,9 +2568,9 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token)
 }

 /*
    Handle the start of a table.
    Parse a wikicode table by starting with the first line.
 */
 static int Tokenizer_handle_table_start(Tokenizer* self)
 static int Tokenizer_parse_table(Tokenizer* self)
 {
    Py_ssize_t reset = self->head + 1;
    PyObject *style, *padding;
@@ -2580,7 +2579,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self)

    if(Tokenizer_push(self, LC_TABLE_OPEN))
        return -1;
    padding = Tokenizer_parse_as_table_style(self, '\n');
    padding = Tokenizer_handle_table_style(self, '\n');
    if (BAD_ROUTE) {
        RESET_ROUTE();
        self->head = reset;
@@ -2622,20 +2621,10 @@ static int Tokenizer_handle_table_start(Tokenizer* self)
 }

 /*
    Return the stack in order to handle the table end.
 */
 static PyObject* Tokenizer_handle_table_end(Tokenizer* self)
 {
    self->head += 2;
    return Tokenizer_pop(self);
 }

 /*
    Parse as style until end of the line, then continue.
 */
 static int Tokenizer_handle_table_row(Tokenizer* self)
 {
    Py_ssize_t reset = self->head;
    PyObject *padding, *style, *row, *trash;
    self->head += 2;

@@ -2648,11 +2637,10 @@ static int Tokenizer_handle_table_row(Tokenizer* self)

    if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN))
        return -1;
    padding = Tokenizer_parse_as_table_style(self, '\n');
    padding = Tokenizer_handle_table_style(self, '\n');
    if (BAD_ROUTE) {
        trash = Tokenizer_pop(self);
        Py_XDECREF(trash);
        self->head = reset;
        return 0;
    }
    if (!padding)
@@ -2666,14 +2654,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self)
    // Don't parse the style separator
    self->head++;
    row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1);
    if (BAD_ROUTE) {
        trash = Tokenizer_pop(self);
        Py_XDECREF(trash);
        Py_DECREF(padding);
        Py_DECREF(style);
        self->head = reset;
        return 0;
    }
    if (!row) {
        Py_DECREF(padding);
        Py_DECREF(style);
@@ -2688,14 +2668,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self)
 }

 /*
    Return the stack in order to handle the table row end.
 */
 static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self)
 {
    return Tokenizer_pop(self);
 }

 /*
    Parse as normal syntax unless we hit a style marker, then parse style
    as HTML attributes and the remainder as normal syntax.
 */
@@ -2705,11 +2677,10 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
 {
    uint64_t old_context = self->topstack->context;
    uint64_t cell_context;
    Py_ssize_t reset = self->head;
    PyObject *padding, *cell, *trash;
    PyObject *style = NULL;
    PyObject *padding, *cell, *style = NULL;
    const char *close_open_markup = NULL;
    self->head += strlen(markup);
    Py_ssize_t reset = self->head;

    if (!Tokenizer_CAN_RECURSE(self)) {
        if (Tokenizer_emit_text(self, markup))
@@ -2720,12 +2691,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,

    cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
                           LC_TABLE_CELL_STYLE | line_context, 1);
    if (BAD_ROUTE) {
        trash = Tokenizer_pop(self);
        Py_XDECREF(trash);
        self->head = reset;
        return 0;
    }
    if (!cell)
        return -1;
    cell_context = self->topstack->context;
@@ -2733,11 +2698,11 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,

    if (cell_context & LC_TABLE_CELL_STYLE) {
        Py_DECREF(cell);
        self->head = reset + strlen(markup);
        self->head = reset;
        if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
                          line_context))
            return -1;
        padding = Tokenizer_parse_as_table_style(self, '|');
        padding = Tokenizer_handle_table_style(self, '|');
        if (!padding)
            return -1;
        style = Tokenizer_pop(self);
@@ -2749,14 +2714,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
        self->head++;
        cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
                               line_context, 1);
        if (BAD_ROUTE) {
            Py_DECREF(padding);
            Py_DECREF(style);
            trash = Tokenizer_pop(self);
            Py_XDECREF(trash);
            self->head = reset;
            return 0;
        }
        if (!cell) {
            Py_DECREF(padding);
            Py_DECREF(style);
@@ -2801,6 +2758,23 @@ Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style)
 }

 /*
    Return the stack in order to handle the table row end.
 */
 static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self)
 {
    return Tokenizer_pop(self);
 }

 /*
    Return the stack in order to handle the table end.
 */
 static PyObject* Tokenizer_handle_table_end(Tokenizer* self)
 {
    self->head += 2;
    return Tokenizer_pop(self);
 }

 /*
    Handle the end of the stream of wikitext.
 */
 static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
@@ -2819,9 +2793,16 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
            if (single)
                return Tokenizer_handle_single_tag_end(self);
        }
        else if (context & AGG_DOUBLE) {
            trash = Tokenizer_pop(self);
            Py_XDECREF(trash);
        else {
            if (context & LC_TABLE_CELL_OPEN) {
                trash = Tokenizer_pop(self);
                Py_XDECREF(trash);
                context = self->topstack->context;
            }
            if (context & AGG_DOUBLE) {
                trash = Tokenizer_pop(self);
                Py_XDECREF(trash);
            }
        }
        return Tokenizer_fail_route(self);
    }
@@ -3082,7 +3063,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
        // Start of table parsing
        else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) {
            if (Tokenizer_CAN_RECURSE(self)) {
                if (Tokenizer_handle_table_start(self))
                if (Tokenizer_parse_table(self))
                    return NULL;
            }
            else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next))
@@ -3197,7 +3178,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
    self->skip_style_tags = skip_style_tags;
    tokens = Tokenizer_parse(self, context, 1);

    if (!tokens && !PyErr_Occurred()) {
    if ((!tokens && !PyErr_Occurred()) || self->topstack) {
        if (!ParserError) {
            if (load_exceptions())
                return NULL;
@@ -3206,6 +3187,9 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
            RESET_ROUTE();
            PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");
        }
        else if (self->topstack)
            PyErr_SetString(ParserError,
                            "C tokenizer exited with non-empty token stack");
        else
            PyErr_SetString(ParserError, "C tokenizer exited unexpectedly");
        return NULL;
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -175,7 +175,7 @@ static PyObject* TagCloseClose;

 #define AGG_FAIL         (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
 #define AGG_UNSAFE       (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
 #define AGG_DOUBLE       (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)
 #define AGG_DOUBLE       (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
 #define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
 #define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)

--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -1009,8 +1009,8 @@ class Tokenizer(object):
        self._emit_text(tag)
        self._emit(tokens.TagCloseClose())

    def _parse_as_table_style(self, end_token):
        """Parse until ``end_token`` as style attributes for a table."""
    def _handle_table_style(self, end_token):
        """Handle style attributes for a table until ``end_token``."""
        data = _TagOpenData()
        data.context = _TagOpenData.CX_ATTR_READY
        while True:
@@ -1037,14 +1037,13 @@ class Tokenizer(object):
                self._handle_tag_data(data, this)
            self._head += 1

    def _handle_table_start(self):
        """Handle the start of a table."""
    def _parse_table(self):
        """Parse a wikicode table by starting with the first line."""
        reset = self._head + 1
        self._head += 2

        self._push(contexts.TABLE_OPEN)
        try:
            padding = self._parse_as_table_style("\n")
            padding = self._handle_table_style("\n")
        except BadRoute:
            self._head = reset
            self._emit_text("{|")
@@ -1063,14 +1062,8 @@ class Tokenizer(object):
        # Offset displacement done by _parse():
        self._head -= 1

    def _handle_table_end(self):
        """Return the stack in order to handle the table end."""
        self._head += 2
        return self._pop()

    def _handle_table_row(self):
        """Parse as style until end of the line, then continue."""
        reset = self._head
        self._head += 2
        if not self._can_recurse():
            self._emit_text("|-")
@@ -1079,67 +1072,47 @@ class Tokenizer(object):

        self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
        try:
            padding = self._parse_as_table_style("\n")
            padding = self._handle_table_style("\n")
        except BadRoute:
            self._head = reset
            self._pop()
            raise
        style = self._pop()

        # Don't parse the style separator:
        self._head += 1
        try:
            row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
        except BadRoute:
            self._head = reset
            self._pop()
            raise
        row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)

        self._emit_table_tag("|-", "tr", style, padding, None, row, "")
        # Offset displacement done by parse():
        self._head -= 1

    def _handle_table_row_end(self):
        """Return the stack in order to handle the table row end."""
        return self._pop()

    def _handle_table_cell(self, markup, tag, line_context):
        """Parse as normal syntax unless we hit a style marker, then parse
        style as HTML attributes and the remainder as normal syntax."""
        old_context = self._context
        reset = self._head
        reset_for_style, padding, style = False, "", None
        padding, style = "", None
        self._head += len(markup)
        reset = self._head
        if not self._can_recurse():
            self._emit_text(markup)
            self._head -= 1
            return

        try:
            cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
                               line_context | contexts.TABLE_CELL_STYLE)
        except BadRoute:
            self._head = reset
            self._pop()
            raise
        cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
                           line_context | contexts.TABLE_CELL_STYLE)
        cell_context = self._context
        self._context = old_context
        reset_for_style = cell_context & contexts.TABLE_CELL_STYLE
        if reset_for_style:
            self._head = reset + len(markup)
            self._head = reset
            self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
                       line_context)
            padding = self._parse_as_table_style("|")
            padding = self._handle_table_style("|")
            style = self._pop()
            # Don't parse the style separator:
            self._head += 1
            try:
                cell = self._parse(contexts.TABLE_OPEN |
                                   contexts.TABLE_CELL_OPEN | line_context)
            except BadRoute:
                self._head = reset
                ret = self._pop()
                raise
            cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
                               line_context)
            cell_context = self._context
            self._context = old_context

@@ -1161,12 +1134,23 @@ class Tokenizer(object):
            self._context &= ~contexts.TABLE_CELL_STYLE
        return self._pop(keep_context=True)

    def _handle_table_row_end(self):
        """Return the stack in order to handle the table row end."""
        return self._pop()

    def _handle_table_end(self):
        """Return the stack in order to handle the table end."""
        self._head += 2
        return self._pop()

    def _handle_end(self):
        """Handle the end of the stream of wikitext."""
        if self._context & contexts.FAIL:
            if self._context & contexts.TAG_BODY:
                if is_single(self._stack[1].text):
                    return self._handle_single_tag_end()
            if self._context & contexts.TABLE_CELL_OPEN:
                self._pop()
            if self._context & contexts.DOUBLE:
                self._pop()
            self._fail_route()
@@ -1327,19 +1311,19 @@ class Tokenizer(object):
            elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or
                    (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
                if self._can_recurse():
                    self._handle_table_start()
                    self._parse_table()
                else:
                    self._emit_text("{|")
            elif self._context & contexts.TABLE_OPEN:
                if this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE:
                if this == next == "|" and self._context & contexts.TABLE_TD_LINE:
                    if self._context & contexts.TABLE_CELL_OPEN:
                        return self._handle_table_cell_end()
                    self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE)
                elif this == "|" and next == "|" and self._context & contexts.TABLE_TH_LINE:
                elif this == next == "|" and self._context & contexts.TABLE_TH_LINE:
                    if self._context & contexts.TABLE_CELL_OPEN:
                        return self._handle_table_cell_end()
                    self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE)
                elif this == "!" and next == "!" and self._context & contexts.TABLE_TH_LINE:
                elif this == next == "!" and self._context & contexts.TABLE_TH_LINE:
                    if self._context & contexts.TABLE_CELL_OPEN:
                        return self._handle_table_cell_end()
                    self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE)
@@ -1387,6 +1371,10 @@ class Tokenizer(object):
        self._text = [segment for segment in split if segment]
        self._head = self._global = self._depth = self._cycles = 0
        try:
            return self._parse(context)
            tokens = self._parse(context)
        except BadRoute:  # pragma: no cover (untestable/exceptional case)
            raise ParserError("Python tokenizer exited with BadRoute")
        if self._stacks:  # pragma: no cover (untestable/exceptional case)
            err = "Python tokenizer exited with non-empty token stack"
            raise ParserError(err)
        return tokens
--- a/tests/tokenizer/tables.mwtest
+++ b/tests/tokenizer/tables.mwtest
@@ -61,6 +61,13 @@ output: [Text(text="{| \n|- \n ")]

 ---

 name:   no_table_close_row_and_cell
 label:  no table close while inside a cell inside a row
 input:  "{| \n|- \n|"
 output: [Text(text="{| \n|- \n|")]

 ---

 name:   no_table_close_attributes
 label:  don't parse attributes as attributes if the table doesn't exist
 input:  "{| border="1""