Started table parsing in PyTokenizer

Started parsing table support and added the start of table support. This is a big commit (ugh) and it should probably be split up into multiple smaller ones if possible, but that seems unworkable as of right now because of all the dependencies. Also breaks tests of CTokenizer (double ugh) because I haven't started table support there. May want to pick line by line on this commit later but I need to save my work for now.
9 years ago · a8d2983161
--- a/mwparserfromhell/definitions.py
+++ b/mwparserfromhell/definitions.py
@@ -52,7 +52,7 @@ INVISIBLE_TAGS = [
 # [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762
 SINGLE_ONLY = ["br", "hr", "meta", "link", "img"]
 SINGLE = SINGLE_ONLY + ["li", "dt", "dd"]
 SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"]
 MARKUP_TO_HTML = {
    "#": "li",
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -155,13 +155,19 @@ FAIL_ON_EQUALS = 1 << 29
 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
                FAIL_ON_RBRACE + FAIL_ON_EQUALS)
 TABLE_OPEN =                1 << 30
 TABLE_CELL_LINE =           1 << 31
 TABLE_HEADER_LINE =         1 << 32
 TABLE_CELL_OPEN =           1 << 33
 TABLE_CELL_STYLE_POSSIBLE = 1 << 34
 # Global contexts:
 GL_HEADING = 1 << 0
 # Aggregate contexts:
 FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE
 FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE_OPEN
 UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE +
          TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE)
 DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -1002,6 +1002,39 @@ class Tokenizer(object):
            self._fail_route()
        return self._pop()
    def _handle_table_start(self):
        """Handle the start of a table."""
        # TODO - fail all other contexts on start?
        self._head += 2
        reset = self._head - 1
        try:
            table = self._parse(contexts.TABLE_OPEN)
        except BadRoute:
            self._head = reset
            self._emit_text("{|")
        else:
            self._emit_style_tag("table", "{|", table)
    def _handle_table_end(self):
        self._head += 2
        return self._pop()
    def _handle_table_row(self):
        self._head += 2
        self._emit(tokens.TagOpenOpen(wiki_markup="{-"))
        self._emit_text("tr")
        self._emit(tokens.TagCloseSelfclose())
        self._context &= ~contexts.TABLE_CELL_OPEN
    def _handle_table_cell(self):
        pass
    def _handle_header_cell(self):
        pass
    def _handle_cell_style(self):
        pass
    def _verify_safe(self, this):
        """Make sure we are not trying to write an invalid character."""
        context = self._context
@@ -1144,15 +1177,48 @@ class Tokenizer(object):
                result = self._parse_style()
                if result is not None:
                    return result
            elif self._read(-1) in ("\n", self.START):
                if this in ("#", "*", ";", ":"):
            elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"):
                    self._handle_list()
                elif this == next == self._read(2) == self._read(3) == "-":
            elif self._read(-1) in ("\n", self.START) and this == next == self._read(2) == self._read(3) == "-":
                    self._handle_hr()
                else:
                    self._emit_text(this)
            elif this in ("\n", ":") and self._context & contexts.DL_TERM:
                self._handle_dl_term()
            elif (this == "{" and next == "|" and (self._read(-1) in ("\n", self.START)) or
                    (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")):
                if self._can_recurse():
                    self._handle_table_start()
                else:
                    self._emit_text("{|")
            elif self._context & contexts.TABLE_OPEN:
                if this == "|" and next == "}":
                    return self._handle_table_end()
                elif this == "|" and next == "|" and self._context & contexts.TABLE_CELL_LINE:
                    self._handle_table_cell()
                elif this == "|" and next == "|" and self._context & contexts.TABLE_HEADER_LINE:
                    self._handle_header_cell()
                elif this == "!" and next == "!" and self._context & contexts.TABLE_HEADER_LINE:
                    self._handle_header_cell()
                elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE:
                    self._handle_cell_style()
                # on newline, clear out cell line contexts
                elif this == "\n" and self._context & (contexts.TABLE_CELL_LINE | contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_STYLE_POSSIBLE):
                    self._context &= (~contexts.TABLE_CELL_LINE & ~contexts.TABLE_HEADER_LINE & ~contexts.TABLE_CELL_STYLE_POSSIBLE)
                    self._emit_text(this)
                # newline or whitespace/newline
                elif (self._read(-1) in ("\n", self.START) or
                    (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")):
                    if this == "|" and next == "-":
                        self._handle_table_row()
                    elif this == "|" and self._can_recurse():
                        self._handle_table_cell()
                    elif this == "!" and self._can_recurse():
                        self._handle_header_cell()
                    else:
                        self._emit_text(this)
                else:
                    self._emit_text(this)
            else:
                self._emit_text(this)
            self._head += 1
--- a/tests/tokenizer/tables.mwtest
+++ b/tests/tokenizer/tables.mwtest
@@ -0,0 +1,32 @@
 name:   empty_table
 label:  Parsing an empty table.
 input:  "{|\n|}"
 output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()]
 ---
 name:   inline_table
 label:  Correctly handle tables with close on the same line.
 input:  "{||}"
 output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(), Text(text="table"), TagCloseClose()]
 ---
 name:   no_table_close_simple
 label:  Handle case when there is no table close.
 input:  "{| "
 output: [Text(text="{| ")]
 ---
 name:   leading_whitespace_table
 label:  Handle leading whitespace for a table.
 input:  "foo \n    \t {|\n|}"
 output: [Text(text="foo \n    \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()]
 ---
 name:   leading_characters_table
 label:  Don't parse as a table when leading characters are not newline or whitespace.
 input:  "foo \n  foo  \t {|\n|}"
 output: [Text(text="foo \n  foo  \t {|\n|}")]