diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 6020ad1..af41f49 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -52,7 +52,7 @@ INVISIBLE_TAGS = [ # [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] -SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] +SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"] MARKUP_TO_HTML = { "#": "li", diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index f568fac..678a392 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -155,13 +155,19 @@ FAIL_ON_EQUALS = 1 << 29 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) +TABLE_OPEN = 1 << 30 +TABLE_CELL_LINE = 1 << 31 +TABLE_HEADER_LINE = 1 << 32 +TABLE_CELL_OPEN = 1 << 33 +TABLE_CELL_STYLE_POSSIBLE = 1 << 34 + # Global contexts: GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE +FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE_OPEN UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 073e64c..70e2d5d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1002,6 +1002,39 @@ class Tokenizer(object): self._fail_route() return self._pop() + def _handle_table_start(self): + """Handle the start of a table.""" + # TODO - fail all other contexts on start? + self._head += 2 + reset = self._head - 1 + try: + table = self._parse(contexts.TABLE_OPEN) + except BadRoute: + self._head = reset + self._emit_text("{|") + else: + self._emit_style_tag("table", "{|", table) + + def _handle_table_end(self): + self._head += 2 + return self._pop() + + def _handle_table_row(self): + self._head += 2 + self._emit(tokens.TagOpenOpen(wiki_markup="{-")) + self._emit_text("tr") + self._emit(tokens.TagCloseSelfclose()) + self._context &= ~contexts.TABLE_CELL_OPEN + + def _handle_table_cell(self): + pass + + def _handle_header_cell(self): + pass + + def _handle_cell_style(self): + pass + def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" context = self._context @@ -1144,15 +1177,48 @@ class Tokenizer(object): result = self._parse_style() if result is not None: return result - elif self._read(-1) in ("\n", self.START): - if this in ("#", "*", ";", ":"): + elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"): self._handle_list() - elif this == next == self._read(2) == self._read(3) == "-": + elif self._read(-1) in ("\n", self.START) and this == next == self._read(2) == self._read(3) == "-": self._handle_hr() - else: - self._emit_text(this) elif this in ("\n", ":") and self._context & contexts.DL_TERM: self._handle_dl_term() + + elif (this == "{" and next == "|" and (self._read(-1) in ("\n", self.START)) or + (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")): + if self._can_recurse(): + self._handle_table_start() + else: + self._emit_text("{|") + elif self._context & contexts.TABLE_OPEN: + if this == "|" and next == "}": + return self._handle_table_end() + elif this == "|" and next == "|" and self._context & contexts.TABLE_CELL_LINE: + self._handle_table_cell() + elif this == "|" and next == "|" and self._context & contexts.TABLE_HEADER_LINE: + self._handle_header_cell() + elif this == "!" and next == "!" and self._context & contexts.TABLE_HEADER_LINE: + self._handle_header_cell() + elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE: + self._handle_cell_style() + # on newline, clear out cell line contexts + elif this == "\n" and self._context & (contexts.TABLE_CELL_LINE | contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_STYLE_POSSIBLE): + self._context &= (~contexts.TABLE_CELL_LINE & ~contexts.TABLE_HEADER_LINE & ~contexts.TABLE_CELL_STYLE_POSSIBLE) + self._emit_text(this) + # newline or whitespace/newline + elif (self._read(-1) in ("\n", self.START) or + (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")): + if this == "|" and next == "-": + self._handle_table_row() + elif this == "|" and self._can_recurse(): + self._handle_table_cell() + elif this == "!" and self._can_recurse(): + self._handle_header_cell() + else: + self._emit_text(this) + else: + self._emit_text(this) + else: self._emit_text(this) self._head += 1 diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest new file mode 100644 index 0000000..399f7fd --- /dev/null +++ b/tests/tokenizer/tables.mwtest @@ -0,0 +1,32 @@ +name: empty_table +label: Parsing an empty table. +input: "{|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: inline_table +label: Correctly handle tables with close on the same line. +input: "{||}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: no_table_close_simple +label: Handle case when there is no table close. +input: "{| " +output: [Text(text="{| ")] + +--- + +name: leading_whitespace_table +label: Handle leading whitespace for a table. +input: "foo \n \t {|\n|}" +output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: leading_characters_table +label: Don't parse as a table when leading characters are not newline or whitespace. +input: "foo \n foo \t {|\n|}" +output: [Text(text="foo \n foo \t {|\n|}")]