diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index b70e932..7bfd11a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1008,9 +1008,16 @@ class Tokenizer(object): data.context = _TagOpenData.CX_ATTR_READY while True: this, next = self._read(), self._read(1) - can_exit = (not data.context & (data.CX_NAME) or + table_end = break_on_table_end and this == "|" and next == "}" + can_exit = (not data.context & data.CX_QUOTED or data.context & data.CX_NOTE_SPACE) - if this is self.END: + if (this == end_token and can_exit) or table_end: + if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): + self._push_tag_buffer(data) + if this.isspace(): + data.padding_buffer["first"] += this + return (self._pop(), data.padding_buffer["first"]) + elif this is self.END or table_end or this == end_token: if self._context & contexts.TAG_ATTR: if data.context & data.CX_QUOTED: # Unclosed attribute quote: reset, don't die @@ -1020,16 +1027,6 @@ class Tokenizer(object): continue self._pop() self._fail_route() - elif this == end_token and can_exit: - if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): - self._push_tag_buffer(data) - if this.isspace(): - data.padding_buffer["first"] += this - return (self._pop(), data.padding_buffer["first"]) - elif break_on_table_end and this == "|" and next == "}": - if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): - self._push_tag_buffer(data) - return (self._pop(), data.padding_buffer["first"]) else: self._handle_tag_data(data, this) self._head += 1 diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index e63bd11..163579b 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -225,14 +225,14 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: table_cell_attributes_quote_with_pipe label: Pipe inside an attribute quote should still be used as a style separator. input: "{| \n | name="foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. input: "{| \n | name|="foo bar" | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text="" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -274,7 +274,7 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: table_row_attributes_crazy_whitespace label: Parse table row style attributes with different whitespace. input: "{| \t \n |- \t name="foo bar" \t \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding=" \t \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding=" \t \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -289,3 +289,32 @@ name: inline_table_attributes label: Correctly handle attributes in inline tables. input: "{| foo="tee bar" |}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="tee bar"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_incorrect_attributes +label: Parse incorrect table style attributes. +input: "{| name="foo\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_unclosed_style +label: Parse unclosed and closed bold and italics inside cells. +input: "{|\n | ''foo || '''bar ||''baz''||'''test'''\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" ''foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" '''bar "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), Text(text="baz"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="b"), TagCloseClose() Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + + +--- + +name: recursion_five_hundred_opens +label: test potentially dangerous recursion: five hundred table openings, without spaces +input: "{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|" +output: [Text(text="{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|")] + +--- + +name: recursion_one_hundred_opens +label: test potentially dangerous recursion: one hundred table openings, with spaces +input: "{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|" +output: [Text(text="{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|")] \ No newline at end of file