From a8d2983161e422e27e0de8c1261b196e7a79363b Mon Sep 17 00:00:00 2001 From: David Winegar Date: Mon, 14 Jul 2014 10:37:36 -0700 Subject: [PATCH 01/44] Started table parsing in PyTokenizer Started parsing table support and added the start of table support. This is a big commit (ugh) and it should probably be split up into multiple smaller ones if possible, but that seems unworkable as of right now because of all the dependencies. Also breaks tests of CTokenizer (double ugh) because I haven't started table support there. May want to pick line by line on this commit later but I need to save my work for now. --- mwparserfromhell/definitions.py | 2 +- mwparserfromhell/parser/contexts.py | 8 +++- mwparserfromhell/parser/tokenizer.py | 76 +++++++++++++++++++++++++++++++++--- tests/tokenizer/tables.mwtest | 32 +++++++++++++++ 4 files changed, 111 insertions(+), 7 deletions(-) create mode 100644 tests/tokenizer/tables.mwtest diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 6020ad1..af41f49 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -52,7 +52,7 @@ INVISIBLE_TAGS = [ # [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] -SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] +SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"] MARKUP_TO_HTML = { "#": "li", diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index f568fac..678a392 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -155,13 +155,19 @@ FAIL_ON_EQUALS = 1 << 29 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) +TABLE_OPEN = 1 << 30 +TABLE_CELL_LINE = 1 << 31 +TABLE_HEADER_LINE = 1 << 32 +TABLE_CELL_OPEN = 1 << 33 +TABLE_CELL_STYLE_POSSIBLE = 1 << 34 + # Global contexts: GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE +FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE_OPEN UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 073e64c..70e2d5d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1002,6 +1002,39 @@ class Tokenizer(object): self._fail_route() return self._pop() + def _handle_table_start(self): + """Handle the start of a table.""" + # TODO - fail all other contexts on start? + self._head += 2 + reset = self._head - 1 + try: + table = self._parse(contexts.TABLE_OPEN) + except BadRoute: + self._head = reset + self._emit_text("{|") + else: + self._emit_style_tag("table", "{|", table) + + def _handle_table_end(self): + self._head += 2 + return self._pop() + + def _handle_table_row(self): + self._head += 2 + self._emit(tokens.TagOpenOpen(wiki_markup="{-")) + self._emit_text("tr") + self._emit(tokens.TagCloseSelfclose()) + self._context &= ~contexts.TABLE_CELL_OPEN + + def _handle_table_cell(self): + pass + + def _handle_header_cell(self): + pass + + def _handle_cell_style(self): + pass + def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" context = self._context @@ -1144,15 +1177,48 @@ class Tokenizer(object): result = self._parse_style() if result is not None: return result - elif self._read(-1) in ("\n", self.START): - if this in ("#", "*", ";", ":"): + elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"): self._handle_list() - elif this == next == self._read(2) == self._read(3) == "-": + elif self._read(-1) in ("\n", self.START) and this == next == self._read(2) == self._read(3) == "-": self._handle_hr() - else: - self._emit_text(this) elif this in ("\n", ":") and self._context & contexts.DL_TERM: self._handle_dl_term() + + elif (this == "{" and next == "|" and (self._read(-1) in ("\n", self.START)) or + (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")): + if self._can_recurse(): + self._handle_table_start() + else: + self._emit_text("{|") + elif self._context & contexts.TABLE_OPEN: + if this == "|" and next == "}": + return self._handle_table_end() + elif this == "|" and next == "|" and self._context & contexts.TABLE_CELL_LINE: + self._handle_table_cell() + elif this == "|" and next == "|" and self._context & contexts.TABLE_HEADER_LINE: + self._handle_header_cell() + elif this == "!" and next == "!" and self._context & contexts.TABLE_HEADER_LINE: + self._handle_header_cell() + elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE: + self._handle_cell_style() + # on newline, clear out cell line contexts + elif this == "\n" and self._context & (contexts.TABLE_CELL_LINE | contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_STYLE_POSSIBLE): + self._context &= (~contexts.TABLE_CELL_LINE & ~contexts.TABLE_HEADER_LINE & ~contexts.TABLE_CELL_STYLE_POSSIBLE) + self._emit_text(this) + # newline or whitespace/newline + elif (self._read(-1) in ("\n", self.START) or + (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")): + if this == "|" and next == "-": + self._handle_table_row() + elif this == "|" and self._can_recurse(): + self._handle_table_cell() + elif this == "!" and self._can_recurse(): + self._handle_header_cell() + else: + self._emit_text(this) + else: + self._emit_text(this) + else: self._emit_text(this) self._head += 1 diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest new file mode 100644 index 0000000..399f7fd --- /dev/null +++ b/tests/tokenizer/tables.mwtest @@ -0,0 +1,32 @@ +name: empty_table +label: Parsing an empty table. +input: "{|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: inline_table +label: Correctly handle tables with close on the same line. +input: "{||}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: no_table_close_simple +label: Handle case when there is no table close. +input: "{| " +output: [Text(text="{| ")] + +--- + +name: leading_whitespace_table +label: Handle leading whitespace for a table. +input: "foo \n \t {|\n|}" +output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: leading_characters_table +label: Don't parse as a table when leading characters are not newline or whitespace. +input: "foo \n foo \t {|\n|}" +output: [Text(text="foo \n foo \t {|\n|}")] From b7e40d7b5aea817c23de68326627c263652cc36c Mon Sep 17 00:00:00 2001 From: David Winegar Date: Mon, 14 Jul 2014 16:03:09 -0700 Subject: [PATCH 02/44] Table cells now recurse Added another stack layer for tokenizing table cells because of styling/correctness of implementation. Added many tests cases. --- mwparserfromhell/parser/tokenizer.py | 68 ++++++++++++++++++++++++++---------- tests/tokenizer/tables.mwtest | 56 +++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 18 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 70e2d5d..80cb501 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1020,17 +1020,34 @@ class Tokenizer(object): return self._pop() def _handle_table_row(self): - self._head += 2 - self._emit(tokens.TagOpenOpen(wiki_markup="{-")) + self._head += 1 + self._emit(tokens.TagOpenOpen(wiki_markup="|-")) self._emit_text("tr") self._emit(tokens.TagCloseSelfclose()) - self._context &= ~contexts.TABLE_CELL_OPEN - def _handle_table_cell(self): - pass + def _handle_table_cell(self, markup, tag, line_context): + """Parse as normal syntax unless we hit a style marker, then parse as HTML attributes""" + if not self._can_recurse(): + self._emit_text(markup) + self._head += len(markup) - 1 + return - def _handle_header_cell(self): - pass + reset = self._head + self._head += len(markup) + try: + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | contexts.TABLE_CELL_STYLE_POSSIBLE | line_context) + except BadRoute: + self._head = reset + raise + else: + self._emit(tokens.TagOpenOpen(wiki_markup=markup)) + self._emit_text(tag) + self._emit(tokens.TagCloseSelfclose()) + self._emit_all(cell) + self._head -= 1 + + def _handle_table_cell_end(self): + return self._pop() def _handle_cell_style(self): pass @@ -1184,36 +1201,51 @@ class Tokenizer(object): elif this in ("\n", ":") and self._context & contexts.DL_TERM: self._handle_dl_term() - elif (this == "{" and next == "|" and (self._read(-1) in ("\n", self.START)) or - (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")): + elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or + (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if self._can_recurse(): self._handle_table_start() else: self._emit_text("{|") elif self._context & contexts.TABLE_OPEN: if this == "|" and next == "}": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() return self._handle_table_end() elif this == "|" and next == "|" and self._context & contexts.TABLE_CELL_LINE: - self._handle_table_cell() + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("||", "td", contexts.TABLE_CELL_LINE) elif this == "|" and next == "|" and self._context & contexts.TABLE_HEADER_LINE: - self._handle_header_cell() + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("||", "th", contexts.TABLE_HEADER_LINE) elif this == "!" and next == "!" and self._context & contexts.TABLE_HEADER_LINE: - self._handle_header_cell() + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("!!", "th", contexts.TABLE_HEADER_LINE) elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE: self._handle_cell_style() # on newline, clear out cell line contexts elif this == "\n" and self._context & (contexts.TABLE_CELL_LINE | contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_STYLE_POSSIBLE): + # TODO might not be handled due to DL_TERM code above + # TODO does this even work? self._context &= (~contexts.TABLE_CELL_LINE & ~contexts.TABLE_HEADER_LINE & ~contexts.TABLE_CELL_STYLE_POSSIBLE) self._emit_text(this) - # newline or whitespace/newline elif (self._read(-1) in ("\n", self.START) or - (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")): + (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if this == "|" and next == "-": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() self._handle_table_row() - elif this == "|" and self._can_recurse(): - self._handle_table_cell() - elif this == "!" and self._can_recurse(): - self._handle_header_cell() + elif this == "|": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("|", "td", contexts.TABLE_CELL_LINE) + elif this == "!": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("!", "th", contexts.TABLE_HEADER_LINE) else: self._emit_text(this) else: diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 399f7fd..f818f65 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -19,6 +19,13 @@ output: [Text(text="{| ")] --- +name: no_table_close_inside_cell +label: Handle case when there is no table close while inside of a cell. +input: "{| | " +output: [Text(text="{| | ")] + +--- + name: leading_whitespace_table label: Handle leading whitespace for a table. input: "foo \n \t {|\n|}" @@ -30,3 +37,52 @@ name: leading_characters_table label: Don't parse as a table when leading characters are not newline or whitespace. input: "foo \n foo \t {|\n|}" output: [Text(text="foo \n foo \t {|\n|}")] + +--- + +name: table_row_simple +label: Simple table row. +input: "{|\n |- \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(), Text(text=" \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_simple +label: Simple table cell. +input: "{|\n | foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: nowiki_inside_table +label: Nowiki handles pipe characters in tables. +input: "{|\n | foo | |- {| |} || ! !! bar \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_text_outside_cell +label: Parse text inside table but outside of a cell. +input: "{|\n bar \n | foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: no_table_cell_with_leading_characters +label: Fail to create a table cell when there are leading non-whitespace characters. +input: "{|\n bar | foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar | foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: no_table_row_with_leading_characters +label: Fail to create a table row when there are leading non-whitespace characters. +input: "{|\n bar |- foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar |- foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: template_inside_table_cell +label: Template within table cell. +input: "{|\n |{{foo\n|bar=baz}} \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] From a13bc948fae32485087feae30b115728885a7abf Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 15 Jul 2014 10:17:23 -0700 Subject: [PATCH 03/44] Started table cell attribute support Started support for parsing table style attributes. I suspect some of this is incorrect, need to add more tests to see. --- mwparserfromhell/parser/tokenizer.py | 66 +++++++++++++++++++++++++++++++----- tests/tokenizer/tables.mwtest | 35 +++++++++++++++++++ 2 files changed, 92 insertions(+), 9 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 80cb501..f09adc8 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1027,30 +1027,78 @@ class Tokenizer(object): def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse as HTML attributes""" + table_context = contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context if not self._can_recurse(): self._emit_text(markup) + # TODO check if this works self._head += len(markup) - 1 return reset = self._head self._head += len(markup) + style = None try: - cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | contexts.TABLE_CELL_STYLE_POSSIBLE | line_context) + (cell_context, cell) = self._parse(table_context | contexts.TABLE_CELL_STYLE_POSSIBLE) except BadRoute: self._head = reset raise - else: - self._emit(tokens.TagOpenOpen(wiki_markup=markup)) - self._emit_text(tag) - self._emit(tokens.TagCloseSelfclose()) - self._emit_all(cell) - self._head -= 1 + # except for handling cell style + except StopIteration: + self._head = reset + len(markup) + try: + style = self._parse_as_table_style("|") + (cell_context, cell) = self._parse(table_context) + except BadRoute: + assert False + self._head = reset + raise + self._emit(tokens.TagOpenOpen(wiki_markup=markup)) + self._emit_text(tag) + if style: + # this looks highly suspicious + if type(style[0] == tokens.Text): + style.pop(0) + self._emit_all(style) + self._emit(tokens.TagCloseSelfclose()) + self._emit_all(cell) + # keep header/cell line contexts + self._context |= cell_context & (contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_LINE) + # offset displacement done by _parse() + self._head -= 1 + + def _parse_as_table_style(self, end_token): + data = _TagOpenData() + data.context = _TagOpenData.CX_ATTR_READY + while True: + this, next = self._read(), self._read(1) + can_exit = (not data.context & (data.CX_NAME) or + data.context & data.CX_NOTE_SPACE) + if this is self.END: + if self._context & contexts.TAG_ATTR: + if data.context & data.CX_QUOTED: + # Unclosed attribute quote: reset, don't die + data.context = data.CX_ATTR_VALUE + self._pop() + self._head = data.reset + continue + self._pop() + self._fail_route() + elif this == end_token and can_exit: + if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): + self._push_tag_buffer(data) + self._head += 1 + return self._pop() + else: + self._handle_tag_data(data, this) + self._head += 1 def _handle_table_cell_end(self): - return self._pop() + """Returns the context and stack in a tuple.""" + return (self._context, self._pop()) def _handle_cell_style(self): - pass + """Pop the cell off the stack and try to parse as style""" + raise StopIteration() def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index f818f65..e7eb40c 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -54,6 +54,13 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text --- +name: table_cell_inline +label: Multiple inline table cells. +input: "{|\n | foo || bar || test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" @@ -86,3 +93,31 @@ name: template_inside_table_cell label: Template within table cell. input: "{|\n |{{foo\n|bar=baz}} \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes +label: Parse table cell style attributes. +input: "{| \n | name="foo bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes_quote_with_pipe +label: Pipe inside an attribute quote should still be used as a style separator. +input: "{| \n | name="foo|bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(), Text(text="bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes_name_with_pipe +label: Pipe inside an attribute name should still be used as a style separator. +input: "{| \n | name|="foo bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(), Text(text="=\"foo bar"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes_pipe_after_equals +label: Pipe inside an attribute should still be used as a style separator after an equals. +input: "{| \n | name=|"foo|bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(), Text(text="\"foo|bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] From 0bba69d5dc32bea027a13573490263530456269d Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 15 Jul 2014 10:23:44 -0700 Subject: [PATCH 04/44] Added tests/support for header cells Support for header cells was mostly in already, just needed minor changes. Added two tests as well. --- mwparserfromhell/parser/tokenizer.py | 2 +- tests/tokenizer/tables.mwtest | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index f09adc8..b899e75 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -63,7 +63,7 @@ class Tokenizer(object): START = object() END = object() MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", - ":", "/", "-", "\n", START, END] + ":", "/", "-", "!", "\n", START, END] MAX_DEPTH = 40 MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index e7eb40c..1087381 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -61,6 +61,20 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text --- +name: table_header_simple +label: Simple header cell. +input: "{|\n ! foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_header_inline +label: Multiple inline header cells. +input: "{|\n ! foo || bar !! test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" From 9f159ecfa2443cbacf542c174058f3cd37eeb08d Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 15 Jul 2014 13:32:33 -0700 Subject: [PATCH 05/44] Add table start/row start style attribute support Started styling attributes for table row and table start. Still not entirely sure about this, definitely need to make changes regarding padding. --- mwparserfromhell/parser/tokenizer.py | 49 ++++++++++++++++++++++++++++++------ tests/tokenizer/tables.mwtest | 24 +++++++++++++++++- 2 files changed, 64 insertions(+), 9 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index b899e75..c2d5240 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1007,23 +1007,53 @@ class Tokenizer(object): # TODO - fail all other contexts on start? self._head += 2 reset = self._head - 1 + style = None try: + self._push(contexts.TABLE_OPEN) + style = self._parse_as_table_style("\n", break_on_table_end=True) + if len(style) == 0: + self._head = reset + 1 table = self._parse(contexts.TABLE_OPEN) except BadRoute: self._head = reset self._emit_text("{|") else: - self._emit_style_tag("table", "{|", table) + self._emit(tokens.TagOpenOpen(wiki_markup="{|")) + self._emit_text("table") + if style: + self._emit_all(style) + self._emit(tokens.TagCloseOpen()) + self._emit_all(table) + self._emit(tokens.TagOpenClose()) + self._emit_text("table") + self._emit(tokens.TagCloseClose()) + # self._emit_style_tag("table", "{|", table) def _handle_table_end(self): self._head += 2 return self._pop() def _handle_table_row(self): - self._head += 1 - self._emit(tokens.TagOpenOpen(wiki_markup="|-")) - self._emit_text("tr") - self._emit(tokens.TagCloseSelfclose()) + reset = self._head + self._head += 2 + try: + self._push(contexts.TABLE_OPEN) + style = self._parse_as_table_style("\n") + if len(style) == 0: + self._head = reset + 2 + except BadRoute: + self._head = reset + raise + else: + self._emit(tokens.TagOpenOpen(wiki_markup="|-")) + self._emit_text("tr") + if style: + # this looks highly suspicious + # if type(style[0] == tokens.Text): + # style.pop(0) + self._emit_all(style) + self._emit(tokens.TagCloseSelfclose()) + self._head -= 1 def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse as HTML attributes""" @@ -1047,9 +1077,10 @@ class Tokenizer(object): self._head = reset + len(markup) try: style = self._parse_as_table_style("|") + # Don't parse the style separator + self._head += 1 (cell_context, cell) = self._parse(table_context) except BadRoute: - assert False self._head = reset raise self._emit(tokens.TagOpenOpen(wiki_markup=markup)) @@ -1066,7 +1097,7 @@ class Tokenizer(object): # offset displacement done by _parse() self._head -= 1 - def _parse_as_table_style(self, end_token): + def _parse_as_table_style(self, end_token, break_on_table_end=False): data = _TagOpenData() data.context = _TagOpenData.CX_ATTR_READY while True: @@ -1086,7 +1117,9 @@ class Tokenizer(object): elif this == end_token and can_exit: if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): self._push_tag_buffer(data) - self._head += 1 + # self._head += 1 + return self._pop() + elif break_on_table_end and this == "|" and next == "}": return self._pop() else: self._handle_tag_data(data, this) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 1087381..fa068fd 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -127,7 +127,7 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. input: "{| \n | name|="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(), Text(text="=\"foo bar"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(), Text(text="=\"foo bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] --- @@ -135,3 +135,25 @@ name: table_cell_attributes_pipe_after_equals label: Pipe inside an attribute should still be used as a style separator after an equals. input: "{| \n | name=|"foo|bar"| test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(), Text(text="\"foo|bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_row_attributes +label: Parse table row style attributes. +input: "{| \n |- name="foo bar"\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: table_row_attributes_crazy_whitespace +label: Parse table row style attributes with different whitespace. +input: "{| \t \n |- \t name="foo bar"\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \t \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + + +--- + +name: table_attributes +label: Parse table style attributes. +input: "{| name="foo bar"\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] From d356a570b32d849ba581a02b77f2aa5b8cdb8ba2 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 15 Jul 2014 14:37:58 -0700 Subject: [PATCH 06/44] Added closing_wiki_markup support to Tag node Added support for allowing different wiki syntax for replacing the opening and closing tags. Added for table support. --- mwparserfromhell/nodes/tag.py | 34 +++++++++++++++++++++++++-- mwparserfromhell/parser/builder.py | 4 +++- tests/test_tag.py | 18 +++++++++++++++ tests/tokenizer/tables.mwtest | 47 ++++++++++++++++++++++---------------- 4 files changed, 80 insertions(+), 23 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 7cbe78d..0fe580f 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -35,7 +35,7 @@ class Tag(Node): def __init__(self, tag, contents=None, attrs=None, wiki_markup=None, self_closing=False, invalid=False, implicit=False, padding="", - closing_tag=None): + closing_tag=None, closing_wiki_markup=None): super(Tag, self).__init__() self._tag = tag if contents is None and not self_closing: @@ -44,6 +44,13 @@ class Tag(Node): self._contents = contents self._attrs = attrs if attrs else [] self._wiki_markup = wiki_markup + if wiki_markup and not self_closing: + if closing_wiki_markup: + self._closing_wiki_markup = closing_wiki_markup + else: + self._closing_wiki_markup = wiki_markup + else: + self._closing_wiki_markup = None self._self_closing = self_closing self._invalid = invalid self._implicit = implicit @@ -55,10 +62,11 @@ class Tag(Node): def __unicode__(self): if self.wiki_markup: + attrs = "".join([str(attr) for attr in self.attributes]) if self.attributes else "" if self.self_closing: return self.wiki_markup else: - return self.wiki_markup + str(self.contents) + self.wiki_markup + return self.wiki_markup + attrs + str(self.contents) + self.closing_wiki_markup result = ("``).""" return self._self_closing @@ -185,10 +206,19 @@ class Tag(Node): @wiki_markup.setter def wiki_markup(self, value): self._wiki_markup = str(value) if value else None + if not value or not self.closing_wiki_markup: + self.closing_wiki_markup = str(value) if value else None + + + @closing_wiki_markup.setter + def closing_wiki_markup(self, value): + self._closing_wiki_markup = str(value) if value and not self.self_closing else None @self_closing.setter def self_closing(self, value): self._self_closing = bool(value) + if not bool(value): + self.closing_wiki_markup = None @invalid.setter def invalid(self, value): diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 2d68036..8d1852e 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -248,6 +248,7 @@ class Builder(object): close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) implicit, attrs, contents, closing_tag = False, [], None, None wiki_markup, invalid = token.wiki_markup, token.invalid or False + closing_wiki_markup = None self._push() while self._tokens: token = self._tokens.pop() @@ -258,6 +259,7 @@ class Builder(object): tag = self._pop() self._push() elif isinstance(token, tokens.TagOpenClose): + closing_wiki_markup = token.wiki_markup contents = self._pop() self._push() elif isinstance(token, close_tokens): @@ -270,7 +272,7 @@ class Builder(object): self_closing = False closing_tag = self._pop() return Tag(tag, contents, attrs, wiki_markup, self_closing, - invalid, implicit, padding, closing_tag) + invalid, implicit, padding, closing_tag, closing_wiki_markup) else: self._write(self._handle_token(token)) raise ParserError("_handle_tag() missed a close token") diff --git a/tests/test_tag.py b/tests/test_tag.py index 7577cce..950233f 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -171,6 +171,24 @@ class TestTag(TreeEqualityTestCase): self.assertFalse(node.wiki_markup) self.assertEqual("italic text", node) + def test_closing_wiki_markup(self): + """test getter/setter behavior for closing_wiki_markup attribute""" + node = Tag(wraptext("table"), wraptext("\n")) + self.assertIs(None, node.closing_wiki_markup) + node.wiki_markup = "{|" + self.assertEqual("{|", node.closing_wiki_markup) + node.closing_wiki_markup = "|}" + self.assertEqual("|}", node.closing_wiki_markup) + self.assertEqual("{|\n|}", node) + node.wiki_markup = False + self.assertFalse(node.closing_wiki_markup) + node.self_closing = True + node.wiki_markup = "{|" + self.assertIs(None, node.closing_wiki_markup) + node.wiki_markup = False + node.self_closing = False + self.assertEqual("\n
", node) + def test_self_closing(self): """test getter/setter for the self_closing attribute""" node = Tag(wraptext("ref"), wraptext("foobar")) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index fa068fd..bfdd83f 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -1,14 +1,14 @@ name: empty_table label: Parsing an empty table. input: "{|\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_table label: Correctly handle tables with close on the same line. input: "{||}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -29,7 +29,7 @@ output: [Text(text="{| | ")] name: leading_whitespace_table label: Handle leading whitespace for a table. input: "foo \n \t {|\n|}" -output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -43,112 +43,119 @@ output: [Text(text="foo \n foo \t {|\n|}")] name: table_row_simple label: Simple table row. input: "{|\n |- \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(), Text(text=" \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_simple label: Simple table cell. input: "{|\n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_inline label: Multiple inline table cells. input: "{|\n | foo || bar || test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_simple label: Simple header cell. input: "{|\n ! foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_inline label: Multiple inline header cells. input: "{|\n ! foo || bar !! test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_text_outside_cell label: Parse text inside table but outside of a cell. input: "{|\n bar \n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_cell_with_leading_characters label: Fail to create a table cell when there are leading non-whitespace characters. input: "{|\n bar | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar | foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_row_with_leading_characters label: Fail to create a table row when there are leading non-whitespace characters. input: "{|\n bar |- foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar |- foo \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: template_inside_table_cell label: Template within table cell. input: "{|\n |{{foo\n|bar=baz}} \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes label: Parse table cell style attributes. input: "{| \n | name="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_quote_with_pipe label: Pipe inside an attribute quote should still be used as a style separator. input: "{| \n | name="foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(), Text(text="bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. input: "{| \n | name|="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(), Text(text="=\"foo bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_pipe_after_equals label: Pipe inside an attribute should still be used as a style separator after an equals. input: "{| \n | name=|"foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(), Text(text="\"foo|bar\"| test \n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes_templates +label: Pipe inside attributes shouldn't be style separator. +input: "{| \n | {{comment|template=baz}} | test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes label: Parse table row style attributes. input: "{| \n |- name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes_crazy_whitespace label: Parse table row style attributes with different whitespace. input: "{| \t \n |- \t name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \t \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \t \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -156,4 +163,4 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text name: table_attributes label: Parse table style attributes. input: "{| name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] From 9e4bb0c7e5b0289bc110cb41619b883b57f55954 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 15 Jul 2014 15:45:53 -0700 Subject: [PATCH 07/44] Clean up and style changes Added comments, tried to keep to 80 character lines. --- mwparserfromhell/parser/contexts.py | 24 ++++++++++--- mwparserfromhell/parser/tokenizer.py | 67 +++++++++++++++++++----------------- 2 files changed, 55 insertions(+), 36 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 678a392..564ceca 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -90,6 +90,15 @@ Local (stack-specific) contexts: * :const:`FAIL_ON_RBRACE` * :const:`FAIL_ON_EQUALS` +* :const:`TABLE` + + * :const:`TABLE_OPEN` + * :const:`TABLE_CELL_OPEN` + * :const:`TABLE_CELL_STYLE_POSSIBLE` + * :const:`TABLE_TD_LINE` + * :const:`TABLE_TH_LINE` + * :const:`TABLE_CELL_LINE_CONTEXTS` + Global contexts: * :const:`GL_HEADING` @@ -156,10 +165,14 @@ SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) TABLE_OPEN = 1 << 30 -TABLE_CELL_LINE = 1 << 31 -TABLE_HEADER_LINE = 1 << 32 -TABLE_CELL_OPEN = 1 << 33 -TABLE_CELL_STYLE_POSSIBLE = 1 << 34 +TABLE_CELL_OPEN = 1 << 31 +TABLE_CELL_STYLE_POSSIBLE = 1 << 32 +TABLE_TD_LINE = 1 << 33 +TABLE_TH_LINE = 1 << 34 +TABLE_CELL_LINE_CONTEXTS = (TABLE_TD_LINE + TABLE_TH_LINE + + TABLE_CELL_STYLE_POSSIBLE) +TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE_POSSIBLE + + TABLE_TD_LINE + TABLE_TH_LINE) # Global contexts: @@ -167,7 +180,8 @@ GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE_OPEN +FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + + STYLE + TABLE_OPEN) UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index c2d5240..4a9c0f5 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1004,18 +1004,18 @@ class Tokenizer(object): def _handle_table_start(self): """Handle the start of a table.""" - # TODO - fail all other contexts on start? self._head += 2 - reset = self._head - 1 + reset = self._head style = None try: self._push(contexts.TABLE_OPEN) style = self._parse_as_table_style("\n", break_on_table_end=True) if len(style) == 0: - self._head = reset + 1 + self._head = reset table = self._parse(contexts.TABLE_OPEN) except BadRoute: - self._head = reset + # offset displacement done by _parse() + self._head = reset - 1 self._emit_text("{|") else: self._emit(tokens.TagOpenOpen(wiki_markup="{|")) @@ -1024,16 +1024,22 @@ class Tokenizer(object): self._emit_all(style) self._emit(tokens.TagCloseOpen()) self._emit_all(table) - self._emit(tokens.TagOpenClose()) + self._emit(tokens.TagOpenClose(wiki_markup="|}")) self._emit_text("table") self._emit(tokens.TagCloseClose()) - # self._emit_style_tag("table", "{|", table) def _handle_table_end(self): + """Return the stack in order to handle the table end.""" self._head += 2 return self._pop() def _handle_table_row(self): + """Parse as style until end of the line, then continue.""" + if not self._can_recurse(): + self._emit_text("|-") + self._head += 2 + return + reset = self._head self._head += 2 try: @@ -1048,22 +1054,20 @@ class Tokenizer(object): self._emit(tokens.TagOpenOpen(wiki_markup="|-")) self._emit_text("tr") if style: - # this looks highly suspicious - # if type(style[0] == tokens.Text): - # style.pop(0) self._emit_all(style) self._emit(tokens.TagCloseSelfclose()) + # offset displacement done by _parse() self._head -= 1 def _handle_table_cell(self, markup, tag, line_context): - """Parse as normal syntax unless we hit a style marker, then parse as HTML attributes""" - table_context = contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context + """Parse as normal syntax unless we hit a style marker, then parse style + as HTML attributes and the remainder as normal syntax.""" if not self._can_recurse(): self._emit_text(markup) - # TODO check if this works self._head += len(markup) - 1 return + table_context = contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context reset = self._head self._head += len(markup) style = None @@ -1074,8 +1078,10 @@ class Tokenizer(object): raise # except for handling cell style except StopIteration: + self._pop() self._head = reset + len(markup) try: + self._push(table_context) style = self._parse_as_table_style("|") # Don't parse the style separator self._head += 1 @@ -1083,21 +1089,20 @@ class Tokenizer(object): except BadRoute: self._head = reset raise + self._emit(tokens.TagOpenOpen(wiki_markup=markup)) self._emit_text(tag) if style: - # this looks highly suspicious - if type(style[0] == tokens.Text): - style.pop(0) self._emit_all(style) self._emit(tokens.TagCloseSelfclose()) self._emit_all(cell) # keep header/cell line contexts - self._context |= cell_context & (contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_LINE) + self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE) # offset displacement done by _parse() self._head -= 1 def _parse_as_table_style(self, end_token, break_on_table_end=False): + """Parse until ``end_token`` as style attributes for a table.""" data = _TagOpenData() data.context = _TagOpenData.CX_ATTR_READY while True: @@ -1117,7 +1122,6 @@ class Tokenizer(object): elif this == end_token and can_exit: if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): self._push_tag_buffer(data) - # self._head += 1 return self._pop() elif break_on_table_end and this == "|" and next == "}": return self._pop() @@ -1130,7 +1134,7 @@ class Tokenizer(object): return (self._context, self._pop()) def _handle_cell_style(self): - """Pop the cell off the stack and try to parse as style""" + """Pop the cell off the stack and try to parse as style.""" raise StopIteration() def _verify_safe(self, this): @@ -1281,7 +1285,10 @@ class Tokenizer(object): self._handle_hr() elif this in ("\n", ":") and self._context & contexts.DL_TERM: self._handle_dl_term() - + if this == "\n": + # kill potential table contexts + self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS + # Start of table parsing elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if self._can_recurse(): @@ -1293,25 +1300,23 @@ class Tokenizer(object): if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() return self._handle_table_end() - elif this == "|" and next == "|" and self._context & contexts.TABLE_CELL_LINE: + elif this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() - self._handle_table_cell("||", "td", contexts.TABLE_CELL_LINE) - elif this == "|" and next == "|" and self._context & contexts.TABLE_HEADER_LINE: + self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE) + elif this == "|" and next == "|" and self._context & contexts.TABLE_TH_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() - self._handle_table_cell("||", "th", contexts.TABLE_HEADER_LINE) - elif this == "!" and next == "!" and self._context & contexts.TABLE_HEADER_LINE: + self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE) + elif this == "!" and next == "!" and self._context & contexts.TABLE_TH_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() - self._handle_table_cell("!!", "th", contexts.TABLE_HEADER_LINE) + self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE: self._handle_cell_style() # on newline, clear out cell line contexts - elif this == "\n" and self._context & (contexts.TABLE_CELL_LINE | contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_STYLE_POSSIBLE): - # TODO might not be handled due to DL_TERM code above - # TODO does this even work? - self._context &= (~contexts.TABLE_CELL_LINE & ~contexts.TABLE_HEADER_LINE & ~contexts.TABLE_CELL_STYLE_POSSIBLE) + elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS: + self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS self._emit_text(this) elif (self._read(-1) in ("\n", self.START) or (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): @@ -1322,11 +1327,11 @@ class Tokenizer(object): elif this == "|": if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() - self._handle_table_cell("|", "td", contexts.TABLE_CELL_LINE) + self._handle_table_cell("|", "td", contexts.TABLE_TD_LINE) elif this == "!": if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() - self._handle_table_cell("!", "th", contexts.TABLE_HEADER_LINE) + self._handle_table_cell("!", "th", contexts.TABLE_TH_LINE) else: self._emit_text(this) else: From ec080018716f66efdb09332ad6de8bf7b8096e99 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 15 Jul 2014 18:19:48 -0700 Subject: [PATCH 08/44] Tables and rows now use newline as padding Tables and rows use newlines as padding, partly because these characters are pretty important to the integrity of the table. They might need to be in the preceding whitespace of inner tags instead as padding after, not sure. --- mwparserfromhell/nodes/tag.py | 39 +++++++++++----------- mwparserfromhell/parser/builder.py | 1 + mwparserfromhell/parser/tokenizer.py | 32 ++++++++++-------- tests/test_tag.py | 5 --- tests/tokenizer/tables.mwtest | 65 ++++++++++++++++++++++++------------ 5 files changed, 81 insertions(+), 61 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 0fe580f..b3ea85c 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -44,11 +44,10 @@ class Tag(Node): self._contents = contents self._attrs = attrs if attrs else [] self._wiki_markup = wiki_markup - if wiki_markup and not self_closing: - if closing_wiki_markup: - self._closing_wiki_markup = closing_wiki_markup - else: - self._closing_wiki_markup = wiki_markup + if closing_wiki_markup: + self._closing_wiki_markup = closing_wiki_markup + elif wiki_markup and not self_closing: + self._closing_wiki_markup = wiki_markup else: self._closing_wiki_markup = None self._self_closing = self_closing @@ -63,10 +62,12 @@ class Tag(Node): def __unicode__(self): if self.wiki_markup: attrs = "".join([str(attr) for attr in self.attributes]) if self.attributes else "" + close = self.closing_wiki_markup if self.closing_wiki_markup else "" + padding = self.padding if self.padding else "" if self.self_closing: - return self.wiki_markup + return self.wiki_markup + attrs + close + padding else: - return self.wiki_markup + attrs + str(self.contents) + self.closing_wiki_markup + return self.wiki_markup + attrs + padding + str(self.contents) + close result = ("\n", node) def test_self_closing(self): diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index bfdd83f..7cf826c 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -1,14 +1,14 @@ name: empty_table label: Parsing an empty table. input: "{|\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_table label: Correctly handle tables with close on the same line. input: "{||}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=""), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -29,7 +29,7 @@ output: [Text(text="{| | ")] name: leading_whitespace_table label: Handle leading whitespace for a table. input: "foo \n \t {|\n|}" -output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -43,119 +43,133 @@ output: [Text(text="foo \n foo \t {|\n|}")] name: table_row_simple label: Simple table row. input: "{|\n |- \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_simple label: Simple table cell. input: "{|\n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_inline label: Multiple inline table cells. input: "{|\n | foo || bar || test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_simple label: Simple header cell. input: "{|\n ! foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_inline label: Multiple inline header cells. input: "{|\n ! foo || bar !! test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_text_outside_cell label: Parse text inside table but outside of a cell. input: "{|\n bar \n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_cell_with_leading_characters label: Fail to create a table cell when there are leading non-whitespace characters. input: "{|\n bar | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_row_with_leading_characters label: Fail to create a table row when there are leading non-whitespace characters. input: "{|\n bar |- foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: template_inside_table_cell label: Template within table cell. input: "{|\n |{{foo\n|bar=baz}} \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes label: Parse table cell style attributes. input: "{| \n | name="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|"), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_quote_with_pipe label: Pipe inside an attribute quote should still be used as a style separator. input: "{| \n | name="foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|"), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. input: "{| \n | name|="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text="" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|"), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_pipe_after_equals label: Pipe inside an attribute should still be used as a style separator after an equals. input: "{| \n | name=|"foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(wiki_markup="|"), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_templates label: Pipe inside attributes shouldn't be style separator. input: "{| \n | {{comment|template=baz}} | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseSelfclose(wiki_markup="|"), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: header_cell_attributes +label: Parse header cell style attributes. +input: "{| \n ! name="foo bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|"), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: inline_cell_attributes +label: Parse cell style attributes of inline cells. +input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|"), Text(text=" test "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseSelfclose(wiki_markup="|"), Text(text=" markup"), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|"), Text(text=" time \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes label: Parse table row style attributes. input: "{| \n |- name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes_crazy_whitespace label: Parse table row style attributes with different whitespace. -input: "{| \t \n |- \t name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text=" \t \n "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +input: "{| \t \n |- \t name="foo bar" \t \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding=" \t \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -163,4 +177,11 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text name: table_attributes label: Parse table style attributes. input: "{| name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: inline_table_attributes +label: Correctly handle attributes in inline tables. +input: "{| foo="tee bar" |}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="tee bar"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] From f1664a8d67d7544d6524bd8de3ab3e554247bc2e Mon Sep 17 00:00:00 2001 From: David Winegar Date: Wed, 16 Jul 2014 10:00:58 -0700 Subject: [PATCH 09/44] Updated row and table handling Changed row recursion handling to make sure the tag is emitted even when hitting recursion limits. Need to test table recursion to make sure that works. Also fixed a bug in which tables were eating the trailing token. Added several tests for rows and trailing tokens with tables. --- mwparserfromhell/parser/tokenizer.py | 33 ++++++++++++++++----------------- tests/tokenizer/tables.mwtest | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 0829e7d..787ea0a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1027,6 +1027,8 @@ class Tokenizer(object): self._emit(tokens.TagOpenClose(wiki_markup="|}")) self._emit_text("table") self._emit(tokens.TagCloseClose()) + # offset displacement done by _parse() + self._head -= 1 def _handle_table_end(self): """Return the stack in order to handle the table end.""" @@ -1035,25 +1037,22 @@ class Tokenizer(object): def _handle_table_row(self): """Parse as style until end of the line, then continue.""" - if not self._can_recurse(): - self._emit_text("|-") - self._head += 2 - return - reset = self._head self._head += 2 - try: - self._push(contexts.TABLE_OPEN) - (style, padding) = self._parse_as_table_style("\n") - except BadRoute: - self._head = reset - raise - else: - self._emit(tokens.TagOpenOpen(wiki_markup="|-")) - self._emit_text("tr") - if style: - self._emit_all(style) - self._emit(tokens.TagCloseSelfclose(padding=padding)) + style, padding = None, "" + # If we can't recurse, still tokenize tag but parse style attrs as text + if self._can_recurse(): + try: + self._push(contexts.TABLE_OPEN) + (style, padding) = self._parse_as_table_style("\n") + except BadRoute: + self._head = reset + raise + self._emit(tokens.TagOpenOpen(wiki_markup="|-")) + self._emit_text("tr") + if style: + self._emit_all(style) + self._emit(tokens.TagCloseSelfclose(padding=padding)) def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse style diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 7cf826c..2770227 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -26,6 +26,13 @@ output: [Text(text="{| | ")] --- +name: no_table_close_inside_row +label: Handle case when there is no table close while inside of a row. +input: "{| |- " +output: [Text(text="{| |- ")] + +--- + name: leading_whitespace_table label: Handle leading whitespace for a table. input: "foo \n \t {|\n|}" @@ -33,6 +40,27 @@ output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="t --- +name: whitespace_after_table +label: Handle whitespace after a table close. +input: "{|\n|}\n \t " +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text="\n \t ")] + +--- + +name: different_whitespace_after_table +label: Handle spaces after a table close. +input: "{|\n|} \n " +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" \n ")] + +--- + +name: characters_after_table +label: Handle characters after a table close. +input: "{|\n|} tsta" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" tsta")] + +--- + name: leading_characters_table label: Don't parse as a table when leading characters are not newline or whitespace. input: "foo \n foo \t {|\n|}" @@ -47,6 +75,13 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- +name: table_row_multiple +label: Simple table row. +input: "{|\n |- \n|- \n |-\n |}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding=" \n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding="\n"), Text(text=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + name: table_cell_simple label: Simple table cell. input: "{|\n | foo \n|}" @@ -171,7 +206,6 @@ label: Parse table row style attributes with different whitespace. input: "{| \t \n |- \t name="foo bar" \t \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding=" \t \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] - --- name: table_attributes From 842af20c38c65188061811959eac8b6e263fd1f2 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Wed, 16 Jul 2014 12:23:38 -0700 Subject: [PATCH 10/44] fixed hacky table cell style exception, added tests Removed the `StopIteration()` exception for handling table style and instead call `_handle_table_cell_end()` with a new parameter. Also added some random tests for table openings. --- mwparserfromhell/parser/tokenizer.py | 22 ++++++++-------------- tests/tokenizer/tables.mwtest | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 787ea0a..0de2831 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1067,24 +1067,21 @@ class Tokenizer(object): self._head += len(markup) style = None try: - (cell_context, cell) = self._parse(table_context | contexts.TABLE_CELL_STYLE_POSSIBLE) + cell_context, cell, reset_for_style = self._parse(table_context | contexts.TABLE_CELL_STYLE_POSSIBLE) except BadRoute: self._head = reset raise - # except for handling cell style - except StopIteration: - self._pop() + if reset_for_style: self._head = reset + len(markup) try: self._push(table_context) (style, padding) = self._parse_as_table_style("|") # Don't parse the style separator self._head += 1 - (cell_context, cell) = self._parse(table_context) + cell_context, cell, reset_for_style = self._parse(table_context) except BadRoute: self._head = reset raise - self._emit(tokens.TagOpenOpen(wiki_markup=markup)) self._emit_text(tag) if style: @@ -1132,13 +1129,10 @@ class Tokenizer(object): self._handle_tag_data(data, this) self._head += 1 - def _handle_table_cell_end(self): - """Returns the context and stack in a tuple.""" - return (self._context, self._pop()) - - def _handle_cell_style(self): - """Pop the cell off the stack and try to parse as style.""" - raise StopIteration() + def _handle_table_cell_end(self, reset_for_style=False): + """Returns the context, stack, and whether to reset the cell for style + in a tuple.""" + return self._context, self._pop(), reset_for_style def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" @@ -1316,7 +1310,7 @@ class Tokenizer(object): return self._handle_table_cell_end() self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE: - self._handle_cell_style() + return self._handle_table_cell_end(reset_for_style=True) # on newline, clear out cell line contexts elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS: self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 2770227..184e695 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -33,6 +33,34 @@ output: [Text(text="{| |- ")] --- +name: no_table_close_attributes +label: Don't parse attributes as attributes if the table doesn't exist. +input: "{| border="1"" +output: [Text(text="{| border=\"1\"")] + +--- + +name: no_table_close_row_attributes +label: Don't parse row attributes as attributes if the table doesn't exist. +input: "{| |- border="1"" +output: [Text(text="{| |- border=\"1\"")] + +--- + +name: no_table_close_cell +label: Don't parse cells if the table doesn't close. +input: "{| | border="1"| test || red | foo" +output: [Text(text="{| | border=\"1\"| test || red | foo")] + +--- + +name: crazy_no_table_close +label: Lost of opened wiki syntax without closes. +input: "{{{ {{ {| Date: Wed, 16 Jul 2014 12:28:40 -0700 Subject: [PATCH 11/44] Reorder table tokenizer methods for forward declaration Make sure py tokenizer methods only call methods that have been declared earlier. Not necessary but makes it much easier to maintain/write the C tokenizer if methods are in the same order. --- mwparserfromhell/parser/tokenizer.py | 68 ++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 0de2831..db4a8cf 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1002,6 +1002,40 @@ class Tokenizer(object): self._fail_route() return self._pop() + def _parse_as_table_style(self, end_token, break_on_table_end=False): + """Parse until ``end_token`` as style attributes for a table.""" + data = _TagOpenData() + data.context = _TagOpenData.CX_ATTR_READY + while True: + this, next = self._read(), self._read(1) + can_exit = (not data.context & (data.CX_NAME) or + data.context & data.CX_NOTE_SPACE) + if this is self.END: + if self._context & contexts.TAG_ATTR: + if data.context & data.CX_QUOTED: + # Unclosed attribute quote: reset, don't die + data.context = data.CX_ATTR_VALUE + self._pop() + self._head = data.reset + continue + self._pop() + self._fail_route() + elif this == end_token and can_exit: + if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): + self._push_tag_buffer(data) + if this.isspace(): + data.padding_buffer["first"] += this + return (self._pop(), data.padding_buffer["first"]) + elif break_on_table_end and this == "|" and next == "}": + if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): + self._push_tag_buffer(data) + if this.isspace(): + data.padding_buffer["first"] += this + return (self._pop(), data.padding_buffer["first"]) + else: + self._handle_tag_data(data, this) + self._head += 1 + def _handle_table_start(self): """Handle the start of a table.""" self._head += 2 @@ -1095,40 +1129,6 @@ class Tokenizer(object): # offset displacement done by _parse() self._head -= 1 - def _parse_as_table_style(self, end_token, break_on_table_end=False): - """Parse until ``end_token`` as style attributes for a table.""" - data = _TagOpenData() - data.context = _TagOpenData.CX_ATTR_READY - while True: - this, next = self._read(), self._read(1) - can_exit = (not data.context & (data.CX_NAME) or - data.context & data.CX_NOTE_SPACE) - if this is self.END: - if self._context & contexts.TAG_ATTR: - if data.context & data.CX_QUOTED: - # Unclosed attribute quote: reset, don't die - data.context = data.CX_ATTR_VALUE - self._pop() - self._head = data.reset - continue - self._pop() - self._fail_route() - elif this == end_token and can_exit: - if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): - self._push_tag_buffer(data) - if this.isspace(): - data.padding_buffer["first"] += this - return (self._pop(), data.padding_buffer["first"]) - elif break_on_table_end and this == "|" and next == "}": - if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): - self._push_tag_buffer(data) - if this.isspace(): - data.padding_buffer["first"] += this - return (self._pop(), data.padding_buffer["first"]) - else: - self._handle_tag_data(data, this) - self._head += 1 - def _handle_table_cell_end(self, reset_for_style=False): """Returns the context, stack, and whether to reset the cell for style in a tuple.""" From 457b2240457a7ed256c7bdf290d9672a4575f435 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Wed, 16 Jul 2014 13:07:11 -0700 Subject: [PATCH 12/44] Add padding to table cell tags Padding now included on all wiki table cells. With wiki table cells that include attributes, `wiki_markup` is also included (unchanged). --- mwparserfromhell/parser/tokenizer.py | 12 +++++----- tests/tokenizer/tables.mwtest | 44 ++++++++++++++++++++++++------------ 2 files changed, 35 insertions(+), 21 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index db4a8cf..c404ebb 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1002,7 +1002,7 @@ class Tokenizer(object): self._fail_route() return self._pop() - def _parse_as_table_style(self, end_token, break_on_table_end=False): + def _parse_as_table_style(self, end_token, break_on_table_end=False): """Parse until ``end_token`` as style attributes for a table.""" data = _TagOpenData() data.context = _TagOpenData.CX_ATTR_READY @@ -1099,7 +1099,7 @@ class Tokenizer(object): table_context = contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context reset = self._head self._head += len(markup) - style = None + rest_for_style, padding = False, "" try: cell_context, cell, reset_for_style = self._parse(table_context | contexts.TABLE_CELL_STYLE_POSSIBLE) except BadRoute: @@ -1112,17 +1112,17 @@ class Tokenizer(object): (style, padding) = self._parse_as_table_style("|") # Don't parse the style separator self._head += 1 - cell_context, cell, reset_for_style = self._parse(table_context) + cell_context, cell, unused = self._parse(table_context) except BadRoute: self._head = reset raise self._emit(tokens.TagOpenOpen(wiki_markup=markup)) self._emit_text(tag) - if style: + if reset_for_style: self._emit_all(style) - self._emit(tokens.TagCloseSelfclose(wiki_markup="|")) + self._emit(tokens.TagCloseSelfclose(wiki_markup="|", padding=padding)) else: - self._emit(tokens.TagCloseSelfclose()) + self._emit(tokens.TagCloseSelfclose(padding=padding)) self._emit_all(cell) # keep header/cell line contexts self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 184e695..3f3a68d 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -113,42 +113,42 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: table_cell_simple label: Simple table cell. input: "{|\n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_inline label: Multiple inline table cells. input: "{|\n | foo || bar || test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_simple label: Simple header cell. input: "{|\n ! foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_inline label: Multiple inline header cells. input: "{|\n ! foo || bar !! test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_text_outside_cell label: Parse text inside table but outside of a cell. input: "{|\n bar \n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -169,56 +169,70 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: template_inside_table_cell label: Template within table cell. input: "{|\n |{{foo\n|bar=baz}} \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes label: Parse table cell style attributes. input: "{| \n | name="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|"), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_empty_attributes +label: Parse table cell with style markers but no attributes. +input: "{| \n | | test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_with_dash +label: Parse a situation in which a cell line looks like a row line. +input: "{|\n ||- \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_quote_with_pipe label: Pipe inside an attribute quote should still be used as a style separator. input: "{| \n | name="foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|"), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. -input: "{| \n | name|="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text="" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|"), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +input: "{| \n | name|="foo bar" | test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text="" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_pipe_after_equals label: Pipe inside an attribute should still be used as a style separator after an equals. input: "{| \n | name=|"foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(wiki_markup="|"), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_templates label: Pipe inside attributes shouldn't be style separator. input: "{| \n | {{comment|template=baz}} | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseSelfclose(wiki_markup="|"), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: header_cell_attributes label: Parse header cell style attributes. input: "{| \n ! name="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|"), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_cell_attributes label: Parse cell style attributes of inline cells. input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|"), Text(text=" test "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseSelfclose(wiki_markup="|"), Text(text=" markup"), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|"), Text(text=" time \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- From 8b5d6f9a3b8892ee9b05e0cf0025475e14f814e0 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Wed, 16 Jul 2014 14:31:40 -0700 Subject: [PATCH 13/44] Changes to table close handling Fix problem in which fake table closes were causing a problem inside cells. Changed inline table handling to fix this. --- mwparserfromhell/parser/tokenizer.py | 29 ++++++++++++++++------------- tests/tokenizer/tables.mwtest | 28 ++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index c404ebb..b70e932 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1029,8 +1029,6 @@ class Tokenizer(object): elif break_on_table_end and this == "|" and next == "}": if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): self._push_tag_buffer(data) - if this.isspace(): - data.padding_buffer["first"] += this return (self._pop(), data.padding_buffer["first"]) else: self._handle_tag_data(data, this) @@ -1040,13 +1038,17 @@ class Tokenizer(object): """Handle the start of a table.""" self._head += 2 reset = self._head - style = None + style, table = None, None try: self._push(contexts.TABLE_OPEN) (style, padding) = self._parse_as_table_style("\n", break_on_table_end=True) - # Have to do this in the case of inline tables - self._head += 1 if "\n" in padding else 0 - table = self._parse(contexts.TABLE_OPEN) + # continue to parse if it is NOT an inline table + if "\n" in padding: + self._head += 1 + table = self._parse(contexts.TABLE_OPEN) + else: + # close tag + self._head += 2 except BadRoute: # offset displacement done by _parse() self._head = reset - 1 @@ -1057,7 +1059,8 @@ class Tokenizer(object): if style: self._emit_all(style) self._emit(tokens.TagCloseOpen(padding=padding)) - self._emit_all(table) + if table: + self._emit_all(table) self._emit(tokens.TagOpenClose(wiki_markup="|}")) self._emit_text("table") self._emit(tokens.TagCloseClose()) @@ -1293,11 +1296,7 @@ class Tokenizer(object): else: self._emit_text("{|") elif self._context & contexts.TABLE_OPEN: - if this == "|" and next == "}": - if self._context & contexts.TABLE_CELL_OPEN: - return self._handle_table_cell_end() - return self._handle_table_end() - elif this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE: + if this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE) @@ -1317,7 +1316,11 @@ class Tokenizer(object): self._emit_text(this) elif (self._read(-1) in ("\n", self.START) or (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): - if this == "|" and next == "-": + if this == "|" and next == "}": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + return self._handle_table_end() + elif this == "|" and next == "-": if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_row() diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 3f3a68d..e63bd11 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -89,6 +89,13 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- +name: characters_after_inline_table +label: Handle characters after an inline table close. +input: "{| |} tsta" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" tsta")] + +--- + name: leading_characters_table label: Don't parse as a table when leading characters are not newline or whitespace. input: "foo \n foo \t {|\n|}" @@ -124,6 +131,27 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- +name: table_cell_fake_close +label: Looks like a table close but is not. +input: "{|\n | |} \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text="} \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_more_fake_close +label: Looks like a table close but is not. +input: "{|\n || |} \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" |} \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_extra_close +label: Process second close as text. +input: "{| \n |} \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" \n|}")] + +--- + name: table_header_simple label: Simple header cell. input: "{|\n ! foo \n|}" From 151a73e4371c26dea5b20169a3acd26ca3f7f711 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Wed, 16 Jul 2014 15:03:26 -0700 Subject: [PATCH 14/44] Fix issue with incorrect table attributes Fix problem in which invalid table attributes were being parsed incorrectly. Added tests. --- mwparserfromhell/parser/tokenizer.py | 21 +++++++++------------ tests/tokenizer/tables.mwtest | 35 ++++++++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index b70e932..7bfd11a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1008,9 +1008,16 @@ class Tokenizer(object): data.context = _TagOpenData.CX_ATTR_READY while True: this, next = self._read(), self._read(1) - can_exit = (not data.context & (data.CX_NAME) or + table_end = break_on_table_end and this == "|" and next == "}" + can_exit = (not data.context & data.CX_QUOTED or data.context & data.CX_NOTE_SPACE) - if this is self.END: + if (this == end_token and can_exit) or table_end: + if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): + self._push_tag_buffer(data) + if this.isspace(): + data.padding_buffer["first"] += this + return (self._pop(), data.padding_buffer["first"]) + elif this is self.END or table_end or this == end_token: if self._context & contexts.TAG_ATTR: if data.context & data.CX_QUOTED: # Unclosed attribute quote: reset, don't die @@ -1020,16 +1027,6 @@ class Tokenizer(object): continue self._pop() self._fail_route() - elif this == end_token and can_exit: - if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): - self._push_tag_buffer(data) - if this.isspace(): - data.padding_buffer["first"] += this - return (self._pop(), data.padding_buffer["first"]) - elif break_on_table_end and this == "|" and next == "}": - if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): - self._push_tag_buffer(data) - return (self._pop(), data.padding_buffer["first"]) else: self._handle_tag_data(data, this) self._head += 1 diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index e63bd11..163579b 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -225,14 +225,14 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: table_cell_attributes_quote_with_pipe label: Pipe inside an attribute quote should still be used as a style separator. input: "{| \n | name="foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. input: "{| \n | name|="foo bar" | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text="" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text="=\"foo bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -274,7 +274,7 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: table_row_attributes_crazy_whitespace label: Parse table row style attributes with different whitespace. input: "{| \t \n |- \t name="foo bar" \t \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding=" \t \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding=" \t \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -289,3 +289,32 @@ name: inline_table_attributes label: Correctly handle attributes in inline tables. input: "{| foo="tee bar" |}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="tee bar"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_incorrect_attributes +label: Parse incorrect table style attributes. +input: "{| name="foo\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_unclosed_style +label: Parse unclosed and closed bold and italics inside cells. +input: "{|\n | ''foo || '''bar ||''baz''||'''test'''\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" ''foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" '''bar "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), Text(text="baz"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="b"), TagCloseClose() Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + + +--- + +name: recursion_five_hundred_opens +label: test potentially dangerous recursion: five hundred table openings, without spaces +input: "{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|" +output: [Text(text="{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|")] + +--- + +name: recursion_one_hundred_opens +label: test potentially dangerous recursion: one hundred table openings, with spaces +input: "{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|" +output: [Text(text="{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|")] \ No newline at end of file From e6ec5dc4de743f62889c65272448bdb1041fea29 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Wed, 16 Jul 2014 18:11:12 -0700 Subject: [PATCH 15/44] Refactor methods to avoid returning tuples Various changes to avoid returning tuples - working on the C tokenizer made me realize this was a bad idea for compatability/similarity between the two. --- mwparserfromhell/parser/contexts.py | 17 ++++++++--------- mwparserfromhell/parser/tokenizer.py | 30 +++++++++++++++++++----------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 564ceca..3827708 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -164,15 +164,14 @@ FAIL_ON_EQUALS = 1 << 29 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) -TABLE_OPEN = 1 << 30 -TABLE_CELL_OPEN = 1 << 31 -TABLE_CELL_STYLE_POSSIBLE = 1 << 32 -TABLE_TD_LINE = 1 << 33 -TABLE_TH_LINE = 1 << 34 -TABLE_CELL_LINE_CONTEXTS = (TABLE_TD_LINE + TABLE_TH_LINE + - TABLE_CELL_STYLE_POSSIBLE) -TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE_POSSIBLE + - TABLE_TD_LINE + TABLE_TH_LINE) +TABLE_OPEN = 1 << 30 +TABLE_CELL_OPEN = 1 << 31 +TABLE_CELL_STYLE = 1 << 32 +TABLE_TD_LINE = 1 << 33 +TABLE_TH_LINE = 1 << 34 +TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE +TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_TD_LINE + + TABLE_TH_LINE) # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 7bfd11a..7fda2d5 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1016,7 +1016,7 @@ class Tokenizer(object): self._push_tag_buffer(data) if this.isspace(): data.padding_buffer["first"] += this - return (self._pop(), data.padding_buffer["first"]) + return data.padding_buffer["first"] elif this is self.END or table_end or this == end_token: if self._context & contexts.TAG_ATTR: if data.context & data.CX_QUOTED: @@ -1038,7 +1038,8 @@ class Tokenizer(object): style, table = None, None try: self._push(contexts.TABLE_OPEN) - (style, padding) = self._parse_as_table_style("\n", break_on_table_end=True) + padding = self._parse_as_table_style("\n", break_on_table_end=True) + style = self._pop() # continue to parse if it is NOT an inline table if "\n" in padding: self._head += 1 @@ -1078,7 +1079,8 @@ class Tokenizer(object): if self._can_recurse(): try: self._push(contexts.TABLE_OPEN) - (style, padding) = self._parse_as_table_style("\n") + padding = self._parse_as_table_style("\n") + style = self._pop() except BadRoute: self._head = reset raise @@ -1099,9 +1101,11 @@ class Tokenizer(object): table_context = contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context reset = self._head self._head += len(markup) - rest_for_style, padding = False, "" + reset_for_style, padding = False, "" try: - cell_context, cell, reset_for_style = self._parse(table_context | contexts.TABLE_CELL_STYLE_POSSIBLE) + cell_context = self._parse(table_context | contexts.TABLE_CELL_STYLE) + cell = self._pop() + reset_for_style = cell_context & contexts.TABLE_CELL_STYLE except BadRoute: self._head = reset raise @@ -1109,10 +1113,12 @@ class Tokenizer(object): self._head = reset + len(markup) try: self._push(table_context) - (style, padding) = self._parse_as_table_style("|") + padding = self._parse_as_table_style("|") + style = self._pop() # Don't parse the style separator self._head += 1 - cell_context, cell, unused = self._parse(table_context) + cell_context = self._parse(table_context) + cell = self._pop() except BadRoute: self._head = reset raise @@ -1130,9 +1136,11 @@ class Tokenizer(object): self._head -= 1 def _handle_table_cell_end(self, reset_for_style=False): - """Returns the context, stack, and whether to reset the cell for style - in a tuple.""" - return self._context, self._pop(), reset_for_style + """Returns the current context, with the TABLE_CELL_STYLE flag set if + it is necessary to reset and parse style attributes.""" + if reset_for_style: + return self._context | contexts.TABLE_CELL_STYLE + return self._context & ~contexts.TABLE_CELL_STYLE def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" @@ -1305,7 +1313,7 @@ class Tokenizer(object): if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) - elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE: + elif this == "|" and self._context & contexts.TABLE_CELL_STYLE: return self._handle_table_cell_end(reset_for_style=True) # on newline, clear out cell line contexts elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS: From 406dd3a157e72d3f37e80661cebc65cc544a321f Mon Sep 17 00:00:00 2001 From: David Winegar Date: Thu, 17 Jul 2014 16:07:43 -0700 Subject: [PATCH 16/44] All tokenizer end methods return a stack For C compatability, switch table cell end to return the stack. Now context is kept by using `keep_context` when calling `self._pop()`. --- mwparserfromhell/parser/contexts.py | 4 ++-- mwparserfromhell/parser/tokenizer.py | 20 ++++++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 3827708..6dd5319 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -94,7 +94,7 @@ Local (stack-specific) contexts: * :const:`TABLE_OPEN` * :const:`TABLE_CELL_OPEN` - * :const:`TABLE_CELL_STYLE_POSSIBLE` + * :const:`TABLE_CELL_STYLE` * :const:`TABLE_TD_LINE` * :const:`TABLE_TH_LINE` * :const:`TABLE_CELL_LINE_CONTEXTS` @@ -180,7 +180,7 @@ GL_HEADING = 1 << 0 # Aggregate contexts: FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + - STYLE + TABLE_OPEN) + STYLE + TABLE) UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 7fda2d5..9e22b28 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1098,13 +1098,14 @@ class Tokenizer(object): self._head += len(markup) - 1 return - table_context = contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context + old_context = self._context reset = self._head self._head += len(markup) reset_for_style, padding = False, "" try: - cell_context = self._parse(table_context | contexts.TABLE_CELL_STYLE) - cell = self._pop() + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context | contexts.TABLE_CELL_STYLE) + cell_context = self._context + self._context = old_context reset_for_style = cell_context & contexts.TABLE_CELL_STYLE except BadRoute: self._head = reset @@ -1112,13 +1113,14 @@ class Tokenizer(object): if reset_for_style: self._head = reset + len(markup) try: - self._push(table_context) + self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) padding = self._parse_as_table_style("|") style = self._pop() # Don't parse the style separator self._head += 1 - cell_context = self._parse(table_context) - cell = self._pop() + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) + cell_context = self._context + self._context = old_context except BadRoute: self._head = reset raise @@ -1139,8 +1141,10 @@ class Tokenizer(object): """Returns the current context, with the TABLE_CELL_STYLE flag set if it is necessary to reset and parse style attributes.""" if reset_for_style: - return self._context | contexts.TABLE_CELL_STYLE - return self._context & ~contexts.TABLE_CELL_STYLE + self._context |= contexts.TABLE_CELL_STYLE + else: + self._context &= ~contexts.TABLE_CELL_STYLE + return self._pop(keep_context=True) def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" From 2d945b30e53d41b0a4d448ddee56d1580274b7c6 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Thu, 17 Jul 2014 16:21:20 -0700 Subject: [PATCH 17/44] Use uint64_t for context For the C tokenizer, include `` and use `uint64_t` instead of `int` for context. Changes to tables mean that context can be larger than 32 bits, and it is possible for `int` to only have 16 bits anyways (though this is very unlikely). --- mwparserfromhell/parser/tokenizer.c | 29 +++++++++++++++-------------- mwparserfromhell/parser/tokenizer.h | 7 ++++--- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 814ad50..90f51b0 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -241,7 +241,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) /* Add a new token stack, context, and textbuffer to the list. */ -static int Tokenizer_push(Tokenizer* self, int context) +static int Tokenizer_push(Tokenizer* self, uint64_t context) { Stack* top = malloc(sizeof(Stack)); @@ -333,7 +333,7 @@ static PyObject* Tokenizer_pop(Tokenizer* self) static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) { PyObject* stack; - int context; + uint64_t context; if (Tokenizer_push_textbuffer(self)) return NULL; @@ -351,7 +351,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) */ static void* Tokenizer_fail_route(Tokenizer* self) { - int context = self->topstack->context; + uint64_t context = self->topstack->context; PyObject* stack = Tokenizer_pop(self); Py_XDECREF(stack); @@ -1034,7 +1034,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) { // Built from Tokenizer_parse()'s end sentinels: Py_UNICODE after = Tokenizer_READ(self, 2); - int ctx = self->topstack->context; + uint64_t ctx = self->topstack->context; return (!this || this == '\n' || this == '[' || this == ']' || this == '<' || this == '>' || (this == '\'' && next == '\'') || @@ -1629,9 +1629,9 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data) static int Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) { - int ctx = data->context; - int end_of_value = (ctx & TAG_ATTR_VALUE && - !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE))); + uint64_t ctx = data->context; + uint64_t end_of_value = (ctx & TAG_ATTR_VALUE && + !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE))); if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) { if (Tokenizer_push_tag_buffer(self, data)) @@ -2153,7 +2153,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag, static int Tokenizer_parse_italics(Tokenizer* self) { Py_ssize_t reset = self->head; - int context; + uint64_t context; PyObject *stack; stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1); @@ -2273,7 +2273,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self) */ static PyObject* Tokenizer_parse_style(Tokenizer* self) { - int context = self->topstack->context, ticks = 2, i; + uint64_t context = self->topstack->context, ticks = 2, i; self->head += 2; while (Tokenizer_READ(self, 0) == '\'') { @@ -2428,7 +2428,7 @@ static int Tokenizer_handle_dl_term(Tokenizer* self) /* Handle the end of the stream of wikitext. */ -static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) +static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) { PyObject *token, *text, *trash; int single; @@ -2457,7 +2457,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) Make sure we are not trying to write an invalid character. Return 0 if everything is safe, or -1 if the route must be failed. */ -static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) +static int Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) { if (context & LC_FAIL_NEXT) return -1; @@ -2536,9 +2536,9 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) Parse the wikicode string, using context for when to stop. If push is true, we will push a new context, otherwise we won't and context will be ignored. */ -static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) +static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) { - int this_context; + uint64_t this_context; Py_UNICODE this, next, next_next, last; PyObject* temp; @@ -2697,7 +2697,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) { PyObject *text, *temp, *tokens; - int context = 0, skip_style_tags = 0; + uint64_t context = 0; + int skip_style_tags = 0; if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { Py_XDECREF(self->text); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index dde6464..e9b1a92 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -29,6 +29,7 @@ SOFTWARE. #include #include #include +#include #if PY_MAJOR_VERSION >= 3 #define IS_PY3K @@ -191,7 +192,7 @@ struct Textbuffer { struct Stack { PyObject* stack; - int context; + uint64_t context; struct Textbuffer* textbuffer; struct Stack* next; }; @@ -202,7 +203,7 @@ typedef struct { } HeadingData; typedef struct { - int context; + uint64_t context; struct Textbuffer* pad_first; struct Textbuffer* pad_before_eq; struct Textbuffer* pad_after_eq; @@ -267,7 +268,7 @@ static int Tokenizer_parse_entity(Tokenizer*); static int Tokenizer_parse_comment(Tokenizer*); static int Tokenizer_handle_dl_term(Tokenizer*); static int Tokenizer_parse_tag(Tokenizer*); -static PyObject* Tokenizer_parse(Tokenizer*, int, int); +static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); static int load_exceptions(void); From 0128b1f78a346dbe774800bd17b1b0f92bb9ca30 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Fri, 18 Jul 2014 17:41:24 -0700 Subject: [PATCH 18/44] Implement CTokenizer for tables CTokenizer is completely implemented in this commit - it didn't make much sense to me to split it up. All tests passing, memory test shows no leaks on Linux. --- mwparserfromhell/parser/tokenizer.c | 503 ++++++++++++++++++++++++++++++++++- mwparserfromhell/parser/tokenizer.h | 108 ++++---- mwparserfromhell/parser/tokenizer.py | 2 +- 3 files changed, 551 insertions(+), 62 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 90f51b0..1d2964e 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2454,6 +2454,399 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) } /* + Parse until ``end_token`` as style attributes for a table. +*/ +static PyObject* Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, + int break_on_table_end) +{ + TagData *data = TagData_new(); + PyObject *padding, *trash; + Py_UNICODE this, next; + int can_exit, table_end; + + if (!data) + return NULL; + data->context = TAG_ATTR_READY; + + while (1) { + this = Tokenizer_READ(self, 0); + next = Tokenizer_READ(self, 1); + can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE); + table_end = (break_on_table_end && this == '|' && next == '}'); + if ((this == end_token && can_exit) || table_end) { + if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) { + if (Tokenizer_push_tag_buffer(self, data)) { + TagData_dealloc(data); + return NULL; + } + } + if (Py_UNICODE_ISSPACE(this)) + Textbuffer_write(&(data->pad_first), this); + padding = Textbuffer_render(data->pad_first); + TagData_dealloc(data); + if (!padding) + return NULL; + return padding; + } + else if (!this || table_end || this == end_token) { + if (self->topstack->context & LC_TAG_ATTR) { + if (data->context & TAG_QUOTED) { + // Unclosed attribute quote: reset, don't die + data->context = TAG_ATTR_VALUE; + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + self->head = data->reset; + continue; + } + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + } + TagData_dealloc(data); + return Tokenizer_fail_route(self); + } + else { + if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) { + TagData_dealloc(data); + return NULL; + } + } + self->head++; + } +} + +/* + Handle the start of a table. +*/ +static int Tokenizer_handle_table_start(Tokenizer* self) +{ + self->head += 2; + Py_ssize_t reset = self->head; + PyObject *style, *open_open_kwargs, *close_open_kwargs, *open_close_kwargs, + *padding, *newline_character, *open_wiki_markup, *close_wiki_markup; + PyObject *table = NULL; + + if(Tokenizer_push(self, LC_TABLE_OPEN)) + return -1; + padding = Tokenizer_parse_as_table_style(self, '\n', 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset - 1; + if (Tokenizer_emit_text(self, "{|")) + return -1; + return 0; + } + if (!padding) + return -1; + style = Tokenizer_pop(self); + if (!style) { + Py_DECREF(padding); + return -1; + } + + newline_character = PyUnicode_FromString("\n"); + if (!newline_character) { + Py_DECREF(padding); + Py_DECREF(style); + return -1; + } + // continue to parse if it is NOT an inline table + if (PyUnicode_Contains(padding, newline_character)) { + Py_DECREF(newline_character); + self->head++; + table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + // offset displacement done by parse() + self->head = reset - 1; + if (Tokenizer_emit_text(self, "{|")) + return -1; + return 0; + } + if (!table) { + Py_DECREF(padding); + Py_DECREF(style); + return -1; + } + } else { + Py_DECREF(newline_character); + // close tag + self->head += 2; + } + + open_open_kwargs = PyDict_New(); + if (!open_open_kwargs) + goto fail_decref_all; + open_wiki_markup = PyUnicode_FromString("{|"); + if (!open_wiki_markup) { + Py_DECREF(open_open_kwargs); + goto fail_decref_all; + } + PyDict_SetItemString(open_open_kwargs, "wiki_markup", open_wiki_markup); + Py_DECREF(open_wiki_markup); + if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs)) + goto fail_decref_all; + if (Tokenizer_emit_text(self, "table")) + goto fail_decref_all; + + if (style) { + if (Tokenizer_emit_all(self, style)) + goto fail_decref_padding_table; + Py_DECREF(style); + } + + close_open_kwargs = PyDict_New(); + if (!close_open_kwargs) + goto fail_decref_padding_table; + PyDict_SetItemString(close_open_kwargs, "padding", padding); + Py_DECREF(padding); + if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs)) + goto fail_decref_table; + + if (table) { + if (Tokenizer_emit_all(self, table)) + goto fail_decref_table; + Py_DECREF(table); + } + + open_close_kwargs = PyDict_New(); + if (!open_close_kwargs) + return -1; + close_wiki_markup = PyUnicode_FromString("|}"); + if (!close_wiki_markup) { + Py_DECREF(open_close_kwargs); + return -1; + } + PyDict_SetItemString(open_close_kwargs, "wiki_markup", close_wiki_markup); + Py_DECREF(close_wiki_markup); + if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs)) + return -1; + if (Tokenizer_emit_text(self, "table")) + return -1; + if (Tokenizer_emit(self, TagCloseClose)) + return -1; + // offset displacement done by _parse() + self->head--; + return 0; + + fail_decref_all: + Py_DECREF(style); + fail_decref_padding_table: + Py_DECREF(padding); + fail_decref_table: + Py_XDECREF(table); + return -1; +} + +/* + Return the stack in order to handle the table end. +*/ +static PyObject * Tokenizer_handle_table_end(Tokenizer* self) +{ + self->head += 2; + return Tokenizer_pop(self); +} + +/* + Parse as style until end of the line, then continue. +*/ +static int Tokenizer_handle_table_row(Tokenizer* self) +{ + Py_ssize_t reset = self->head; + self->head += 2; + PyObject *padding, *open_kwargs, *close_kwargs, *wiki_markup; + PyObject *style = NULL; + + // If we can't recurse, still tokenize tag but parse style attrs as text + if (Tokenizer_CAN_RECURSE(self)) { + if(Tokenizer_push(self, LC_TABLE_OPEN)) + return -1; + padding = Tokenizer_parse_as_table_style(self, '\n', 0); + if (BAD_ROUTE) { + self->head = reset; + return 0; + } + if (!padding) + return -1; + style = Tokenizer_pop(self); + if (!style) { + Py_DECREF(padding); + return -1; + } + } else { + padding = PyUnicode_FromString(""); + if (!padding) + return -1; + } + + open_kwargs = PyDict_New(); + if (!open_kwargs) + goto fail_decref_all; + wiki_markup = PyUnicode_FromString("|-"); + if (!wiki_markup) { + Py_DECREF(open_kwargs); + goto fail_decref_all; + } + PyDict_SetItemString(open_kwargs, "wiki_markup", wiki_markup); + Py_DECREF(wiki_markup); + if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_kwargs)) + goto fail_decref_all; + if (Tokenizer_emit_text(self, "tr")) + goto fail_decref_all; + + if (style) { + if (Tokenizer_emit_all(self, style)) + goto fail_decref_all; + Py_DECREF(style); + } + + close_kwargs = PyDict_New(); + if (!close_kwargs) + goto fail_decref_all; + PyDict_SetItemString(close_kwargs, "padding", padding); + Py_DECREF(padding); + if (Tokenizer_emit_kwargs(self, TagCloseSelfclose, close_kwargs)) + return -1; + return 0; + + fail_decref_all: + Py_XDECREF(style); + Py_DECREF(padding); + return -1; +} + +/* + Parse as normal syntax unless we hit a style marker, then parse style + as HTML attributes and the remainder as normal syntax. +*/ +static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, + const char *tag, uint64_t line_context) +{ + if (!Tokenizer_CAN_RECURSE(self)) { + if (Tokenizer_emit_text(self, markup)) + return -1; + self->head += strlen(markup) - 1; + return 0; + } + + uint64_t old_context = self->topstack->context; + uint64_t cell_context; + Py_ssize_t reset = self->head; + self->head += strlen(markup); + PyObject *padding; + PyObject *cell, *open_kwargs, *close_kwargs, *open_wiki_markup, *close_wiki_markup; + PyObject *style = NULL; + + cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); + if (BAD_ROUTE) { + self->head = reset; + return 0; + } + if (!cell) + return -1; + cell_context = self->topstack->context; + self->topstack->context = old_context; + + if (cell_context & LC_TABLE_CELL_STYLE) { + Py_DECREF(cell); + self->head = reset + strlen(markup); + if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context)) + return -1; + padding = Tokenizer_parse_as_table_style(self, '|', 0); + if (BAD_ROUTE) { + self->head = reset; + return 0; + } + if (!padding) + return -1; + style = Tokenizer_pop(self); + if (!style) { + Py_DECREF(padding); + return -1; + } + // Don't parse the style separator + self->head++; + cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context, 1); + if (BAD_ROUTE) { + self->head = reset; + return 0; + } + if (!cell) + return -1; + cell_context = self->topstack->context; + self->topstack->context = old_context; + } + else { + padding = PyUnicode_FromString(""); + if (!padding) { + Py_DECREF(cell); + return -1; + } + } + + open_kwargs = PyDict_New(); + if (!open_kwargs) + goto fail_decref_all; + close_kwargs = PyDict_New(); + if (!close_kwargs) + goto fail_decref_all; + open_wiki_markup = PyUnicode_FromString(markup); + if (!open_wiki_markup) + goto fail_decref_all; + PyDict_SetItemString(open_kwargs, "wiki_markup", open_wiki_markup); + Py_DECREF(open_wiki_markup); + if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_kwargs)) + goto fail_decref_all; + if (Tokenizer_emit_text(self, tag)) + goto fail_decref_all; + + if (style) { + if (Tokenizer_emit_all(self, style)) + goto fail_decref_all; + close_wiki_markup = PyUnicode_FromString("|"); + if (!close_wiki_markup) + goto fail_decref_all; + PyDict_SetItemString(close_kwargs, "wiki_markup", close_wiki_markup); + Py_DECREF(close_wiki_markup); + Py_DECREF(style); + } + + PyDict_SetItemString(close_kwargs, "padding", padding); + Py_DECREF(padding); + if (Tokenizer_emit_kwargs(self, TagCloseSelfclose, close_kwargs)) + goto fail_decref_cell; + if (Tokenizer_emit_all(self, cell)) + goto fail_decref_cell; + Py_DECREF(cell); + // keep header/cell line contexts + self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE); + // offset displacement done by parse() + self->head--; + return 0; + + fail_decref_all: + Py_XDECREF(style); + Py_DECREF(padding); + Py_XDECREF(open_kwargs); + Py_XDECREF(close_kwargs); + fail_decref_cell: + Py_DECREF(cell); + return -1; +} + +/* + Returns the context, stack, and whether to reset the cell for style + in a tuple. +*/ +static PyObject* Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) +{ + if (reset_for_style) + self->topstack->context |= LC_TABLE_CELL_STYLE; + else + self->topstack->context &= ~LC_TABLE_CELL_STYLE; + return Tokenizer_pop_keeping_context(self); +} + +/* Make sure we are not trying to write an invalid character. Return 0 if everything is safe, or -1 if the route must be failed. */ @@ -2533,6 +2926,24 @@ static int Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE d } /* + Returns whether the current head has leading whitespace. + TODO: treat comments and templates as whitespace, allow fail on non-newline spaces. +*/ +static int Tokenizer_has_leading_whitespace(Tokenizer* self) +{ + int offset = 1; + Py_UNICODE current_character; + while (1) { + current_character = Tokenizer_READ_BACKWARDS(self, offset); + if (!current_character || current_character == '\n') + return 1; + else if (!Py_UNICODE_ISSPACE(current_character)) + return 0; + offset++; + } +} + +/* Parse the wikicode string, using context for when to stop. If push is true, we will push a new context, otherwise we won't and context will be ignored. */ @@ -2667,24 +3078,94 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) if (temp != Py_None) return temp; } - else if (!last || last == '\n') { - if (this == '#' || this == '*' || this == ';' || this == ':') { - if (Tokenizer_handle_list(self)) + else if ((!last || last == '\n') && (this == '#' || this == '*' || this == ';' || this == ':')) { + if (Tokenizer_handle_list(self)) + return NULL; + } + else if ((!last || last == '\n') && (this == '-' && this == next && + this == Tokenizer_READ(self, 2) && + this == Tokenizer_READ(self, 3))) { + if (Tokenizer_handle_hr(self)) + return NULL; + } + else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) { + if (Tokenizer_handle_dl_term(self)) + return NULL; + // kill potential table contexts + if (this == '\n') + self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; + } + + // Start of table parsing + else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) { + if (Tokenizer_CAN_RECURSE(self)) { + if (Tokenizer_handle_table_start(self)) + return NULL; + } + else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next)) + return NULL; + else + self->head++; + } + else if (this_context & LC_TABLE_OPEN) { + if (this == '|' && next == '|' && this_context & LC_TABLE_TD_LINE) { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "||", "td", LC_TABLE_TD_LINE)) + return NULL; + } + else if (this == '|' && next == '|' && this_context & LC_TABLE_TH_LINE) { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "||", "th", LC_TABLE_TH_LINE)) return NULL; } - else if (this == '-' && this == next && - this == Tokenizer_READ(self, 2) && - this == Tokenizer_READ(self, 3)) { - if (Tokenizer_handle_hr(self)) + else if (this == '!' && next == '!' && this_context & LC_TABLE_TH_LINE) { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "!!", "th", LC_TABLE_TH_LINE)) + return NULL; + } + else if (this == '|' && this_context & LC_TABLE_CELL_STYLE) { + return Tokenizer_handle_table_cell_end(self, 1); + } + // on newline, clear out cell line contexts + else if (this == '\n' && this_context & LC_TABLE_CELL_LINE_CONTEXTS) { + self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; + if (Tokenizer_emit_char(self, this)) + return NULL; + } + else if (Tokenizer_has_leading_whitespace(self)) { + if (this == '|' && next == '}') { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else + return Tokenizer_handle_table_end(self); + } + else if (this == '|' && next == '-') { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_row(self)) + return NULL; + } + else if (this == '|') { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "|", "td", LC_TABLE_TD_LINE)) + return NULL; + } + else if (this == '!') { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "!", "th", LC_TABLE_TH_LINE)) + return NULL; + } + else if (Tokenizer_emit_char(self, this)) return NULL; } else if (Tokenizer_emit_char(self, this)) return NULL; } - else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) { - if (Tokenizer_handle_dl_term(self)) - return NULL; - } else if (Tokenizer_emit_char(self, this)) return NULL; self->head++; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index e9b1a92..de7b7d4 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -44,9 +44,9 @@ SOFTWARE. static const char MARKERS[] = { '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', - '-', '\n', '\0'}; + '-', '!', '\n', '\0'}; -#define NUM_MARKERS 18 +#define NUM_MARKERS 19 #define TEXTBUFFER_BLOCKSIZE 1024 #define MAX_DEPTH 40 #define MAX_CYCLES 100000 @@ -110,60 +110,68 @@ static PyObject* TagCloseClose; /* Local contexts: */ -#define LC_TEMPLATE 0x00000007 -#define LC_TEMPLATE_NAME 0x00000001 -#define LC_TEMPLATE_PARAM_KEY 0x00000002 -#define LC_TEMPLATE_PARAM_VALUE 0x00000004 - -#define LC_ARGUMENT 0x00000018 -#define LC_ARGUMENT_NAME 0x00000008 -#define LC_ARGUMENT_DEFAULT 0x00000010 - -#define LC_WIKILINK 0x00000060 -#define LC_WIKILINK_TITLE 0x00000020 -#define LC_WIKILINK_TEXT 0x00000040 - -#define LC_EXT_LINK 0x00000180 -#define LC_EXT_LINK_URI 0x00000080 -#define LC_EXT_LINK_TITLE 0x00000100 - -#define LC_HEADING 0x00007E00 -#define LC_HEADING_LEVEL_1 0x00000200 -#define LC_HEADING_LEVEL_2 0x00000400 -#define LC_HEADING_LEVEL_3 0x00000800 -#define LC_HEADING_LEVEL_4 0x00001000 -#define LC_HEADING_LEVEL_5 0x00002000 -#define LC_HEADING_LEVEL_6 0x00004000 - -#define LC_TAG 0x00078000 -#define LC_TAG_OPEN 0x00008000 -#define LC_TAG_ATTR 0x00010000 -#define LC_TAG_BODY 0x00020000 -#define LC_TAG_CLOSE 0x00040000 - -#define LC_STYLE 0x00780000 -#define LC_STYLE_ITALICS 0x00080000 -#define LC_STYLE_BOLD 0x00100000 -#define LC_STYLE_PASS_AGAIN 0x00200000 -#define LC_STYLE_SECOND_PASS 0x00400000 - -#define LC_DLTERM 0x00800000 - -#define LC_SAFETY_CHECK 0x3F000000 -#define LC_HAS_TEXT 0x01000000 -#define LC_FAIL_ON_TEXT 0x02000000 -#define LC_FAIL_NEXT 0x04000000 -#define LC_FAIL_ON_LBRACE 0x08000000 -#define LC_FAIL_ON_RBRACE 0x10000000 -#define LC_FAIL_ON_EQUALS 0x20000000 - +#define LC_TEMPLATE 0x0000000000000007 +#define LC_TEMPLATE_NAME 0x0000000000000001 +#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002 +#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004 + +#define LC_ARGUMENT 0x0000000000000018 +#define LC_ARGUMENT_NAME 0x0000000000000008 +#define LC_ARGUMENT_DEFAULT 0x0000000000000010 + +#define LC_WIKILINK 0x0000000000000060 +#define LC_WIKILINK_TITLE 0x0000000000000020 +#define LC_WIKILINK_TEXT 0x0000000000000040 + +#define LC_EXT_LINK 0x0000000000000180 +#define LC_EXT_LINK_URI 0x0000000000000080 +#define LC_EXT_LINK_TITLE 0x0000000000000100 + +#define LC_HEADING 0x0000000000007E00 +#define LC_HEADING_LEVEL_1 0x0000000000000200 +#define LC_HEADING_LEVEL_2 0x0000000000000400 +#define LC_HEADING_LEVEL_3 0x0000000000000800 +#define LC_HEADING_LEVEL_4 0x0000000000001000 +#define LC_HEADING_LEVEL_5 0x0000000000002000 +#define LC_HEADING_LEVEL_6 0x0000000000004000 + +#define LC_TAG 0x0000000000078000 +#define LC_TAG_OPEN 0x0000000000008000 +#define LC_TAG_ATTR 0x0000000000010000 +#define LC_TAG_BODY 0x0000000000020000 +#define LC_TAG_CLOSE 0x0000000000040000 + +#define LC_STYLE 0x0000000000780000 +#define LC_STYLE_ITALICS 0x0000000000080000 +#define LC_STYLE_BOLD 0x0000000000100000 +#define LC_STYLE_PASS_AGAIN 0x0000000000200000 +#define LC_STYLE_SECOND_PASS 0x0000000000400000 + +#define LC_DLTERM 0x0000000000800000 + +#define LC_SAFETY_CHECK 0x000000003F000000 +#define LC_HAS_TEXT 0x0000000001000000 +#define LC_FAIL_ON_TEXT 0x0000000002000000 +#define LC_FAIL_NEXT 0x0000000004000000 +#define LC_FAIL_ON_LBRACE 0x0000000008000000 +#define LC_FAIL_ON_RBRACE 0x0000000010000000 +#define LC_FAIL_ON_EQUALS 0x0000000020000000 + +// TODO realign all +#define LC_TABLE 0x00000007C0000000 +#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000700000000 +#define LC_TABLE_OPEN 0x0000000040000000 +#define LC_TABLE_CELL_OPEN 0x0000000080000000 +#define LC_TABLE_CELL_STYLE 0x0000000100000000 +#define LC_TABLE_TD_LINE 0x0000000200000000 +#define LC_TABLE_TH_LINE 0x0000000400000000 /* Global contexts: */ #define GL_HEADING 0x1 /* Aggregate contexts: */ -#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) #define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9e22b28..e8f21c0 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1134,7 +1134,7 @@ class Tokenizer(object): self._emit_all(cell) # keep header/cell line contexts self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE) - # offset displacement done by _parse() + # offset displacement done by parse() self._head -= 1 def _handle_table_cell_end(self, reset_for_style=False): From 94a9e32494fd8c3f1ce5e39a5ef1738967244ac2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 21 Jul 2014 15:51:59 -0400 Subject: [PATCH 19/44] Add missing comma to test output. --- tests/tokenizer/tables.mwtest | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 163579b..9572733 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -302,7 +302,7 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_fir name: table_cell_unclosed_style label: Parse unclosed and closed bold and italics inside cells. input: "{|\n | ''foo || '''bar ||''baz''||'''test'''\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" ''foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" '''bar "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), Text(text="baz"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="b"), TagCloseClose() Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" ''foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" '''bar "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), Text(text="baz"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -317,4 +317,4 @@ output: [Text(text="{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{| name: recursion_one_hundred_opens label: test potentially dangerous recursion: one hundred table openings, with spaces input: "{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|" -output: [Text(text="{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|")] \ No newline at end of file +output: [Text(text="{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|")] From 7bbeb6899a653cbca35c75f66edddfc6289b7564 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 22 Jul 2014 10:41:34 -0700 Subject: [PATCH 20/44] Fix ordering of tag representation Self-closing wiki syntax tags have incorrectly ordered wiki syntax and padding, fixed the ordering. --- mwparserfromhell/nodes/tag.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index b3ea85c..c5f9d84 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -65,7 +65,7 @@ class Tag(Node): close = self.closing_wiki_markup if self.closing_wiki_markup else "" padding = self.padding if self.padding else "" if self.self_closing: - return self.wiki_markup + attrs + close + padding + return self.wiki_markup + attrs + padding + close else: return self.wiki_markup + attrs + padding + str(self.contents) + close From 64869fe84be7a5aa5b1c14f5f12c06232402ab9c Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 22 Jul 2014 12:23:44 -0700 Subject: [PATCH 21/44] Remove style test Remove style test to properly implement implicit style closes later. --- tests/tokenizer/tables.mwtest | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 9572733..c684451 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -299,14 +299,6 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_fir --- -name: table_cell_unclosed_style -label: Parse unclosed and closed bold and italics inside cells. -input: "{|\n | ''foo || '''bar ||''baz''||'''test'''\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" ''foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" '''bar "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'"), Text(text="i"), TagCloseOpen(), Text(text="baz"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] - - ---- - name: recursion_five_hundred_opens label: test potentially dangerous recursion: five hundred table openings, without spaces input: "{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|" From 213c105666a669349dfa607a163da245df9af466 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 22 Jul 2014 14:31:37 -0700 Subject: [PATCH 22/44] Table tags are no longer self-closing Table tags no longer self-closing. Rows and cells now contain their contents. Also refactored out an `emit_table_tag` method. Note: this will require changes to the Tag node and possibly the builder, those changes will be in the next commit. --- mwparserfromhell/parser/contexts.py | 9 +- mwparserfromhell/parser/tokenizer.c | 289 +++++++++++++++++------------------ mwparserfromhell/parser/tokenizer.h | 11 +- mwparserfromhell/parser/tokenizer.py | 83 +++++----- tests/tokenizer/tables.mwtest | 44 +++--- 5 files changed, 218 insertions(+), 218 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 6dd5319..ef44ce2 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -167,11 +167,12 @@ SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + TABLE_OPEN = 1 << 30 TABLE_CELL_OPEN = 1 << 31 TABLE_CELL_STYLE = 1 << 32 -TABLE_TD_LINE = 1 << 33 -TABLE_TH_LINE = 1 << 34 +TABLE_ROW_OPEN = 1 << 33 +TABLE_TD_LINE = 1 << 34 +TABLE_TH_LINE = 1 << 35 TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE -TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_TD_LINE + - TABLE_TH_LINE) +TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + + TABLE_ROW_OPEN + + TABLE_TD_LINE + TABLE_TH_LINE) # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 1d2964e..c062404 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2454,6 +2454,88 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) } /* + Emit a table tag. +*/ +static int Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, + const char* tag, PyObject* style, PyObject* padding, + const char* close_open_markup, PyObject* contents, + const char* open_close_markup) +{ + PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs, *close_open_markup_unicode, + *open_close_kwargs, *open_close_markup_unicode; + + open_open_kwargs = PyDict_New(); + if (!open_open_kwargs) + goto fail_decref_all; + open_open_markup_unicode = PyUnicode_FromString(open_open_markup); + if (!open_open_markup_unicode) { + Py_DECREF(open_open_kwargs); + goto fail_decref_all; + } + PyDict_SetItemString(open_open_kwargs, "wiki_markup", open_open_markup_unicode); + Py_DECREF(open_open_markup_unicode); + if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs)) + goto fail_decref_all; + if (Tokenizer_emit_text(self, tag)) + goto fail_decref_all; + + if (style) { + if (Tokenizer_emit_all(self, style)) + goto fail_decref_all; + Py_DECREF(style); + } + + close_open_kwargs = PyDict_New(); + if (!close_open_kwargs) + goto fail_decref_padding_contents; + if (close_open_markup && strlen(close_open_markup) != 0) { + close_open_markup_unicode = PyUnicode_FromString(close_open_markup); + if (!close_open_markup_unicode) { + Py_DECREF(close_open_kwargs); + goto fail_decref_padding_contents; + } + PyDict_SetItemString(close_open_kwargs, "wiki_markup", close_open_markup_unicode); + Py_DECREF(close_open_markup_unicode); + } + PyDict_SetItemString(close_open_kwargs, "padding", padding); + Py_DECREF(padding); + if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs)) + goto fail_decref_contents; + + if (contents) { + if (Tokenizer_emit_all(self, contents)) + goto fail_decref_contents; + Py_DECREF(contents); + } + + open_close_kwargs = PyDict_New(); + if (!open_close_kwargs) + return -1; + open_close_markup_unicode = PyUnicode_FromString(open_close_markup); + if (!open_close_markup_unicode) { + Py_DECREF(open_close_kwargs); + return -1; + } + PyDict_SetItemString(open_close_kwargs, "wiki_markup", open_close_markup_unicode); + Py_DECREF(open_close_markup_unicode); + if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs)) + return -1; + if (Tokenizer_emit_text(self, tag)) + return -1; + if (Tokenizer_emit(self, TagCloseClose)) + return -1; + return 0; + + fail_decref_all: + Py_XDECREF(style); + fail_decref_padding_contents: + Py_DECREF(padding); + fail_decref_contents: + Py_DECREF(contents); + return -1; +} + +/* Parse until ``end_token`` as style attributes for a table. */ static PyObject* Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, @@ -2521,8 +2603,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self) { self->head += 2; Py_ssize_t reset = self->head; - PyObject *style, *open_open_kwargs, *close_open_kwargs, *open_close_kwargs, - *padding, *newline_character, *open_wiki_markup, *close_wiki_markup; + PyObject *style, *padding, *newline_character; PyObject *table = NULL; if(Tokenizer_push(self, LC_TABLE_OPEN)) @@ -2573,68 +2654,11 @@ static int Tokenizer_handle_table_start(Tokenizer* self) self->head += 2; } - open_open_kwargs = PyDict_New(); - if (!open_open_kwargs) - goto fail_decref_all; - open_wiki_markup = PyUnicode_FromString("{|"); - if (!open_wiki_markup) { - Py_DECREF(open_open_kwargs); - goto fail_decref_all; - } - PyDict_SetItemString(open_open_kwargs, "wiki_markup", open_wiki_markup); - Py_DECREF(open_wiki_markup); - if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs)) - goto fail_decref_all; - if (Tokenizer_emit_text(self, "table")) - goto fail_decref_all; - - if (style) { - if (Tokenizer_emit_all(self, style)) - goto fail_decref_padding_table; - Py_DECREF(style); - } - - close_open_kwargs = PyDict_New(); - if (!close_open_kwargs) - goto fail_decref_padding_table; - PyDict_SetItemString(close_open_kwargs, "padding", padding); - Py_DECREF(padding); - if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs)) - goto fail_decref_table; - - if (table) { - if (Tokenizer_emit_all(self, table)) - goto fail_decref_table; - Py_DECREF(table); - } - - open_close_kwargs = PyDict_New(); - if (!open_close_kwargs) - return -1; - close_wiki_markup = PyUnicode_FromString("|}"); - if (!close_wiki_markup) { - Py_DECREF(open_close_kwargs); - return -1; - } - PyDict_SetItemString(open_close_kwargs, "wiki_markup", close_wiki_markup); - Py_DECREF(close_wiki_markup); - if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs)) - return -1; - if (Tokenizer_emit_text(self, "table")) - return -1; - if (Tokenizer_emit(self, TagCloseClose)) + if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL, table, "|}")) return -1; // offset displacement done by _parse() self->head--; return 0; - - fail_decref_all: - Py_DECREF(style); - fail_decref_padding_table: - Py_DECREF(padding); - fail_decref_table: - Py_XDECREF(table); - return -1; } /* @@ -2651,67 +2675,60 @@ static PyObject * Tokenizer_handle_table_end(Tokenizer* self) */ static int Tokenizer_handle_table_row(Tokenizer* self) { + if (!Tokenizer_CAN_RECURSE(self)) { + if (Tokenizer_emit_text(self, "|-")) + return -1; + self->head += 1; + return 0; + } + Py_ssize_t reset = self->head; self->head += 2; - PyObject *padding, *open_kwargs, *close_kwargs, *wiki_markup; - PyObject *style = NULL; + PyObject *padding, *style, *row; - // If we can't recurse, still tokenize tag but parse style attrs as text - if (Tokenizer_CAN_RECURSE(self)) { - if(Tokenizer_push(self, LC_TABLE_OPEN)) - return -1; - padding = Tokenizer_parse_as_table_style(self, '\n', 0); - if (BAD_ROUTE) { - self->head = reset; - return 0; - } - if (!padding) - return -1; - style = Tokenizer_pop(self); - if (!style) { - Py_DECREF(padding); - return -1; - } - } else { - padding = PyUnicode_FromString(""); - if (!padding) - return -1; + if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) + return -1; + padding = Tokenizer_parse_as_table_style(self, '\n', 0); + if (BAD_ROUTE) { + self->head = reset; + return 0; } - - open_kwargs = PyDict_New(); - if (!open_kwargs) - goto fail_decref_all; - wiki_markup = PyUnicode_FromString("|-"); - if (!wiki_markup) { - Py_DECREF(open_kwargs); - goto fail_decref_all; + if (!padding) + return -1; + style = Tokenizer_pop(self); + if (!style) { + Py_DECREF(padding); + return -1; } - PyDict_SetItemString(open_kwargs, "wiki_markup", wiki_markup); - Py_DECREF(wiki_markup); - if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_kwargs)) - goto fail_decref_all; - if (Tokenizer_emit_text(self, "tr")) - goto fail_decref_all; - - if (style) { - if (Tokenizer_emit_all(self, style)) - goto fail_decref_all; + // don't parse the style separator + self->head++; + row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); + if (BAD_ROUTE) { + Py_DECREF(padding); Py_DECREF(style); + self->head = reset; + return 0; + } + if (!row) { + Py_DECREF(padding); + Py_DECREF(style); + Py_DECREF(row); + return -1; } - close_kwargs = PyDict_New(); - if (!close_kwargs) - goto fail_decref_all; - PyDict_SetItemString(close_kwargs, "padding", padding); - Py_DECREF(padding); - if (Tokenizer_emit_kwargs(self, TagCloseSelfclose, close_kwargs)) + if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, "")) return -1; + // offset displacement done by _parse() + self->head--; return 0; +} - fail_decref_all: - Py_XDECREF(style); - Py_DECREF(padding); - return -1; +/* + Return the stack in order to handle the table row end. +*/ +static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) +{ + return Tokenizer_pop(self); } /* @@ -2732,9 +2749,9 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, uint64_t cell_context; Py_ssize_t reset = self->head; self->head += strlen(markup); - PyObject *padding; - PyObject *cell, *open_kwargs, *close_kwargs, *open_wiki_markup, *close_wiki_markup; + PyObject *padding, *cell; PyObject *style = NULL; + const char *close_open_markup = NULL; cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); if (BAD_ROUTE) { @@ -2783,54 +2800,16 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, } } - open_kwargs = PyDict_New(); - if (!open_kwargs) - goto fail_decref_all; - close_kwargs = PyDict_New(); - if (!close_kwargs) - goto fail_decref_all; - open_wiki_markup = PyUnicode_FromString(markup); - if (!open_wiki_markup) - goto fail_decref_all; - PyDict_SetItemString(open_kwargs, "wiki_markup", open_wiki_markup); - Py_DECREF(open_wiki_markup); - if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_kwargs)) - goto fail_decref_all; - if (Tokenizer_emit_text(self, tag)) - goto fail_decref_all; - if (style) { - if (Tokenizer_emit_all(self, style)) - goto fail_decref_all; - close_wiki_markup = PyUnicode_FromString("|"); - if (!close_wiki_markup) - goto fail_decref_all; - PyDict_SetItemString(close_kwargs, "wiki_markup", close_wiki_markup); - Py_DECREF(close_wiki_markup); - Py_DECREF(style); + close_open_markup = "|"; } - - PyDict_SetItemString(close_kwargs, "padding", padding); - Py_DECREF(padding); - if (Tokenizer_emit_kwargs(self, TagCloseSelfclose, close_kwargs)) - goto fail_decref_cell; - if (Tokenizer_emit_all(self, cell)) - goto fail_decref_cell; - Py_DECREF(cell); + if (Tokenizer_emit_table_tag(self, markup, tag, style, padding, close_open_markup, cell, "")) + return -1; // keep header/cell line contexts self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE); // offset displacement done by parse() self->head--; return 0; - - fail_decref_all: - Py_XDECREF(style); - Py_DECREF(padding); - Py_XDECREF(open_kwargs); - Py_XDECREF(close_kwargs); - fail_decref_cell: - Py_DECREF(cell); - return -1; } /* @@ -3139,12 +3118,16 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) if (this == '|' && next == '}') { if (this_context & LC_TABLE_CELL_OPEN) return Tokenizer_handle_table_cell_end(self, 0); + if (this_context & LC_TABLE_ROW_OPEN) + return Tokenizer_handle_table_row_end(self); else return Tokenizer_handle_table_end(self); } else if (this == '|' && next == '-') { if (this_context & LC_TABLE_CELL_OPEN) return Tokenizer_handle_table_cell_end(self, 0); + if (this_context & LC_TABLE_ROW_OPEN) + return Tokenizer_handle_table_row_end(self); else if (Tokenizer_handle_table_row(self)) return NULL; } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index de7b7d4..57a0121 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -157,14 +157,15 @@ static PyObject* TagCloseClose; #define LC_FAIL_ON_RBRACE 0x0000000010000000 #define LC_FAIL_ON_EQUALS 0x0000000020000000 -// TODO realign all -#define LC_TABLE 0x00000007C0000000 -#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000700000000 +#define LC_TABLE 0x0000000FC0000000 +#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000 #define LC_TABLE_OPEN 0x0000000040000000 #define LC_TABLE_CELL_OPEN 0x0000000080000000 #define LC_TABLE_CELL_STYLE 0x0000000100000000 -#define LC_TABLE_TD_LINE 0x0000000200000000 -#define LC_TABLE_TH_LINE 0x0000000400000000 +#define LC_TABLE_ROW_OPEN 0x0000000200000000 +#define LC_TABLE_TD_LINE 0x0000000400000000 +#define LC_TABLE_TH_LINE 0x0000000800000000 + /* Global contexts: */ #define GL_HEADING 0x1 diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index e8f21c0..6ae6050 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1002,6 +1002,23 @@ class Tokenizer(object): self._fail_route() return self._pop() + def _emit_table_tag(self, open_open_markup, tag, style, padding, + close_open_markup, contents, open_close_markup): + """Emit a table tag.""" + self._emit(tokens.TagOpenOpen(wiki_markup=open_open_markup)) + self._emit_text(tag) + if style: + self._emit_all(style) + if close_open_markup: + self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup, padding=padding)) + else: + self._emit(tokens.TagCloseOpen(padding=padding)) + if contents: + self._emit_all(contents) + self._emit(tokens.TagOpenClose(wiki_markup=open_close_markup)) + self._emit_text(tag) + self._emit(tokens.TagCloseClose()) + def _parse_as_table_style(self, end_token, break_on_table_end=False): """Parse until ``end_token`` as style attributes for a table.""" data = _TagOpenData() @@ -1052,17 +1069,7 @@ class Tokenizer(object): self._head = reset - 1 self._emit_text("{|") else: - self._emit(tokens.TagOpenOpen(wiki_markup="{|")) - self._emit_text("table") - if style: - self._emit_all(style) - self._emit(tokens.TagCloseOpen(padding=padding)) - if table: - self._emit_all(table) - self._emit(tokens.TagOpenClose(wiki_markup="|}")) - self._emit_text("table") - self._emit(tokens.TagCloseClose()) - # offset displacement done by _parse() + self._emit_table_tag("{|", "table", style, padding, None, table, "|}") self._head -= 1 def _handle_table_end(self): @@ -1072,23 +1079,31 @@ class Tokenizer(object): def _handle_table_row(self): """Parse as style until end of the line, then continue.""" + if not self._can_recurse(): + self._emit_text("|-") + self._head += 1 + return + reset = self._head self._head += 2 style, padding = None, "" - # If we can't recurse, still tokenize tag but parse style attrs as text - if self._can_recurse(): - try: - self._push(contexts.TABLE_OPEN) - padding = self._parse_as_table_style("\n") - style = self._pop() - except BadRoute: - self._head = reset - raise - self._emit(tokens.TagOpenOpen(wiki_markup="|-")) - self._emit_text("tr") - if style: - self._emit_all(style) - self._emit(tokens.TagCloseSelfclose(padding=padding)) + try: + self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) + padding = self._parse_as_table_style("\n") + style = self._pop() + # don't parse the style separator + self._head += 1 + row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) + except BadRoute: + self._head = reset + raise + self._emit_table_tag("|-", "tr", style, padding, None, row, "") + # offset displacement done by parse() + self._head -= 1 + + def _handle_table_row_end(self): + """Return the stack in order to handle the table row end.""" + return self._pop() def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse style @@ -1101,7 +1116,7 @@ class Tokenizer(object): old_context = self._context reset = self._head self._head += len(markup) - reset_for_style, padding = False, "" + reset_for_style, padding, style = False, "", None try: cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context | contexts.TABLE_CELL_STYLE) cell_context = self._context @@ -1124,14 +1139,8 @@ class Tokenizer(object): except BadRoute: self._head = reset raise - self._emit(tokens.TagOpenOpen(wiki_markup=markup)) - self._emit_text(tag) - if reset_for_style: - self._emit_all(style) - self._emit(tokens.TagCloseSelfclose(wiki_markup="|", padding=padding)) - else: - self._emit(tokens.TagCloseSelfclose(padding=padding)) - self._emit_all(cell) + close_open_markup = "|" if reset_for_style else None + self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "") # keep header/cell line contexts self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE) # offset displacement done by parse() @@ -1140,6 +1149,8 @@ class Tokenizer(object): def _handle_table_cell_end(self, reset_for_style=False): """Returns the current context, with the TABLE_CELL_STYLE flag set if it is necessary to reset and parse style attributes.""" + if self._context & (contexts.FAIL & ~contexts.TABLE): + raise BadRoute if reset_for_style: self._context |= contexts.TABLE_CELL_STYLE else: @@ -1328,10 +1339,14 @@ class Tokenizer(object): if this == "|" and next == "}": if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() + if self._context & contexts.TABLE_ROW_OPEN: + return self._handle_table_row_end() return self._handle_table_end() elif this == "|" and next == "-": if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() + if self._context & contexts.TABLE_ROW_OPEN: + return self._handle_table_row_end() self._handle_table_row() elif this == "|": if self._context & contexts.TABLE_CELL_OPEN: diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index c684451..455da67 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -106,42 +106,42 @@ output: [Text(text="foo \n foo \t {|\n|}")] name: table_row_simple label: Simple table row. input: "{|\n |- \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Tag Text(text="table"), TagCloseClose()] --- name: table_row_multiple label: Simple table row. input: "{|\n |- \n|- \n |-\n |}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding=" \n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseSelfclose(padding="\n"), Text(text=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_simple label: Simple table cell. input: "{|\n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_inline label: Multiple inline table cells. input: "{|\n | foo || bar || test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" bar "),TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_fake_close label: Looks like a table close but is not. input: "{|\n | |} \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text="} \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text="} \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_more_fake_close label: Looks like a table close but is not. input: "{|\n || |} \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" |} \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" |} \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -155,28 +155,28 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: table_header_simple label: Simple header cell. input: "{|\n ! foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_header_inline label: Multiple inline header cells. input: "{|\n ! foo || bar !! test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" foo "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" bar "),TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseSelfclose(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_text_outside_cell label: Parse text inside table but outside of a cell. input: "{|\n bar \n | foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -197,84 +197,84 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: template_inside_table_cell label: Template within table cell. input: "{|\n |{{foo\n|bar=baz}} \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes label: Parse table cell style attributes. input: "{| \n | name="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_empty_attributes label: Parse table cell with style markers but no attributes. input: "{| \n | | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_with_dash label: Parse a situation in which a cell line looks like a row line. input: "{|\n ||- \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_quote_with_pipe label: Pipe inside an attribute quote should still be used as a style separator. input: "{| \n | name="foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe label: Pipe inside an attribute name should still be used as a style separator. input: "{| \n | name|="foo bar" | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_pipe_after_equals label: Pipe inside an attribute should still be used as a style separator after an equals. input: "{| \n | name=|"foo|bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseOpen(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_templates label: Pipe inside attributes shouldn't be style separator. input: "{| \n | {{comment|template=baz}} | test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: header_cell_attributes label: Parse header cell style attributes. input: "{| \n ! name="foo bar"| test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_cell_attributes label: Parse cell style attributes of inline cells. input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseSelfclose(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes label: Parse table row style attributes. input: "{| \n |- name="foo bar"\n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes_crazy_whitespace label: Parse table row style attributes with different whitespace. input: "{| \t \n |- \t name="foo bar" \t \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseSelfclose(padding=" \t \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding=" \t \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- From 1b3e3c365704bed8b0b9d8601c9ca5cbe8e7e0f6 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 22 Jul 2014 15:17:51 -0700 Subject: [PATCH 23/44] Change wiki tags to use style separators For wiki syntax tables, add `wiki_style_separator` as an attribute for the Tag node. Also reorder `closing_wiki_markup` property and tests to match its place in the constructor. --- mwparserfromhell/nodes/tag.py | 78 +++++++++++++++++++++++--------------- mwparserfromhell/parser/builder.py | 6 ++- tests/test_tag.py | 40 ++++++++++++------- 3 files changed, 79 insertions(+), 45 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index c5f9d84..e9531e7 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -35,7 +35,8 @@ class Tag(Node): def __init__(self, tag, contents=None, attrs=None, wiki_markup=None, self_closing=False, invalid=False, implicit=False, padding="", - closing_tag=None, closing_wiki_markup=None): + closing_tag=None, wiki_style_separator=None, + closing_wiki_markup=None): super(Tag, self).__init__() self._tag = tag if contents is None and not self_closing: @@ -44,12 +45,6 @@ class Tag(Node): self._contents = contents self._attrs = attrs if attrs else [] self._wiki_markup = wiki_markup - if closing_wiki_markup: - self._closing_wiki_markup = closing_wiki_markup - elif wiki_markup and not self_closing: - self._closing_wiki_markup = wiki_markup - else: - self._closing_wiki_markup = None self._self_closing = self_closing self._invalid = invalid self._implicit = implicit @@ -58,16 +53,28 @@ class Tag(Node): self._closing_tag = closing_tag else: self._closing_tag = tag + self._wiki_style_separator = wiki_style_separator + if closing_wiki_markup is not None: + self._closing_wiki_markup = closing_wiki_markup + elif wiki_markup and not self_closing: + self._closing_wiki_markup = wiki_markup + else: + self._closing_wiki_markup = None def __unicode__(self): if self.wiki_markup: - attrs = "".join([str(attr) for attr in self.attributes]) if self.attributes else "" - close = self.closing_wiki_markup if self.closing_wiki_markup else "" - padding = self.padding if self.padding else "" + if self.attributes: + attrs = "".join([str(attr) for attr in self.attributes]) + else: + attrs = "" + padding = self.padding or "" + separator = self.wiki_style_separator or "" + close = self.closing_wiki_markup or "" if self.self_closing: - return self.wiki_markup + attrs + padding + close + return self.wiki_markup + attrs + padding + separator else: - return self.wiki_markup + attrs + padding + str(self.contents) + close + return self.wiki_markup + attrs + padding + separator + \ + str(self.contents) + close result = ("``).""" return self._self_closing @@ -197,6 +190,27 @@ class Tag(Node): """ return self._closing_tag + @property + def wiki_style_separator(self): + """The separator between the padding and content in a wiki markup tag. + + Essentially the wiki equivalent of the TagCloseOpen. + """ + return self._wiki_style_separator + + @property + def closing_wiki_markup(self): + """The wikified version of the closing tag to show instead of HTML. + + If set to a value, this will be displayed instead of the close tag + brackets. If tag is :attr:`self_closing` is ``True`` then this is not + displayed. If :attr:`wiki_markup` is set and this has not been set, this + is set to the value of :attr:`wiki_markup`. If this has been set and + :attr:`wiki_markup` is set to a ``False`` value, this is set to + ``None``. + """ + return self._closing_wiki_markup + @tag.setter def tag(self, value): self._tag = self._closing_tag = parse_anything(value) @@ -211,10 +225,6 @@ class Tag(Node): if not value or not self.closing_wiki_markup: self.closing_wiki_markup = str(value) if value else None - @closing_wiki_markup.setter - def closing_wiki_markup(self, value): - self._closing_wiki_markup = str(value) if value else None - @self_closing.setter def self_closing(self, value): self._self_closing = bool(value) @@ -241,6 +251,14 @@ class Tag(Node): def closing_tag(self, value): self._closing_tag = parse_anything(value) + @wiki_style_separator.setter + def wiki_style_separator(self, value): + self._wiki_style_separator = str(value) if value else None + + @closing_wiki_markup.setter + def closing_wiki_markup(self, value): + self._closing_wiki_markup = str(value) if value else None + def has(self, name): """Return whether any attribute in the tag has the given *name*. diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 32cbb93..99a54d1 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -248,13 +248,14 @@ class Builder(object): close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) implicit, attrs, contents, closing_tag = False, [], None, None wiki_markup, invalid = token.wiki_markup, token.invalid or False - closing_wiki_markup = None + wiki_style_separator, closing_wiki_markup = None, wiki_markup self._push() while self._tokens: token = self._tokens.pop() if isinstance(token, tokens.TagAttrStart): attrs.append(self._handle_attribute(token)) elif isinstance(token, tokens.TagCloseOpen): + wiki_style_separator = token.wiki_markup padding = token.padding or "" tag = self._pop() self._push() @@ -273,7 +274,8 @@ class Builder(object): self_closing = False closing_tag = self._pop() return Tag(tag, contents, attrs, wiki_markup, self_closing, - invalid, implicit, padding, closing_tag, closing_wiki_markup) + invalid, implicit, padding, closing_tag, + wiki_style_separator, closing_wiki_markup) else: self._write(self._handle_token(token)) raise ParserError("_handle_tag() missed a close token") diff --git a/tests/test_tag.py b/tests/test_tag.py index 2d67723..c2c751b 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -171,19 +171,6 @@ class TestTag(TreeEqualityTestCase): self.assertFalse(node.wiki_markup) self.assertEqual("italic text", node) - def test_closing_wiki_markup(self): - """test getter/setter behavior for closing_wiki_markup attribute""" - node = Tag(wraptext("table"), wraptext("\n")) - self.assertIs(None, node.closing_wiki_markup) - node.wiki_markup = "{|" - self.assertEqual("{|", node.closing_wiki_markup) - node.closing_wiki_markup = "|}" - self.assertEqual("|}", node.closing_wiki_markup) - self.assertEqual("{|\n|}", node) - node.wiki_markup = False - self.assertFalse(node.closing_wiki_markup) - self.assertEqual("\n
", node) - def test_self_closing(self): """test getter/setter for the self_closing attribute""" node = Tag(wraptext("ref"), wraptext("foobar")) @@ -239,6 +226,33 @@ class TestTag(TreeEqualityTestCase): self.assertWikicodeEqual(parsed, node.closing_tag) self.assertEqual("foobar", node) + def test_wiki_style_separator(self): + """test getter/setter for wiki_style_separator attribute""" + node = Tag(wraptext("table"), wraptext("\n")) + self.assertIs(None, node.wiki_style_separator) + node.wiki_style_separator = "|" + self.assertEqual("|", node.wiki_style_separator) + node.wiki_markup = "{" + self.assertEqual("{|\n{", node) + node2 = Tag(wraptext("table"), wraptext("\n"), wiki_style_separator="|") + self.assertEqual("|", node.wiki_style_separator) + + def test_closing_wiki_markup(self): + """test getter/setter for closing_wiki_markup attribute""" + node = Tag(wraptext("table"), wraptext("\n")) + self.assertIs(None, node.closing_wiki_markup) + node.wiki_markup = "{|" + self.assertEqual("{|", node.closing_wiki_markup) + node.closing_wiki_markup = "|}" + self.assertEqual("|}", node.closing_wiki_markup) + self.assertEqual("{|\n|}", node) + node.wiki_markup = False + self.assertFalse(node.closing_wiki_markup) + self.assertEqual("\n
", node) + node2 = Tag(wraptext("table"), wraptext("\n"), wiki_markup="{|", + closing_wiki_markup="|}") + self.assertEqual("|}", node2.closing_wiki_markup) + def test_has(self): """test Tag.has()""" node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")]) From c63108039b4bb56348bd54ba0b59fe77c5f19eec Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 22 Jul 2014 16:01:32 -0700 Subject: [PATCH 24/44] Fix C code to make declarations before statements Python 3.4 compiles C extensions with the `-Werror=declaration-after-statement` flag that enforces C90 more strictly than previous versions. Move all statements after declarations to make sure this extension builds on 3.4. --- mwparserfromhell/parser/tokenizer.c | 34 +++++++++++++++++----------------- mwparserfromhell/parser/tokenizer.py | 26 ++++++++++++-------------- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index c062404..c902c3d 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2601,17 +2601,17 @@ static PyObject* Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, */ static int Tokenizer_handle_table_start(Tokenizer* self) { - self->head += 2; - Py_ssize_t reset = self->head; + Py_ssize_t reset = self->head + 1; PyObject *style, *padding, *newline_character; PyObject *table = NULL; + self->head += 2; if(Tokenizer_push(self, LC_TABLE_OPEN)) return -1; padding = Tokenizer_parse_as_table_style(self, '\n', 1); if (BAD_ROUTE) { RESET_ROUTE(); - self->head = reset - 1; + self->head = reset; if (Tokenizer_emit_text(self, "{|")) return -1; return 0; @@ -2638,7 +2638,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self) if (BAD_ROUTE) { RESET_ROUTE(); // offset displacement done by parse() - self->head = reset - 1; + self->head = reset; if (Tokenizer_emit_text(self, "{|")) return -1; return 0; @@ -2675,17 +2675,17 @@ static PyObject * Tokenizer_handle_table_end(Tokenizer* self) */ static int Tokenizer_handle_table_row(Tokenizer* self) { + Py_ssize_t reset = self->head; + PyObject *padding, *style, *row; + self->head += 2; + if (!Tokenizer_CAN_RECURSE(self)) { if (Tokenizer_emit_text(self, "|-")) return -1; - self->head += 1; + self->head -= 1; return 0; } - Py_ssize_t reset = self->head; - self->head += 2; - PyObject *padding, *style, *row; - if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) return -1; padding = Tokenizer_parse_as_table_style(self, '\n', 0); @@ -2738,20 +2738,20 @@ static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, const char *tag, uint64_t line_context) { - if (!Tokenizer_CAN_RECURSE(self)) { - if (Tokenizer_emit_text(self, markup)) - return -1; - self->head += strlen(markup) - 1; - return 0; - } - uint64_t old_context = self->topstack->context; uint64_t cell_context; Py_ssize_t reset = self->head; - self->head += strlen(markup); PyObject *padding, *cell; PyObject *style = NULL; const char *close_open_markup = NULL; + self->head += strlen(markup); + + if (!Tokenizer_CAN_RECURSE(self)) { + if (Tokenizer_emit_text(self, markup)) + return -1; + self->head--; + return 0; + } cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); if (BAD_ROUTE) { diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 6ae6050..59f2156 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1050,9 +1050,9 @@ class Tokenizer(object): def _handle_table_start(self): """Handle the start of a table.""" - self._head += 2 - reset = self._head + reset = self._head + 1 style, table = None, None + self._head += 2 try: self._push(contexts.TABLE_OPEN) padding = self._parse_as_table_style("\n", break_on_table_end=True) @@ -1066,7 +1066,7 @@ class Tokenizer(object): self._head += 2 except BadRoute: # offset displacement done by _parse() - self._head = reset - 1 + self._head = reset self._emit_text("{|") else: self._emit_table_tag("{|", "table", style, padding, None, table, "|}") @@ -1079,14 +1079,14 @@ class Tokenizer(object): def _handle_table_row(self): """Parse as style until end of the line, then continue.""" + reset = self._head + style, padding = None, "" + self._head += 2 if not self._can_recurse(): self._emit_text("|-") - self._head += 1 + self._head -= 1 return - reset = self._head - self._head += 2 - style, padding = None, "" try: self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) padding = self._parse_as_table_style("\n") @@ -1108,15 +1108,15 @@ class Tokenizer(object): def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse style as HTML attributes and the remainder as normal syntax.""" + old_context = self._context + reset = self._head + reset_for_style, padding, style = False, "", None + self._head += len(markup) if not self._can_recurse(): self._emit_text(markup) - self._head += len(markup) - 1 + self._head -= 1 return - old_context = self._context - reset = self._head - self._head += len(markup) - reset_for_style, padding, style = False, "", None try: cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context | contexts.TABLE_CELL_STYLE) cell_context = self._context @@ -1149,8 +1149,6 @@ class Tokenizer(object): def _handle_table_cell_end(self, reset_for_style=False): """Returns the current context, with the TABLE_CELL_STYLE flag set if it is necessary to reset and parse style attributes.""" - if self._context & (contexts.FAIL & ~contexts.TABLE): - raise BadRoute if reset_for_style: self._context |= contexts.TABLE_CELL_STYLE else: From 8dc70bc20b4f4f0926db267ed4430ff175bcb37b Mon Sep 17 00:00:00 2001 From: David Winegar Date: Tue, 22 Jul 2014 16:31:56 -0700 Subject: [PATCH 25/44] Add test coverage Add some table tests to increase coverage. Also reorder some tests. --- tests/test_tag.py | 4 +++- tests/tokenizer/tables.mwtest | 51 ++++++++++++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/tests/test_tag.py b/tests/test_tag.py index c2c751b..b33b0c2 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -249,9 +249,11 @@ class TestTag(TreeEqualityTestCase): node.wiki_markup = False self.assertFalse(node.closing_wiki_markup) self.assertEqual("\n
", node) - node2 = Tag(wraptext("table"), wraptext("\n"), wiki_markup="{|", + node2 = Tag(wraptext("table"), wraptext("\n"), + attrs=[agen("id", "foo")], wiki_markup="{|", closing_wiki_markup="|}") self.assertEqual("|}", node2.closing_wiki_markup) + self.assertEqual('{| id="foo"\n|}', node2) def test_has(self): """test Tag.has()""" diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 455da67..39acf0c 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -106,7 +106,7 @@ output: [Text(text="foo \n foo \t {|\n|}")] name: table_row_simple label: Simple table row. input: "{|\n |- \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Tag Text(text="table"), TagCloseClose()] +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- @@ -131,6 +131,41 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- +name: table_cell_multiple +label: Multiple table cells (non-inline). +input: "{|\n| foo \n| bar \n| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_header_simple +label: Simple header cell. +input: "{|\n ! foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_header_inline +label: Multiple inline header cells. +input: "{|\n ! foo || bar !! test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_header_multiple +label: Multiple table header cells (non-inline). +input: "{|\n! foo \n! bar \n! test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: nested_cells_and_rows +label: Combination of cells and rows in a table. +input: "{|\n|- \n| foo \n|- \n| bar\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + name: table_cell_fake_close label: Looks like a table close but is not. input: "{|\n | |} \n|}" @@ -152,20 +187,6 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- -name: table_header_simple -label: Simple header cell. -input: "{|\n ! foo \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] - ---- - -name: table_header_inline -label: Multiple inline header cells. -input: "{|\n ! foo || bar !! test \n|}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] - ---- - name: nowiki_inside_table label: Nowiki handles pipe characters in tables. input: "{|\n | foo | |- {| |} || ! !! bar \n|}" From c802b1f8143018e8d014c682eb98c14d11b06c54 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Fri, 25 Jul 2014 15:53:35 -0700 Subject: [PATCH 26/44] Change context to uint64_t One-line fix --- mwparserfromhell/parser/tokenizer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 57a0121..8d2d428 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -53,7 +53,8 @@ static const char MARKERS[] = { #define MAX_BRACES 255 #define MAX_ENTITY_SIZE 8 -static int route_state = 0, route_context = 0; +static int route_state = 0; +static uint64_t route_context = 0; #define BAD_ROUTE route_state #define BAD_ROUTE_CONTEXT route_context #define FAIL_ROUTE(context) route_state = 1; route_context = context From 1a4c88e11f8b6403e4a15a1e24b67b3185c884c6 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Fri, 25 Jul 2014 15:54:37 -0700 Subject: [PATCH 27/44] Correctly handle no table endings Tests were not correctly testing the situations without a table close. Fixed tests and then fixed tokenizers for failing tests. Also refactored pytokenizer to more closely match the ctokenizer by only holding the `_parse` methods in the try blocks and no other code. --- mwparserfromhell/parser/tokenizer.c | 28 ++++++++++++--- mwparserfromhell/parser/tokenizer.py | 70 +++++++++++++++++++++++------------- tests/tokenizer/tables.mwtest | 49 +++++++++++++++++++++---- 3 files changed, 110 insertions(+), 37 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index c902c3d..bad72ef 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2636,8 +2636,9 @@ static int Tokenizer_handle_table_start(Tokenizer* self) self->head++; table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); if (BAD_ROUTE) { + Py_DECREF(padding); + Py_DECREF(style); RESET_ROUTE(); - // offset displacement done by parse() self->head = reset; if (Tokenizer_emit_text(self, "{|")) return -1; @@ -2676,7 +2677,7 @@ static PyObject * Tokenizer_handle_table_end(Tokenizer* self) static int Tokenizer_handle_table_row(Tokenizer* self) { Py_ssize_t reset = self->head; - PyObject *padding, *style, *row; + PyObject *padding, *style, *row, *trash; self->head += 2; if (!Tokenizer_CAN_RECURSE(self)) { @@ -2690,6 +2691,8 @@ static int Tokenizer_handle_table_row(Tokenizer* self) return -1; padding = Tokenizer_parse_as_table_style(self, '\n', 0); if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); self->head = reset; return 0; } @@ -2704,6 +2707,8 @@ static int Tokenizer_handle_table_row(Tokenizer* self) self->head++; row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); Py_DECREF(padding); Py_DECREF(style); self->head = reset; @@ -2712,7 +2717,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self) if (!row) { Py_DECREF(padding); Py_DECREF(style); - Py_DECREF(row); return -1; } @@ -2741,7 +2745,7 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, uint64_t old_context = self->topstack->context; uint64_t cell_context; Py_ssize_t reset = self->head; - PyObject *padding, *cell; + PyObject *padding, *cell, *trash; PyObject *style = NULL; const char *close_open_markup = NULL; self->head += strlen(markup); @@ -2755,6 +2759,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); self->head = reset; return 0; } @@ -2770,6 +2776,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, return -1; padding = Tokenizer_parse_as_table_style(self, '|', 0); if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); self->head = reset; return 0; } @@ -2784,11 +2792,18 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, self->head++; cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context, 1); if (BAD_ROUTE) { + Py_DECREF(padding); + Py_DECREF(style); + trash = Tokenizer_pop(self); + Py_XDECREF(trash); self->head = reset; return 0; } - if (!cell) + if (!cell) { + Py_DECREF(padding); + Py_DECREF(style); return -1; + } cell_context = self->topstack->context; self->topstack->context = old_context; } @@ -3148,6 +3163,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) } else if (Tokenizer_emit_char(self, this)) return NULL; + // Raise BadRoute to table start + if (BAD_ROUTE) + return NULL; } else if (Tokenizer_emit_char(self, this)) return NULL; diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 59f2156..527d364 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1053,24 +1053,30 @@ class Tokenizer(object): reset = self._head + 1 style, table = None, None self._head += 2 + + self._push(contexts.TABLE_OPEN) try: - self._push(contexts.TABLE_OPEN) padding = self._parse_as_table_style("\n", break_on_table_end=True) - style = self._pop() - # continue to parse if it is NOT an inline table - if "\n" in padding: - self._head += 1 - table = self._parse(contexts.TABLE_OPEN) - else: - # close tag - self._head += 2 except BadRoute: - # offset displacement done by _parse() self._head = reset self._emit_text("{|") + return + style = self._pop() + # continue to parse if it is NOT an inline table + if "\n" in padding: + self._head += 1 + try: + table = self._parse(contexts.TABLE_OPEN) + except BadRoute: + self._head = reset + self._emit_text("{|") + return else: - self._emit_table_tag("{|", "table", style, padding, None, table, "|}") - self._head -= 1 + # close tag + self._head += 2 + self._emit_table_tag("{|", "table", style, padding, None, table, "|}") + # offset displacement done by _parse() + self._head -= 1 def _handle_table_end(self): """Return the stack in order to handle the table end.""" @@ -1087,15 +1093,21 @@ class Tokenizer(object): self._head -= 1 return + self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) try: - self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) padding = self._parse_as_table_style("\n") - style = self._pop() - # don't parse the style separator - self._head += 1 + except BadRoute: + self._head = reset + self._pop() + raise + style = self._pop() + # don't parse the style separator + self._head += 1 + try: row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) except BadRoute: self._head = reset + self._pop() raise self._emit_table_tag("|-", "tr", style, padding, None, row, "") # offset displacement done by parse() @@ -1119,26 +1131,34 @@ class Tokenizer(object): try: cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context | contexts.TABLE_CELL_STYLE) - cell_context = self._context - self._context = old_context - reset_for_style = cell_context & contexts.TABLE_CELL_STYLE except BadRoute: self._head = reset + self._pop() raise + cell_context = self._context + self._context = old_context + reset_for_style = cell_context & contexts.TABLE_CELL_STYLE if reset_for_style: self._head = reset + len(markup) + self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) try: - self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) padding = self._parse_as_table_style("|") - style = self._pop() - # Don't parse the style separator - self._head += 1 + except BadRoute: + self._head = reset + self._pop() + raise + style = self._pop() + # Don't parse the style separator + self._head += 1 + try: cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) - cell_context = self._context - self._context = old_context except BadRoute: self._head = reset + ret = self._pop() raise + cell_context = self._context + self._context = old_context + close_open_markup = "|" if reset_for_style else None self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "") # keep header/cell line contexts diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 39acf0c..ecace32 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -13,23 +13,51 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- name: no_table_close_simple -label: Handle case when there is no table close. +label: No table close on inline table input: "{| " output: [Text(text="{| ")] --- +name: no_table_close_newline +label: No table close with a newline +input: "{| \n " +output: [Text(text="{| \n ")] + +--- + name: no_table_close_inside_cell -label: Handle case when there is no table close while inside of a cell. -input: "{| | " -output: [Text(text="{| | ")] +label: No table close while inside of a cell +input: "{| \n| " +output: [Text(text="{| \n| ")] + +--- + +name: no_table_close_inside_cell_after_newline +label: No table close while inside of a cell after a newline +input: "{| \n| \n " +output: [Text(text="{| \n| \n ")] + +--- + +name: no_table_close_inside_cell_with_attributes +label: No table close while inside of a cell with attributes +input: "{| \n| red | test" +output: [Text(text="{| \n| red | test")] --- name: no_table_close_inside_row -label: Handle case when there is no table close while inside of a row. -input: "{| |- " -output: [Text(text="{| |- ")] +label: No table close while inside of a row +input: "{| \n|- " +output: [Text(text="{| \n|- ")] + +--- + +name: no_table_close_inside_row_after_newline +label: No table close while inside of a row after a newline +input: "{| \n|- \n " +output: [Text(text="{| \n|- \n ")] --- @@ -40,6 +68,13 @@ output: [Text(text="{| border=\"1\"")] --- +name: no_table_close_unclosed_attributes +label: Don't parse unclosed attributes if the table doesn't exist. +input: "{| border=" +output: [Text(text="{| border=")] + +--- + name: no_table_close_row_attributes label: Don't parse row attributes as attributes if the table doesn't exist. input: "{| |- border="1"" From e446c51347f061670e78d47840a34c1028317798 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 19 Oct 2014 01:51:44 -0500 Subject: [PATCH 28/44] Adjust table test labels for consistency. --- tests/tokenizer/tables.mwtest | 102 +++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index ecace32..b411045 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -1,355 +1,355 @@ name: empty_table -label: Parsing an empty table. +label: parsing an empty table input: "{|\n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_table -label: Correctly handle tables with close on the same line. +label: correctly handle tables with close on the same line input: "{||}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=""), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_close_simple -label: No table close on inline table +label: no table close on inline table input: "{| " output: [Text(text="{| ")] --- name: no_table_close_newline -label: No table close with a newline +label: no table close with a newline input: "{| \n " output: [Text(text="{| \n ")] --- name: no_table_close_inside_cell -label: No table close while inside of a cell +label: no table close while inside of a cell input: "{| \n| " output: [Text(text="{| \n| ")] --- name: no_table_close_inside_cell_after_newline -label: No table close while inside of a cell after a newline +label: no table close while inside of a cell after a newline input: "{| \n| \n " output: [Text(text="{| \n| \n ")] --- name: no_table_close_inside_cell_with_attributes -label: No table close while inside of a cell with attributes +label: no table close while inside of a cell with attributes input: "{| \n| red | test" output: [Text(text="{| \n| red | test")] --- name: no_table_close_inside_row -label: No table close while inside of a row +label: no table close while inside of a row input: "{| \n|- " output: [Text(text="{| \n|- ")] --- name: no_table_close_inside_row_after_newline -label: No table close while inside of a row after a newline +label: no table close while inside of a row after a newline input: "{| \n|- \n " output: [Text(text="{| \n|- \n ")] --- name: no_table_close_attributes -label: Don't parse attributes as attributes if the table doesn't exist. +label: don't parse attributes as attributes if the table doesn't exist input: "{| border="1"" output: [Text(text="{| border=\"1\"")] --- name: no_table_close_unclosed_attributes -label: Don't parse unclosed attributes if the table doesn't exist. +label: don't parse unclosed attributes if the table doesn't exist input: "{| border=" output: [Text(text="{| border=")] --- name: no_table_close_row_attributes -label: Don't parse row attributes as attributes if the table doesn't exist. +label: don't parse row attributes as attributes if the table doesn't exist input: "{| |- border="1"" output: [Text(text="{| |- border=\"1\"")] --- name: no_table_close_cell -label: Don't parse cells if the table doesn't close. +label: don't parse cells if the table doesn't close input: "{| | border="1"| test || red | foo" output: [Text(text="{| | border=\"1\"| test || red | foo")] --- name: crazy_no_table_close -label: Lost of opened wiki syntax without closes. +label: lost of opened wiki syntax without closes input: "{{{ {{ {| | |- {| |} || ! !! bar \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_text_outside_cell -label: Parse text inside table but outside of a cell. +label: parse text inside table but outside of a cell input: "{|\n bar \n | foo \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_cell_with_leading_characters -label: Fail to create a table cell when there are leading non-whitespace characters. +label: fail to create a table cell when there are leading non-whitespace characters input: "{|\n bar | foo \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: no_table_row_with_leading_characters -label: Fail to create a table row when there are leading non-whitespace characters. +label: fail to create a table row when there are leading non-whitespace characters input: "{|\n bar |- foo \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: template_inside_table_cell -label: Template within table cell. +label: template within table cell input: "{|\n |{{foo\n|bar=baz}} \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes -label: Parse table cell style attributes. +label: parse table cell style attributes input: "{| \n | name="foo bar"| test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_empty_attributes -label: Parse table cell with style markers but no attributes. +label: parse table cell with style markers but no attributes input: "{| \n | | test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_with_dash -label: Parse a situation in which a cell line looks like a row line. +label: parse a situation in which a cell line looks like a row line input: "{|\n ||- \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_quote_with_pipe -label: Pipe inside an attribute quote should still be used as a style separator. +label: pipe inside an attribute quote should still be used as a style separator input: "{| \n | name="foo|bar"| test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_name_with_pipe -label: Pipe inside an attribute name should still be used as a style separator. +label: pipe inside an attribute name should still be used as a style separator input: "{| \n | name|="foo bar" | test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_pipe_after_equals -label: Pipe inside an attribute should still be used as a style separator after an equals. +label: pipe inside an attribute should still be used as a style separator after an equals input: "{| \n | name=|"foo|bar"| test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseOpen(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_cell_attributes_templates -label: Pipe inside attributes shouldn't be style separator. +label: pipe inside attributes shouldn't be style separator input: "{| \n | {{comment|template=baz}} | test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: header_cell_attributes -label: Parse header cell style attributes. +label: parse header cell style attributes input: "{| \n ! name="foo bar"| test \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_cell_attributes -label: Parse cell style attributes of inline cells. +label: parse cell style attributes of inline cells input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes -label: Parse table row style attributes. +label: parse table row style attributes input: "{| \n |- name="foo bar"\n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_row_attributes_crazy_whitespace -label: Parse table row style attributes with different whitespace. +label: parse table row style attributes with different whitespace input: "{| \t \n |- \t name="foo bar" \t \n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding=" \t \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_attributes -label: Parse table style attributes. +label: parse table style attributes input: "{| name="foo bar"\n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: inline_table_attributes -label: Correctly handle attributes in inline tables. +label: correctly handle attributes in inline tables input: "{| foo="tee bar" |}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="tee bar"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- name: table_incorrect_attributes -label: Parse incorrect table style attributes. +label: parse incorrect table style attributes input: "{| name="foo\n|}" output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] From b7c46a6dca5ed71326a7a8e9c3f7071a9297524b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 19 Oct 2014 20:44:57 -0500 Subject: [PATCH 29/44] Add tables to changelog. --- CHANGELOG | 1 + docs/changelog.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index b4b01d6..9c05482 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ v0.4 (unreleased): - The parser is now distributed with Windows binaries, fixing an issue that prevented Windows users from using the C tokenizer. +- Added support for parsing wikicode tables. - Added a script to test for memory leaks in scripts/memtest.py. - Added a script to do releases in scripts/release.sh. - skip_style_tags can now be passed to mwparserfromhell.parse() (previously, diff --git a/docs/changelog.rst b/docs/changelog.rst index 9fdfef2..1854fa0 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,6 +9,7 @@ Unreleased - The parser is now distributed with Windows binaries, fixing an issue that prevented Windows users from using the C tokenizer. +- Added support for parsing wikicode tables. - Added a script to test for memory leaks in :file:`scripts/memtest.py`. - Added a script to do releases in :file:`scripts/release.sh`. - *skip_style_tags* can now be passed to :func:`mwparserfromhell.parse() From bd85805f8fc693b8c4b2b32f700b74d4eb4e774b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 19 Oct 2014 20:49:16 -0500 Subject: [PATCH 30/44] Add integration tests for token roundtripping. --- tests/_test_tokenizer.py | 11 ++++++++--- tests/test_roundtripping.py | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 3 deletions(-) create mode 100644 tests/test_roundtripping.py diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index bfd4857..e44280b 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -25,8 +25,9 @@ import codecs from os import listdir, path import sys -from mwparserfromhell.compat import py3k +from mwparserfromhell.compat import py3k, str from mwparserfromhell.parser import tokens +from mwparserfromhell.parser.builder import Builder class _TestParseError(Exception): """Raised internally when a test could not be parsed.""" @@ -50,8 +51,12 @@ class TokenizerTestCase(object): *label* for the method's docstring. """ def inner(self): - expected = data["output"] - actual = self.tokenizer().tokenize(data["input"]) + if hasattr(self, "roundtrip"): + expected = data["input"] + actual = str(Builder().build(data["output"])) + else: + expected = data["output"] + actual = self.tokenizer().tokenize(data["input"]) self.assertEqual(expected, actual) if not py3k: inner.__name__ = funcname.encode("utf8") diff --git a/tests/test_roundtripping.py b/tests/test_roundtripping.py new file mode 100644 index 0000000..5360387 --- /dev/null +++ b/tests/test_roundtripping.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012-2014 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals + +try: + import unittest2 as unittest +except ImportError: + import unittest + +from ._test_tokenizer import TokenizerTestCase + +class TestRoundtripping(TokenizerTestCase, unittest.TestCase): + """Test cases for roundtripping tokens back to wikitext.""" + + @classmethod + def setUpClass(cls): + cls.roundtrip = True + + +if __name__ == "__main__": + unittest.main(verbosity=2) From 7489253e3289dd821144e324f375d31039cc4a6f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 19 Oct 2014 21:45:17 -0500 Subject: [PATCH 31/44] Break at 80 cols for most lines. --- mwparserfromhell/parser/tokenizer.c | 64 ++++++++++++++++++++++-------------- mwparserfromhell/parser/tokenizer.py | 18 ++++++---- 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index bad72ef..ce46388 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2456,13 +2456,15 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) /* Emit a table tag. */ -static int Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, - const char* tag, PyObject* style, PyObject* padding, - const char* close_open_markup, PyObject* contents, - const char* open_close_markup) +static int +Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, + const char* tag, PyObject* style, PyObject* padding, + const char* close_open_markup, PyObject* contents, + const char* open_close_markup) { - PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs, *close_open_markup_unicode, - *open_close_kwargs, *open_close_markup_unicode; + PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs, + *close_open_markup_unicode, *open_close_kwargs, + *open_close_markup_unicode; open_open_kwargs = PyDict_New(); if (!open_open_kwargs) @@ -2472,7 +2474,8 @@ static int Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_marku Py_DECREF(open_open_kwargs); goto fail_decref_all; } - PyDict_SetItemString(open_open_kwargs, "wiki_markup", open_open_markup_unicode); + PyDict_SetItemString(open_open_kwargs, "wiki_markup", + open_open_markup_unicode); Py_DECREF(open_open_markup_unicode); if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs)) goto fail_decref_all; @@ -2494,7 +2497,8 @@ static int Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_marku Py_DECREF(close_open_kwargs); goto fail_decref_padding_contents; } - PyDict_SetItemString(close_open_kwargs, "wiki_markup", close_open_markup_unicode); + PyDict_SetItemString(close_open_kwargs, "wiki_markup", + close_open_markup_unicode); Py_DECREF(close_open_markup_unicode); } PyDict_SetItemString(close_open_kwargs, "padding", padding); @@ -2516,7 +2520,8 @@ static int Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_marku Py_DECREF(open_close_kwargs); return -1; } - PyDict_SetItemString(open_close_kwargs, "wiki_markup", open_close_markup_unicode); + PyDict_SetItemString(open_close_kwargs, "wiki_markup", + open_close_markup_unicode); Py_DECREF(open_close_markup_unicode); if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs)) return -1; @@ -2538,8 +2543,9 @@ static int Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_marku /* Parse until ``end_token`` as style attributes for a table. */ -static PyObject* Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, - int break_on_table_end) +static PyObject* +Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, + int break_on_table_end) { TagData *data = TagData_new(); PyObject *padding, *trash; @@ -2655,7 +2661,8 @@ static int Tokenizer_handle_table_start(Tokenizer* self) self->head += 2; } - if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL, table, "|}")) + if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL, + table, "|}")) return -1; // offset displacement done by _parse() self->head--; @@ -2665,7 +2672,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self) /* Return the stack in order to handle the table end. */ -static PyObject * Tokenizer_handle_table_end(Tokenizer* self) +static PyObject* Tokenizer_handle_table_end(Tokenizer* self) { self->head += 2; return Tokenizer_pop(self); @@ -2720,7 +2727,8 @@ static int Tokenizer_handle_table_row(Tokenizer* self) return -1; } - if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, "")) + if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, + "")) return -1; // offset displacement done by _parse() self->head--; @@ -2739,8 +2747,9 @@ static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) Parse as normal syntax unless we hit a style marker, then parse style as HTML attributes and the remainder as normal syntax. */ -static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, - const char *tag, uint64_t line_context) +static int +Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, + const char *tag, uint64_t line_context) { uint64_t old_context = self->topstack->context; uint64_t cell_context; @@ -2757,7 +2766,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, return 0; } - cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); + cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | + LC_TABLE_CELL_STYLE | line_context, 1); if (BAD_ROUTE) { trash = Tokenizer_pop(self); Py_XDECREF(trash); @@ -2772,7 +2782,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if (cell_context & LC_TABLE_CELL_STYLE) { Py_DECREF(cell); self->head = reset + strlen(markup); - if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context)) + if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | + line_context)) return -1; padding = Tokenizer_parse_as_table_style(self, '|', 0); if (BAD_ROUTE) { @@ -2790,7 +2801,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, } // Don't parse the style separator self->head++; - cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context, 1); + cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | + line_context, 1); if (BAD_ROUTE) { Py_DECREF(padding); Py_DECREF(style); @@ -2818,10 +2830,12 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if (style) { close_open_markup = "|"; } - if (Tokenizer_emit_table_tag(self, markup, tag, style, padding, close_open_markup, cell, "")) + if (Tokenizer_emit_table_tag(self, markup, tag, style, padding, + close_open_markup, cell, "")) return -1; // keep header/cell line contexts - self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE); + self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | + LC_TABLE_TD_LINE); // offset displacement done by parse() self->head--; return 0; @@ -2831,7 +2845,8 @@ static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, Returns the context, stack, and whether to reset the cell for style in a tuple. */ -static PyObject* Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) +static PyObject* +Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) { if (reset_for_style) self->topstack->context |= LC_TABLE_CELL_STYLE; @@ -2844,7 +2859,8 @@ static PyObject* Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_ Make sure we are not trying to write an invalid character. Return 0 if everything is safe, or -1 if the route must be failed. */ -static int Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) +static int +Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) { if (context & LC_FAIL_NEXT) return -1; @@ -2895,7 +2911,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE d } else if (context & LC_FAIL_ON_LBRACE) { if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' && - Tokenizer_READ_BACKWARDS(self, 2) == '{')) { + Tokenizer_READ_BACKWARDS(self, 2) == '{')) { if (context & LC_TEMPLATE) self->topstack->context |= LC_FAIL_ON_EQUALS; else diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 527d364..ad4895e 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1010,7 +1010,8 @@ class Tokenizer(object): if style: self._emit_all(style) if close_open_markup: - self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup, padding=padding)) + self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup, + padding=padding)) else: self._emit(tokens.TagCloseOpen(padding=padding)) if contents: @@ -1130,7 +1131,8 @@ class Tokenizer(object): return try: - cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context | contexts.TABLE_CELL_STYLE) + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context | contexts.TABLE_CELL_STYLE) except BadRoute: self._head = reset self._pop() @@ -1140,7 +1142,8 @@ class Tokenizer(object): reset_for_style = cell_context & contexts.TABLE_CELL_STYLE if reset_for_style: self._head = reset + len(markup) - self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) + self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context) try: padding = self._parse_as_table_style("|") except BadRoute: @@ -1151,7 +1154,8 @@ class Tokenizer(object): # Don't parse the style separator self._head += 1 try: - cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) + cell = self._parse(contexts.TABLE_OPEN | + contexts.TABLE_CELL_OPEN | line_context) except BadRoute: self._head = reset ret = self._pop() @@ -1160,9 +1164,11 @@ class Tokenizer(object): self._context = old_context close_open_markup = "|" if reset_for_style else None - self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "") + self._emit_table_tag(markup, tag, style, padding, close_open_markup, + cell, "") # keep header/cell line contexts - self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE) + self._context |= cell_context & (contexts.TABLE_TH_LINE | + contexts.TABLE_TD_LINE) # offset displacement done by parse() self._head -= 1 From 92cf8f2c03a8b339baa9e5a31c18c80ce635b2fb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Oct 2014 15:13:53 -0500 Subject: [PATCH 32/44] Add a couple more tests involving templates. --- tests/tokenizer/tables.mwtest | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index b411045..4e4fe74 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -90,7 +90,7 @@ output: [Text(text="{| | border=\"1\"| test || red | foo")] --- name: crazy_no_table_close -label: lost of opened wiki syntax without closes +label: lots of opened wiki syntax without closes input: "{{{ {{ {| Date: Wed, 22 Oct 2014 15:38:13 -0500 Subject: [PATCH 33/44] Add a test for tokenizer line 1384. --- tests/tokenizer/tables.mwtest | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 4e4fe74..59ad934 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -369,6 +369,13 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_fir --- +name: inappropriate_marker_at_line_start +label: an inappropriate marker (a right bracket) at the start of a line in the table +input: "{|\n}" +output: [Text(text="{|\n}")] + +--- + name: recursion_five_hundred_opens label: test potentially dangerous recursion: five hundred table openings, without spaces input: "{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|" From 457355d4bf976986f3471a2e1de39e9762a5dac3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Oct 2014 18:52:58 -0500 Subject: [PATCH 34/44] Remove try/except that is impossible to fail inside of. --- mwparserfromhell/parser/tokenizer.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index ad4895e..9787c5f 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1119,8 +1119,8 @@ class Tokenizer(object): return self._pop() def _handle_table_cell(self, markup, tag, line_context): - """Parse as normal syntax unless we hit a style marker, then parse style - as HTML attributes and the remainder as normal syntax.""" + """Parse as normal syntax unless we hit a style marker, then parse + style as HTML attributes and the remainder as normal syntax.""" old_context = self._context reset = self._head reset_for_style, padding, style = False, "", None @@ -1144,12 +1144,7 @@ class Tokenizer(object): self._head = reset + len(markup) self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) - try: - padding = self._parse_as_table_style("|") - except BadRoute: - self._head = reset - self._pop() - raise + padding = self._parse_as_table_style("|") style = self._pop() # Don't parse the style separator self._head += 1 From 5d29bff918ad80b150bfc51aa407019ff51229e2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Oct 2014 19:04:11 -0500 Subject: [PATCH 35/44] Remove an incorrect usage of Py_XDECREF(). --- mwparserfromhell/parser/tokenizer.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index ce46388..10a03a9 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -676,11 +676,8 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) RESET_ROUTE(); for (i = 0; i < braces; i++) text[i] = '{'; text[braces] = '\0'; - if (Tokenizer_emit_text_then_stack(self, text)) { - Py_XDECREF(text); + if (Tokenizer_emit_text_then_stack(self, text)) return -1; - } - Py_XDECREF(text); return 0; } else From 504b8bace08429e6a778f1fa69331cb5e849c043 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Oct 2014 19:22:50 -0500 Subject: [PATCH 36/44] Add test code for a missing branch of Tag.wiki_markup.setter; cleanup. --- mwparserfromhell/nodes/tag.py | 2 +- tests/test_tag.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index e9531e7..e3c7260 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -223,7 +223,7 @@ class Tag(Node): def wiki_markup(self, value): self._wiki_markup = str(value) if value else None if not value or not self.closing_wiki_markup: - self.closing_wiki_markup = str(value) if value else None + self._closing_wiki_markup = self._wiki_markup @self_closing.setter def self_closing(self, value): diff --git a/tests/test_tag.py b/tests/test_tag.py index b33b0c2..3beea98 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -246,6 +246,9 @@ class TestTag(TreeEqualityTestCase): node.closing_wiki_markup = "|}" self.assertEqual("|}", node.closing_wiki_markup) self.assertEqual("{|\n|}", node) + node.wiki_markup = "!!" + self.assertEqual("|}", node.closing_wiki_markup) + self.assertEqual("!!\n|}", node) node.wiki_markup = False self.assertFalse(node.closing_wiki_markup) self.assertEqual("\n
", node) From 913ff590c8e90f771e16e150b239147bd32f1c8d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Oct 2014 20:34:36 -0500 Subject: [PATCH 37/44] Cleanup; add a missing test. --- mwparserfromhell/parser/tokenizer.c | 6 ------ mwparserfromhell/parser/tokenizer.py | 2 +- tests/tokenizer/tags_wikimarkup.mwtest | 7 +++++++ 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 10a03a9..faed5d7 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2783,12 +2783,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, line_context)) return -1; padding = Tokenizer_parse_as_table_style(self, '|', 0); - if (BAD_ROUTE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - self->head = reset; - return 0; - } if (!padding) return -1; style = Tokenizer_pop(self); diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9787c5f..dd5d6d9 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1325,7 +1325,7 @@ class Tokenizer(object): elif this in ("\n", ":") and self._context & contexts.DL_TERM: self._handle_dl_term() if this == "\n": - # kill potential table contexts + # Kill potential table contexts self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS # Start of table parsing elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index 04f617a..c709ba7 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -447,6 +447,13 @@ output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Tag --- +name: dt_dd_mix4 +label: another example of correct dt/dd usage, with a trigger for a specific parse route +input: ";foo]:bar" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo]"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")] + +--- + name: ul_ol_dt_dd_mix label: an assortment of uls, ols, dds, and dts input: ";:#*foo\n:#*;foo\n#*;:foo\n*;:#foo" From e1ebb59b9e1be3fe2ffd64c679e02983234d20ae Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 22 Oct 2014 22:59:42 -0500 Subject: [PATCH 38/44] Ensure token list is copied before being fed to the builder. --- tests/_test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index e44280b..17d588b 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -53,7 +53,7 @@ class TokenizerTestCase(object): def inner(self): if hasattr(self, "roundtrip"): expected = data["input"] - actual = str(Builder().build(data["output"])) + actual = str(Builder().build(data["output"][:])) else: expected = data["output"] actual = self.tokenizer().tokenize(data["input"]) From 640005dbb2eb641572f9880aaa72c3c6347802f9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 23 Oct 2014 21:27:21 -0500 Subject: [PATCH 39/44] Tokenizer cleanup; make inline table syntax invalid as it should be. --- mwparserfromhell/parser/tokenizer.c | 56 ++++++++++++++--------------- mwparserfromhell/parser/tokenizer.py | 70 +++++++++++++++++------------------- 2 files changed, 61 insertions(+), 65 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index faed5d7..c53a420 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2423,34 +2423,6 @@ static int Tokenizer_handle_dl_term(Tokenizer* self) } /* - Handle the end of the stream of wikitext. -*/ -static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) -{ - PyObject *token, *text, *trash; - int single; - - if (context & AGG_FAIL) { - if (context & LC_TAG_BODY) { - token = PyList_GET_ITEM(self->topstack->stack, 1); - text = PyObject_GetAttrString(token, "text"); - if (!text) - return NULL; - single = IS_SINGLE(text); - Py_DECREF(text); - if (single) - return Tokenizer_handle_single_tag_end(self); - } - else if (context & AGG_DOUBLE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - } - return Tokenizer_fail_route(self); - } - return Tokenizer_pop(self); -} - -/* Emit a table tag. */ static int @@ -2847,6 +2819,34 @@ Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) } /* + Handle the end of the stream of wikitext. +*/ +static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) +{ + PyObject *token, *text, *trash; + int single; + + if (context & AGG_FAIL) { + if (context & LC_TAG_BODY) { + token = PyList_GET_ITEM(self->topstack->stack, 1); + text = PyObject_GetAttrString(token, "text"); + if (!text) + return NULL; + single = IS_SINGLE(text); + Py_DECREF(text); + if (single) + return Tokenizer_handle_single_tag_end(self); + } + else if (context & AGG_DOUBLE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + } + return Tokenizer_fail_route(self); + } + return Tokenizer_pop(self); +} + +/* Make sure we are not trying to write an invalid character. Return 0 if everything is safe, or -1 if the route must be failed. */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index dd5d6d9..7921e7c 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -991,17 +991,6 @@ class Tokenizer(object): else: self._emit_text("\n") - def _handle_end(self): - """Handle the end of the stream of wikitext.""" - if self._context & contexts.FAIL: - if self._context & contexts.TAG_BODY: - if is_single(self._stack[1].text): - return self._handle_single_tag_end() - if self._context & contexts.DOUBLE: - self._pop() - self._fail_route() - return self._pop() - def _emit_table_tag(self, open_open_markup, tag, style, padding, close_open_markup, contents, open_close_markup): """Emit a table tag.""" @@ -1020,22 +1009,21 @@ class Tokenizer(object): self._emit_text(tag) self._emit(tokens.TagCloseClose()) - def _parse_as_table_style(self, end_token, break_on_table_end=False): + def _parse_as_table_style(self, end_token): """Parse until ``end_token`` as style attributes for a table.""" data = _TagOpenData() data.context = _TagOpenData.CX_ATTR_READY while True: - this, next = self._read(), self._read(1) - table_end = break_on_table_end and this == "|" and next == "}" + this = self._read() can_exit = (not data.context & data.CX_QUOTED or data.context & data.CX_NOTE_SPACE) - if (this == end_token and can_exit) or table_end: + if this == end_token and can_exit: if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): self._push_tag_buffer(data) if this.isspace(): data.padding_buffer["first"] += this return data.padding_buffer["first"] - elif this is self.END or table_end or this == end_token: + elif this is self.END or this == end_token: if self._context & contexts.TAG_ATTR: if data.context & data.CX_QUOTED: # Unclosed attribute quote: reset, don't die @@ -1052,31 +1040,27 @@ class Tokenizer(object): def _handle_table_start(self): """Handle the start of a table.""" reset = self._head + 1 - style, table = None, None self._head += 2 self._push(contexts.TABLE_OPEN) try: - padding = self._parse_as_table_style("\n", break_on_table_end=True) + padding = self._parse_as_table_style("\n") except BadRoute: self._head = reset self._emit_text("{|") return style = self._pop() - # continue to parse if it is NOT an inline table - if "\n" in padding: - self._head += 1 - try: - table = self._parse(contexts.TABLE_OPEN) - except BadRoute: - self._head = reset - self._emit_text("{|") - return - else: - # close tag - self._head += 2 + + self._head += 1 + try: + table = self._parse(contexts.TABLE_OPEN) + except BadRoute: + self._head = reset + self._emit_text("{|") + return + self._emit_table_tag("{|", "table", style, padding, None, table, "|}") - # offset displacement done by _parse() + # Offset displacement done by _parse(): self._head -= 1 def _handle_table_end(self): @@ -1087,7 +1071,6 @@ class Tokenizer(object): def _handle_table_row(self): """Parse as style until end of the line, then continue.""" reset = self._head - style, padding = None, "" self._head += 2 if not self._can_recurse(): self._emit_text("|-") @@ -1102,7 +1085,8 @@ class Tokenizer(object): self._pop() raise style = self._pop() - # don't parse the style separator + + # Don't parse the style separator: self._head += 1 try: row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) @@ -1110,8 +1094,9 @@ class Tokenizer(object): self._head = reset self._pop() raise + self._emit_table_tag("|-", "tr", style, padding, None, row, "") - # offset displacement done by parse() + # Offset displacement done by parse(): self._head -= 1 def _handle_table_row_end(self): @@ -1146,7 +1131,7 @@ class Tokenizer(object): line_context) padding = self._parse_as_table_style("|") style = self._pop() - # Don't parse the style separator + # Don't parse the style separator: self._head += 1 try: cell = self._parse(contexts.TABLE_OPEN | @@ -1161,10 +1146,10 @@ class Tokenizer(object): close_open_markup = "|" if reset_for_style else None self._emit_table_tag(markup, tag, style, padding, close_open_markup, cell, "") - # keep header/cell line contexts + # Keep header/cell line contexts: self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE) - # offset displacement done by parse() + # Offset displacement done by parse(): self._head -= 1 def _handle_table_cell_end(self, reset_for_style=False): @@ -1176,6 +1161,17 @@ class Tokenizer(object): self._context &= ~contexts.TABLE_CELL_STYLE return self._pop(keep_context=True) + def _handle_end(self): + """Handle the end of the stream of wikitext.""" + if self._context & contexts.FAIL: + if self._context & contexts.TAG_BODY: + if is_single(self._stack[1].text): + return self._handle_single_tag_end() + if self._context & contexts.DOUBLE: + self._pop() + self._fail_route() + return self._pop() + def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" context = self._context From 4d4045902d1b56369c962a79a8e6a95e09a068c5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 23 Oct 2014 21:27:55 -0500 Subject: [PATCH 40/44] Update table tests to reflect new grammar. --- tests/tokenizer/tables.mwtest | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index 59ad934..e042467 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -6,9 +6,9 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding --- name: inline_table -label: correctly handle tables with close on the same line +label: tables with a close on the same line are not valid input: "{||}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=""), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [Text(text="{||}")] --- @@ -127,7 +127,7 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding name: characters_after_inline_table label: handle characters after an inline table close input: "{| |} tsta" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" tsta")] +output: [Text(text="{| |} tsta")] --- @@ -342,9 +342,9 @@ output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_fir --- name: inline_table_attributes -label: correctly handle attributes in inline tables +label: handle attributes in inline tables input: "{| foo="tee bar" |}" -output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"),TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="tee bar"), TagCloseOpen(padding=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] +output: [Text(text='{| foo="tee bar" |}')] --- @@ -376,14 +376,28 @@ output: [Text(text="{|\n}")] --- -name: recursion_five_hundred_opens -label: test potentially dangerous recursion: five hundred table openings, without spaces -input: "{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|" -output: [Text(text="{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|{|")] +name: fake_close_near_start +label: a fake closing token at the end of the first line in the table +input: "{| class="wikitable" style="text-align: center; width=100%;|}\n|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"text-align:"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="center;"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="width"), TagAttrEquals(), Text(text="100%;|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] --- -name: recursion_one_hundred_opens -label: test potentially dangerous recursion: one hundred table openings, with spaces -input: "{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|" -output: [Text(text="{| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {| {|")] +name: fake_close_near_start_2 +label: a fake closing token at the end of the first line in the table +input: "{| class="wikitable|}"\n|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: junk_after_table_start +label: ignore more junk on the first line of the table +input: "{| class="wikitable" | foobar\n|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="foobar"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: junk_after_table_row +label: ignore junk on the first line of a table row +input: "{|\n|- foo="bar" | baz\n|blerp\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="bar"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="baz"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="blerp\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] From fb261450d8fa0d3e666fe48a000a6afd6694c89a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 23 Oct 2014 21:40:50 -0500 Subject: [PATCH 41/44] Port tokenizer updates to C. --- mwparserfromhell/parser/tokenizer.c | 80 ++++++++++++++----------------------- 1 file changed, 31 insertions(+), 49 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index c53a420..1b68b46 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2513,13 +2513,12 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, Parse until ``end_token`` as style attributes for a table. */ static PyObject* -Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, - int break_on_table_end) +Tokenizer_parse_as_table_style(Tokenizer* self, char end_token) { TagData *data = TagData_new(); PyObject *padding, *trash; - Py_UNICODE this, next; - int can_exit, table_end; + Py_UNICODE this; + int can_exit; if (!data) return NULL; @@ -2527,10 +2526,8 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, while (1) { this = Tokenizer_READ(self, 0); - next = Tokenizer_READ(self, 1); can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE); - table_end = (break_on_table_end && this == '|' && next == '}'); - if ((this == end_token && can_exit) || table_end) { + if (this == end_token && can_exit) { if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) { if (Tokenizer_push_tag_buffer(self, data)) { TagData_dealloc(data); @@ -2545,7 +2542,7 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, return NULL; return padding; } - else if (!this || table_end || this == end_token) { + else if (!this || this == end_token) { if (self->topstack->context & LC_TAG_ATTR) { if (data->context & TAG_QUOTED) { // Unclosed attribute quote: reset, don't die @@ -2577,13 +2574,13 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token, static int Tokenizer_handle_table_start(Tokenizer* self) { Py_ssize_t reset = self->head + 1; - PyObject *style, *padding, *newline_character; + PyObject *style, *padding; PyObject *table = NULL; self->head += 2; if(Tokenizer_push(self, LC_TABLE_OPEN)) return -1; - padding = Tokenizer_parse_as_table_style(self, '\n', 1); + padding = Tokenizer_parse_as_table_style(self, '\n'); if (BAD_ROUTE) { RESET_ROUTE(); self->head = reset; @@ -2599,41 +2596,27 @@ static int Tokenizer_handle_table_start(Tokenizer* self) return -1; } - newline_character = PyUnicode_FromString("\n"); - if (!newline_character) { + self->head++; + table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); Py_DECREF(padding); Py_DECREF(style); - return -1; - } - // continue to parse if it is NOT an inline table - if (PyUnicode_Contains(padding, newline_character)) { - Py_DECREF(newline_character); - self->head++; - table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); - if (BAD_ROUTE) { - Py_DECREF(padding); - Py_DECREF(style); - RESET_ROUTE(); - self->head = reset; - if (Tokenizer_emit_text(self, "{|")) - return -1; - return 0; - } - if (!table) { - Py_DECREF(padding); - Py_DECREF(style); + self->head = reset; + if (Tokenizer_emit_text(self, "{|")) return -1; - } - } else { - Py_DECREF(newline_character); - // close tag - self->head += 2; + return 0; + } + if (!table) { + Py_DECREF(padding); + Py_DECREF(style); + return -1; } if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL, table, "|}")) return -1; - // offset displacement done by _parse() + // Offset displacement done by _parse() self->head--; return 0; } @@ -2665,7 +2648,7 @@ static int Tokenizer_handle_table_row(Tokenizer* self) if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) return -1; - padding = Tokenizer_parse_as_table_style(self, '\n', 0); + padding = Tokenizer_parse_as_table_style(self, '\n'); if (BAD_ROUTE) { trash = Tokenizer_pop(self); Py_XDECREF(trash); @@ -2679,7 +2662,8 @@ static int Tokenizer_handle_table_row(Tokenizer* self) Py_DECREF(padding); return -1; } - // don't parse the style separator + + // Don't parse the style separator self->head++; row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); if (BAD_ROUTE) { @@ -2696,10 +2680,9 @@ static int Tokenizer_handle_table_row(Tokenizer* self) return -1; } - if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, - "")) + if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, "")) return -1; - // offset displacement done by _parse() + // Offset displacement done by _parse() self->head--; return 0; } @@ -2754,7 +2737,7 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context)) return -1; - padding = Tokenizer_parse_as_table_style(self, '|', 0); + padding = Tokenizer_parse_as_table_style(self, '|'); if (!padding) return -1; style = Tokenizer_pop(self); @@ -2796,10 +2779,9 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if (Tokenizer_emit_table_tag(self, markup, tag, style, padding, close_open_markup, cell, "")) return -1; - // keep header/cell line contexts - self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | - LC_TABLE_TD_LINE); - // offset displacement done by parse() + // Keep header/cell line contexts + self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE); + // Offset displacement done by parse() self->head--; return 0; } @@ -3092,7 +3074,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) { if (Tokenizer_handle_dl_term(self)) return NULL; - // kill potential table contexts + // Kill potential table contexts if (this == '\n') self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; } @@ -3130,7 +3112,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) else if (this == '|' && this_context & LC_TABLE_CELL_STYLE) { return Tokenizer_handle_table_cell_end(self, 1); } - // on newline, clear out cell line contexts + // On newline, clear out cell line contexts else if (this == '\n' && this_context & LC_TABLE_CELL_LINE_CONTEXTS) { self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; if (Tokenizer_emit_char(self, this)) From 8480381a31b5da4571e32f75a18f9f15e03d770c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 23 Oct 2014 21:53:55 -0500 Subject: [PATCH 42/44] Credit for table parsing code. [skip ci] --- CHANGELOG | 2 +- docs/changelog.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 9c05482..3471531 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,7 +2,7 @@ v0.4 (unreleased): - The parser is now distributed with Windows binaries, fixing an issue that prevented Windows users from using the C tokenizer. -- Added support for parsing wikicode tables. +- Added support for parsing wikicode tables (patches by David Winegar). - Added a script to test for memory leaks in scripts/memtest.py. - Added a script to do releases in scripts/release.sh. - skip_style_tags can now be passed to mwparserfromhell.parse() (previously, diff --git a/docs/changelog.rst b/docs/changelog.rst index 1854fa0..b3e7548 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,7 +9,7 @@ Unreleased - The parser is now distributed with Windows binaries, fixing an issue that prevented Windows users from using the C tokenizer. -- Added support for parsing wikicode tables. +- Added support for parsing wikicode tables (patches by David Winegar). - Added a script to test for memory leaks in :file:`scripts/memtest.py`. - Added a script to do releases in :file:`scripts/release.sh`. - *skip_style_tags* can now be passed to :func:`mwparserfromhell.parse() From 9fc4b909e150cd786e97caf7daeb479733e5330e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 24 Oct 2014 03:40:37 -0500 Subject: [PATCH 43/44] Refactor a lot of table error recovery code. --- mwparserfromhell/parser/contexts.py | 4 +- mwparserfromhell/parser/tokenizer.c | 100 +++++++++++++++-------------------- mwparserfromhell/parser/tokenizer.h | 2 +- mwparserfromhell/parser/tokenizer.py | 82 ++++++++++++---------------- tests/tokenizer/tables.mwtest | 7 +++ 5 files changed, 87 insertions(+), 108 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index ef44ce2..17912cb 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -171,7 +171,7 @@ TABLE_ROW_OPEN = 1 << 33 TABLE_TD_LINE = 1 << 34 TABLE_TH_LINE = 1 << 35 TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE -TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + + TABLE_ROW_OPEN + +TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + TABLE_TD_LINE + TABLE_TH_LINE) # Global contexts: @@ -184,6 +184,6 @@ FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE) UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) -DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE +DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 1b68b46..301ecfc 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2510,10 +2510,9 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, } /* - Parse until ``end_token`` as style attributes for a table. + Handle style attributes for a table until an ending token. */ -static PyObject* -Tokenizer_parse_as_table_style(Tokenizer* self, char end_token) +static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token) { TagData *data = TagData_new(); PyObject *padding, *trash; @@ -2569,9 +2568,9 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token) } /* - Handle the start of a table. + Parse a wikicode table by starting with the first line. */ -static int Tokenizer_handle_table_start(Tokenizer* self) +static int Tokenizer_parse_table(Tokenizer* self) { Py_ssize_t reset = self->head + 1; PyObject *style, *padding; @@ -2580,7 +2579,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self) if(Tokenizer_push(self, LC_TABLE_OPEN)) return -1; - padding = Tokenizer_parse_as_table_style(self, '\n'); + padding = Tokenizer_handle_table_style(self, '\n'); if (BAD_ROUTE) { RESET_ROUTE(); self->head = reset; @@ -2622,20 +2621,10 @@ static int Tokenizer_handle_table_start(Tokenizer* self) } /* - Return the stack in order to handle the table end. -*/ -static PyObject* Tokenizer_handle_table_end(Tokenizer* self) -{ - self->head += 2; - return Tokenizer_pop(self); -} - -/* Parse as style until end of the line, then continue. */ static int Tokenizer_handle_table_row(Tokenizer* self) { - Py_ssize_t reset = self->head; PyObject *padding, *style, *row, *trash; self->head += 2; @@ -2648,11 +2637,10 @@ static int Tokenizer_handle_table_row(Tokenizer* self) if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) return -1; - padding = Tokenizer_parse_as_table_style(self, '\n'); + padding = Tokenizer_handle_table_style(self, '\n'); if (BAD_ROUTE) { trash = Tokenizer_pop(self); Py_XDECREF(trash); - self->head = reset; return 0; } if (!padding) @@ -2666,14 +2654,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self) // Don't parse the style separator self->head++; row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); - if (BAD_ROUTE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - Py_DECREF(padding); - Py_DECREF(style); - self->head = reset; - return 0; - } if (!row) { Py_DECREF(padding); Py_DECREF(style); @@ -2688,14 +2668,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self) } /* - Return the stack in order to handle the table row end. -*/ -static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) -{ - return Tokenizer_pop(self); -} - -/* Parse as normal syntax unless we hit a style marker, then parse style as HTML attributes and the remainder as normal syntax. */ @@ -2705,11 +2677,10 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, { uint64_t old_context = self->topstack->context; uint64_t cell_context; - Py_ssize_t reset = self->head; - PyObject *padding, *cell, *trash; - PyObject *style = NULL; + PyObject *padding, *cell, *style = NULL; const char *close_open_markup = NULL; self->head += strlen(markup); + Py_ssize_t reset = self->head; if (!Tokenizer_CAN_RECURSE(self)) { if (Tokenizer_emit_text(self, markup)) @@ -2720,12 +2691,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1); - if (BAD_ROUTE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - self->head = reset; - return 0; - } if (!cell) return -1; cell_context = self->topstack->context; @@ -2733,11 +2698,11 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if (cell_context & LC_TABLE_CELL_STYLE) { Py_DECREF(cell); - self->head = reset + strlen(markup); + self->head = reset; if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context)) return -1; - padding = Tokenizer_parse_as_table_style(self, '|'); + padding = Tokenizer_handle_table_style(self, '|'); if (!padding) return -1; style = Tokenizer_pop(self); @@ -2749,14 +2714,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, self->head++; cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context, 1); - if (BAD_ROUTE) { - Py_DECREF(padding); - Py_DECREF(style); - trash = Tokenizer_pop(self); - Py_XDECREF(trash); - self->head = reset; - return 0; - } if (!cell) { Py_DECREF(padding); Py_DECREF(style); @@ -2801,6 +2758,23 @@ Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) } /* + Return the stack in order to handle the table row end. +*/ +static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) +{ + return Tokenizer_pop(self); +} + +/* + Return the stack in order to handle the table end. +*/ +static PyObject* Tokenizer_handle_table_end(Tokenizer* self) +{ + self->head += 2; + return Tokenizer_pop(self); +} + +/* Handle the end of the stream of wikitext. */ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) @@ -2819,9 +2793,16 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) if (single) return Tokenizer_handle_single_tag_end(self); } - else if (context & AGG_DOUBLE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); + else { + if (context & LC_TABLE_CELL_OPEN) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + context = self->topstack->context; + } + if (context & AGG_DOUBLE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + } } return Tokenizer_fail_route(self); } @@ -3082,7 +3063,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) // Start of table parsing else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) { if (Tokenizer_CAN_RECURSE(self)) { - if (Tokenizer_handle_table_start(self)) + if (Tokenizer_parse_table(self)) return NULL; } else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next)) @@ -3197,7 +3178,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) self->skip_style_tags = skip_style_tags; tokens = Tokenizer_parse(self, context, 1); - if (!tokens && !PyErr_Occurred()) { + if ((!tokens && !PyErr_Occurred()) || self->topstack) { if (!ParserError) { if (load_exceptions()) return NULL; @@ -3206,6 +3187,9 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) RESET_ROUTE(); PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); } + else if (self->topstack) + PyErr_SetString(ParserError, + "C tokenizer exited with non-empty token stack"); else PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); return NULL; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 8d2d428..33ba0e1 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -175,7 +175,7 @@ static PyObject* TagCloseClose; #define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) -#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) +#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) #define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) #define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 7921e7c..3ac25a5 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1009,8 +1009,8 @@ class Tokenizer(object): self._emit_text(tag) self._emit(tokens.TagCloseClose()) - def _parse_as_table_style(self, end_token): - """Parse until ``end_token`` as style attributes for a table.""" + def _handle_table_style(self, end_token): + """Handle style attributes for a table until ``end_token``.""" data = _TagOpenData() data.context = _TagOpenData.CX_ATTR_READY while True: @@ -1037,14 +1037,13 @@ class Tokenizer(object): self._handle_tag_data(data, this) self._head += 1 - def _handle_table_start(self): - """Handle the start of a table.""" + def _parse_table(self): + """Parse a wikicode table by starting with the first line.""" reset = self._head + 1 self._head += 2 - self._push(contexts.TABLE_OPEN) try: - padding = self._parse_as_table_style("\n") + padding = self._handle_table_style("\n") except BadRoute: self._head = reset self._emit_text("{|") @@ -1063,14 +1062,8 @@ class Tokenizer(object): # Offset displacement done by _parse(): self._head -= 1 - def _handle_table_end(self): - """Return the stack in order to handle the table end.""" - self._head += 2 - return self._pop() - def _handle_table_row(self): """Parse as style until end of the line, then continue.""" - reset = self._head self._head += 2 if not self._can_recurse(): self._emit_text("|-") @@ -1079,67 +1072,47 @@ class Tokenizer(object): self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) try: - padding = self._parse_as_table_style("\n") + padding = self._handle_table_style("\n") except BadRoute: - self._head = reset self._pop() raise style = self._pop() # Don't parse the style separator: self._head += 1 - try: - row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) - except BadRoute: - self._head = reset - self._pop() - raise + row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) self._emit_table_tag("|-", "tr", style, padding, None, row, "") # Offset displacement done by parse(): self._head -= 1 - def _handle_table_row_end(self): - """Return the stack in order to handle the table row end.""" - return self._pop() - def _handle_table_cell(self, markup, tag, line_context): """Parse as normal syntax unless we hit a style marker, then parse style as HTML attributes and the remainder as normal syntax.""" old_context = self._context - reset = self._head - reset_for_style, padding, style = False, "", None + padding, style = "", None self._head += len(markup) + reset = self._head if not self._can_recurse(): self._emit_text(markup) self._head -= 1 return - try: - cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | - line_context | contexts.TABLE_CELL_STYLE) - except BadRoute: - self._head = reset - self._pop() - raise + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context | contexts.TABLE_CELL_STYLE) cell_context = self._context self._context = old_context reset_for_style = cell_context & contexts.TABLE_CELL_STYLE if reset_for_style: - self._head = reset + len(markup) + self._head = reset self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | line_context) - padding = self._parse_as_table_style("|") + padding = self._handle_table_style("|") style = self._pop() # Don't parse the style separator: self._head += 1 - try: - cell = self._parse(contexts.TABLE_OPEN | - contexts.TABLE_CELL_OPEN | line_context) - except BadRoute: - self._head = reset - ret = self._pop() - raise + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context) cell_context = self._context self._context = old_context @@ -1161,12 +1134,23 @@ class Tokenizer(object): self._context &= ~contexts.TABLE_CELL_STYLE return self._pop(keep_context=True) + def _handle_table_row_end(self): + """Return the stack in order to handle the table row end.""" + return self._pop() + + def _handle_table_end(self): + """Return the stack in order to handle the table end.""" + self._head += 2 + return self._pop() + def _handle_end(self): """Handle the end of the stream of wikitext.""" if self._context & contexts.FAIL: if self._context & contexts.TAG_BODY: if is_single(self._stack[1].text): return self._handle_single_tag_end() + if self._context & contexts.TABLE_CELL_OPEN: + self._pop() if self._context & contexts.DOUBLE: self._pop() self._fail_route() @@ -1327,19 +1311,19 @@ class Tokenizer(object): elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if self._can_recurse(): - self._handle_table_start() + self._parse_table() else: self._emit_text("{|") elif self._context & contexts.TABLE_OPEN: - if this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE: + if this == next == "|" and self._context & contexts.TABLE_TD_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE) - elif this == "|" and next == "|" and self._context & contexts.TABLE_TH_LINE: + elif this == next == "|" and self._context & contexts.TABLE_TH_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE) - elif this == "!" and next == "!" and self._context & contexts.TABLE_TH_LINE: + elif this == next == "!" and self._context & contexts.TABLE_TH_LINE: if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) @@ -1387,6 +1371,10 @@ class Tokenizer(object): self._text = [segment for segment in split if segment] self._head = self._global = self._depth = self._cycles = 0 try: - return self._parse(context) + tokens = self._parse(context) except BadRoute: # pragma: no cover (untestable/exceptional case) raise ParserError("Python tokenizer exited with BadRoute") + if self._stacks: # pragma: no cover (untestable/exceptional case) + err = "Python tokenizer exited with non-empty token stack" + raise ParserError(err) + return tokens diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest index e042467..16012cf 100644 --- a/tests/tokenizer/tables.mwtest +++ b/tests/tokenizer/tables.mwtest @@ -61,6 +61,13 @@ output: [Text(text="{| \n|- \n ")] --- +name: no_table_close_row_and_cell +label: no table close while inside a cell inside a row +input: "{| \n|- \n|" +output: [Text(text="{| \n|- \n|")] + +--- + name: no_table_close_attributes label: don't parse attributes as attributes if the table doesn't exist input: "{| border="1"" From a15f6172c09ee22aae4899547975eec4b2b0ced3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 24 Oct 2014 03:43:22 -0500 Subject: [PATCH 44/44] Minor bugfix. --- mwparserfromhell/parser/tokenizer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 301ecfc..38e3a4c 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2677,10 +2677,12 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, { uint64_t old_context = self->topstack->context; uint64_t cell_context; + Py_ssize_t reset; PyObject *padding, *cell, *style = NULL; const char *close_open_markup = NULL; + self->head += strlen(markup); - Py_ssize_t reset = self->head; + reset = self->head; if (!Tokenizer_CAN_RECURSE(self)) { if (Tokenizer_emit_text(self, markup))