From a8d2983161e422e27e0de8c1261b196e7a79363b Mon Sep 17 00:00:00 2001 From: David Winegar Date: Mon, 14 Jul 2014 10:37:36 -0700 Subject: [PATCH] Started table parsing in PyTokenizer Started parsing table support and added the start of table support. This is a big commit (ugh) and it should probably be split up into multiple smaller ones if possible, but that seems unworkable as of right now because of all the dependencies. Also breaks tests of CTokenizer (double ugh) because I haven't started table support there. May want to pick line by line on this commit later but I need to save my work for now. --- mwparserfromhell/definitions.py | 2 +- mwparserfromhell/parser/contexts.py | 8 +++- mwparserfromhell/parser/tokenizer.py | 76 +++++++++++++++++++++++++++++++++--- tests/tokenizer/tables.mwtest | 32 +++++++++++++++ 4 files changed, 111 insertions(+), 7 deletions(-) create mode 100644 tests/tokenizer/tables.mwtest diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 6020ad1..af41f49 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -52,7 +52,7 @@ INVISIBLE_TAGS = [ # [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] -SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] +SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"] MARKUP_TO_HTML = { "#": "li", diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index f568fac..678a392 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -155,13 +155,19 @@ FAIL_ON_EQUALS = 1 << 29 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) +TABLE_OPEN = 1 << 30 +TABLE_CELL_LINE = 1 << 31 +TABLE_HEADER_LINE = 1 << 32 +TABLE_CELL_OPEN = 1 << 33 +TABLE_CELL_STYLE_POSSIBLE = 1 << 34 + # Global contexts: GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE +FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE + TABLE_OPEN UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 073e64c..70e2d5d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1002,6 +1002,39 @@ class Tokenizer(object): self._fail_route() return self._pop() + def _handle_table_start(self): + """Handle the start of a table.""" + # TODO - fail all other contexts on start? + self._head += 2 + reset = self._head - 1 + try: + table = self._parse(contexts.TABLE_OPEN) + except BadRoute: + self._head = reset + self._emit_text("{|") + else: + self._emit_style_tag("table", "{|", table) + + def _handle_table_end(self): + self._head += 2 + return self._pop() + + def _handle_table_row(self): + self._head += 2 + self._emit(tokens.TagOpenOpen(wiki_markup="{-")) + self._emit_text("tr") + self._emit(tokens.TagCloseSelfclose()) + self._context &= ~contexts.TABLE_CELL_OPEN + + def _handle_table_cell(self): + pass + + def _handle_header_cell(self): + pass + + def _handle_cell_style(self): + pass + def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" context = self._context @@ -1144,15 +1177,48 @@ class Tokenizer(object): result = self._parse_style() if result is not None: return result - elif self._read(-1) in ("\n", self.START): - if this in ("#", "*", ";", ":"): + elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"): self._handle_list() - elif this == next == self._read(2) == self._read(3) == "-": + elif self._read(-1) in ("\n", self.START) and this == next == self._read(2) == self._read(3) == "-": self._handle_hr() - else: - self._emit_text(this) elif this in ("\n", ":") and self._context & contexts.DL_TERM: self._handle_dl_term() + + elif (this == "{" and next == "|" and (self._read(-1) in ("\n", self.START)) or + (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")): + if self._can_recurse(): + self._handle_table_start() + else: + self._emit_text("{|") + elif self._context & contexts.TABLE_OPEN: + if this == "|" and next == "}": + return self._handle_table_end() + elif this == "|" and next == "|" and self._context & contexts.TABLE_CELL_LINE: + self._handle_table_cell() + elif this == "|" and next == "|" and self._context & contexts.TABLE_HEADER_LINE: + self._handle_header_cell() + elif this == "!" and next == "!" and self._context & contexts.TABLE_HEADER_LINE: + self._handle_header_cell() + elif this == "|" and self._context & contexts.TABLE_CELL_STYLE_POSSIBLE: + self._handle_cell_style() + # on newline, clear out cell line contexts + elif this == "\n" and self._context & (contexts.TABLE_CELL_LINE | contexts.TABLE_HEADER_LINE | contexts.TABLE_CELL_STYLE_POSSIBLE): + self._context &= (~contexts.TABLE_CELL_LINE & ~contexts.TABLE_HEADER_LINE & ~contexts.TABLE_CELL_STYLE_POSSIBLE) + self._emit_text(this) + # newline or whitespace/newline + elif (self._read(-1) in ("\n", self.START) or + (self._read(-2) in ("\n", self.START) and self._read(-1).strip() == "")): + if this == "|" and next == "-": + self._handle_table_row() + elif this == "|" and self._can_recurse(): + self._handle_table_cell() + elif this == "!" and self._can_recurse(): + self._handle_header_cell() + else: + self._emit_text(this) + else: + self._emit_text(this) + else: self._emit_text(this) self._head += 1 diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest new file mode 100644 index 0000000..399f7fd --- /dev/null +++ b/tests/tokenizer/tables.mwtest @@ -0,0 +1,32 @@ +name: empty_table +label: Parsing an empty table. +input: "{|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: inline_table +label: Correctly handle tables with close on the same line. +input: "{||}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: no_table_close_simple +label: Handle case when there is no table close. +input: "{| " +output: [Text(text="{| ")] + +--- + +name: leading_whitespace_table +label: Handle leading whitespace for a table. +input: "foo \n \t {|\n|}" +output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(), Text(text="\n"), TagOpenClose(), Text(text="table"), TagCloseClose()] + +--- + +name: leading_characters_table +label: Don't parse as a table when leading characters are not newline or whitespace. +input: "foo \n foo \t {|\n|}" +output: [Text(text="foo \n foo \t {|\n|}")]