From b24ddaea1020df3ba0a81413feed981cf34267d8 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 9 Jul 2013 22:23:06 -0400 Subject: [PATCH] Tokenizer support for implicitly self-closing tags. --- mwparserfromhell/parser/tokenizer.py | 35 +++++++++++++++++++++++++---------- tests/tokenizer/tags.mwtest | 7 +++++++ 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 7247148..308852d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -26,7 +26,7 @@ import re from . import contexts, tokens from ..compat import htmlentities -from ..tag_defs import is_parsable +from ..tag_defs import is_parsable, is_single, is_single_only __all__ = ["Tokenizer"] @@ -596,6 +596,29 @@ class Tokenizer(object): self._emit(tokens.TagCloseClose()) return self._pop() + def _handle_single_end(self): + """Handle the steam end when inside a single-supporting HTML tag.""" + gen = enumerate(self._stack) + index = next(i for i, t in gen if isinstance(t, tokens.TagCloseOpen)) + padding = self._stack[index].padding + token = tokens.TagCloseSelfclose(padding=padding, implicit=True) + self._stack[index] = token + return self._pop() + + def _handle_end(self): + """Handle the end of the stream of wikitext.""" + fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | + contexts.HEADING | contexts.COMMENT | contexts.TAG) + double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) + if self._context & fail: + if self._context & contexts.TAG_BODY: + if is_single(self._stack[1].text): + return self._handle_single_end() + if self._context & double_fail: + self._pop() + self._fail_route() + return self._pop() + def _verify_safe(self, this): """Make sure we are not trying to write an invalid character.""" context = self._context @@ -658,10 +681,6 @@ class Tokenizer(object): unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME | contexts.TAG_CLOSE) - fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | - contexts.HEADING | contexts.COMMENT | contexts.TAG) - double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) - if push: self._push(context) while True: @@ -676,11 +695,7 @@ class Tokenizer(object): self._head += 1 continue if this is self.END: - if self._context & fail: - if self._context & double_fail: - self._pop() - self._fail_route() - return self._pop() + return self._handle_end() next = self._read(1) if self._context & contexts.COMMENT: if this == next == "-" and self._read(2) == ">": diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index a07f6c5..6dd67ff 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -520,3 +520,10 @@ name: single_only_close label: a tag that can only be single; just a close tag input: "foo
bar{{baz}}" output: [Text(text="foo"), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()] + +--- + +name: single_only_double +label: a tag that can only be single; a tag with backslashes at the beginning and end +input: "foo
bar{{baz}}" +output: [Text(text="foo"), TagOpenOpen(showtag=True, invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]