diff --git a/CHANGELOG b/CHANGELOG index bebacbf..b52a70f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -16,6 +16,8 @@ v0.5 (unreleased): on incompletely-constructed StringMixIn subclasses). - Fixed Wikicode.matches()'s behavior on iterables besides lists and tuples. - Fixed len() sometimes raising ValueError on empty node lists. +- Fixed a rare parsing bug involving self-closing tags inside the attributes of + unpaired tags. - Fixed release script after changes to PyPI. v0.4.4 (released December 30, 2016): diff --git a/docs/changelog.rst b/docs/changelog.rst index c558579..b02437f 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -27,6 +27,8 @@ Unreleased - Fixed :meth:`.Wikicode.matches`\ 's behavior on iterables besides lists and tuples. - Fixed ``len()`` sometimes raising ``ValueError`` on empty node lists. +- Fixed a rare parsing bug involving self-closing tags inside the attributes of + unpaired tags. - Fixed release script after changes to PyPI. v0.4.4 diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index f8e52ec..90ee19d 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -1548,6 +1548,14 @@ static PyObject* Tokenizer_handle_single_tag_end(Tokenizer* self) if (depth == 0) break; } + is_instance = PyObject_IsInstance(token, TagCloseSelfclose); + if (is_instance == -1) + return NULL; + else if (is_instance == 1) { + depth--; + if (depth == 0) // Should never happen + return NULL; + } } if (!token || depth > 0) return NULL; diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index b3e5883..d7a0282 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -819,6 +819,12 @@ class Tokenizer(object): depth -= 1 if depth == 0: break + elif isinstance(token, tokens.TagCloseSelfclose): + depth -= 1 + if depth == 0: # pragma: no cover (untestable/exceptional) + raise ParserError( + "_handle_single_tag_end() got an unexpected " + "TagCloseSelfclose") else: # pragma: no cover (untestable/exceptional case) raise ParserError("_handle_single_tag_end() missed a TagCloseOpen") padding = stack[index].padding diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 3c07ac9..40815a6 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -646,3 +646,10 @@ name: non_ascii_full label: an open/close tag pair containing non-ASCII characters input: "<éxamplé>" output: [TagOpenOpen(), Text(text="éxamplé"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="éxamplé"), TagCloseClose()] + +--- + +name: single_nested_selfclosing +label: a single (unpaired) tag with a self-closing tag in the middle (see issue #147) +input: "
  • c>foobar" +output: [TagOpenOpen(), Text(text="li"), TagAttrStart(pad_first=" ", pad_after_eq="", pad_before_eq=" "), Text(text="a"), TagAttrStart(pad_first="", pad_after_eq="", pad_before_eq=" "), TagOpenOpen(), Text(text="br"), TagCloseSelfclose(padding=""), TagAttrStart(pad_first="", pad_after_eq="", pad_before_eq=""), Text(text="c"), TagCloseSelfclose(padding="", implicit=True), Text(text="foobar")]