diff --git a/mwparserfromhell/compat.py b/mwparserfromhell/compat.py index 94e0db3..4384ace 100644 --- a/mwparserfromhell/compat.py +++ b/mwparserfromhell/compat.py @@ -20,7 +20,6 @@ if py3k: range = range maxsize = sys.maxsize import html.entities as htmlentities - zip = zip else: bytes = str @@ -28,6 +27,5 @@ else: range = xrange maxsize = sys.maxint import htmlentitydefs as htmlentities - from itertools import izip as zip del sys diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index d8a505f..41ce5ac 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1896,18 +1896,26 @@ static PyObject* Tokenizer_handle_single_tag_end(Tokenizer* self) { PyObject *token = 0, *padding, *kwargs; Py_ssize_t len, index; - int is_instance; + int depth = 1, is_instance; len = PyList_GET_SIZE(self->topstack->stack); - for (index = len - 1; index >= 0; index--) { + for (index = 2; index < len; index++) { token = PyList_GET_ITEM(self->topstack->stack, index); - is_instance = PyObject_IsInstance(token, TagCloseOpen); + is_instance = PyObject_IsInstance(token, TagOpenOpen); if (is_instance == -1) return NULL; else if (is_instance == 1) - break; + depth++; + is_instance = PyObject_IsInstance(token, TagCloseOpen); + if (is_instance == -1) + return NULL; + else if (is_instance == 1) { + depth--; + if (depth == 0) + break; + } } - if (!token) + if (!token || depth > 0) return NULL; padding = PyObject_GetAttrString(token, "padding"); if (!padding) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index aa7499a..e69a823 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -25,7 +25,7 @@ from math import log import re from . import contexts, tokens -from ..compat import htmlentities, range, zip +from ..compat import htmlentities, range from ..definitions import (get_html_tag, is_parsable, is_single, is_single_only, is_scheme) @@ -752,11 +752,18 @@ class Tokenizer(object): def _handle_single_tag_end(self): """Handle the stream end when inside a single-supporting HTML tag.""" stack = self._stack - gen = zip(range(len(stack) - 1, -1, -1), reversed(stack)) - index = next(i for i, t in gen if isinstance(t, tokens.TagCloseOpen)) + # We need to find the index of the TagCloseOpen token corresponding to + # the TagOpenOpen token located at index 0: + depth = 1 + for index, token in enumerate(stack[2:], 2): + if isinstance(token, tokens.TagOpenOpen): + depth += 1 + elif isinstance(token, tokens.TagCloseOpen): + depth -= 1 + if depth == 0: + break padding = stack[index].padding - token = tokens.TagCloseSelfclose(padding=padding, implicit=True) - stack[index] = token + stack[index] = tokens.TagCloseSelfclose(padding=padding, implicit=True) return self._pop() def _really_parse_tag(self): diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index bf19f4d..5e1a409 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -178,3 +178,10 @@ name: external_link_inside_wikilink_title label: an external link inside a wikilink title, which is invalid input: "[[File:Example.png http://example.com]]" output: [WikilinkOpen(), Text(text="File:Example.png http://example.com"), WikilinkClose()] + +--- + +name: italics_inside_external_link_inside_incomplete_list +label: italic text inside an external link inside an incomplete list +input: "
  • [http://www.example.com ''example'']" +output: [TagOpenOpen(), Text(text="li"), TagCloseSelfclose(padding="", implicit=True), ExternalLinkOpen(brackets=True), Text(text="http://www.example.com"), ExternalLinkSeparator(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="example"), TagOpenClose(), Text(text="i"), TagCloseClose(), ExternalLinkClose()]