diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 38e3a4c..7d07ed8 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -69,15 +69,19 @@ static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2, /* Sanitize the name of a tag so it can be compared with others for equality. */ -static PyObject* strip_tag_name(PyObject* token) +static PyObject* strip_tag_name(PyObject* token, int take_attr) { PyObject *text, *rstripped, *lowered; - text = PyObject_GetAttrString(token, "text"); - if (!text) - return NULL; - rstripped = PyObject_CallMethod(text, "rstrip", NULL); - Py_DECREF(text); + if (take_attr) { + text = PyObject_GetAttrString(token, "text"); + if (!text) + return NULL; + rstripped = PyObject_CallMethod(text, "rstrip", NULL); + Py_DECREF(text); + } + else + rstripped = PyObject_CallMethod(token, "rstrip", NULL); if (!rstripped) return NULL; lowered = PyObject_CallMethod(rstripped, "lower", NULL); @@ -1812,8 +1816,9 @@ static PyObject* Tokenizer_handle_tag_close_close(Tokenizer* self) valid = 0; break; case 1: { - so = strip_tag_name(first); - sc = strip_tag_name(PyList_GET_ITEM(self->topstack->stack, 1)); + so = strip_tag_name(first, 1); + sc = strip_tag_name( + PyList_GET_ITEM(self->topstack->stack, 1), 1); if (so && sc) { if (PyUnicode_Compare(so, sc)) valid = 0; @@ -1848,7 +1853,11 @@ static PyObject* Tokenizer_handle_tag_close_close(Tokenizer* self) */ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self) { + Textbuffer* buffer; + PyObject *buf_tmp, *end_tag, *start_tag; Py_UNICODE this, next; + Py_ssize_t reset; + int cmp; while (1) { this = Tokenizer_READ(self, 0); @@ -1856,10 +1865,48 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self) if (!this) return Tokenizer_fail_route(self); else if (this == '<' && next == '/') { - if (Tokenizer_handle_tag_open_close(self)) + self->head += 2; + reset = self->head - 1; + buffer = Textbuffer_new(); + if (!buffer) return NULL; - self->head++; - return Tokenizer_parse(self, 0, 0); + while ((this = Tokenizer_READ(self, 0))) { + if (this == '>') { + buf_tmp = Textbuffer_render(buffer); + if (!buf_tmp) + return NULL; + end_tag = strip_tag_name(buf_tmp, 0); + Py_DECREF(buf_tmp); + if (!end_tag) + return NULL; + start_tag = strip_tag_name( + PyList_GET_ITEM(self->topstack->stack, 1), 1); + if (!start_tag) + return NULL; + cmp = PyUnicode_Compare(start_tag, end_tag); + Py_DECREF(end_tag); + Py_DECREF(start_tag); + if (cmp) + goto no_matching_end; + if (Tokenizer_emit(self, TagOpenClose)) + return NULL; + if (Tokenizer_emit_textbuffer(self, buffer, 0)) + return NULL; + if (Tokenizer_emit(self, TagCloseClose)) + return NULL; + return Tokenizer_pop(self); + } + if (!this || this == '\n') { + no_matching_end: + Textbuffer_dealloc(buffer); + self->head = reset; + if (Tokenizer_emit_text(self, "head++; + } } else if (this == '&') { if (Tokenizer_parse_entity(self)) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 3ac25a5..607cc69 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -735,14 +735,22 @@ class Tokenizer(object): def _handle_blacklisted_tag(self): """Handle the body of an HTML tag that is parser-blacklisted.""" + strip = lambda text: text.rstrip().lower() while True: this, next = self._read(), self._read(1) if this is self.END: self._fail_route() elif this == "<" and next == "/": - self._handle_tag_open_close() - self._head += 1 - return self._parse(push=False) + self._head += 3 + if self._read() != ">" or (strip(self._read(-1)) != + strip(self._stack[1].text)): + self._head -= 1 + self._emit_text("foo" output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text=""), TagOpenClose(), Text(text="nowiki"), TagCloseClose()] + +--- + +name: unparsable_with_intermediates_normalize +label: an unparsable tag with intermediate tags inside of it, requiring normalization +input: "" +output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text=""), TagOpenClose(), Text(text="nowIKI "), TagCloseClose()]