diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 38e3a4c..7d07ed8 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -69,15 +69,19 @@ static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2,
/*
Sanitize the name of a tag so it can be compared with others for equality.
*/
-static PyObject* strip_tag_name(PyObject* token)
+static PyObject* strip_tag_name(PyObject* token, int take_attr)
{
PyObject *text, *rstripped, *lowered;
- text = PyObject_GetAttrString(token, "text");
- if (!text)
- return NULL;
- rstripped = PyObject_CallMethod(text, "rstrip", NULL);
- Py_DECREF(text);
+ if (take_attr) {
+ text = PyObject_GetAttrString(token, "text");
+ if (!text)
+ return NULL;
+ rstripped = PyObject_CallMethod(text, "rstrip", NULL);
+ Py_DECREF(text);
+ }
+ else
+ rstripped = PyObject_CallMethod(token, "rstrip", NULL);
if (!rstripped)
return NULL;
lowered = PyObject_CallMethod(rstripped, "lower", NULL);
@@ -1812,8 +1816,9 @@ static PyObject* Tokenizer_handle_tag_close_close(Tokenizer* self)
valid = 0;
break;
case 1: {
- so = strip_tag_name(first);
- sc = strip_tag_name(PyList_GET_ITEM(self->topstack->stack, 1));
+ so = strip_tag_name(first, 1);
+ sc = strip_tag_name(
+ PyList_GET_ITEM(self->topstack->stack, 1), 1);
if (so && sc) {
if (PyUnicode_Compare(so, sc))
valid = 0;
@@ -1848,7 +1853,11 @@ static PyObject* Tokenizer_handle_tag_close_close(Tokenizer* self)
*/
static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
{
+ Textbuffer* buffer;
+ PyObject *buf_tmp, *end_tag, *start_tag;
Py_UNICODE this, next;
+ Py_ssize_t reset;
+ int cmp;
while (1) {
this = Tokenizer_READ(self, 0);
@@ -1856,10 +1865,48 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
if (!this)
return Tokenizer_fail_route(self);
else if (this == '<' && next == '/') {
- if (Tokenizer_handle_tag_open_close(self))
+ self->head += 2;
+ reset = self->head - 1;
+ buffer = Textbuffer_new();
+ if (!buffer)
return NULL;
- self->head++;
- return Tokenizer_parse(self, 0, 0);
+ while ((this = Tokenizer_READ(self, 0))) {
+ if (this == '>') {
+ buf_tmp = Textbuffer_render(buffer);
+ if (!buf_tmp)
+ return NULL;
+ end_tag = strip_tag_name(buf_tmp, 0);
+ Py_DECREF(buf_tmp);
+ if (!end_tag)
+ return NULL;
+ start_tag = strip_tag_name(
+ PyList_GET_ITEM(self->topstack->stack, 1), 1);
+ if (!start_tag)
+ return NULL;
+ cmp = PyUnicode_Compare(start_tag, end_tag);
+ Py_DECREF(end_tag);
+ Py_DECREF(start_tag);
+ if (cmp)
+ goto no_matching_end;
+ if (Tokenizer_emit(self, TagOpenClose))
+ return NULL;
+ if (Tokenizer_emit_textbuffer(self, buffer, 0))
+ return NULL;
+ if (Tokenizer_emit(self, TagCloseClose))
+ return NULL;
+ return Tokenizer_pop(self);
+ }
+ if (!this || this == '\n') {
+ no_matching_end:
+ Textbuffer_dealloc(buffer);
+ self->head = reset;
+ if (Tokenizer_emit_text(self, ""))
+ return NULL;
+ break;
+ }
+ Textbuffer_write(&buffer, this);
+ self->head++;
+ }
}
else if (this == '&') {
if (Tokenizer_parse_entity(self))
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 3ac25a5..607cc69 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -735,14 +735,22 @@ class Tokenizer(object):
def _handle_blacklisted_tag(self):
"""Handle the body of an HTML tag that is parser-blacklisted."""
+ strip = lambda text: text.rstrip().lower()
while True:
this, next = self._read(), self._read(1)
if this is self.END:
self._fail_route()
elif this == "<" and next == "/":
- self._handle_tag_open_close()
- self._head += 1
- return self._parse(push=False)
+ self._head += 3
+ if self._read() != ">" or (strip(self._read(-1)) !=
+ strip(self._stack[1].text)):
+ self._head -= 1
+ self._emit_text("")
+ continue
+ self._emit(tokens.TagOpenClose())
+ self._emit_text(self._read(-1))
+ self._emit(tokens.TagCloseClose())
+ return self._pop()
elif this == "&":
self._parse_entity()
else:
diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest
index 55b18f7..ff39f26 100644
--- a/tests/tokenizer/tags.mwtest
+++ b/tests/tokenizer/tags.mwtest
@@ -614,7 +614,21 @@ output: [TagOpenOpen(), Text(text="NoWiKi"), TagCloseOpen(padding=""), Text(text
---
+name: unparsable_incomplete_close
+label: an unparsable tag with an incomplete close afterwards
+input: "foofoo"
output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text=""), TagOpenClose(), Text(text="nowiki"), TagCloseClose()]
+
+---
+
+name: unparsable_with_intermediates_normalize
+label: an unparsable tag with intermediate tags inside of it, requiring normalization
+input: ""
+output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text=""), TagOpenClose(), Text(text="nowIKI "), TagCloseClose()]