Browse Source

Fix handling of tag closes within <nowiki> (fixes #89).

tags/v0.4
Ben Kurtovic 9 years ago
parent
commit
a00c645bd8
3 changed files with 83 additions and 14 deletions
  1. +58
    -11
      mwparserfromhell/parser/tokenizer.c
  2. +11
    -3
      mwparserfromhell/parser/tokenizer.py
  3. +14
    -0
      tests/tokenizer/tags.mwtest

+ 58
- 11
mwparserfromhell/parser/tokenizer.c View File

@@ -69,15 +69,19 @@ static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2,
/*
Sanitize the name of a tag so it can be compared with others for equality.
*/
static PyObject* strip_tag_name(PyObject* token)
static PyObject* strip_tag_name(PyObject* token, int take_attr)
{
PyObject *text, *rstripped, *lowered;

text = PyObject_GetAttrString(token, "text");
if (!text)
return NULL;
rstripped = PyObject_CallMethod(text, "rstrip", NULL);
Py_DECREF(text);
if (take_attr) {
text = PyObject_GetAttrString(token, "text");
if (!text)
return NULL;
rstripped = PyObject_CallMethod(text, "rstrip", NULL);
Py_DECREF(text);
}
else
rstripped = PyObject_CallMethod(token, "rstrip", NULL);
if (!rstripped)
return NULL;
lowered = PyObject_CallMethod(rstripped, "lower", NULL);
@@ -1812,8 +1816,9 @@ static PyObject* Tokenizer_handle_tag_close_close(Tokenizer* self)
valid = 0;
break;
case 1: {
so = strip_tag_name(first);
sc = strip_tag_name(PyList_GET_ITEM(self->topstack->stack, 1));
so = strip_tag_name(first, 1);
sc = strip_tag_name(
PyList_GET_ITEM(self->topstack->stack, 1), 1);
if (so && sc) {
if (PyUnicode_Compare(so, sc))
valid = 0;
@@ -1848,7 +1853,11 @@ static PyObject* Tokenizer_handle_tag_close_close(Tokenizer* self)
*/
static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
{
Textbuffer* buffer;
PyObject *buf_tmp, *end_tag, *start_tag;
Py_UNICODE this, next;
Py_ssize_t reset;
int cmp;

while (1) {
this = Tokenizer_READ(self, 0);
@@ -1856,10 +1865,48 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
if (!this)
return Tokenizer_fail_route(self);
else if (this == '<' && next == '/') {
if (Tokenizer_handle_tag_open_close(self))
self->head += 2;
reset = self->head - 1;
buffer = Textbuffer_new();
if (!buffer)
return NULL;
self->head++;
return Tokenizer_parse(self, 0, 0);
while ((this = Tokenizer_READ(self, 0))) {
if (this == '>') {
buf_tmp = Textbuffer_render(buffer);
if (!buf_tmp)
return NULL;
end_tag = strip_tag_name(buf_tmp, 0);
Py_DECREF(buf_tmp);
if (!end_tag)
return NULL;
start_tag = strip_tag_name(
PyList_GET_ITEM(self->topstack->stack, 1), 1);
if (!start_tag)
return NULL;
cmp = PyUnicode_Compare(start_tag, end_tag);
Py_DECREF(end_tag);
Py_DECREF(start_tag);
if (cmp)
goto no_matching_end;
if (Tokenizer_emit(self, TagOpenClose))
return NULL;
if (Tokenizer_emit_textbuffer(self, buffer, 0))
return NULL;
if (Tokenizer_emit(self, TagCloseClose))
return NULL;
return Tokenizer_pop(self);
}
if (!this || this == '\n') {
no_matching_end:
Textbuffer_dealloc(buffer);
self->head = reset;
if (Tokenizer_emit_text(self, "</"))
return NULL;
break;
}
Textbuffer_write(&buffer, this);
self->head++;
}
}
else if (this == '&') {
if (Tokenizer_parse_entity(self))


+ 11
- 3
mwparserfromhell/parser/tokenizer.py View File

@@ -735,14 +735,22 @@ class Tokenizer(object):

def _handle_blacklisted_tag(self):
"""Handle the body of an HTML tag that is parser-blacklisted."""
strip = lambda text: text.rstrip().lower()
while True:
this, next = self._read(), self._read(1)
if this is self.END:
self._fail_route()
elif this == "<" and next == "/":
self._handle_tag_open_close()
self._head += 1
return self._parse(push=False)
self._head += 3
if self._read() != ">" or (strip(self._read(-1)) !=
strip(self._stack[1].text)):
self._head -= 1
self._emit_text("</")
continue
self._emit(tokens.TagOpenClose())
self._emit_text(self._read(-1))
self._emit(tokens.TagCloseClose())
return self._pop()
elif this == "&":
self._parse_entity()
else:


+ 14
- 0
tests/tokenizer/tags.mwtest View File

@@ -614,7 +614,21 @@ output: [TagOpenOpen(), Text(text="NoWiKi"), TagCloseOpen(padding=""), Text(text

---

name: unparsable_incomplete_close
label: an unparsable tag with an incomplete close afterwards
input: "<nowiki>foo</nowiki"
output: [Text(text="<nowiki>foo</nowiki")]

---

name: unparsable_with_intermediates
label: an unparsable tag with intermediate tags inside of it
input: "<nowiki><ref></ref></nowiki>"
output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="<ref></ref>"), TagOpenClose(), Text(text="nowiki"), TagCloseClose()]

---

name: unparsable_with_intermediates_normalize
label: an unparsable tag with intermediate tags inside of it, requiring normalization
input: "<nowiki><ref></ref></nowIKI >"
output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="<ref></ref>"), TagOpenClose(), Text(text="nowIKI "), TagCloseClose()]

Loading…
Cancel
Save