Browse Source

Support HTML entities inside parser-blacklisted tags (closes #36)

tags/v0.3
Ben Kurtovic 10 years ago
parent
commit
be5d2cbb07
4 changed files with 46 additions and 3 deletions
  1. +7
    -2
      mwparserfromhell/parser/tokenizer.c
  2. +4
    -1
      mwparserfromhell/parser/tokenizer.py
  3. +14
    -0
      tests/tokenizer/html_entities.mwtest
  4. +21
    -0
      tests/tokenizer/tags.mwtest

+ 7
- 2
mwparserfromhell/parser/tokenizer.c View File

@@ -1578,16 +1578,21 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
while (1) {
this = Tokenizer_READ(self, 0);
next = Tokenizer_READ(self, 1);
self->head++;
if (this == *"")
return Tokenizer_fail_route(self);
else if (this == *"<" && next == *"/") {
if (Tokenizer_handle_tag_open_close(self))
return NULL;
self->head++;
return Tokenizer_parse(self, 0, 0);
}
if (Tokenizer_emit_char(self, this))
else if (this == *"&") {
if (Tokenizer_parse_entity(self))
return NULL;
}
else if (Tokenizer_emit_char(self, this))
return NULL;
self->head++;
}
}



+ 4
- 1
mwparserfromhell/parser/tokenizer.py View File

@@ -552,14 +552,17 @@ class Tokenizer(object):
"""Handle the body of an HTML tag that is parser-blacklisted."""
while True:
this, next = self._read(), self._read(1)
self._head += 1
if this is self.END:
self._fail_route()
elif this == "<" and next == "/":
self._handle_tag_open_close()
self._head += 1
return self._parse(push=False)
elif this == "&":
self._parse_entity()
else:
self._emit_text(this)
self._head += 1

def _handle_single_only_tag_end(self):
"""Handle the end of an implicitly closing single-only HTML tag."""


+ 14
- 0
tests/tokenizer/html_entities.mwtest View File

@@ -117,6 +117,20 @@ output: [Text(text="&;")]

---

name: invalid_partial_amp_pound
label: invalid entities: just an ampersand, pound sign
input: "&#"
output: [Text(text="&#")]

---

name: invalid_partial_amp_pound_x
label: invalid entities: just an ampersand, pound sign, x
input: "&#x"
output: [Text(text="&#x")]

---

name: invalid_partial_amp_pound_semicolon
label: invalid entities: an ampersand, pound sign, and semicolon
input: "&#;"


+ 21
- 0
tests/tokenizer/tags.mwtest View File

@@ -467,6 +467,27 @@ output: [TemplateOpen(), Text(text="t1"), TemplateClose(), Text(text="<nowiki>")

---

name: unparsable_entity
label: a HTML entity inside unparsable text is still parsed
input: "{{t1}}<nowiki>{{t2}}&nbsp;{{t3}}</nowiki>{{t4}}"
output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="{{t2}}"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="{{t3}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t4"), TemplateClose()]

---

name: unparsable_entity_incomplete
label: an incomplete HTML entity inside unparsable text
input: "<nowiki>&</nowiki>"
output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="&"), TagOpenClose(), Text(text="nowiki"), TagCloseClose()]

---

name: unparsable_entity_incomplete_2
label: an incomplete HTML entity inside unparsable text
input: "<nowiki>&"
output: [Text(text="<nowiki>&")]

---

name: single_open_close
label: a tag that supports being single; both an open and a close tag
input: "foo<li>bar{{baz}}</li>"


Loading…
Cancel
Save