From be5d2cbb07da98f9babec7e1b799b40f374dfe52 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 14 Aug 2013 22:24:14 -0400 Subject: [PATCH] Support HTML entities inside parser-blacklisted tags (closes #36) --- mwparserfromhell/parser/tokenizer.c | 9 +++++++-- mwparserfromhell/parser/tokenizer.py | 5 ++++- tests/tokenizer/html_entities.mwtest | 14 ++++++++++++++ tests/tokenizer/tags.mwtest | 21 +++++++++++++++++++++ 4 files changed, 46 insertions(+), 3 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index eff000a..912cfd7 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1578,16 +1578,21 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self) while (1) { this = Tokenizer_READ(self, 0); next = Tokenizer_READ(self, 1); - self->head++; if (this == *"") return Tokenizer_fail_route(self); else if (this == *"<" && next == *"/") { if (Tokenizer_handle_tag_open_close(self)) return NULL; + self->head++; return Tokenizer_parse(self, 0, 0); } - if (Tokenizer_emit_char(self, this)) + else if (this == *"&") { + if (Tokenizer_parse_entity(self)) + return NULL; + } + else if (Tokenizer_emit_char(self, this)) return NULL; + self->head++; } } diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 38ffa80..583d2f8 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -552,14 +552,17 @@ class Tokenizer(object): """Handle the body of an HTML tag that is parser-blacklisted.""" while True: this, next = self._read(), self._read(1) - self._head += 1 if this is self.END: self._fail_route() elif this == "<" and next == "/": self._handle_tag_open_close() + self._head += 1 return self._parse(push=False) + elif this == "&": + self._parse_entity() else: self._emit_text(this) + self._head += 1 def _handle_single_only_tag_end(self): """Handle the end of an implicitly closing single-only HTML tag.""" diff --git a/tests/tokenizer/html_entities.mwtest b/tests/tokenizer/html_entities.mwtest index 625dd60..53bedbd 100644 --- a/tests/tokenizer/html_entities.mwtest +++ b/tests/tokenizer/html_entities.mwtest @@ -117,6 +117,20 @@ output: [Text(text="&;")] --- +name: invalid_partial_amp_pound +label: invalid entities: just an ampersand, pound sign +input: "&#" +output: [Text(text="&#")] + +--- + +name: invalid_partial_amp_pound_x +label: invalid entities: just an ampersand, pound sign, x +input: "&#x" +output: [Text(text="&#x")] + +--- + name: invalid_partial_amp_pound_semicolon label: invalid entities: an ampersand, pound sign, and semicolon input: "&#;" diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 50c844e..dc02a51 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -467,6 +467,27 @@ output: [TemplateOpen(), Text(text="t1"), TemplateClose(), Text(text="") --- +name: unparsable_entity +label: a HTML entity inside unparsable text is still parsed +input: "{{t1}}{{t2}} {{t3}}{{t4}}" +output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="{{t2}}"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="{{t3}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t4"), TemplateClose()] + +--- + +name: unparsable_entity_incomplete +label: an incomplete HTML entity inside unparsable text +input: "&" +output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="&"), TagOpenClose(), Text(text="nowiki"), TagCloseClose()] + +--- + +name: unparsable_entity_incomplete_2 +label: an incomplete HTML entity inside unparsable text +input: "&" +output: [Text(text="&")] + +--- + name: single_open_close label: a tag that supports being single; both an open and a close tag input: "foo
  • bar{{baz}}
  • "