From 010bd346530759ccf56cd1137d2a140b78d9dd37 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 17 Aug 2012 00:56:41 -0400 Subject: [PATCH] Support HTMLEntities. --- mwparserfromhell/parser/builder.py | 3 ++ mwparserfromhell/parser/tokenizer.py | 56 ++++++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 929c269..9edc987 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -98,9 +98,12 @@ class Builder(object): token = self._tokens.pop(0) if isinstance(token, tokens.HTMLEntityHex): text = self._tokens.pop(0) + self._tokens.pop(0) # Remove HTMLEntityEnd return HTMLEntity(text.text, named=False, hexadecimal=True, hex_char=token.char) + self._tokens.pop(0) # Remove HTMLEntityEnd return HTMLEntity(token.text, named=False, hexadecimal=False) + self._tokens.pop(0) # Remove HTMLEntityEnd return HTMLEntity(token.text, named=True, hexadecimal=False) def _handle_heading(self, token): diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 5befcf0..fc09462 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -20,6 +20,9 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. +import htmlentitydefs +import string + from . import contexts from . import tokens @@ -49,9 +52,8 @@ class Tokenizer(object): def _context(self, value): self._stacks[-1][1] = value - def _push(self): - stack, context = [], 0 - self._stacks.append([stack, context]) + def _push(self, context=0): + self._stacks.append([[], context]) def _pop(self): return self._stacks.pop()[0] @@ -137,9 +139,51 @@ class Tokenizer(object): self._context |= contexts.TEMPLATE_PARAM_VALUE self._write(tokens.TemplateParamEquals()) + def _parse_entity(self): + reset = self._head + self._head += 1 + try: + self._push() + self._write(tokens.HTMLEntityStart()) + numeric = hexadecimal = False + if self._at_head("#"): + numeric = True + self._write(tokens.HTMLEntityNumeric()) + if self._read(1).lower() == "x": + hexadecimal = True + self._write(tokens.HTMLEntityHex(char=self._read(1))) + self._head += 2 + else: + self._head += 1 + text = [] + valid = string.hexdigits if hexadecimal else string.digits + if not numeric and not hexadecimal: + valid += string.ascii_letters + while True: + if self._at_head(";"): + text = "".join(text) + if numeric: + test = int(text, 16) if hexadecimal else int(text) + if test < 1 or test > 0x10FFFF: + raise BadRoute(self._pop()) + else: + if text not in htmlentitydefs.entitydefs: + raise BadRoute(self._pop()) + self._write(tokens.Text(text=text)) + self._write(tokens.HTMLEntityEnd()) + break + if self._read() is self.END or self._read() not in valid: + raise BadRoute(self._pop()) + text.append(self._read()) + self._head += 1 + except BadRoute: + self._head = reset + self._write(tokens.Text(text=self._read())) + else: + self._write_all(self._pop()) + def _parse_until(self, stop, context=0): - self._push() - self._context = context + self._push(context) while True: self._verify_context_pre_stop() if self._catch_stop(stop): @@ -151,6 +195,8 @@ class Tokenizer(object): self._handle_template_param() elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY: self._handle_template_param_value() + elif self._at_head("&"): + self._parse_entity() else: self._write(tokens.Text(text=self._read())) self._head += 1