diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index fc09462..df0cf12 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -34,6 +34,7 @@ class BadRoute(Exception): class Tokenizer(object): START = object() END = object() + SENTINELS = ["{", "}", "[", "]", "|", "=", "&", END] def __init__(self): self._text = None @@ -52,28 +53,38 @@ class Tokenizer(object): def _context(self, value): self._stacks[-1][1] = value + @property + def _textbuffer(self): + return self._stacks[-1][2] + + @_textbuffer.setter + def _textbuffer(self, value): + self._stacks[-1][2] = value + def _push(self, context=0): - self._stacks.append([[], context]) + self._stacks.append([[], context, []]) def _pop(self): - return self._stacks.pop()[0] - - def _write(self, token, stack=None): - if stack is None: - stack = self._stack - if not stack: - stack.append(token) + top = self._stacks.pop() + stack, text = top[0], top[2] + if text: + stack.append(tokens.Text(text="".join(text))) + return stack + + def _write(self, data, text=False): + if text: + self._textbuffer.append(data) return - last = stack[-1] - if isinstance(token, tokens.Text) and isinstance(last, tokens.Text): - last.text += token.text - else: - stack.append(token) + if self._textbuffer: + self._stack.append(tokens.Text(text="".join(self._textbuffer))) + self._textbuffer = [] + self._stack.append(data) - def _write_all(self, tokenlist, stack=None): - if stack is None: - stack = self._stack - stack.extend(tokenlist) + def _write_all(self, tokenlist): + if self._textbuffer: + self._stack.append(tokens.Text(text="".join(self._textbuffer))) + self._textbuffer = [] + self._stack.extend(tokenlist) def _read(self, delta=0, wrap=False): index = self._head + delta @@ -84,50 +95,34 @@ class Tokenizer(object): return self._text[index] def _at_head(self, chars): + length = len(chars) + if length == 1: + return self._read() == chars return all([self._read(i) == chars[i] for i in xrange(len(chars))]) - def _verify_context_pre_stop(self): - if self._read() is self.END: - if self._context & contexts.TEMPLATE: - raise BadRoute(self._pop()) - - def _catch_stop(self, stop): - if self._read() is self.END: - return True - try: - iter(stop) - except TypeError: - if self._read() is stop: - return True - else: - if all([self._read(i) == stop[i] for i in xrange(len(stop))]): - self._head += len(stop) - 1 - return True - return False - - def _verify_context_post_stop(self): - if self._context & contexts.TEMPLATE_NAME and self._stack: - head = self._stack[-1] - if isinstance(head, tokens.Text): - if head.text.strip() and head.text.endswith("\n"): - if self._read() not in ["|", "=", "\n"]: - raise BadRoute(self._pop()) - def _parse_template(self): reset = self._head self._head += 2 try: - template = self._parse_until("}}", contexts.TEMPLATE_NAME) + template = self._parse(contexts.TEMPLATE_NAME) except BadRoute: self._head = reset - self._write(tokens.Text(text=self._read())) + self._write(self._read(), text=True) else: self._write(tokens.TemplateOpen()) self._write_all(template) self._write(tokens.TemplateClose()) + def _verify_template_name(self): + if self._stack: + text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] + text = "".join([token.text for token in text]) + if text.strip() and "\n" in text: + raise BadRoute(self._pop()) + def _handle_template_param(self): if self._context & contexts.TEMPLATE_NAME: + self._verify_template_name() self._context ^= contexts.TEMPLATE_NAME if self._context & contexts.TEMPLATE_PARAM_VALUE: self._context ^= contexts.TEMPLATE_PARAM_VALUE @@ -139,6 +134,12 @@ class Tokenizer(object): self._context |= contexts.TEMPLATE_PARAM_VALUE self._write(tokens.TemplateParamEquals()) + def _handle_template_end(self): + if self._context & contexts.TEMPLATE_NAME: + self._verify_template_name() + self._head += 1 + return self._pop() + def _parse_entity(self): reset = self._head self._head += 1 @@ -178,29 +179,35 @@ class Tokenizer(object): self._head += 1 except BadRoute: self._head = reset - self._write(tokens.Text(text=self._read())) + self._write(self._read(), text=True) else: self._write_all(self._pop()) - def _parse_until(self, stop, context=0): + def _parse(self, context=0): self._push(context) while True: - self._verify_context_pre_stop() - if self._catch_stop(stop): + if self._read() not in self.SENTINELS: + self._write(self._read(), text=True) + self._head += 1 + continue + if self._read() is self.END: + if self._context & contexts.TEMPLATE: + raise BadRoute(self._pop()) return self._pop() - self._verify_context_post_stop() if self._at_head("{{"): self._parse_template() elif self._at_head("|") and self._context & contexts.TEMPLATE: self._handle_template_param() elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY: self._handle_template_param_value() + elif self._at_head("}}") and self._context & contexts.TEMPLATE: + return self._handle_template_end() elif self._at_head("&"): self._parse_entity() else: - self._write(tokens.Text(text=self._read())) + self._write(self._read(), text=True) self._head += 1 def tokenize(self, text): self._text = list(text) - return self._parse_until(stop=self.END) + return self._parse() diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 6c77a5f..a465227 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -26,6 +26,9 @@ class Token(object): def __init__(self, **kwargs): super(Token, self).__setattr__("_kwargs", kwargs) + def __repr__(self): + return type(self).__name__ + def __getattr__(self, key): return self._kwargs[key]