Browse Source

Massive speedup and refactoring.

tags/v0.1
Ben Kurtovic 12 years ago
parent
commit
2f4ce003b4
2 changed files with 63 additions and 53 deletions
  1. +60
    -53
      mwparserfromhell/parser/tokenizer.py
  2. +3
    -0
      mwparserfromhell/parser/tokens.py

+ 60
- 53
mwparserfromhell/parser/tokenizer.py View File

@@ -34,6 +34,7 @@ class BadRoute(Exception):
class Tokenizer(object): class Tokenizer(object):
START = object() START = object()
END = object() END = object()
SENTINELS = ["{", "}", "[", "]", "|", "=", "&", END]


def __init__(self): def __init__(self):
self._text = None self._text = None
@@ -52,28 +53,38 @@ class Tokenizer(object):
def _context(self, value): def _context(self, value):
self._stacks[-1][1] = value self._stacks[-1][1] = value


@property
def _textbuffer(self):
return self._stacks[-1][2]

@_textbuffer.setter
def _textbuffer(self, value):
self._stacks[-1][2] = value

def _push(self, context=0): def _push(self, context=0):
self._stacks.append([[], context])
self._stacks.append([[], context, []])


def _pop(self): def _pop(self):
return self._stacks.pop()[0]

def _write(self, token, stack=None):
if stack is None:
stack = self._stack
if not stack:
stack.append(token)
top = self._stacks.pop()
stack, text = top[0], top[2]
if text:
stack.append(tokens.Text(text="".join(text)))
return stack

def _write(self, data, text=False):
if text:
self._textbuffer.append(data)
return return
last = stack[-1]
if isinstance(token, tokens.Text) and isinstance(last, tokens.Text):
last.text += token.text
else:
stack.append(token)
if self._textbuffer:
self._stack.append(tokens.Text(text="".join(self._textbuffer)))
self._textbuffer = []
self._stack.append(data)


def _write_all(self, tokenlist, stack=None):
if stack is None:
stack = self._stack
stack.extend(tokenlist)
def _write_all(self, tokenlist):
if self._textbuffer:
self._stack.append(tokens.Text(text="".join(self._textbuffer)))
self._textbuffer = []
self._stack.extend(tokenlist)


def _read(self, delta=0, wrap=False): def _read(self, delta=0, wrap=False):
index = self._head + delta index = self._head + delta
@@ -84,50 +95,34 @@ class Tokenizer(object):
return self._text[index] return self._text[index]


def _at_head(self, chars): def _at_head(self, chars):
length = len(chars)
if length == 1:
return self._read() == chars
return all([self._read(i) == chars[i] for i in xrange(len(chars))]) return all([self._read(i) == chars[i] for i in xrange(len(chars))])


def _verify_context_pre_stop(self):
if self._read() is self.END:
if self._context & contexts.TEMPLATE:
raise BadRoute(self._pop())

def _catch_stop(self, stop):
if self._read() is self.END:
return True
try:
iter(stop)
except TypeError:
if self._read() is stop:
return True
else:
if all([self._read(i) == stop[i] for i in xrange(len(stop))]):
self._head += len(stop) - 1
return True
return False

def _verify_context_post_stop(self):
if self._context & contexts.TEMPLATE_NAME and self._stack:
head = self._stack[-1]
if isinstance(head, tokens.Text):
if head.text.strip() and head.text.endswith("\n"):
if self._read() not in ["|", "=", "\n"]:
raise BadRoute(self._pop())

def _parse_template(self): def _parse_template(self):
reset = self._head reset = self._head
self._head += 2 self._head += 2
try: try:
template = self._parse_until("}}", contexts.TEMPLATE_NAME)
template = self._parse(contexts.TEMPLATE_NAME)
except BadRoute: except BadRoute:
self._head = reset self._head = reset
self._write(tokens.Text(text=self._read()))
self._write(self._read(), text=True)
else: else:
self._write(tokens.TemplateOpen()) self._write(tokens.TemplateOpen())
self._write_all(template) self._write_all(template)
self._write(tokens.TemplateClose()) self._write(tokens.TemplateClose())


def _verify_template_name(self):
if self._stack:
text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
text = "".join([token.text for token in text])
if text.strip() and "\n" in text:
raise BadRoute(self._pop())

def _handle_template_param(self): def _handle_template_param(self):
if self._context & contexts.TEMPLATE_NAME: if self._context & contexts.TEMPLATE_NAME:
self._verify_template_name()
self._context ^= contexts.TEMPLATE_NAME self._context ^= contexts.TEMPLATE_NAME
if self._context & contexts.TEMPLATE_PARAM_VALUE: if self._context & contexts.TEMPLATE_PARAM_VALUE:
self._context ^= contexts.TEMPLATE_PARAM_VALUE self._context ^= contexts.TEMPLATE_PARAM_VALUE
@@ -139,6 +134,12 @@ class Tokenizer(object):
self._context |= contexts.TEMPLATE_PARAM_VALUE self._context |= contexts.TEMPLATE_PARAM_VALUE
self._write(tokens.TemplateParamEquals()) self._write(tokens.TemplateParamEquals())


def _handle_template_end(self):
if self._context & contexts.TEMPLATE_NAME:
self._verify_template_name()
self._head += 1
return self._pop()

def _parse_entity(self): def _parse_entity(self):
reset = self._head reset = self._head
self._head += 1 self._head += 1
@@ -178,29 +179,35 @@ class Tokenizer(object):
self._head += 1 self._head += 1
except BadRoute: except BadRoute:
self._head = reset self._head = reset
self._write(tokens.Text(text=self._read()))
self._write(self._read(), text=True)
else: else:
self._write_all(self._pop()) self._write_all(self._pop())


def _parse_until(self, stop, context=0):
def _parse(self, context=0):
self._push(context) self._push(context)
while True: while True:
self._verify_context_pre_stop()
if self._catch_stop(stop):
if self._read() not in self.SENTINELS:
self._write(self._read(), text=True)
self._head += 1
continue
if self._read() is self.END:
if self._context & contexts.TEMPLATE:
raise BadRoute(self._pop())
return self._pop() return self._pop()
self._verify_context_post_stop()
if self._at_head("{{"): if self._at_head("{{"):
self._parse_template() self._parse_template()
elif self._at_head("|") and self._context & contexts.TEMPLATE: elif self._at_head("|") and self._context & contexts.TEMPLATE:
self._handle_template_param() self._handle_template_param()
elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY: elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY:
self._handle_template_param_value() self._handle_template_param_value()
elif self._at_head("}}") and self._context & contexts.TEMPLATE:
return self._handle_template_end()
elif self._at_head("&"): elif self._at_head("&"):
self._parse_entity() self._parse_entity()
else: else:
self._write(tokens.Text(text=self._read()))
self._write(self._read(), text=True)
self._head += 1 self._head += 1


def tokenize(self, text): def tokenize(self, text):
self._text = list(text) self._text = list(text)
return self._parse_until(stop=self.END)
return self._parse()

+ 3
- 0
mwparserfromhell/parser/tokens.py View File

@@ -26,6 +26,9 @@ class Token(object):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super(Token, self).__setattr__("_kwargs", kwargs) super(Token, self).__setattr__("_kwargs", kwargs)


def __repr__(self):
return type(self).__name__

def __getattr__(self, key): def __getattr__(self, key):
return self._kwargs[key] return self._kwargs[key]




Loading…
Cancel
Save