Browse Source

Another speedup by reducing calls to _read().

tags/v0.1
Ben Kurtovic 12 years ago
parent
commit
3fd13100da
1 changed files with 18 additions and 20 deletions
  1. +18
    -20
      mwparserfromhell/parser/tokenizer.py

+ 18
- 20
mwparserfromhell/parser/tokenizer.py View File

@@ -88,15 +88,10 @@ class Tokenizer(object):
index = self._head + delta index = self._head + delta
if index < 0 and (not wrap or abs(index) > len(self._text)): if index < 0 and (not wrap or abs(index) > len(self._text)):
return self.START return self.START
if index >= len(self._text):
try:
return self._text[index]
except IndexError:
return self.END return self.END
return self._text[index]

def _at_head(self, chars):
length = len(chars)
if length == 1:
return self._read() == chars
return all([self._read(i) == chars[i] for i in xrange(len(chars))])


def _parse_template(self): def _parse_template(self):
reset = self._head reset = self._head
@@ -146,7 +141,7 @@ class Tokenizer(object):
self._push() self._push()
self._write(tokens.HTMLEntityStart()) self._write(tokens.HTMLEntityStart())
numeric = hexadecimal = False numeric = hexadecimal = False
if self._at_head("#"):
if self._read() == "#":
numeric = True numeric = True
self._write(tokens.HTMLEntityNumeric()) self._write(tokens.HTMLEntityNumeric())
if self._read(1).lower() == "x": if self._read(1).lower() == "x":
@@ -160,7 +155,8 @@ class Tokenizer(object):
if not numeric and not hexadecimal: if not numeric and not hexadecimal:
valid += string.ascii_letters valid += string.ascii_letters
while True: while True:
if self._at_head(";"):
this = self._read()
if this == ";":
text = "".join(text) text = "".join(text)
if numeric: if numeric:
test = int(text, 16) if hexadecimal else int(text) test = int(text, 16) if hexadecimal else int(text)
@@ -172,9 +168,9 @@ class Tokenizer(object):
self._write(tokens.Text(text=text)) self._write(tokens.Text(text=text))
self._write(tokens.HTMLEntityEnd()) self._write(tokens.HTMLEntityEnd())
break break
if self._read() is self.END or self._read() not in valid:
if this is self.END or this not in valid:
raise BadRoute(self._pop()) raise BadRoute(self._pop())
text.append(self._read())
text.append(this)
self._head += 1 self._head += 1
except BadRoute: except BadRoute:
self._head = reset self._head = reset
@@ -185,26 +181,28 @@ class Tokenizer(object):
def _parse(self, context=0): def _parse(self, context=0):
self._push(context) self._push(context)
while True: while True:
if self._read() not in self.SENTINELS:
this = self._read()
if this not in self.SENTINELS:
self._write(self._read(), text=True) self._write(self._read(), text=True)
self._head += 1 self._head += 1
continue continue
if self._read() is self.END:
if this is self.END:
if self._context & contexts.TEMPLATE: if self._context & contexts.TEMPLATE:
raise BadRoute(self._pop()) raise BadRoute(self._pop())
return self._pop() return self._pop()
if self._at_head("{{"):
next = self._read(1)
if this == next == "{":
self._parse_template() self._parse_template()
elif self._at_head("|") and self._context & contexts.TEMPLATE:
elif this == "|" and self._context & contexts.TEMPLATE:
self._handle_template_param() self._handle_template_param()
elif self._at_head("=") and self._context & contexts.TEMPLATE_PARAM_KEY:
elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
self._handle_template_param_value() self._handle_template_param_value()
elif self._at_head("}}") and self._context & contexts.TEMPLATE:
elif this == next == "}" and self._context & contexts.TEMPLATE:
return self._handle_template_end() return self._handle_template_end()
elif self._at_head("&"):
elif this == "&":
self._parse_entity() self._parse_entity()
else: else:
self._write(self._read(), text=True)
self._write(this, text=True)
self._head += 1 self._head += 1


def tokenize(self, text): def tokenize(self, text):


Loading…
Cancel
Save