Browse Source

Faster parsing: split the text on sentinels instead of every letter.

tags/v0.1
Ben Kurtovic 11 years ago
parent
commit
278594a8cf
2 changed files with 34 additions and 28 deletions
  1. +27
    -27
      mwparserfromhell/parser/tokenizer.py
  2. +7
    -1
      mwparserfromhell/parser/tokens.py

+ 27
- 27
mwparserfromhell/parser/tokenizer.py View File

@@ -21,6 +21,7 @@
# SOFTWARE.

import htmlentitydefs
import re
import string

from . import contexts
@@ -35,6 +36,7 @@ class Tokenizer(object):
START = object()
END = object()
SENTINELS = ["{", "}", "[", "]", "|", "=", "&", END]
REGEX = r"([{}\[\]|=&;])"

def __init__(self):
self._text = None
@@ -135,48 +137,45 @@ class Tokenizer(object):
return self._pop()

def _parse_entity(self):
reset = self._head
self._head += 1
try:
self._push()
self._write(tokens.HTMLEntityStart())
this = self._read(1)
if this is self.END:
raise BadRoute(self._pop())
numeric = hexadecimal = False
if self._read() == "#":
skip = 0
if this.startswith("#"):
numeric = True
self._write(tokens.HTMLEntityNumeric())
if self._read(1).lower() == "x":
if this[1:].lower().startswith("x"):
hexadecimal = True
self._write(tokens.HTMLEntityHex(char=self._read(1)))
self._head += 2
self._write(tokens.HTMLEntityHex(char=this[1]))
skip = 2
else:
self._head += 1
text = []
skip = 1
text = this[skip:]
valid = string.hexdigits if hexadecimal else string.digits
if not numeric and not hexadecimal:
valid += string.ascii_letters
while True:
this = self._read()
if this == ";":
text = "".join(text)
if numeric:
test = int(text, 16) if hexadecimal else int(text)
if test < 1 or test > 0x10FFFF:
raise BadRoute(self._pop())
else:
if text not in htmlentitydefs.entitydefs:
raise BadRoute(self._pop())
self._write(tokens.Text(text=text))
self._write(tokens.HTMLEntityEnd())
break
if this is self.END or this not in valid:
if not text or not all([char in valid for char in text]):
raise BadRoute(self._pop())
if self._read(2) != ";":
raise BadRoute(self._pop())
if numeric:
test = int(text, 16) if hexadecimal else int(text)
if test < 1 or test > 0x10FFFF:
raise BadRoute(self._pop())
text.append(this)
self._head += 1
else:
if text not in htmlentitydefs.entitydefs:
raise BadRoute(self._pop())
self._write(tokens.Text(text=text))
self._write(tokens.HTMLEntityEnd())
except BadRoute:
self._head = reset
self._write(self._read(), text=True)
else:
self._write_all(self._pop())
self._head += 2

def _parse(self, context=0):
self._push(context)
@@ -206,5 +205,6 @@ class Tokenizer(object):
self._head += 1

def tokenize(self, text):
self._text = list(text)
split = re.split(self.REGEX, text, flags=re.I)
self._text = [segment for segment in split if segment]
return self._parse()

+ 7
- 1
mwparserfromhell/parser/tokens.py View File

@@ -27,7 +27,13 @@ class Token(object):
super(Token, self).__setattr__("_kwargs", kwargs)

def __repr__(self):
return type(self).__name__
args = []
for key, value in self._kwargs.iteritems():
if len(value) > 100:
args.append(key + "=" + repr(value[:97] + "..."))
else:
args.append(key + "=" + repr(value))
return u"{0}({1})".format(type(self).__name__, u", ".join(args))

def __getattr__(self, key):
return self._kwargs[key]


Loading…
Cancel
Save