Browse Source

Split on all characters that we use.

tags/v0.1
Ben Kurtovic 12 years ago
parent
commit
40b6480aa7
1 changed files with 23 additions and 19 deletions
  1. +23
    -19
      mwparserfromhell/parser/tokenizer.py

+ 23
- 19
mwparserfromhell/parser/tokenizer.py View File

@@ -35,8 +35,9 @@ class BadRoute(Exception):
class Tokenizer(object): class Tokenizer(object):
START = object() START = object()
END = object() END = object()
SENTINELS = ["{", "}", "[", "]", "|", "=", "&", END]
REGEX = r"([{}\[\]|=&;])"
SENTINELS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";",
":", "/", "-", END]
regex = re.compile(r"([{}\[\]<>|=&#*;:/-])", flags=re.IGNORECASE)


def __init__(self): def __init__(self):
self._text = None self._text = None
@@ -137,45 +138,48 @@ class Tokenizer(object):
return self._pop() return self._pop()


def _parse_entity(self): def _parse_entity(self):
reset = self._head
self._push() self._push()
try: try:
self._write(tokens.HTMLEntityStart()) self._write(tokens.HTMLEntityStart())
this = self._read(1)
if this is self.END:
self._head += 1
this = self._read()
if not this or this is self.END:
raise BadRoute(self._pop()) raise BadRoute(self._pop())
numeric = hexadecimal = False numeric = hexadecimal = False
skip = 0
if this.startswith("#"):
if this == "#":
numeric = True numeric = True
self._write(tokens.HTMLEntityNumeric()) self._write(tokens.HTMLEntityNumeric())
if this[1:].lower().startswith("x"):
self._head += 1
this = self._read()
if not this or this is self.END:
raise BadRoute(self._pop())
if this[0].lower() == "x":
hexadecimal = True hexadecimal = True
self._write(tokens.HTMLEntityHex(char=this[1]))
skip = 2
else:
skip = 1
text = this[skip:]
self._write(tokens.HTMLEntityHex(char=this[0]))
this = this[1:]
valid = string.hexdigits if hexadecimal else string.digits valid = string.hexdigits if hexadecimal else string.digits
if not numeric and not hexadecimal: if not numeric and not hexadecimal:
valid += string.ascii_letters valid += string.ascii_letters
if not text or not all([char in valid for char in text]):
if not all([char in valid for char in this]):
raise BadRoute(self._pop()) raise BadRoute(self._pop())
if self._read(2) != ";":
self._head += 1
if self._read() != ";":
raise BadRoute(self._pop()) raise BadRoute(self._pop())
if numeric: if numeric:
test = int(text, 16) if hexadecimal else int(text)
test = int(this, 16) if hexadecimal else int(this)
if test < 1 or test > 0x10FFFF: if test < 1 or test > 0x10FFFF:
raise BadRoute(self._pop()) raise BadRoute(self._pop())
else: else:
if text not in htmlentitydefs.entitydefs:
if this not in htmlentitydefs.entitydefs:
raise BadRoute(self._pop()) raise BadRoute(self._pop())
self._write(tokens.Text(text=text))
self._write(tokens.Text(text=this))
self._write(tokens.HTMLEntityEnd()) self._write(tokens.HTMLEntityEnd())
except BadRoute: except BadRoute:
self._head = reset
self._write_text(self._read()) self._write_text(self._read())
else: else:
self._write_all(self._pop()) self._write_all(self._pop())
self._head += 2


def _parse(self, context=0): def _parse(self, context=0):
self._push(context) self._push(context)
@@ -205,6 +209,6 @@ class Tokenizer(object):
self._head += 1 self._head += 1


def tokenize(self, text): def tokenize(self, text):
split = re.split(self.REGEX, text, flags=re.I)
split = self.regex.split(text)
self._text = [segment for segment in split if segment] self._text = [segment for segment in split if segment]
return self._parse() return self._parse()

Loading…
Cancel
Save