|
@@ -35,8 +35,9 @@ class BadRoute(Exception): |
|
|
class Tokenizer(object): |
|
|
class Tokenizer(object): |
|
|
START = object() |
|
|
START = object() |
|
|
END = object() |
|
|
END = object() |
|
|
SENTINELS = ["{", "}", "[", "]", "|", "=", "&", END] |
|
|
|
|
|
REGEX = r"([{}\[\]|=&;])" |
|
|
|
|
|
|
|
|
SENTINELS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", |
|
|
|
|
|
":", "/", "-", END] |
|
|
|
|
|
regex = re.compile(r"([{}\[\]<>|=&#*;:/-])", flags=re.IGNORECASE) |
|
|
|
|
|
|
|
|
def __init__(self): |
|
|
def __init__(self): |
|
|
self._text = None |
|
|
self._text = None |
|
@@ -137,45 +138,48 @@ class Tokenizer(object): |
|
|
return self._pop() |
|
|
return self._pop() |
|
|
|
|
|
|
|
|
def _parse_entity(self): |
|
|
def _parse_entity(self): |
|
|
|
|
|
reset = self._head |
|
|
self._push() |
|
|
self._push() |
|
|
try: |
|
|
try: |
|
|
self._write(tokens.HTMLEntityStart()) |
|
|
self._write(tokens.HTMLEntityStart()) |
|
|
this = self._read(1) |
|
|
|
|
|
if this is self.END: |
|
|
|
|
|
|
|
|
self._head += 1 |
|
|
|
|
|
this = self._read() |
|
|
|
|
|
if not this or this is self.END: |
|
|
raise BadRoute(self._pop()) |
|
|
raise BadRoute(self._pop()) |
|
|
numeric = hexadecimal = False |
|
|
numeric = hexadecimal = False |
|
|
skip = 0 |
|
|
|
|
|
if this.startswith("#"): |
|
|
|
|
|
|
|
|
if this == "#": |
|
|
numeric = True |
|
|
numeric = True |
|
|
self._write(tokens.HTMLEntityNumeric()) |
|
|
self._write(tokens.HTMLEntityNumeric()) |
|
|
if this[1:].lower().startswith("x"): |
|
|
|
|
|
|
|
|
self._head += 1 |
|
|
|
|
|
this = self._read() |
|
|
|
|
|
if not this or this is self.END: |
|
|
|
|
|
raise BadRoute(self._pop()) |
|
|
|
|
|
if this[0].lower() == "x": |
|
|
hexadecimal = True |
|
|
hexadecimal = True |
|
|
self._write(tokens.HTMLEntityHex(char=this[1])) |
|
|
|
|
|
skip = 2 |
|
|
|
|
|
else: |
|
|
|
|
|
skip = 1 |
|
|
|
|
|
text = this[skip:] |
|
|
|
|
|
|
|
|
self._write(tokens.HTMLEntityHex(char=this[0])) |
|
|
|
|
|
this = this[1:] |
|
|
valid = string.hexdigits if hexadecimal else string.digits |
|
|
valid = string.hexdigits if hexadecimal else string.digits |
|
|
if not numeric and not hexadecimal: |
|
|
if not numeric and not hexadecimal: |
|
|
valid += string.ascii_letters |
|
|
valid += string.ascii_letters |
|
|
if not text or not all([char in valid for char in text]): |
|
|
|
|
|
|
|
|
if not all([char in valid for char in this]): |
|
|
raise BadRoute(self._pop()) |
|
|
raise BadRoute(self._pop()) |
|
|
if self._read(2) != ";": |
|
|
|
|
|
|
|
|
self._head += 1 |
|
|
|
|
|
if self._read() != ";": |
|
|
raise BadRoute(self._pop()) |
|
|
raise BadRoute(self._pop()) |
|
|
if numeric: |
|
|
if numeric: |
|
|
test = int(text, 16) if hexadecimal else int(text) |
|
|
|
|
|
|
|
|
test = int(this, 16) if hexadecimal else int(this) |
|
|
if test < 1 or test > 0x10FFFF: |
|
|
if test < 1 or test > 0x10FFFF: |
|
|
raise BadRoute(self._pop()) |
|
|
raise BadRoute(self._pop()) |
|
|
else: |
|
|
else: |
|
|
if text not in htmlentitydefs.entitydefs: |
|
|
|
|
|
|
|
|
if this not in htmlentitydefs.entitydefs: |
|
|
raise BadRoute(self._pop()) |
|
|
raise BadRoute(self._pop()) |
|
|
self._write(tokens.Text(text=text)) |
|
|
|
|
|
|
|
|
self._write(tokens.Text(text=this)) |
|
|
self._write(tokens.HTMLEntityEnd()) |
|
|
self._write(tokens.HTMLEntityEnd()) |
|
|
except BadRoute: |
|
|
except BadRoute: |
|
|
|
|
|
self._head = reset |
|
|
self._write_text(self._read()) |
|
|
self._write_text(self._read()) |
|
|
else: |
|
|
else: |
|
|
self._write_all(self._pop()) |
|
|
self._write_all(self._pop()) |
|
|
self._head += 2 |
|
|
|
|
|
|
|
|
|
|
|
def _parse(self, context=0): |
|
|
def _parse(self, context=0): |
|
|
self._push(context) |
|
|
self._push(context) |
|
@@ -205,6 +209,6 @@ class Tokenizer(object): |
|
|
self._head += 1 |
|
|
self._head += 1 |
|
|
|
|
|
|
|
|
def tokenize(self, text): |
|
|
def tokenize(self, text): |
|
|
split = re.split(self.REGEX, text, flags=re.I) |
|
|
|
|
|
|
|
|
split = self.regex.split(text) |
|
|
self._text = [segment for segment in split if segment] |
|
|
self._text = [segment for segment in split if segment] |
|
|
return self._parse() |
|
|
return self._parse() |