|
@@ -42,9 +42,9 @@ class _TagOpenData(object): |
|
|
CX_ATTR_NAME = 1 << 2 |
|
|
CX_ATTR_NAME = 1 << 2 |
|
|
CX_ATTR_VALUE = 1 << 3 |
|
|
CX_ATTR_VALUE = 1 << 3 |
|
|
CX_QUOTED = 1 << 4 |
|
|
CX_QUOTED = 1 << 4 |
|
|
CX_NEED_SPACE = 1 << 5 |
|
|
|
|
|
CX_NEED_EQUALS = 1 << 6 |
|
|
|
|
|
CX_NEED_QUOTE = 1 << 7 |
|
|
|
|
|
|
|
|
CX_NOTE_SPACE = 1 << 5 |
|
|
|
|
|
CX_NOTE_EQUALS = 1 << 6 |
|
|
|
|
|
CX_NOTE_QUOTE = 1 << 7 |
|
|
|
|
|
|
|
|
def __init__(self): |
|
|
def __init__(self): |
|
|
self.context = self.CX_NAME |
|
|
self.context = self.CX_NAME |
|
@@ -58,10 +58,10 @@ class Tokenizer(object): |
|
|
START = object() |
|
|
START = object() |
|
|
END = object() |
|
|
END = object() |
|
|
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", |
|
|
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", |
|
|
"/", "-", "!", "\n", END] |
|
|
|
|
|
|
|
|
"/", "\\", '"', "-", "!", "\n", END] |
|
|
MAX_DEPTH = 40 |
|
|
MAX_DEPTH = 40 |
|
|
MAX_CYCLES = 100000 |
|
|
MAX_CYCLES = 100000 |
|
|
regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE) |
|
|
|
|
|
|
|
|
regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE) |
|
|
tag_splitter = re.compile(r"([\s\"\\])") |
|
|
tag_splitter = re.compile(r"([\s\"\\])") |
|
|
|
|
|
|
|
|
def __init__(self): |
|
|
def __init__(self): |
|
@@ -445,7 +445,7 @@ class Tokenizer(object): |
|
|
while True: |
|
|
while True: |
|
|
this, next = self._read(), self._read(1) |
|
|
this, next = self._read(), self._read(1) |
|
|
can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or |
|
|
can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or |
|
|
data.context & data.CX_NEED_SPACE) |
|
|
|
|
|
|
|
|
data.context & data.CX_NOTE_SPACE) |
|
|
if this is self.END: |
|
|
if this is self.END: |
|
|
if self._context & contexts.TAG_ATTR: |
|
|
if self._context & contexts.TAG_ATTR: |
|
|
if data.context & data.CX_QUOTED: |
|
|
if data.context & data.CX_QUOTED: |
|
@@ -488,11 +488,11 @@ class Tokenizer(object): |
|
|
if data.context & data.CX_NAME: |
|
|
if data.context & data.CX_NAME: |
|
|
if chunk in self.MARKERS or chunk.isspace(): |
|
|
if chunk in self.MARKERS or chunk.isspace(): |
|
|
self._fail_route() # Tags must start with text, not spaces |
|
|
self._fail_route() # Tags must start with text, not spaces |
|
|
data.context = data.CX_NEED_SPACE |
|
|
|
|
|
|
|
|
data.context = data.CX_NOTE_SPACE |
|
|
elif chunk.isspace(): |
|
|
elif chunk.isspace(): |
|
|
self._handle_tag_space(data, chunk) |
|
|
self._handle_tag_space(data, chunk) |
|
|
continue |
|
|
continue |
|
|
elif data.context & data.CX_NEED_SPACE: |
|
|
|
|
|
|
|
|
elif data.context & data.CX_NOTE_SPACE: |
|
|
if data.context & data.CX_QUOTED: |
|
|
if data.context & data.CX_QUOTED: |
|
|
data.context = data.CX_ATTR_VALUE |
|
|
data.context = data.CX_ATTR_VALUE |
|
|
self._pop() |
|
|
self._pop() |
|
@@ -504,43 +504,43 @@ class Tokenizer(object): |
|
|
self._push(contexts.TAG_ATTR) |
|
|
self._push(contexts.TAG_ATTR) |
|
|
elif data.context & data.CX_ATTR_NAME: |
|
|
elif data.context & data.CX_ATTR_NAME: |
|
|
if chunk == "=": |
|
|
if chunk == "=": |
|
|
if not data.context & data.CX_NEED_EQUALS: |
|
|
|
|
|
|
|
|
if not data.context & data.CX_NOTE_EQUALS: |
|
|
data.padding_buffer.append("") # No padding before '=' |
|
|
data.padding_buffer.append("") # No padding before '=' |
|
|
data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE |
|
|
|
|
|
|
|
|
data.context = data.CX_ATTR_VALUE | data.CX_NOTE_QUOTE |
|
|
self._emit(tokens.TagAttrEquals()) |
|
|
self._emit(tokens.TagAttrEquals()) |
|
|
continue |
|
|
continue |
|
|
if data.context & data.CX_NEED_EQUALS: |
|
|
|
|
|
|
|
|
if data.context & data.CX_NOTE_EQUALS: |
|
|
self._push_tag_buffer(data) |
|
|
self._push_tag_buffer(data) |
|
|
data.padding_buffer.append("") # No padding before tag |
|
|
data.padding_buffer.append("") # No padding before tag |
|
|
data.context = data.CX_ATTR_NAME |
|
|
data.context = data.CX_ATTR_NAME |
|
|
self._push(contexts.TAG_ATTR) |
|
|
self._push(contexts.TAG_ATTR) |
|
|
elif data.context & data.CX_ATTR_VALUE: |
|
|
elif data.context & data.CX_ATTR_VALUE: |
|
|
### handle backslashes here |
|
|
|
|
|
if data.context & data.CX_NEED_QUOTE: |
|
|
|
|
|
data.context ^= data.CX_NEED_QUOTE |
|
|
|
|
|
if chunk == '"': |
|
|
|
|
|
|
|
|
escaped = self._read(-1) == "\\" and self._read(-2) != "\\" |
|
|
|
|
|
if data.context & data.CX_NOTE_QUOTE: |
|
|
|
|
|
data.context ^= data.CX_NOTE_QUOTE |
|
|
|
|
|
if chunk == '"' and not escaped: |
|
|
data.context |= data.CX_QUOTED |
|
|
data.context |= data.CX_QUOTED |
|
|
self._push(self._context) |
|
|
self._push(self._context) |
|
|
data.reset = self._head |
|
|
data.reset = self._head |
|
|
continue |
|
|
continue |
|
|
elif data.context & data.CX_QUOTED: |
|
|
elif data.context & data.CX_QUOTED: |
|
|
if chunk == '"': |
|
|
|
|
|
data.context |= data.CX_NEED_SPACE |
|
|
|
|
|
|
|
|
if chunk == '"' and not escaped: |
|
|
|
|
|
data.context |= data.CX_NOTE_SPACE |
|
|
continue |
|
|
continue |
|
|
self._handle_tag_text(chunk) |
|
|
self._handle_tag_text(chunk) |
|
|
|
|
|
|
|
|
def _handle_tag_space(self, data, text): |
|
|
def _handle_tag_space(self, data, text): |
|
|
"""Handle whitespace (*text*) inside of an HTML open tag.""" |
|
|
"""Handle whitespace (*text*) inside of an HTML open tag.""" |
|
|
ctx = data.context |
|
|
ctx = data.context |
|
|
end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NEED_QUOTE) |
|
|
|
|
|
if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NEED_SPACE): |
|
|
|
|
|
|
|
|
end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NOTE_QUOTE) |
|
|
|
|
|
if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NOTE_SPACE): |
|
|
self._push_tag_buffer(data) |
|
|
self._push_tag_buffer(data) |
|
|
data.context = data.CX_ATTR_READY |
|
|
data.context = data.CX_ATTR_READY |
|
|
elif ctx & data.CX_NEED_SPACE: |
|
|
|
|
|
|
|
|
elif ctx & data.CX_NOTE_SPACE: |
|
|
data.context = data.CX_ATTR_READY |
|
|
data.context = data.CX_ATTR_READY |
|
|
elif ctx & data.CX_ATTR_NAME: |
|
|
elif ctx & data.CX_ATTR_NAME: |
|
|
data.context |= data.CX_NEED_EQUALS |
|
|
|
|
|
if ctx & data.CX_QUOTED and not ctx & data.CX_NEED_SPACE: |
|
|
|
|
|
|
|
|
data.context |= data.CX_NOTE_EQUALS |
|
|
|
|
|
if ctx & data.CX_QUOTED and not ctx & data.CX_NOTE_SPACE: |
|
|
self._emit_text(text) |
|
|
self._emit_text(text) |
|
|
else: |
|
|
else: |
|
|
data.padding_buffer.append(text) |
|
|
data.padding_buffer.append(text) |
|
|