Browse Source

Improve/fix the way padding is handled.

tags/v0.3
Ben Kurtovic 11 years ago
parent
commit
50beda0914
2 changed files with 13 additions and 16 deletions
  1. +11
    -14
      mwparserfromhell/parser/tokenizer.py
  2. +2
    -2
      tests/tokenizer/integration.mwtest

+ 11
- 14
mwparserfromhell/parser/tokenizer.py View File

@@ -48,7 +48,7 @@ class _TagOpenData(object):


def __init__(self): def __init__(self):
self.context = self.CX_NAME self.context = self.CX_NAME
self.padding_buffer = []
self.padding_buffer = {"first": "", "before_eq": "", "after_eq": ""}
self.reset = 0 self.reset = 0




@@ -62,7 +62,7 @@ class Tokenizer(object):
MAX_DEPTH = 40 MAX_DEPTH = 40
MAX_CYCLES = 100000 MAX_CYCLES = 100000
regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE) regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
tag_splitter = re.compile(r"([\s\"\\])")
tag_splitter = re.compile(r"([\s\"\\]+)")


def __init__(self): def __init__(self):
self._text = None self._text = None
@@ -475,12 +475,10 @@ class Tokenizer(object):
self._emit_first(tokens.TagAttrQuote()) self._emit_first(tokens.TagAttrQuote())
self._emit_all(self._pop()) self._emit_all(self._pop())
buf = data.padding_buffer buf = data.padding_buffer
while len(buf) < 3:
buf.append("")
self._emit_first(tokens.TagAttrStart(pad_after_eq=buf.pop(),
pad_before_eq=buf.pop(), pad_first=buf.pop()))
self._emit_first(tokens.TagAttrStart(pad_first=buf["first"],
pad_before_eq=buf["before_eq"], pad_after_eq=buf["after_eq"]))
self._emit_all(self._pop()) self._emit_all(self._pop())
data.padding_buffer = []
data.padding_buffer = {key: "" for key in data.padding_buffer}


def _handle_tag_data(self, data, text): def _handle_tag_data(self, data, text):
"""Handle all sorts of *text* data inside of an HTML open tag.""" """Handle all sorts of *text* data inside of an HTML open tag."""
@@ -506,14 +504,11 @@ class Tokenizer(object):
self._push(contexts.TAG_ATTR) self._push(contexts.TAG_ATTR)
elif data.context & data.CX_ATTR_NAME: elif data.context & data.CX_ATTR_NAME:
if chunk == "=": if chunk == "=":
if not data.context & data.CX_NOTE_EQUALS:
data.padding_buffer.append("") # No padding before '='
data.context = data.CX_ATTR_VALUE | data.CX_NOTE_QUOTE data.context = data.CX_ATTR_VALUE | data.CX_NOTE_QUOTE
self._emit(tokens.TagAttrEquals()) self._emit(tokens.TagAttrEquals())
continue continue
if data.context & data.CX_NOTE_EQUALS: if data.context & data.CX_NOTE_EQUALS:
self._push_tag_buffer(data) self._push_tag_buffer(data)
data.padding_buffer.append("") # No padding before tag
data.context = data.CX_ATTR_NAME data.context = data.CX_ATTR_NAME
self._push(contexts.TAG_ATTR) self._push(contexts.TAG_ATTR)
elif data.context & data.CX_ATTR_VALUE: elif data.context & data.CX_ATTR_VALUE:
@@ -542,10 +537,13 @@ class Tokenizer(object):
data.context = data.CX_ATTR_READY data.context = data.CX_ATTR_READY
elif ctx & data.CX_ATTR_NAME: elif ctx & data.CX_ATTR_NAME:
data.context |= data.CX_NOTE_EQUALS data.context |= data.CX_NOTE_EQUALS
data.padding_buffer["before_eq"] += text
if ctx & data.CX_QUOTED and not ctx & data.CX_NOTE_SPACE: if ctx & data.CX_QUOTED and not ctx & data.CX_NOTE_SPACE:
self._emit_text(text) self._emit_text(text)
else:
data.padding_buffer.append(text)
elif data.context & data.CX_ATTR_READY:
data.padding_buffer["first"] += text
elif data.context & data.CX_ATTR_VALUE:
data.padding_buffer["after_eq"] += text


def _handle_tag_text(self, text): def _handle_tag_text(self, text):
"""Handle regular *text* inside of an HTML open tag.""" """Handle regular *text* inside of an HTML open tag."""
@@ -578,8 +576,7 @@ class Tokenizer(object):
"""Handle the closing of a open tag (``<foo>``).""" """Handle the closing of a open tag (``<foo>``)."""
if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
self._push_tag_buffer(data) self._push_tag_buffer(data)
padding = data.padding_buffer[0] if data.padding_buffer else ""
self._emit(token(padding=padding))
self._emit(token(padding=data.padding_buffer["first"]))
self._head += 1 self._head += 1


def _handle_tag_open_close(self): def _handle_tag_open_close(self):


+ 2
- 2
tests/tokenizer/integration.mwtest View File

@@ -35,8 +35,8 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t


name: rich_tags name: rich_tags
label: a HTML tag with tons of other things in it label: a HTML tag with tons of other things in it
input: "{{dubious claim}}<ref name={{abc}} foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}} mno="{{p}} [[q]] {{r}}">[[Source]]</ref>"
output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]
input: "{{dubious claim}}<ref name={{abc}} foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}} \n mno = "{{p}} [[q]] {{r}}">[[Source]]</ref>"
output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" \n ", pad_before_eq=" ", pad_after_eq=" "), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]


--- ---




Loading…
Cancel
Save