Browse Source

Improve/fix the way padding is handled.

tags/v0.3
Ben Kurtovic 11 years ago
parent
commit
50beda0914
2 changed files with 13 additions and 16 deletions
  1. +11
    -14
      mwparserfromhell/parser/tokenizer.py
  2. +2
    -2
      tests/tokenizer/integration.mwtest

+ 11
- 14
mwparserfromhell/parser/tokenizer.py View File

@@ -48,7 +48,7 @@ class _TagOpenData(object):

def __init__(self):
self.context = self.CX_NAME
self.padding_buffer = []
self.padding_buffer = {"first": "", "before_eq": "", "after_eq": ""}
self.reset = 0


@@ -62,7 +62,7 @@ class Tokenizer(object):
MAX_DEPTH = 40
MAX_CYCLES = 100000
regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
tag_splitter = re.compile(r"([\s\"\\])")
tag_splitter = re.compile(r"([\s\"\\]+)")

def __init__(self):
self._text = None
@@ -475,12 +475,10 @@ class Tokenizer(object):
self._emit_first(tokens.TagAttrQuote())
self._emit_all(self._pop())
buf = data.padding_buffer
while len(buf) < 3:
buf.append("")
self._emit_first(tokens.TagAttrStart(pad_after_eq=buf.pop(),
pad_before_eq=buf.pop(), pad_first=buf.pop()))
self._emit_first(tokens.TagAttrStart(pad_first=buf["first"],
pad_before_eq=buf["before_eq"], pad_after_eq=buf["after_eq"]))
self._emit_all(self._pop())
data.padding_buffer = []
data.padding_buffer = {key: "" for key in data.padding_buffer}

def _handle_tag_data(self, data, text):
"""Handle all sorts of *text* data inside of an HTML open tag."""
@@ -506,14 +504,11 @@ class Tokenizer(object):
self._push(contexts.TAG_ATTR)
elif data.context & data.CX_ATTR_NAME:
if chunk == "=":
if not data.context & data.CX_NOTE_EQUALS:
data.padding_buffer.append("") # No padding before '='
data.context = data.CX_ATTR_VALUE | data.CX_NOTE_QUOTE
self._emit(tokens.TagAttrEquals())
continue
if data.context & data.CX_NOTE_EQUALS:
self._push_tag_buffer(data)
data.padding_buffer.append("") # No padding before tag
data.context = data.CX_ATTR_NAME
self._push(contexts.TAG_ATTR)
elif data.context & data.CX_ATTR_VALUE:
@@ -542,10 +537,13 @@ class Tokenizer(object):
data.context = data.CX_ATTR_READY
elif ctx & data.CX_ATTR_NAME:
data.context |= data.CX_NOTE_EQUALS
data.padding_buffer["before_eq"] += text
if ctx & data.CX_QUOTED and not ctx & data.CX_NOTE_SPACE:
self._emit_text(text)
else:
data.padding_buffer.append(text)
elif data.context & data.CX_ATTR_READY:
data.padding_buffer["first"] += text
elif data.context & data.CX_ATTR_VALUE:
data.padding_buffer["after_eq"] += text

def _handle_tag_text(self, text):
"""Handle regular *text* inside of an HTML open tag."""
@@ -578,8 +576,7 @@ class Tokenizer(object):
"""Handle the closing of a open tag (``<foo>``)."""
if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
self._push_tag_buffer(data)
padding = data.padding_buffer[0] if data.padding_buffer else ""
self._emit(token(padding=padding))
self._emit(token(padding=data.padding_buffer["first"]))
self._head += 1

def _handle_tag_open_close(self):


+ 2
- 2
tests/tokenizer/integration.mwtest View File

@@ -35,8 +35,8 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t

name: rich_tags
label: a HTML tag with tons of other things in it
input: "{{dubious claim}}<ref name={{abc}} foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}} mno="{{p}} [[q]] {{r}}">[[Source]]</ref>"
output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]
input: "{{dubious claim}}<ref name={{abc}} foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}} \n mno = "{{p}} [[q]] {{r}}">[[Source]]</ref>"
output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" \n ", pad_before_eq=" ", pad_after_eq=" "), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---



Loading…
Cancel
Save