diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 833b597..94f92c5 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -59,8 +59,8 @@ class Tag(TagDefinitions, Node): return open_ + str(self.contents) + close result = "<" + str(self.tag) - if self.attrs: - result += " " + " ".join([str(attr) for attr in self.attrs]) + if self.attributes: + result += " " + " ".join([str(attr) for attr in self.attributes]) if self.self_closing: result += self.open_padding + "/>" else: @@ -73,7 +73,7 @@ class Tag(TagDefinitions, Node): if self.showtag: for child in getter(self.tag): yield self.tag, child - for attr in self.attrs: + for attr in self.attributes: for child in getter(attr.name): yield attr.name, child if attr.value: @@ -89,12 +89,13 @@ class Tag(TagDefinitions, Node): def __showtree__(self, write, get, mark): tagnodes = self.tag.nodes - if (not self.attrs and len(tagnodes) == 1 and isinstance(tagnodes[0], Text)): + if not self.attributes and (len(tagnodes) == 1 and + isinstance(tagnodes[0], Text)): write("<" + str(tagnodes[0]) + ">") else: write("<") get(self.tag) - for attr in self.attrs: + for attr in self.attributes: get(attr.name) if not attr.value: continue diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 053c930..d87da9a 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -71,7 +71,8 @@ Local (stack-specific) contexts: * :py:const:`TAG_OPEN_ATTR_NAME` * :py:const:`TAG_OPEN_ATTR_BODY` - * :py:const:`TAG_OPEN_ATTR_BODY_QUOTED` + * :py:const:`TAG_OPEN_ATTR_QUOTED` + * :py:const:`TAG_OPEN_ATTR_IGNORE` * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` @@ -83,38 +84,39 @@ Global contexts: # Local contexts: -TEMPLATE = 0b00000000000000000111 -TEMPLATE_NAME = 0b00000000000000000001 -TEMPLATE_PARAM_KEY = 0b00000000000000000010 -TEMPLATE_PARAM_VALUE = 0b00000000000000000100 - -ARGUMENT = 0b00000000000000011000 -ARGUMENT_NAME = 0b00000000000000001000 -ARGUMENT_DEFAULT = 0b00000000000000010000 - -WIKILINK = 0b00000000000001100000 -WIKILINK_TITLE = 0b00000000000000100000 -WIKILINK_TEXT = 0b00000000000001000000 - -HEADING = 0b00000001111110000000 -HEADING_LEVEL_1 = 0b00000000000010000000 -HEADING_LEVEL_2 = 0b00000000000100000000 -HEADING_LEVEL_3 = 0b00000000001000000000 -HEADING_LEVEL_4 = 0b00000000010000000000 -HEADING_LEVEL_5 = 0b00000000100000000000 -HEADING_LEVEL_6 = 0b00000001000000000000 - -COMMENT = 0b00000010000000000000 - -TAG = 0b11111100000000000000 -TAG_OPEN = 0b00111100000000000000 -TAG_OPEN_NAME = 0b00000100000000000000 -TAG_OPEN_ATTR = 0b00111000000000000000 -TAG_OPEN_ATTR_NAME = 0b00001000000000000000 -TAG_OPEN_ATTR_BODY = 0b00010000000000000000 -TAG_OPEN_ATTR_BODY_QUOTED = 0b00100000000000000000 -TAG_BODY = 0b01000000000000000000 -TAG_CLOSE = 0b10000000000000000000 +TEMPLATE = 0b000000000000000000111 +TEMPLATE_NAME = 0b000000000000000000001 +TEMPLATE_PARAM_KEY = 0b000000000000000000010 +TEMPLATE_PARAM_VALUE = 0b000000000000000000100 + +ARGUMENT = 0b000000000000000011000 +ARGUMENT_NAME = 0b000000000000000001000 +ARGUMENT_DEFAULT = 0b000000000000000010000 + +WIKILINK = 0b000000000000001100000 +WIKILINK_TITLE = 0b000000000000000100000 +WIKILINK_TEXT = 0b000000000000001000000 + +HEADING = 0b000000001111110000000 +HEADING_LEVEL_1 = 0b000000000000010000000 +HEADING_LEVEL_2 = 0b000000000000100000000 +HEADING_LEVEL_3 = 0b000000000001000000000 +HEADING_LEVEL_4 = 0b000000000010000000000 +HEADING_LEVEL_5 = 0b000000000100000000000 +HEADING_LEVEL_6 = 0b000000001000000000000 + +COMMENT = 0b000000010000000000000 + +TAG = 0b111111100000000000000 +TAG_OPEN = 0b001111100000000000000 +TAG_OPEN_NAME = 0b000000100000000000000 +TAG_OPEN_ATTR = 0b001111000000000000000 +TAG_OPEN_ATTR_NAME = 0b000001000000000000000 +TAG_OPEN_ATTR_BODY = 0b000010000000000000000 +TAG_OPEN_ATTR_QUOTED = 0b000100000000000000000 +TAG_OPEN_ATTR_IGNORE = 0b001000000000000000000 +TAG_BODY = 0b010000000000000000000 +TAG_CLOSE = 0b100000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 46c4399..1d31fa4 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -457,11 +457,13 @@ class Tokenizer(object): self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY - padding = "" # TODO + + ## If the last element was TagAttrStart, remove it, add " " to its padding, then return that + padding = "" return padding def _actually_handle_chunk(self, chunks, is_new): - if is_new and not self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: + if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: padding = 0 while chunks: if chunks[0] == "": @@ -470,18 +472,24 @@ class Tokenizer(object): else: break self._write(tokens.TagAttrStart(padding=" " * padding)) + elif self._context & contexts.TAG_OPEN_ATTR_IGNORE: + self._context ^= contexts.TAG_OPEN_ATTR_IGNORE + chunks.pop(0) + return + elif self._context & contexts.TAG_OPEN_ATTR_QUOTED: + self._write_text(" ") # Quoted chunks don't lose their spaces if chunks: chunk = chunks.pop(0) if self._context & contexts.TAG_OPEN_ATTR_BODY: self._context ^= contexts.TAG_OPEN_ATTR_BODY self._context |= contexts.TAG_OPEN_ATTR_NAME - if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: + if self._context & contexts.TAG_OPEN_ATTR_QUOTED: if re.search(r'[^\\]"', chunk[:-1]): self._fail_route() if re.search(r'[^\\]"$', chunk): self._write_text(chunk[:-1]) - self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED + self._context ^= contexts.TAG_OPEN_ATTR_QUOTED self._context |= contexts.TAG_OPEN_ATTR_NAME return True # Back to _handle_tag_attribute_body() self._write_text(chunk) @@ -491,6 +499,8 @@ class Tokenizer(object): self._write_text(text) return chunks = text.split(" ") + is_new = False + is_quoted = False if self._context & contexts.TAG_OPEN_NAME: self._write_text(chunks.pop(0)) tag = self._get_tag_type_from_stack() @@ -500,9 +510,7 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME self._actually_handle_chunk(chunks, True) - - is_new = False - is_quoted = False + is_new = True while chunks: result = self._actually_handle_chunk(chunks, is_new) is_quoted = result or is_quoted @@ -530,7 +538,7 @@ class Tokenizer(object): self._head += 1 reset = self._head try: - attr = self._parse(contexts.TAG_OPEN_ATTR_BODY_QUOTED) + attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | contexts.TAG_OPEN_ATTR_IGNORE) except BadRoute: self._head = reset self._write_text(next) @@ -538,6 +546,7 @@ class Tokenizer(object): self._write(tokens.TagAttrQuote()) self._write_text(next[1:]) self._write_all(attr) + return self._context ^= contexts.TAG_OPEN_ATTR_BODY self._context |= contexts.TAG_OPEN_ATTR_NAME while chunks: @@ -588,7 +597,7 @@ class Tokenizer(object): contexts.HEADING | contexts.COMMENT | contexts.TAG) double_fail = ( contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE | - contexts.TAG_OPEN_ATTR_BODY_QUOTED) + contexts.TAG_OPEN_ATTR_QUOTED) if self._context & double_fail: self._pop() if self._context & fail: @@ -645,7 +654,7 @@ class Tokenizer(object): elif this == "<" and next != "/" and ( not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() - elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_BODY_QUOTED): + elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): if this == "\n": if self._context & contexts.TAG_CLOSE: self._pop()