Browse Source

Fix some usage of attrs; shorten a context, fix some behavior I broke.

tags/v0.3
Ben Kurtovic 11 years ago
parent
commit
a58c480639
3 changed files with 60 additions and 48 deletions
  1. +6
    -5
      mwparserfromhell/nodes/tag.py
  2. +35
    -33
      mwparserfromhell/parser/contexts.py
  3. +19
    -10
      mwparserfromhell/parser/tokenizer.py

+ 6
- 5
mwparserfromhell/nodes/tag.py View File

@@ -59,8 +59,8 @@ class Tag(TagDefinitions, Node):
return open_ + str(self.contents) + close

result = "<" + str(self.tag)
if self.attrs:
result += " " + " ".join([str(attr) for attr in self.attrs])
if self.attributes:
result += " " + " ".join([str(attr) for attr in self.attributes])
if self.self_closing:
result += self.open_padding + "/>"
else:
@@ -73,7 +73,7 @@ class Tag(TagDefinitions, Node):
if self.showtag:
for child in getter(self.tag):
yield self.tag, child
for attr in self.attrs:
for attr in self.attributes:
for child in getter(attr.name):
yield attr.name, child
if attr.value:
@@ -89,12 +89,13 @@ class Tag(TagDefinitions, Node):

def __showtree__(self, write, get, mark):
tagnodes = self.tag.nodes
if (not self.attrs and len(tagnodes) == 1 and isinstance(tagnodes[0], Text)):
if not self.attributes and (len(tagnodes) == 1 and
isinstance(tagnodes[0], Text)):
write("<" + str(tagnodes[0]) + ">")
else:
write("<")
get(self.tag)
for attr in self.attrs:
for attr in self.attributes:
get(attr.name)
if not attr.value:
continue


+ 35
- 33
mwparserfromhell/parser/contexts.py View File

@@ -71,7 +71,8 @@ Local (stack-specific) contexts:

* :py:const:`TAG_OPEN_ATTR_NAME`
* :py:const:`TAG_OPEN_ATTR_BODY`
* :py:const:`TAG_OPEN_ATTR_BODY_QUOTED`
* :py:const:`TAG_OPEN_ATTR_QUOTED`
* :py:const:`TAG_OPEN_ATTR_IGNORE`

* :py:const:`TAG_BODY`
* :py:const:`TAG_CLOSE`
@@ -83,38 +84,39 @@ Global contexts:

# Local contexts:

TEMPLATE = 0b00000000000000000111
TEMPLATE_NAME = 0b00000000000000000001
TEMPLATE_PARAM_KEY = 0b00000000000000000010
TEMPLATE_PARAM_VALUE = 0b00000000000000000100

ARGUMENT = 0b00000000000000011000
ARGUMENT_NAME = 0b00000000000000001000
ARGUMENT_DEFAULT = 0b00000000000000010000

WIKILINK = 0b00000000000001100000
WIKILINK_TITLE = 0b00000000000000100000
WIKILINK_TEXT = 0b00000000000001000000

HEADING = 0b00000001111110000000
HEADING_LEVEL_1 = 0b00000000000010000000
HEADING_LEVEL_2 = 0b00000000000100000000
HEADING_LEVEL_3 = 0b00000000001000000000
HEADING_LEVEL_4 = 0b00000000010000000000
HEADING_LEVEL_5 = 0b00000000100000000000
HEADING_LEVEL_6 = 0b00000001000000000000

COMMENT = 0b00000010000000000000

TAG = 0b11111100000000000000
TAG_OPEN = 0b00111100000000000000
TAG_OPEN_NAME = 0b00000100000000000000
TAG_OPEN_ATTR = 0b00111000000000000000
TAG_OPEN_ATTR_NAME = 0b00001000000000000000
TAG_OPEN_ATTR_BODY = 0b00010000000000000000
TAG_OPEN_ATTR_BODY_QUOTED = 0b00100000000000000000
TAG_BODY = 0b01000000000000000000
TAG_CLOSE = 0b10000000000000000000
TEMPLATE = 0b000000000000000000111
TEMPLATE_NAME = 0b000000000000000000001
TEMPLATE_PARAM_KEY = 0b000000000000000000010
TEMPLATE_PARAM_VALUE = 0b000000000000000000100

ARGUMENT = 0b000000000000000011000
ARGUMENT_NAME = 0b000000000000000001000
ARGUMENT_DEFAULT = 0b000000000000000010000

WIKILINK = 0b000000000000001100000
WIKILINK_TITLE = 0b000000000000000100000
WIKILINK_TEXT = 0b000000000000001000000

HEADING = 0b000000001111110000000
HEADING_LEVEL_1 = 0b000000000000010000000
HEADING_LEVEL_2 = 0b000000000000100000000
HEADING_LEVEL_3 = 0b000000000001000000000
HEADING_LEVEL_4 = 0b000000000010000000000
HEADING_LEVEL_5 = 0b000000000100000000000
HEADING_LEVEL_6 = 0b000000001000000000000

COMMENT = 0b000000010000000000000

TAG = 0b111111100000000000000
TAG_OPEN = 0b001111100000000000000
TAG_OPEN_NAME = 0b000000100000000000000
TAG_OPEN_ATTR = 0b001111000000000000000
TAG_OPEN_ATTR_NAME = 0b000001000000000000000
TAG_OPEN_ATTR_BODY = 0b000010000000000000000
TAG_OPEN_ATTR_QUOTED = 0b000100000000000000000
TAG_OPEN_ATTR_IGNORE = 0b001000000000000000000
TAG_BODY = 0b010000000000000000000
TAG_CLOSE = 0b100000000000000000000


# Global contexts:


+ 19
- 10
mwparserfromhell/parser/tokenizer.py View File

@@ -457,11 +457,13 @@ class Tokenizer(object):
self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
self._context ^= contexts.TAG_OPEN_NAME
self._context |= contexts.TAG_BODY
padding = "" # TODO

## If the last element was TagAttrStart, remove it, add " " to its padding, then return that
padding = ""
return padding

def _actually_handle_chunk(self, chunks, is_new):
if is_new and not self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED:
if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED:
padding = 0
while chunks:
if chunks[0] == "":
@@ -470,18 +472,24 @@ class Tokenizer(object):
else:
break
self._write(tokens.TagAttrStart(padding=" " * padding))
elif self._context & contexts.TAG_OPEN_ATTR_IGNORE:
self._context ^= contexts.TAG_OPEN_ATTR_IGNORE
chunks.pop(0)
return
elif self._context & contexts.TAG_OPEN_ATTR_QUOTED:
self._write_text(" ") # Quoted chunks don't lose their spaces

if chunks:
chunk = chunks.pop(0)
if self._context & contexts.TAG_OPEN_ATTR_BODY:
self._context ^= contexts.TAG_OPEN_ATTR_BODY
self._context |= contexts.TAG_OPEN_ATTR_NAME
if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED:
if self._context & contexts.TAG_OPEN_ATTR_QUOTED:
if re.search(r'[^\\]"', chunk[:-1]):
self._fail_route()
if re.search(r'[^\\]"$', chunk):
self._write_text(chunk[:-1])
self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED
self._context ^= contexts.TAG_OPEN_ATTR_QUOTED
self._context |= contexts.TAG_OPEN_ATTR_NAME
return True # Back to _handle_tag_attribute_body()
self._write_text(chunk)
@@ -491,6 +499,8 @@ class Tokenizer(object):
self._write_text(text)
return
chunks = text.split(" ")
is_new = False
is_quoted = False
if self._context & contexts.TAG_OPEN_NAME:
self._write_text(chunks.pop(0))
tag = self._get_tag_type_from_stack()
@@ -500,9 +510,7 @@ class Tokenizer(object):
self._context ^= contexts.TAG_OPEN_NAME
self._context |= contexts.TAG_OPEN_ATTR_NAME
self._actually_handle_chunk(chunks, True)

is_new = False
is_quoted = False
is_new = True
while chunks:
result = self._actually_handle_chunk(chunks, is_new)
is_quoted = result or is_quoted
@@ -530,7 +538,7 @@ class Tokenizer(object):
self._head += 1
reset = self._head
try:
attr = self._parse(contexts.TAG_OPEN_ATTR_BODY_QUOTED)
attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | contexts.TAG_OPEN_ATTR_IGNORE)
except BadRoute:
self._head = reset
self._write_text(next)
@@ -538,6 +546,7 @@ class Tokenizer(object):
self._write(tokens.TagAttrQuote())
self._write_text(next[1:])
self._write_all(attr)
return
self._context ^= contexts.TAG_OPEN_ATTR_BODY
self._context |= contexts.TAG_OPEN_ATTR_NAME
while chunks:
@@ -588,7 +597,7 @@ class Tokenizer(object):
contexts.HEADING | contexts.COMMENT | contexts.TAG)
double_fail = (
contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE |
contexts.TAG_OPEN_ATTR_BODY_QUOTED)
contexts.TAG_OPEN_ATTR_QUOTED)
if self._context & double_fail:
self._pop()
if self._context & fail:
@@ -645,7 +654,7 @@ class Tokenizer(object):
elif this == "<" and next != "/" and (
not self._context & (contexts.TAG ^ contexts.TAG_BODY)):
self._parse_tag()
elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_BODY_QUOTED):
elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED):
if this == "\n":
if self._context & contexts.TAG_CLOSE:
self._pop()


Loading…
Cancel
Save