From d9f23b8faaedb94d667372fb2a892307cf15a38a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 22 Dec 2012 21:58:21 -0500 Subject: [PATCH] Really basic, messy, and fragile tag attribute support. --- mwparserfromhell/parser/contexts.py | 73 +++++++++++++++++++----------------- mwparserfromhell/parser/tokenizer.py | 65 ++++++++++++++++++-------------- 2 files changed, 75 insertions(+), 63 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index a67bd76..053c930 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -65,11 +65,13 @@ Local (stack-specific) contexts: * :py:const:`TAG` * :py:const:`TAG_OPEN` - * :py:const:`TAG_ATTR` - * :py:const:`TAG_ATTR_NAME` - * :py:const:`TAG_ATTR_BODY` - * :py:const:`TAG_ATTR_BODY_QUOTED` + * :py:const:`TAG_OPEN_NAME` + * :py:const:`TAG_OPEN_ATTR` + + * :py:const:`TAG_OPEN_ATTR_NAME` + * :py:const:`TAG_OPEN_ATTR_BODY` + * :py:const:`TAG_OPEN_ATTR_BODY_QUOTED` * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` @@ -81,37 +83,38 @@ Global contexts: # Local contexts: -TEMPLATE = 0b00000000000000000111 -TEMPLATE_NAME = 0b00000000000000000001 -TEMPLATE_PARAM_KEY = 0b00000000000000000010 -TEMPLATE_PARAM_VALUE = 0b00000000000000000100 - -ARGUMENT = 0b00000000000000011000 -ARGUMENT_NAME = 0b00000000000000001000 -ARGUMENT_DEFAULT = 0b00000000000000010000 - -WIKILINK = 0b00000000000001100000 -WIKILINK_TITLE = 0b00000000000000100000 -WIKILINK_TEXT = 0b00000000000001000000 - -HEADING = 0b00000001111110000000 -HEADING_LEVEL_1 = 0b00000000000010000000 -HEADING_LEVEL_2 = 0b00000000000100000000 -HEADING_LEVEL_3 = 0b00000000001000000000 -HEADING_LEVEL_4 = 0b00000000010000000000 -HEADING_LEVEL_5 = 0b00000000100000000000 -HEADING_LEVEL_6 = 0b00000001000000000000 - -COMMENT = 0b00000010000000000000 - -TAG = 0b11111100000000000000 -TAG_OPEN = 0b00000100000000000000 -TAG_ATTR = 0b00111000000000000000 -TAG_ATTR_NAME = 0b00001000000000000000 -TAG_ATTR_BODY = 0b00010000000000000000 -TAG_ATTR_BODY_QUOTED = 0b00100000000000000000 -TAG_BODY = 0b01000000000000000000 -TAG_CLOSE = 0b10000000000000000000 +TEMPLATE = 0b00000000000000000111 +TEMPLATE_NAME = 0b00000000000000000001 +TEMPLATE_PARAM_KEY = 0b00000000000000000010 +TEMPLATE_PARAM_VALUE = 0b00000000000000000100 + +ARGUMENT = 0b00000000000000011000 +ARGUMENT_NAME = 0b00000000000000001000 +ARGUMENT_DEFAULT = 0b00000000000000010000 + +WIKILINK = 0b00000000000001100000 +WIKILINK_TITLE = 0b00000000000000100000 +WIKILINK_TEXT = 0b00000000000001000000 + +HEADING = 0b00000001111110000000 +HEADING_LEVEL_1 = 0b00000000000010000000 +HEADING_LEVEL_2 = 0b00000000000100000000 +HEADING_LEVEL_3 = 0b00000000001000000000 +HEADING_LEVEL_4 = 0b00000000010000000000 +HEADING_LEVEL_5 = 0b00000000100000000000 +HEADING_LEVEL_6 = 0b00000001000000000000 + +COMMENT = 0b00000010000000000000 + +TAG = 0b11111100000000000000 +TAG_OPEN = 0b00111100000000000000 +TAG_OPEN_NAME = 0b00000100000000000000 +TAG_OPEN_ATTR = 0b00111000000000000000 +TAG_OPEN_ATTR_NAME = 0b00001000000000000000 +TAG_OPEN_ATTR_BODY = 0b00010000000000000000 +TAG_OPEN_ATTR_BODY_QUOTED = 0b00100000000000000000 +TAG_BODY = 0b01000000000000000000 +TAG_CLOSE = 0b10000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 99f5a7b..f65cbc1 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -426,7 +426,7 @@ class Tokenizer(object): reset = self._head self._head += 1 try: - tokens = self._parse(contexts.TAG_OPEN) + tokens = self._parse(contexts.TAG_OPEN_NAME) except BadRoute: self._head = reset self._write_text("<") @@ -438,34 +438,48 @@ class Tokenizer(object): if not self._stack: return None # Tag has an empty name? text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] - text = "".join([token.text for token in text]).strip().lower() + text = "".join([token.text for token in text]).rstrip().lower() try: return Tag.TRANSLATIONS[text] except KeyError: return Tag.TAG_UNKNOWN def _actually_close_tag_opening(self): - if self._context & contexts.TAG_ATTR: - if self._context & contexts.TAG_ATTR_BODY: - self._context ^= contexts.TAG_ATTR_BODY - if self._context & contexts.TAG_ATTR_BODY_QUOTED: - self._context ^= contexts.TAG_ATTR_BODY_QUOTED - else: - self._context ^= contexts.TAG_ATTR_NAME + if self._context & contexts.TAG_OPEN_ATTR: + if self._context & contexts.TAG_OPEN_ATTR_NAME: + self._context ^= contexts.TAG_OPEN_ATTR_NAME + if self._context & contexts.TAG_OPEN_ATTR_BODY: + self._context ^= contexts.TAG_OPEN_ATTR_BODY + if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: + self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED else: tag = self._get_tag_type_from_stack() - if tag is None: + if not tag: self._fail_route() self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) - - self._context ^= contexts.TAG_OPEN + self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY padding = "" # TODO return padding - # def _handle_attribute(self): - # if not self._context & contexts.TAG_ATTR: - # self._handle_tag_close_name() + def _handle_tag_chunk(self, text): + if " " not in text: + self._write_text(text) + return + chunks = text.split(" ") + if self._context & contexts.TAG_OPEN_NAME: + self._write_text(chunks.pop(0)) + tag = self._get_tag_type_from_stack() + if not tag: + self._fail_route() + self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) + self._context ^= contexts.TAG_OPEN_NAME + self._context |= contexts.TAG_OPEN_ATTR_NAME + self._write(tokens.TagAttrStart()) + for i, chunk in enumerate(chunks): + if i > 0: + self._write(tokens.TagAttrStart()) + self._write_text(chunk) # def _handle_attribute_name(self): # ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED @@ -505,7 +519,10 @@ class Tokenizer(object): while True: this = self._read() if this not in self.MARKERS: - self._write_text(this) + if self._context & contexts.TAG_OPEN: + self._handle_tag_chunk(this) + else: + self._write_text(this) self._head += 1 continue if this is self.END: @@ -567,25 +584,17 @@ class Tokenizer(object): elif this == "<" and next != "/" and ( not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() - # elif this == " " and (self._context & contexts.TAG_OPEN and not - # self._context & contexts.TAG_ATTR_BODY_QUOTED): - # self._handle_attribute() - # elif this == "=" and self._context & contexts.TAG_ATTR_NAME: - # self._handle_attribute_name() - # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: - # self._handle_quoted_attribute_close() - elif self._context & contexts.TAG_OPEN and ( - not self._context & contexts.TAG_ATTR_BODY_QUOTED): + elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_BODY_QUOTED): if this == "\n": if self._context & contexts.TAG_CLOSE: self._pop() self._fail_route() elif this == ">": self._handle_tag_close_open() - elif this == "/": + elif this == "/" and next == ">": return self._handle_tag_selfclose() - else: - self._write_text(this) + # elif this == "=": + # self._handle_tag_attr_body() elif this == "<" and next == "/" and ( self._context & contexts.TAG_BODY): self._handle_tag_open_close()