From d1a9ba9a34f544d241b7595655e74a68c5b3f60b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 1 Dec 2012 13:42:08 -0500 Subject: [PATCH 01/77] Starting tag work. - Translation dict, contexts, parse_* and handle_* hooks in tokenizer. --- mwparserfromhell/nodes/tag.py | 36 +++++++++++++++++ mwparserfromhell/parser/contexts.py | 65 +++++++++++++++++++----------- mwparserfromhell/parser/tokenizer.c | 1 - mwparserfromhell/parser/tokenizer.py | 77 +++++++++++++++++++++++++++++++++++- 4 files changed, 155 insertions(+), 24 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 5873a49..c32f398 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -73,6 +73,42 @@ class Tag(Node): TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE)) TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE + TRANSLATIONS = { + "i": TAG_ITALIC, + "em": TAG_ITALIC, + "b": TAG_BOLD, + "strong": TAG_BOLD, + "u": TAG_UNDERLINE, + "s": TAG_STRIKETHROUGH, + "ul": TAG_UNORDERED_LIST, + "ol": TAG_ORDERED_LIST, + "dt": TAG_DEF_TERM, + "dd": TAG_DEF_ITEM, + "blockquote": TAG_BLOCKQUOTE, + "hl": TAG_RULE, + "br": TAG_BREAK, + "abbr": TAG_ABBR, + "pre": TAG_PRE, + "tt": TAG_MONOSPACE, + "code": TAG_CODE, + "span": TAG_SPAN, + "div": TAG_DIV, + "font": TAG_FONT, + "small": TAG_SMALL, + "big": TAG_BIG, + "center": TAG_CENTER, + "ref": TAG_REF, + "gallery": TAG_GALLERY, + "math": TAG_MATH, + "nowiki": TAG_NOWIKI, + "noinclude": TAG_NOINCLUDE, + "includeonly": TAG_INCLUDEONLY, + "onlyinclude": TAG_ONLYINCLUDE, + "syntaxhighlight": TAG_SYNTAXHIGHLIGHT, + "source": TAG_SYNTAXHIGHLIGHT, + "poem": TAG_POEM, + } + def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, self_closing=False, open_padding=0, close_padding=0): super(Tag, self).__init__() diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 9d41870..a67bd76 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -62,35 +62,56 @@ Local (stack-specific) contexts: * :py:const:`COMMENT` -Global contexts: +* :py:const:`TAG` -* :py:const:`GL_HEADING` -""" + * :py:const:`TAG_OPEN` + * :py:const:`TAG_ATTR` -# Local contexts: + * :py:const:`TAG_ATTR_NAME` + * :py:const:`TAG_ATTR_BODY` + * :py:const:`TAG_ATTR_BODY_QUOTED` -TEMPLATE = 0b00000000000111 -TEMPLATE_NAME = 0b00000000000001 -TEMPLATE_PARAM_KEY = 0b00000000000010 -TEMPLATE_PARAM_VALUE = 0b00000000000100 + * :py:const:`TAG_BODY` + * :py:const:`TAG_CLOSE` -ARGUMENT = 0b00000000011000 -ARGUMENT_NAME = 0b00000000001000 -ARGUMENT_DEFAULT = 0b00000000010000 +Global contexts: -WIKILINK = 0b00000001100000 -WIKILINK_TITLE = 0b00000000100000 -WIKILINK_TEXT = 0b00000001000000 +* :py:const:`GL_HEADING` +""" -HEADING = 0b01111110000000 -HEADING_LEVEL_1 = 0b00000010000000 -HEADING_LEVEL_2 = 0b00000100000000 -HEADING_LEVEL_3 = 0b00001000000000 -HEADING_LEVEL_4 = 0b00010000000000 -HEADING_LEVEL_5 = 0b00100000000000 -HEADING_LEVEL_6 = 0b01000000000000 +# Local contexts: -COMMENT = 0b10000000000000 +TEMPLATE = 0b00000000000000000111 +TEMPLATE_NAME = 0b00000000000000000001 +TEMPLATE_PARAM_KEY = 0b00000000000000000010 +TEMPLATE_PARAM_VALUE = 0b00000000000000000100 + +ARGUMENT = 0b00000000000000011000 +ARGUMENT_NAME = 0b00000000000000001000 +ARGUMENT_DEFAULT = 0b00000000000000010000 + +WIKILINK = 0b00000000000001100000 +WIKILINK_TITLE = 0b00000000000000100000 +WIKILINK_TEXT = 0b00000000000001000000 + +HEADING = 0b00000001111110000000 +HEADING_LEVEL_1 = 0b00000000000010000000 +HEADING_LEVEL_2 = 0b00000000000100000000 +HEADING_LEVEL_3 = 0b00000000001000000000 +HEADING_LEVEL_4 = 0b00000000010000000000 +HEADING_LEVEL_5 = 0b00000000100000000000 +HEADING_LEVEL_6 = 0b00000001000000000000 + +COMMENT = 0b00000010000000000000 + +TAG = 0b11111100000000000000 +TAG_OPEN = 0b00000100000000000000 +TAG_ATTR = 0b00111000000000000000 +TAG_ATTR_NAME = 0b00001000000000000000 +TAG_ATTR_BODY = 0b00010000000000000000 +TAG_ATTR_BODY_QUOTED = 0b00100000000000000000 +TAG_BODY = 0b01000000000000000000 +TAG_CLOSE = 0b10000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index cc1b4dd..71b6cc3 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -767,7 +767,6 @@ Tokenizer_parse_heading(Tokenizer* self) self->global ^= GL_HEADING; return 0; } - level = PyInt_FromSsize_t(heading->level); if (!level) { Py_DECREF(heading->title); diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 5b0e976..f640aa2 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -27,6 +27,7 @@ import string from . import contexts from . import tokens +from ..nodes.tag import Tag from ..compat import htmlentities __all__ = ["Tokenizer"] @@ -420,6 +421,57 @@ class Tokenizer(object): self._write(tokens.CommentEnd()) self._head += 2 + def _parse_tag(self): + """Parse an HTML tag at the head of the wikicode string.""" + self._head += 1 + reset = self._head + self._push() + try: + t_open, type_, self_close, o_pad = self._parse(contexts.TAG_OPEN) + if not self_close: + t_body = self._parse(contexts.TAG_BODY) + t_close, c_pad = self._parse(contexts.TAG_CLOSE) + except BadRoute: + self._head = reset + self._pop() + self._write_text("<") + else: + self._pop() + self._write(tokens.TagOpenOpen(type=type_, showtag=False)) + self._write_all(t_open) + if self_close: + self._write(tokens.TagCloseSelfclose(padding=o_pad)) + else: + self._write(tokens.TagCloseOpen(padding=o_pad)) + self._write_all(t_body) + self._write(tokens.TagOpenClose()) + self._write_all(t_close) + self._write(tokens.TagCloseClose(padding=c_pad)) + + def _handle_attribute(self): + if not self._context & contexts.TAG_ATTR: + ## check name is valid + + def _handle_attribute_name(self): + ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED + pass + + def _handle_quoted_attribute_close(self): + pass + + def _handle_tag_close_open(self): + pass ## .padding + + def _handle_tag_selfclose(self): + pass ## .padding + + def _handle_tag_close_open(self): + pass + + def _handle_tag_close_close(self): + ## check that the closing name is the same as the opening name + pass ## .padding + def _parse(self, context=0): """Parse the wikicode string, using *context* for when to stop.""" self._push(context) @@ -432,7 +484,7 @@ class Tokenizer(object): if this is self.END: fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | contexts.HEADING | - contexts.COMMENT) + contexts.COMMENT | contexts.TAG) if self._context & contexts.TEMPLATE_PARAM_KEY: self._pop() if self._context & fail: @@ -484,6 +536,29 @@ class Tokenizer(object): self._parse_comment() else: self._write_text(this) + elif this == "<" and not self._context & (contexts.TAG ^ contexts.TAG_BODY): + self._parse_tag() + elif this == " " and (self._context & contexts.TAG_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): + self._handle_attribute() + elif this == "=" and self._context & contexts.TAG_ATTR_NAME: + self._handle_attribute_name() + elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: + self._handle_quoted_attribute_close() + elif this == "\n" and (self._context & contexts.TAG_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): + self._fail_route() + elif this == ">" and (self._context & contexts.TAG_ATTR_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): + return self._handle_tag_close_open() + elif this == "/" and next == ">" and ( + self._context & contexts.TAG_ATTR_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): + return self._handle_tag_selfclose() + elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: + self._handle_tag_close_open() + elif this == ">" and self._context & contexts.TAG_CLOSE: + self._handle_tag_close_close() else: self._write_text(this) self._head += 1 From 05ec7a1a92fdf2549e8722aabd4a36a4825f3227 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 8 Dec 2012 22:04:03 -0500 Subject: [PATCH 02/77] Improve padding support for Tags; more code for tags in tokenizer. --- mwparserfromhell/nodes/extras/attribute.py | 27 +++++-- mwparserfromhell/nodes/tag.py | 18 ++--- mwparserfromhell/parser/tokenizer.py | 116 ++++++++++++++++++----------- 3 files changed, 100 insertions(+), 61 deletions(-) diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py index 648bca0..58a99a8 100644 --- a/mwparserfromhell/nodes/extras/attribute.py +++ b/mwparserfromhell/nodes/extras/attribute.py @@ -36,18 +36,20 @@ class Attribute(StringMixIn): whose value is ``"foo"``. """ - def __init__(self, name, value=None, quoted=True): + def __init__(self, name, value=None, quoted=True, padding=""): super(Attribute, self).__init__() self._name = name self._value = value self._quoted = quoted + self._padding = padding def __unicode__(self): + base = self.padding + str(self.name) if self.value: if self.quoted: - return str(self.name) + '="' + str(self.value) + '"' - return str(self.name) + "=" + str(self.value) - return str(self.name) + return base + '="' + str(self.value) + '"' + return base + "=" + str(self.value) + return base @property def name(self): @@ -64,14 +66,23 @@ class Attribute(StringMixIn): """Whether the attribute's value is quoted with double quotes.""" return self._quoted + @property + def padding(self): + """Spacing to insert right before the attribute.""" + return self._padding + @name.setter - def name(self, newval): - self._name = parse_anything(newval) + def name(self, value): + self._name = parse_anything(value) @value.setter def value(self, newval): self._value = parse_anything(newval) @quoted.setter - def quoted(self, newval): - self._quoted = bool(newval) + def quoted(self, value): + self._quoted = bool(value) + + @padding.setter + def padding(self, value): + self._padding = str(value) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index c32f398..681a17a 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -110,7 +110,7 @@ class Tag(Node): } def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, - self_closing=False, open_padding=0, close_padding=0): + self_closing=False, open_padding="", close_padding=""): super(Tag, self).__init__() self._type = type_ self._tag = tag @@ -136,10 +136,10 @@ class Tag(Node): if self.attrs: result += " " + " ".join([str(attr) for attr in self.attrs]) if self.self_closing: - result += " " * self.open_padding + "/>" + result += self.open_padding + "/>" else: - result += " " * self.open_padding + ">" + str(self.contents) - result += "" + result += self.open_padding + ">" + str(self.contents) + result += "" return result def __iternodes__(self, getter): @@ -232,17 +232,17 @@ class Tag(Node): @property def self_closing(self): - """Whether the tag is self-closing with no content.""" + """Whether the tag is self-closing with no content (like ``
``).""" return self._self_closing @property def open_padding(self): - """How much spacing to insert before the first closing >.""" + """Spacing to insert before the first closing >.""" return self._open_padding @property def close_padding(self): - """How much spacing to insert before the last closing >.""" + """Spacing to insert before the last closing > (excl. self-closing).""" return self._close_padding @type.setter @@ -270,8 +270,8 @@ class Tag(Node): @open_padding.setter def open_padding(self, value): - self._open_padding = int(value) + self._open_padding = str(value) @close_padding.setter def close_padding(self, value): - self._close_padding = int(value) + self._close_padding = str(value) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index f640aa2..80d7610 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -425,52 +425,77 @@ class Tokenizer(object): """Parse an HTML tag at the head of the wikicode string.""" self._head += 1 reset = self._head - self._push() try: - t_open, type_, self_close, o_pad = self._parse(contexts.TAG_OPEN) - if not self_close: - t_body = self._parse(contexts.TAG_BODY) - t_close, c_pad = self._parse(contexts.TAG_CLOSE) + tokens = self._parse(contexts.TAG_OPEN) except BadRoute: self._head = reset - self._pop() self._write_text("<") else: - self._pop() - self._write(tokens.TagOpenOpen(type=type_, showtag=False)) - self._write_all(t_open) - if self_close: - self._write(tokens.TagCloseSelfclose(padding=o_pad)) - else: - self._write(tokens.TagCloseOpen(padding=o_pad)) - self._write_all(t_body) - self._write(tokens.TagOpenClose()) - self._write_all(t_close) - self._write(tokens.TagCloseClose(padding=c_pad)) + self._write_all(tokens) - def _handle_attribute(self): - if not self._context & contexts.TAG_ATTR: - ## check name is valid + def _get_tag_type_from_stack(self): + self._push_textbuffer() + if not self._stack: + return None # Tag has an empty name? + text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] + text = "".join([token.text for token in text]).strip().lower() + try: + return Tag.TRANSLATIONS[text] + except KeyError: + return Tag.TAG_UNKNOWN + + def _handle_tag_close_name(self): + tag = self._get_tag_type_from_stack() + if tag is None: + self._fail_route() + self._write(tokens.TagOpenOpen(type=tag, showtag=False)) - def _handle_attribute_name(self): - ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED - pass + # def _handle_attribute(self): + # if not self._context & contexts.TAG_ATTR: + # self._handle_tag_close_name() - def _handle_quoted_attribute_close(self): - pass + # def _handle_attribute_name(self): + # ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED + # pass + + # def _handle_quoted_attribute_close(self): + # pass def _handle_tag_close_open(self): - pass ## .padding + if not self._context & contexts.TAG_ATTR: + self._handle_tag_close_name() + + self._context ^= contexts.TAG_OPEN # also TAG_ATTR_* + self._context |= contexts.TAG_BODY + + padding = "" # TODO + self._write(tokens.TagCloseOpen(padding=padding)) def _handle_tag_selfclose(self): - pass ## .padding + self._context ^= contexts.TAG_OPEN # also TAG_ATTR_* + self._context |= contexts.TAG_BODY - def _handle_tag_close_open(self): - pass + padding = "" # TODO + self._write(tokens.TagCloseSelfclose(padding=padding)) + self._pop() + + def _handle_tag_open_close(self): + self._context ^= contexts.TAG_BODY + self._context |= contexts.TAG_CLOSE + self._write(tokens.TagOpenClose()) + self._push() + self._head += 1 def _handle_tag_close_close(self): - ## check that the closing name is the same as the opening name - pass ## .padding + tag = self._get_tag_type_from_stack() + closing = self._pop() + if tag != self._stack[0].type: + # Closing and opening tags are not the same, so fail this route: + self._fail_route() + self._write_all(closing) + padding = "" # TODO + self._write(tokens.TagCloseClose(padding=padding)) + return self._pop() def _parse(self, context=0): """Parse the wikicode string, using *context* for when to stop.""" @@ -485,7 +510,8 @@ class Tokenizer(object): fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | contexts.HEADING | contexts.COMMENT | contexts.TAG) - if self._context & contexts.TEMPLATE_PARAM_KEY: + double_fail = contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE + if self._context & double_fail: self._pop() if self._context & fail: self._fail_route() @@ -538,27 +564,29 @@ class Tokenizer(object): self._write_text(this) elif this == "<" and not self._context & (contexts.TAG ^ contexts.TAG_BODY): self._parse_tag() - elif this == " " and (self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): - self._handle_attribute() - elif this == "=" and self._context & contexts.TAG_ATTR_NAME: - self._handle_attribute_name() - elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: - self._handle_quoted_attribute_close() + # elif this == " " and (self._context & contexts.TAG_OPEN and not + # self._context & contexts.TAG_ATTR_BODY_QUOTED): + # self._handle_attribute() + # elif this == "=" and self._context & contexts.TAG_ATTR_NAME: + # self._handle_attribute_name() + # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: + # self._handle_quoted_attribute_close() elif this == "\n" and (self._context & contexts.TAG_OPEN and not self._context & contexts.TAG_ATTR_BODY_QUOTED): + if self._context & contexts.TAG_CLOSE: + self._pop() self._fail_route() - elif this == ">" and (self._context & contexts.TAG_ATTR_OPEN and not + elif this == ">" and (self._context & contexts.TAG_OPEN and not self._context & contexts.TAG_ATTR_BODY_QUOTED): - return self._handle_tag_close_open() + self._handle_tag_close_open() elif this == "/" and next == ">" and ( - self._context & contexts.TAG_ATTR_OPEN and not + self._context & contexts.TAG_OPEN and not self._context & contexts.TAG_ATTR_BODY_QUOTED): return self._handle_tag_selfclose() elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: - self._handle_tag_close_open() + self._handle_tag_open_close() elif this == ">" and self._context & contexts.TAG_CLOSE: - self._handle_tag_close_close() + return self._handle_tag_close_close() else: self._write_text(this) self._head += 1 From 7e46601b1d358a09dfa8641b03d6bb2a5eeb63c3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 9 Dec 2012 00:20:21 -0500 Subject: [PATCH 03/77] Tags should fully work now in tokenizer and builder. Still need to do attributes. --- mwparserfromhell/nodes/tag.py | 5 +-- mwparserfromhell/parser/builder.py | 2 ++ mwparserfromhell/parser/tokenizer.py | 62 ++++++++++++++++++++---------------- 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 681a17a..48effa1 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -70,8 +70,9 @@ class Tag(Node): TAG_POEM = 202 # Lists of tags: + TAGS_ALL = set(range(300)) TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE)) - TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE + TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE TRANSLATIONS = { "i": TAG_ITALIC, @@ -248,7 +249,7 @@ class Tag(Node): @type.setter def type(self, value): value = int(value) - if value not in self.TAGS_INVISIBLE | self.TAGS_VISIBLE: + if value not in self.TAGS_ALL: raise ValueError(value) self._type = value diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 61a8209..648842c 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -219,7 +219,9 @@ class Builder(object): self_closing=True, open_padding=token.padding) elif isinstance(token, tokens.TagOpenClose): contents = self._pop() + self._push() elif isinstance(token, tokens.TagCloseClose): + self._pop() return Tag(type_, tag, contents, attrs, showtag, False, open_pad, token.padding) else: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 80d7610..2e72951 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -423,8 +423,8 @@ class Tokenizer(object): def _parse_tag(self): """Parse an HTML tag at the head of the wikicode string.""" - self._head += 1 reset = self._head + self._head += 1 try: tokens = self._parse(contexts.TAG_OPEN) except BadRoute: @@ -444,11 +444,24 @@ class Tokenizer(object): except KeyError: return Tag.TAG_UNKNOWN - def _handle_tag_close_name(self): - tag = self._get_tag_type_from_stack() - if tag is None: - self._fail_route() - self._write(tokens.TagOpenOpen(type=tag, showtag=False)) + def _actually_close_tag_opening(self): + if self._context & contexts.TAG_ATTR: + if self._context & contexts.TAG_ATTR_BODY: + self._context ^= contexts.TAG_ATTR_BODY + if self._context & contexts.TAG_ATTR_BODY_QUOTED: + self._context ^= contexts.TAG_ATTR_BODY_QUOTED + else: + self._context ^= contexts.TAG_ATTR_NAME + else: + tag = self._get_tag_type_from_stack() + if tag is None: + self._fail_route() + self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) + + self._context ^= contexts.TAG_OPEN + self._context |= contexts.TAG_BODY + padding = "" # TODO + return padding # def _handle_attribute(self): # if not self._context & contexts.TAG_ATTR: @@ -462,28 +475,18 @@ class Tokenizer(object): # pass def _handle_tag_close_open(self): - if not self._context & contexts.TAG_ATTR: - self._handle_tag_close_name() - - self._context ^= contexts.TAG_OPEN # also TAG_ATTR_* - self._context |= contexts.TAG_BODY - - padding = "" # TODO + padding = self._actually_close_tag_opening() self._write(tokens.TagCloseOpen(padding=padding)) def _handle_tag_selfclose(self): - self._context ^= contexts.TAG_OPEN # also TAG_ATTR_* - self._context |= contexts.TAG_BODY - - padding = "" # TODO + padding = self._actually_close_tag_opening() self._write(tokens.TagCloseSelfclose(padding=padding)) - self._pop() + self._head += 1 + return self._pop() def _handle_tag_open_close(self): - self._context ^= contexts.TAG_BODY - self._context |= contexts.TAG_CLOSE self._write(tokens.TagOpenClose()) - self._push() + self._push(contexts.TAG_CLOSE) self._head += 1 def _handle_tag_close_close(self): @@ -562,7 +565,8 @@ class Tokenizer(object): self._parse_comment() else: self._write_text(this) - elif this == "<" and not self._context & (contexts.TAG ^ contexts.TAG_BODY): + elif this == "<" and next != "/" and ( + not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() # elif this == " " and (self._context & contexts.TAG_OPEN and not # self._context & contexts.TAG_ATTR_BODY_QUOTED): @@ -571,17 +575,19 @@ class Tokenizer(object): # self._handle_attribute_name() # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: # self._handle_quoted_attribute_close() - elif this == "\n" and (self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): + elif this == "\n" and ( + self._context & contexts.TAG_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): if self._context & contexts.TAG_CLOSE: self._pop() self._fail_route() - elif this == ">" and (self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): + elif this == ">" and ( + self._context & contexts.TAG_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): self._handle_tag_close_open() elif this == "/" and next == ">" and ( - self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): + self._context & contexts.TAG_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): return self._handle_tag_selfclose() elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: self._handle_tag_open_close() From f78bcf832a08b81d7a9a03f344d2bd82bf97b6c0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 9 Dec 2012 00:29:37 -0500 Subject: [PATCH 04/77] Keep .type and .tag synchronized in Tags when using their setters. --- mwparserfromhell/nodes/tag.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 48effa1..b1eb133 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -252,10 +252,17 @@ class Tag(Node): if value not in self.TAGS_ALL: raise ValueError(value) self._type = value + for key in self.TRANSLATIONS: + if self.TRANSLATIONS[key] == value: + self._tag = parse_anything(key) @tag.setter def tag(self, value): self._tag = parse_anything(value) + try: + self._type = self.TRANSLATIONS[text] + except KeyError: + self._type = self.TAG_UNKNOWN @contents.setter def contents(self, value): From 827c544721e223c2f9a5eaf90d5742b2d45de449 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 9 Dec 2012 01:38:45 -0500 Subject: [PATCH 05/77] Should correctly handle closing tags with strange spacing. --- mwparserfromhell/nodes/tag.py | 29 ++++++++++++++++++----------- mwparserfromhell/parser/builder.py | 3 +-- mwparserfromhell/parser/tokenizer.py | 6 +++--- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index b1eb133..1f3bdf9 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -111,7 +111,7 @@ class Tag(Node): } def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, - self_closing=False, open_padding="", close_padding=""): + self_closing=False, open_padding="", closing_tag=None): super(Tag, self).__init__() self._type = type_ self._tag = tag @@ -123,7 +123,10 @@ class Tag(Node): self._showtag = showtag self._self_closing = self_closing self._open_padding = open_padding - self._close_padding = close_padding + if closing_tag: + self._closing_tag = closing_tag + else: + self._closing_tag = tag def __unicode__(self): if not self.showtag: @@ -140,7 +143,7 @@ class Tag(Node): result += self.open_padding + "/>" else: result += self.open_padding + ">" + str(self.contents) - result += "" + result += "" return result def __iternodes__(self, getter): @@ -242,9 +245,13 @@ class Tag(Node): return self._open_padding @property - def close_padding(self): - """Spacing to insert before the last closing > (excl. self-closing).""" - return self._close_padding + def closing_tag(self): + """The closing tag, as a :py:class:`~.Wikicode` object. + + This will usually equal :py:attr:`tag`, unless there is additional + spacing, comments, or the like. + """ + return self._closing_tag @type.setter def type(self, value): @@ -254,11 +261,11 @@ class Tag(Node): self._type = value for key in self.TRANSLATIONS: if self.TRANSLATIONS[key] == value: - self._tag = parse_anything(key) + self._tag = self._closing_tag = parse_anything(key) @tag.setter def tag(self, value): - self._tag = parse_anything(value) + self._tag = self._closing_tag = parse_anything(value) try: self._type = self.TRANSLATIONS[text] except KeyError: @@ -280,6 +287,6 @@ class Tag(Node): def open_padding(self, value): self._open_padding = str(value) - @close_padding.setter - def close_padding(self, value): - self._close_padding = str(value) + @closing_tag.setter + def closing_tag(self, value): + self._closing_tag = parse_anything(value) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 648842c..90274fa 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -221,9 +221,8 @@ class Builder(object): contents = self._pop() self._push() elif isinstance(token, tokens.TagCloseClose): - self._pop() return Tag(type_, tag, contents, attrs, showtag, False, - open_pad, token.padding) + open_pad, self._pop()) else: self._write(self._handle_token(token)) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 2e72951..9e9465d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -496,8 +496,7 @@ class Tokenizer(object): # Closing and opening tags are not the same, so fail this route: self._fail_route() self._write_all(closing) - padding = "" # TODO - self._write(tokens.TagCloseClose(padding=padding)) + self._write(tokens.TagCloseClose()) return self._pop() def _parse(self, context=0): @@ -589,7 +588,8 @@ class Tokenizer(object): self._context & contexts.TAG_OPEN and not self._context & contexts.TAG_ATTR_BODY_QUOTED): return self._handle_tag_selfclose() - elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: + elif this == "<" and next == "/" and ( + self._context & contexts.TAG_BODY): self._handle_tag_open_close() elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() From a21c69fa1e0fc6111b98a5028e8c214f21139dd0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 9 Dec 2012 01:47:41 -0500 Subject: [PATCH 06/77] Split off tag definitions into a new file. --- mwparserfromhell/nodes/tag.py | 104 ++----------------------------------- mwparserfromhell/tag_defs.py | 118 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 100 deletions(-) create mode 100644 mwparserfromhell/tag_defs.py diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 1f3bdf9..ea98bb6 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -24,92 +24,14 @@ from __future__ import unicode_literals from . import Node, Text from ..compat import str +from ..tag_defs import TagDefinitions from ..utils import parse_anything __all__ = ["Tag"] -class Tag(Node): +class Tag(TagDefinitions, Node): """Represents an HTML-style tag in wikicode, like ````.""" - TAG_UNKNOWN = 0 - - # Basic HTML: - TAG_ITALIC = 1 - TAG_BOLD = 2 - TAG_UNDERLINE = 3 - TAG_STRIKETHROUGH = 4 - TAG_UNORDERED_LIST = 5 - TAG_ORDERED_LIST = 6 - TAG_DEF_TERM = 7 - TAG_DEF_ITEM = 8 - TAG_BLOCKQUOTE = 9 - TAG_RULE = 10 - TAG_BREAK = 11 - TAG_ABBR = 12 - TAG_PRE = 13 - TAG_MONOSPACE = 14 - TAG_CODE = 15 - TAG_SPAN = 16 - TAG_DIV = 17 - TAG_FONT = 18 - TAG_SMALL = 19 - TAG_BIG = 20 - TAG_CENTER = 21 - - # MediaWiki parser hooks: - TAG_REF = 101 - TAG_GALLERY = 102 - TAG_MATH = 103 - TAG_NOWIKI = 104 - TAG_NOINCLUDE = 105 - TAG_INCLUDEONLY = 106 - TAG_ONLYINCLUDE = 107 - - # Additional parser hooks: - TAG_SYNTAXHIGHLIGHT = 201 - TAG_POEM = 202 - - # Lists of tags: - TAGS_ALL = set(range(300)) - TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE)) - TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE - - TRANSLATIONS = { - "i": TAG_ITALIC, - "em": TAG_ITALIC, - "b": TAG_BOLD, - "strong": TAG_BOLD, - "u": TAG_UNDERLINE, - "s": TAG_STRIKETHROUGH, - "ul": TAG_UNORDERED_LIST, - "ol": TAG_ORDERED_LIST, - "dt": TAG_DEF_TERM, - "dd": TAG_DEF_ITEM, - "blockquote": TAG_BLOCKQUOTE, - "hl": TAG_RULE, - "br": TAG_BREAK, - "abbr": TAG_ABBR, - "pre": TAG_PRE, - "tt": TAG_MONOSPACE, - "code": TAG_CODE, - "span": TAG_SPAN, - "div": TAG_DIV, - "font": TAG_FONT, - "small": TAG_SMALL, - "big": TAG_BIG, - "center": TAG_CENTER, - "ref": TAG_REF, - "gallery": TAG_GALLERY, - "math": TAG_MATH, - "nowiki": TAG_NOWIKI, - "noinclude": TAG_NOINCLUDE, - "includeonly": TAG_INCLUDEONLY, - "onlyinclude": TAG_ONLYINCLUDE, - "syntaxhighlight": TAG_SYNTAXHIGHLIGHT, - "source": TAG_SYNTAXHIGHLIGHT, - "poem": TAG_POEM, - } - def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, self_closing=False, open_padding="", closing_tag=None): super(Tag, self).__init__() @@ -130,7 +52,7 @@ class Tag(Node): def __unicode__(self): if not self.showtag: - open_, close = self._translate() + open_, close = self.WIKICODE[self.type] if self.self_closing: return open_ else: @@ -188,24 +110,6 @@ class Tag(Node): get(self.tag) write(">") - def _translate(self): - """If the HTML-style tag has a wikicode representation, return that. - - For example, ``Foo`` can be represented as ``'''Foo'''``. This - returns a tuple of the character starting the sequence and the - character ending it. - """ - translations = { - self.TAG_ITALIC: ("''", "''"), - self.TAG_BOLD: ("'''", "'''"), - self.TAG_UNORDERED_LIST: ("*", ""), - self.TAG_ORDERED_LIST: ("#", ""), - self.TAG_DEF_TERM: (";", ""), - self.TAG_DEF_ITEM: (":", ""), - self.TAG_RULE: ("----", ""), - } - return translations[self.type] - @property def type(self): """The tag type.""" @@ -241,7 +145,7 @@ class Tag(Node): @property def open_padding(self): - """Spacing to insert before the first closing >.""" + """Spacing to insert before the first closing ``>``.""" return self._open_padding @property diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py new file mode 100644 index 0000000..74d3a81 --- /dev/null +++ b/mwparserfromhell/tag_defs.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals + +class TagDefinitions(object): + """Contains numerical definitions for valid HTML (and wikicode) tags. + + Base class for :py:class:`~.Tag` objects. + """ + + TAG_UNKNOWN = 0 + + # Basic HTML: + TAG_ITALIC = 1 + TAG_BOLD = 2 + TAG_UNDERLINE = 3 + TAG_STRIKETHROUGH = 4 + TAG_UNORDERED_LIST = 5 + TAG_ORDERED_LIST = 6 + TAG_DEF_TERM = 7 + TAG_DEF_ITEM = 8 + TAG_BLOCKQUOTE = 9 + TAG_RULE = 10 + TAG_BREAK = 11 + TAG_ABBR = 12 + TAG_PRE = 13 + TAG_MONOSPACE = 14 + TAG_CODE = 15 + TAG_SPAN = 16 + TAG_DIV = 17 + TAG_FONT = 18 + TAG_SMALL = 19 + TAG_BIG = 20 + TAG_CENTER = 21 + + # MediaWiki parser hooks: + TAG_REF = 101 + TAG_GALLERY = 102 + TAG_MATH = 103 + TAG_NOWIKI = 104 + TAG_NOINCLUDE = 105 + TAG_INCLUDEONLY = 106 + TAG_ONLYINCLUDE = 107 + + # Additional parser hooks: + TAG_SYNTAXHIGHLIGHT = 201 + TAG_POEM = 202 + + # Lists of tags: + TAGS_ALL = set(range(300)) + TAGS_INVISIBLE = {TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE} + TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE + + TRANSLATIONS = { + "i": TAG_ITALIC, + "em": TAG_ITALIC, + "b": TAG_BOLD, + "strong": TAG_BOLD, + "u": TAG_UNDERLINE, + "s": TAG_STRIKETHROUGH, + "ul": TAG_UNORDERED_LIST, + "ol": TAG_ORDERED_LIST, + "dt": TAG_DEF_TERM, + "dd": TAG_DEF_ITEM, + "blockquote": TAG_BLOCKQUOTE, + "hl": TAG_RULE, + "br": TAG_BREAK, + "abbr": TAG_ABBR, + "pre": TAG_PRE, + "tt": TAG_MONOSPACE, + "code": TAG_CODE, + "span": TAG_SPAN, + "div": TAG_DIV, + "font": TAG_FONT, + "small": TAG_SMALL, + "big": TAG_BIG, + "center": TAG_CENTER, + "ref": TAG_REF, + "gallery": TAG_GALLERY, + "math": TAG_MATH, + "nowiki": TAG_NOWIKI, + "noinclude": TAG_NOINCLUDE, + "includeonly": TAG_INCLUDEONLY, + "onlyinclude": TAG_ONLYINCLUDE, + "syntaxhighlight": TAG_SYNTAXHIGHLIGHT, + "source": TAG_SYNTAXHIGHLIGHT, + "poem": TAG_POEM, + } + + WIKICODE = { + TAG_ITALIC: ("''", "''"), + TAG_BOLD: ("'''", "'''"), + TAG_UNORDERED_LIST: ("*", ""), + TAG_ORDERED_LIST: ("#", ""), + TAG_DEF_TERM: (";", ""), + TAG_DEF_ITEM: (":", ""), + TAG_RULE: ("----", ""), + } From 252cc13a998d60d8a8daf89dc3aa53e5f9bdde27 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 9 Dec 2012 02:01:23 -0500 Subject: [PATCH 07/77] Move repeated context checks into one block in Tokenizer._parse(). --- mwparserfromhell/parser/tokenizer.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9e9465d..99f5a7b 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -574,20 +574,18 @@ class Tokenizer(object): # self._handle_attribute_name() # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: # self._handle_quoted_attribute_close() - elif this == "\n" and ( - self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): - if self._context & contexts.TAG_CLOSE: - self._pop() - self._fail_route() - elif this == ">" and ( - self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): - self._handle_tag_close_open() - elif this == "/" and next == ">" and ( - self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): - return self._handle_tag_selfclose() + elif self._context & contexts.TAG_OPEN and ( + not self._context & contexts.TAG_ATTR_BODY_QUOTED): + if this == "\n": + if self._context & contexts.TAG_CLOSE: + self._pop() + self._fail_route() + elif this == ">": + self._handle_tag_close_open() + elif this == "/": + return self._handle_tag_selfclose() + else: + self._write_text(this) elif this == "<" and next == "/" and ( self._context & contexts.TAG_BODY): self._handle_tag_open_close() From d9f23b8faaedb94d667372fb2a892307cf15a38a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 22 Dec 2012 21:58:21 -0500 Subject: [PATCH 08/77] Really basic, messy, and fragile tag attribute support. --- mwparserfromhell/parser/contexts.py | 73 +++++++++++++++++++----------------- mwparserfromhell/parser/tokenizer.py | 65 ++++++++++++++++++-------------- 2 files changed, 75 insertions(+), 63 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index a67bd76..053c930 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -65,11 +65,13 @@ Local (stack-specific) contexts: * :py:const:`TAG` * :py:const:`TAG_OPEN` - * :py:const:`TAG_ATTR` - * :py:const:`TAG_ATTR_NAME` - * :py:const:`TAG_ATTR_BODY` - * :py:const:`TAG_ATTR_BODY_QUOTED` + * :py:const:`TAG_OPEN_NAME` + * :py:const:`TAG_OPEN_ATTR` + + * :py:const:`TAG_OPEN_ATTR_NAME` + * :py:const:`TAG_OPEN_ATTR_BODY` + * :py:const:`TAG_OPEN_ATTR_BODY_QUOTED` * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` @@ -81,37 +83,38 @@ Global contexts: # Local contexts: -TEMPLATE = 0b00000000000000000111 -TEMPLATE_NAME = 0b00000000000000000001 -TEMPLATE_PARAM_KEY = 0b00000000000000000010 -TEMPLATE_PARAM_VALUE = 0b00000000000000000100 - -ARGUMENT = 0b00000000000000011000 -ARGUMENT_NAME = 0b00000000000000001000 -ARGUMENT_DEFAULT = 0b00000000000000010000 - -WIKILINK = 0b00000000000001100000 -WIKILINK_TITLE = 0b00000000000000100000 -WIKILINK_TEXT = 0b00000000000001000000 - -HEADING = 0b00000001111110000000 -HEADING_LEVEL_1 = 0b00000000000010000000 -HEADING_LEVEL_2 = 0b00000000000100000000 -HEADING_LEVEL_3 = 0b00000000001000000000 -HEADING_LEVEL_4 = 0b00000000010000000000 -HEADING_LEVEL_5 = 0b00000000100000000000 -HEADING_LEVEL_6 = 0b00000001000000000000 - -COMMENT = 0b00000010000000000000 - -TAG = 0b11111100000000000000 -TAG_OPEN = 0b00000100000000000000 -TAG_ATTR = 0b00111000000000000000 -TAG_ATTR_NAME = 0b00001000000000000000 -TAG_ATTR_BODY = 0b00010000000000000000 -TAG_ATTR_BODY_QUOTED = 0b00100000000000000000 -TAG_BODY = 0b01000000000000000000 -TAG_CLOSE = 0b10000000000000000000 +TEMPLATE = 0b00000000000000000111 +TEMPLATE_NAME = 0b00000000000000000001 +TEMPLATE_PARAM_KEY = 0b00000000000000000010 +TEMPLATE_PARAM_VALUE = 0b00000000000000000100 + +ARGUMENT = 0b00000000000000011000 +ARGUMENT_NAME = 0b00000000000000001000 +ARGUMENT_DEFAULT = 0b00000000000000010000 + +WIKILINK = 0b00000000000001100000 +WIKILINK_TITLE = 0b00000000000000100000 +WIKILINK_TEXT = 0b00000000000001000000 + +HEADING = 0b00000001111110000000 +HEADING_LEVEL_1 = 0b00000000000010000000 +HEADING_LEVEL_2 = 0b00000000000100000000 +HEADING_LEVEL_3 = 0b00000000001000000000 +HEADING_LEVEL_4 = 0b00000000010000000000 +HEADING_LEVEL_5 = 0b00000000100000000000 +HEADING_LEVEL_6 = 0b00000001000000000000 + +COMMENT = 0b00000010000000000000 + +TAG = 0b11111100000000000000 +TAG_OPEN = 0b00111100000000000000 +TAG_OPEN_NAME = 0b00000100000000000000 +TAG_OPEN_ATTR = 0b00111000000000000000 +TAG_OPEN_ATTR_NAME = 0b00001000000000000000 +TAG_OPEN_ATTR_BODY = 0b00010000000000000000 +TAG_OPEN_ATTR_BODY_QUOTED = 0b00100000000000000000 +TAG_BODY = 0b01000000000000000000 +TAG_CLOSE = 0b10000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 99f5a7b..f65cbc1 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -426,7 +426,7 @@ class Tokenizer(object): reset = self._head self._head += 1 try: - tokens = self._parse(contexts.TAG_OPEN) + tokens = self._parse(contexts.TAG_OPEN_NAME) except BadRoute: self._head = reset self._write_text("<") @@ -438,34 +438,48 @@ class Tokenizer(object): if not self._stack: return None # Tag has an empty name? text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] - text = "".join([token.text for token in text]).strip().lower() + text = "".join([token.text for token in text]).rstrip().lower() try: return Tag.TRANSLATIONS[text] except KeyError: return Tag.TAG_UNKNOWN def _actually_close_tag_opening(self): - if self._context & contexts.TAG_ATTR: - if self._context & contexts.TAG_ATTR_BODY: - self._context ^= contexts.TAG_ATTR_BODY - if self._context & contexts.TAG_ATTR_BODY_QUOTED: - self._context ^= contexts.TAG_ATTR_BODY_QUOTED - else: - self._context ^= contexts.TAG_ATTR_NAME + if self._context & contexts.TAG_OPEN_ATTR: + if self._context & contexts.TAG_OPEN_ATTR_NAME: + self._context ^= contexts.TAG_OPEN_ATTR_NAME + if self._context & contexts.TAG_OPEN_ATTR_BODY: + self._context ^= contexts.TAG_OPEN_ATTR_BODY + if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: + self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED else: tag = self._get_tag_type_from_stack() - if tag is None: + if not tag: self._fail_route() self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) - - self._context ^= contexts.TAG_OPEN + self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY padding = "" # TODO return padding - # def _handle_attribute(self): - # if not self._context & contexts.TAG_ATTR: - # self._handle_tag_close_name() + def _handle_tag_chunk(self, text): + if " " not in text: + self._write_text(text) + return + chunks = text.split(" ") + if self._context & contexts.TAG_OPEN_NAME: + self._write_text(chunks.pop(0)) + tag = self._get_tag_type_from_stack() + if not tag: + self._fail_route() + self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) + self._context ^= contexts.TAG_OPEN_NAME + self._context |= contexts.TAG_OPEN_ATTR_NAME + self._write(tokens.TagAttrStart()) + for i, chunk in enumerate(chunks): + if i > 0: + self._write(tokens.TagAttrStart()) + self._write_text(chunk) # def _handle_attribute_name(self): # ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED @@ -505,7 +519,10 @@ class Tokenizer(object): while True: this = self._read() if this not in self.MARKERS: - self._write_text(this) + if self._context & contexts.TAG_OPEN: + self._handle_tag_chunk(this) + else: + self._write_text(this) self._head += 1 continue if this is self.END: @@ -567,25 +584,17 @@ class Tokenizer(object): elif this == "<" and next != "/" and ( not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() - # elif this == " " and (self._context & contexts.TAG_OPEN and not - # self._context & contexts.TAG_ATTR_BODY_QUOTED): - # self._handle_attribute() - # elif this == "=" and self._context & contexts.TAG_ATTR_NAME: - # self._handle_attribute_name() - # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: - # self._handle_quoted_attribute_close() - elif self._context & contexts.TAG_OPEN and ( - not self._context & contexts.TAG_ATTR_BODY_QUOTED): + elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_BODY_QUOTED): if this == "\n": if self._context & contexts.TAG_CLOSE: self._pop() self._fail_route() elif this == ">": self._handle_tag_close_open() - elif this == "/": + elif this == "/" and next == ">": return self._handle_tag_selfclose() - else: - self._write_text(this) + # elif this == "=": + # self._handle_tag_attr_body() elif this == "<" and next == "/" and ( self._context & contexts.TAG_BODY): self._handle_tag_open_close() From d459899649362773ca0db16da37bebfc1f3ce180 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Dec 2012 18:38:31 -0500 Subject: [PATCH 09/77] More attribute stuff. --- mwparserfromhell/parser/builder.py | 10 +++--- mwparserfromhell/parser/tokenizer.py | 65 +++++++++++++++++++++++++----------- 2 files changed, 50 insertions(+), 25 deletions(-) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 90274fa..cb5499f 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -180,9 +180,9 @@ class Builder(object): else: self._write(self._handle_token(token)) - def _handle_attribute(self): + def _handle_attribute(self, token): """Handle a case where a tag attribute is at the head of the tokens.""" - name, quoted = None, False + name, quoted, padding = None, False, token.padding self._push() while self._tokens: token = self._tokens.pop() @@ -195,8 +195,8 @@ class Builder(object): tokens.TagCloseOpen)): self._tokens.append(token) if name is not None: - return Attribute(name, self._pop(), quoted) - return Attribute(self._pop(), quoted=quoted) + return Attribute(name, self._pop(), quoted, padding) + return Attribute(self._pop(), quoted=quoted, padding=padding) else: self._write(self._handle_token(token)) @@ -208,7 +208,7 @@ class Builder(object): while self._tokens: token = self._tokens.pop() if isinstance(token, tokens.TagAttrStart): - attrs.append(self._handle_attribute()) + attrs.append(self._handle_attribute(token)) elif isinstance(token, tokens.TagCloseOpen): open_pad = token.padding tag = self._pop() diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index f65cbc1..d3cb40f 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -450,8 +450,6 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_ATTR_NAME if self._context & contexts.TAG_OPEN_ATTR_BODY: self._context ^= contexts.TAG_OPEN_ATTR_BODY - if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: - self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED else: tag = self._get_tag_type_from_stack() if not tag: @@ -462,6 +460,20 @@ class Tokenizer(object): padding = "" # TODO return padding + def _actually_handle_chunk(self, chunks, is_new): + if is_new: + padding = 0 + while chunks: + if chunks[0] == "": + padding += 1 + chunks.pop(0) + else: + break + self._write(tokens.TagAttrStart(padding=" " * padding)) + if chunks: + chunk = chunks.pop(0) + self._write_text(chunk) + def _handle_tag_chunk(self, text): if " " not in text: self._write_text(text) @@ -475,18 +487,29 @@ class Tokenizer(object): self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME - self._write(tokens.TagAttrStart()) - for i, chunk in enumerate(chunks): - if i > 0: - self._write(tokens.TagAttrStart()) - self._write_text(chunk) - - # def _handle_attribute_name(self): - # ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED - # pass - - # def _handle_quoted_attribute_close(self): - # pass + self._actually_handle_chunk(chunks, True) + is_new = False + while chunks: + self._actually_handle_chunk(chunks, is_new) + is_new = True + + def _handle_tag_attribute_body(self): + self._context ^= contexts.TAG_OPEN_ATTR_NAME + self._context |= contexts.TAG_OPEN_ATTR_BODY + self._write(TagAttrEquals()) + next = self._read(1) + if next not in self.MARKERS and next.startswith('"'): + if re.search(r'[^\\]"$', next[1:]): + if not re.search(r'[^\\]"', next[1:-1]): + self._write(TagAttrQuote()) + self._write_text(next[1:-1]) + self._head += 1 + else: + if not re.search(r'[^\\]"', next[1:]): + self._push(contexts.TAG_OPEN_ATTR_BODY_QUOTED) + self._write(TagAttrQuote()) + self._write_text(next[1:]) + self._head += 1 def _handle_tag_close_open(self): padding = self._actually_close_tag_opening() @@ -526,10 +549,12 @@ class Tokenizer(object): self._head += 1 continue if this is self.END: - fail = (contexts.TEMPLATE | contexts.ARGUMENT | - contexts.WIKILINK | contexts.HEADING | - contexts.COMMENT | contexts.TAG) - double_fail = contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE + fail = ( + contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | + contexts.HEADING | contexts.COMMENT | contexts.TAG) + double_fail = ( + contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE | + contexts.TAG_OPEN_ATTR_BODY_QUOTED) if self._context & double_fail: self._pop() if self._context & fail: @@ -593,8 +618,8 @@ class Tokenizer(object): self._handle_tag_close_open() elif this == "/" and next == ">": return self._handle_tag_selfclose() - # elif this == "=": - # self._handle_tag_attr_body() + elif this == "=": + self._handle_tag_attribute_body() elif this == "<" and next == "/" and ( self._context & contexts.TAG_BODY): self._handle_tag_open_close() From 26d30f3d1a8c0caca854f7040d07555c6f794b0f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Dec 2012 19:18:09 -0500 Subject: [PATCH 10/77] Seems to be working for quoted attributes now. --- mwparserfromhell/parser/tokenizer.py | 40 ++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index d3cb40f..920d1cf 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -461,7 +461,7 @@ class Tokenizer(object): return padding def _actually_handle_chunk(self, chunks, is_new): - if is_new: + if is_new and not self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: padding = 0 while chunks: if chunks[0] == "": @@ -472,6 +472,15 @@ class Tokenizer(object): self._write(tokens.TagAttrStart(padding=" " * padding)) if chunks: chunk = chunks.pop(0) + if self._context & contexts.TAG_OPEN_ATTR_BODY: + self._context ^= contexts.TAG_OPEN_ATTR_BODY + self._context |= contexts.TAG_OPEN_ATTR_NAME + if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: + if re.search(r'[^\\]"', chunk[:-1]): + self._fail_route() + if re.search(r'[^\\]"$', chunk): + self._write_text(chunk[:-1]) + return self._pop() # Back to _handle_tag_attribute_body() self._write_text(chunk) def _handle_tag_chunk(self, text): @@ -490,26 +499,35 @@ class Tokenizer(object): self._actually_handle_chunk(chunks, True) is_new = False while chunks: - self._actually_handle_chunk(chunks, is_new) + should_exit = self._actually_handle_chunk(chunks, is_new) + if should_exit: + return should_exit is_new = True def _handle_tag_attribute_body(self): self._context ^= contexts.TAG_OPEN_ATTR_NAME self._context |= contexts.TAG_OPEN_ATTR_BODY - self._write(TagAttrEquals()) + self._write(tokens.TagAttrEquals()) next = self._read(1) if next not in self.MARKERS and next.startswith('"'): if re.search(r'[^\\]"$', next[1:]): if not re.search(r'[^\\]"', next[1:-1]): - self._write(TagAttrQuote()) + self._write(tokens.TagAttrQuote()) self._write_text(next[1:-1]) self._head += 1 else: if not re.search(r'[^\\]"', next[1:]): - self._push(contexts.TAG_OPEN_ATTR_BODY_QUOTED) - self._write(TagAttrQuote()) - self._write_text(next[1:]) self._head += 1 + reset = self._head + try: + attr = self._parse(contexts.TAG_OPEN_ATTR_BODY_QUOTED) + except BadRoute: + self._head = reset + self._write_text(next) + else: + self._write(tokens.TagAttrQuote()) + self._write_text(next[1:]) + self._write_all(attr) def _handle_tag_close_open(self): padding = self._actually_close_tag_opening() @@ -543,7 +561,9 @@ class Tokenizer(object): this = self._read() if this not in self.MARKERS: if self._context & contexts.TAG_OPEN: - self._handle_tag_chunk(this) + should_exit = self._handle_tag_chunk(this) + if should_exit: + return should_exit else: self._write_text(this) self._head += 1 @@ -593,6 +613,8 @@ class Tokenizer(object): elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() + elif self._context & contexts.TAG_OPEN_ATTR_NAME: + self._handle_tag_attribute_body() else: self._write_text("=") elif this == "=" and self._context & contexts.HEADING: @@ -618,7 +640,7 @@ class Tokenizer(object): self._handle_tag_close_open() elif this == "/" and next == ">": return self._handle_tag_selfclose() - elif this == "=": + elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME: self._handle_tag_attribute_body() elif this == "<" and next == "/" and ( self._context & contexts.TAG_BODY): From ca47305074aa04585d29dd91f346079e57156f53 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Dec 2012 21:35:48 -0500 Subject: [PATCH 11/77] Fix attribute behavior under certain strange circumstances. --- mwparserfromhell/parser/tokenizer.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 920d1cf..46c4399 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -470,6 +470,7 @@ class Tokenizer(object): else: break self._write(tokens.TagAttrStart(padding=" " * padding)) + if chunks: chunk = chunks.pop(0) if self._context & contexts.TAG_OPEN_ATTR_BODY: @@ -480,7 +481,9 @@ class Tokenizer(object): self._fail_route() if re.search(r'[^\\]"$', chunk): self._write_text(chunk[:-1]) - return self._pop() # Back to _handle_tag_attribute_body() + self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED + self._context |= contexts.TAG_OPEN_ATTR_NAME + return True # Back to _handle_tag_attribute_body() self._write_text(chunk) def _handle_tag_chunk(self, text): @@ -497,12 +500,15 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME self._actually_handle_chunk(chunks, True) + is_new = False + is_quoted = False while chunks: - should_exit = self._actually_handle_chunk(chunks, is_new) - if should_exit: - return should_exit + result = self._actually_handle_chunk(chunks, is_new) + is_quoted = result or is_quoted is_new = True + if is_quoted: + return self._pop() def _handle_tag_attribute_body(self): self._context ^= contexts.TAG_OPEN_ATTR_NAME @@ -510,6 +516,10 @@ class Tokenizer(object): self._write(tokens.TagAttrEquals()) next = self._read(1) if next not in self.MARKERS and next.startswith('"'): + chunks = None + if " " in next: + chunks = next.split(" ") + next = chunks.pop(0) if re.search(r'[^\\]"$', next[1:]): if not re.search(r'[^\\]"', next[1:-1]): self._write(tokens.TagAttrQuote()) @@ -528,6 +538,10 @@ class Tokenizer(object): self._write(tokens.TagAttrQuote()) self._write_text(next[1:]) self._write_all(attr) + self._context ^= contexts.TAG_OPEN_ATTR_BODY + self._context |= contexts.TAG_OPEN_ATTR_NAME + while chunks: + self._actually_handle_chunk(chunks, True) def _handle_tag_close_open(self): padding = self._actually_close_tag_opening() From 146d1fd006c32b4a71312cd966c3e124592bce92 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Dec 2012 21:44:56 -0500 Subject: [PATCH 12/77] Fix a bug in rendering Tags; attrs->attributes; update documentation. --- docs/api/mwparserfromhell.nodes.rst | 1 + docs/api/mwparserfromhell.rst | 6 ++++++ mwparserfromhell/nodes/tag.py | 4 ++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/api/mwparserfromhell.nodes.rst b/docs/api/mwparserfromhell.nodes.rst index d1016f9..a093c17 100644 --- a/docs/api/mwparserfromhell.nodes.rst +++ b/docs/api/mwparserfromhell.nodes.rst @@ -46,6 +46,7 @@ nodes Package .. automodule:: mwparserfromhell.nodes.tag :members: + :undoc-members: :show-inheritance: :mod:`template` Module diff --git a/docs/api/mwparserfromhell.rst b/docs/api/mwparserfromhell.rst index 3ca09c9..b682139 100644 --- a/docs/api/mwparserfromhell.rst +++ b/docs/api/mwparserfromhell.rst @@ -30,6 +30,12 @@ mwparserfromhell Package :members: :undoc-members: +:mod:`tag_defs` Module +---------------------- + +.. automodule:: mwparserfromhell.tag_defs + :members: + :mod:`utils` Module ------------------- diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index ea98bb6..833b597 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -65,7 +65,7 @@ class Tag(TagDefinitions, Node): result += self.open_padding + "/>" else: result += self.open_padding + ">" + str(self.contents) - result += "" + result += "" return result def __iternodes__(self, getter): @@ -126,7 +126,7 @@ class Tag(TagDefinitions, Node): return self._contents @property - def attrs(self): + def attributes(self): """The list of attributes affecting the tag. Each attribute is an instance of :py:class:`~.Attribute`. From a58c480639119b2cd3c78eee8dfe0893fa6360fc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Dec 2012 22:23:31 -0500 Subject: [PATCH 13/77] Fix some usage of attrs; shorten a context, fix some behavior I broke. --- mwparserfromhell/nodes/tag.py | 11 +++--- mwparserfromhell/parser/contexts.py | 68 +++++++++++++++++++----------------- mwparserfromhell/parser/tokenizer.py | 29 +++++++++------ 3 files changed, 60 insertions(+), 48 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 833b597..94f92c5 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -59,8 +59,8 @@ class Tag(TagDefinitions, Node): return open_ + str(self.contents) + close result = "<" + str(self.tag) - if self.attrs: - result += " " + " ".join([str(attr) for attr in self.attrs]) + if self.attributes: + result += " " + " ".join([str(attr) for attr in self.attributes]) if self.self_closing: result += self.open_padding + "/>" else: @@ -73,7 +73,7 @@ class Tag(TagDefinitions, Node): if self.showtag: for child in getter(self.tag): yield self.tag, child - for attr in self.attrs: + for attr in self.attributes: for child in getter(attr.name): yield attr.name, child if attr.value: @@ -89,12 +89,13 @@ class Tag(TagDefinitions, Node): def __showtree__(self, write, get, mark): tagnodes = self.tag.nodes - if (not self.attrs and len(tagnodes) == 1 and isinstance(tagnodes[0], Text)): + if not self.attributes and (len(tagnodes) == 1 and + isinstance(tagnodes[0], Text)): write("<" + str(tagnodes[0]) + ">") else: write("<") get(self.tag) - for attr in self.attrs: + for attr in self.attributes: get(attr.name) if not attr.value: continue diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 053c930..d87da9a 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -71,7 +71,8 @@ Local (stack-specific) contexts: * :py:const:`TAG_OPEN_ATTR_NAME` * :py:const:`TAG_OPEN_ATTR_BODY` - * :py:const:`TAG_OPEN_ATTR_BODY_QUOTED` + * :py:const:`TAG_OPEN_ATTR_QUOTED` + * :py:const:`TAG_OPEN_ATTR_IGNORE` * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` @@ -83,38 +84,39 @@ Global contexts: # Local contexts: -TEMPLATE = 0b00000000000000000111 -TEMPLATE_NAME = 0b00000000000000000001 -TEMPLATE_PARAM_KEY = 0b00000000000000000010 -TEMPLATE_PARAM_VALUE = 0b00000000000000000100 - -ARGUMENT = 0b00000000000000011000 -ARGUMENT_NAME = 0b00000000000000001000 -ARGUMENT_DEFAULT = 0b00000000000000010000 - -WIKILINK = 0b00000000000001100000 -WIKILINK_TITLE = 0b00000000000000100000 -WIKILINK_TEXT = 0b00000000000001000000 - -HEADING = 0b00000001111110000000 -HEADING_LEVEL_1 = 0b00000000000010000000 -HEADING_LEVEL_2 = 0b00000000000100000000 -HEADING_LEVEL_3 = 0b00000000001000000000 -HEADING_LEVEL_4 = 0b00000000010000000000 -HEADING_LEVEL_5 = 0b00000000100000000000 -HEADING_LEVEL_6 = 0b00000001000000000000 - -COMMENT = 0b00000010000000000000 - -TAG = 0b11111100000000000000 -TAG_OPEN = 0b00111100000000000000 -TAG_OPEN_NAME = 0b00000100000000000000 -TAG_OPEN_ATTR = 0b00111000000000000000 -TAG_OPEN_ATTR_NAME = 0b00001000000000000000 -TAG_OPEN_ATTR_BODY = 0b00010000000000000000 -TAG_OPEN_ATTR_BODY_QUOTED = 0b00100000000000000000 -TAG_BODY = 0b01000000000000000000 -TAG_CLOSE = 0b10000000000000000000 +TEMPLATE = 0b000000000000000000111 +TEMPLATE_NAME = 0b000000000000000000001 +TEMPLATE_PARAM_KEY = 0b000000000000000000010 +TEMPLATE_PARAM_VALUE = 0b000000000000000000100 + +ARGUMENT = 0b000000000000000011000 +ARGUMENT_NAME = 0b000000000000000001000 +ARGUMENT_DEFAULT = 0b000000000000000010000 + +WIKILINK = 0b000000000000001100000 +WIKILINK_TITLE = 0b000000000000000100000 +WIKILINK_TEXT = 0b000000000000001000000 + +HEADING = 0b000000001111110000000 +HEADING_LEVEL_1 = 0b000000000000010000000 +HEADING_LEVEL_2 = 0b000000000000100000000 +HEADING_LEVEL_3 = 0b000000000001000000000 +HEADING_LEVEL_4 = 0b000000000010000000000 +HEADING_LEVEL_5 = 0b000000000100000000000 +HEADING_LEVEL_6 = 0b000000001000000000000 + +COMMENT = 0b000000010000000000000 + +TAG = 0b111111100000000000000 +TAG_OPEN = 0b001111100000000000000 +TAG_OPEN_NAME = 0b000000100000000000000 +TAG_OPEN_ATTR = 0b001111000000000000000 +TAG_OPEN_ATTR_NAME = 0b000001000000000000000 +TAG_OPEN_ATTR_BODY = 0b000010000000000000000 +TAG_OPEN_ATTR_QUOTED = 0b000100000000000000000 +TAG_OPEN_ATTR_IGNORE = 0b001000000000000000000 +TAG_BODY = 0b010000000000000000000 +TAG_CLOSE = 0b100000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 46c4399..1d31fa4 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -457,11 +457,13 @@ class Tokenizer(object): self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY - padding = "" # TODO + + ## If the last element was TagAttrStart, remove it, add " " to its padding, then return that + padding = "" return padding def _actually_handle_chunk(self, chunks, is_new): - if is_new and not self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: + if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: padding = 0 while chunks: if chunks[0] == "": @@ -470,18 +472,24 @@ class Tokenizer(object): else: break self._write(tokens.TagAttrStart(padding=" " * padding)) + elif self._context & contexts.TAG_OPEN_ATTR_IGNORE: + self._context ^= contexts.TAG_OPEN_ATTR_IGNORE + chunks.pop(0) + return + elif self._context & contexts.TAG_OPEN_ATTR_QUOTED: + self._write_text(" ") # Quoted chunks don't lose their spaces if chunks: chunk = chunks.pop(0) if self._context & contexts.TAG_OPEN_ATTR_BODY: self._context ^= contexts.TAG_OPEN_ATTR_BODY self._context |= contexts.TAG_OPEN_ATTR_NAME - if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: + if self._context & contexts.TAG_OPEN_ATTR_QUOTED: if re.search(r'[^\\]"', chunk[:-1]): self._fail_route() if re.search(r'[^\\]"$', chunk): self._write_text(chunk[:-1]) - self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED + self._context ^= contexts.TAG_OPEN_ATTR_QUOTED self._context |= contexts.TAG_OPEN_ATTR_NAME return True # Back to _handle_tag_attribute_body() self._write_text(chunk) @@ -491,6 +499,8 @@ class Tokenizer(object): self._write_text(text) return chunks = text.split(" ") + is_new = False + is_quoted = False if self._context & contexts.TAG_OPEN_NAME: self._write_text(chunks.pop(0)) tag = self._get_tag_type_from_stack() @@ -500,9 +510,7 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME self._actually_handle_chunk(chunks, True) - - is_new = False - is_quoted = False + is_new = True while chunks: result = self._actually_handle_chunk(chunks, is_new) is_quoted = result or is_quoted @@ -530,7 +538,7 @@ class Tokenizer(object): self._head += 1 reset = self._head try: - attr = self._parse(contexts.TAG_OPEN_ATTR_BODY_QUOTED) + attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | contexts.TAG_OPEN_ATTR_IGNORE) except BadRoute: self._head = reset self._write_text(next) @@ -538,6 +546,7 @@ class Tokenizer(object): self._write(tokens.TagAttrQuote()) self._write_text(next[1:]) self._write_all(attr) + return self._context ^= contexts.TAG_OPEN_ATTR_BODY self._context |= contexts.TAG_OPEN_ATTR_NAME while chunks: @@ -588,7 +597,7 @@ class Tokenizer(object): contexts.HEADING | contexts.COMMENT | contexts.TAG) double_fail = ( contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE | - contexts.TAG_OPEN_ATTR_BODY_QUOTED) + contexts.TAG_OPEN_ATTR_QUOTED) if self._context & double_fail: self._pop() if self._context & fail: @@ -645,7 +654,7 @@ class Tokenizer(object): elif this == "<" and next != "/" and ( not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() - elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_BODY_QUOTED): + elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): if this == "\n": if self._context & contexts.TAG_CLOSE: self._pop() From eed7c918bfb0741fefd0473f61bbc1e9343ad033 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Dec 2012 22:41:32 -0500 Subject: [PATCH 14/77] Implement padding support for Tags completely; open_padding->padding. --- mwparserfromhell/nodes/tag.py | 18 +++++++++--------- mwparserfromhell/parser/builder.py | 6 +++--- mwparserfromhell/parser/tokenizer.py | 15 +++++++++------ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 94f92c5..ecf6f2b 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -33,7 +33,7 @@ class Tag(TagDefinitions, Node): """Represents an HTML-style tag in wikicode, like ````.""" def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, - self_closing=False, open_padding="", closing_tag=None): + self_closing=False, padding="", closing_tag=None): super(Tag, self).__init__() self._type = type_ self._tag = tag @@ -44,7 +44,7 @@ class Tag(TagDefinitions, Node): self._attrs = [] self._showtag = showtag self._self_closing = self_closing - self._open_padding = open_padding + self._padding = padding if closing_tag: self._closing_tag = closing_tag else: @@ -62,9 +62,9 @@ class Tag(TagDefinitions, Node): if self.attributes: result += " " + " ".join([str(attr) for attr in self.attributes]) if self.self_closing: - result += self.open_padding + "/>" + result += self.padding + "/>" else: - result += self.open_padding + ">" + str(self.contents) + result += self.padding + ">" + str(self.contents) result += "" return result @@ -145,9 +145,9 @@ class Tag(TagDefinitions, Node): return self._self_closing @property - def open_padding(self): + def padding(self): """Spacing to insert before the first closing ``>``.""" - return self._open_padding + return self._padding @property def closing_tag(self): @@ -188,9 +188,9 @@ class Tag(TagDefinitions, Node): def self_closing(self, value): self._self_closing = bool(value) - @open_padding.setter - def open_padding(self, value): - self._open_padding = str(value) + @padding.setter + def padding(self, value): + self._padding = str(value) @closing_tag.setter def closing_tag(self, value): diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index cb5499f..2d9ea55 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -210,19 +210,19 @@ class Builder(object): if isinstance(token, tokens.TagAttrStart): attrs.append(self._handle_attribute(token)) elif isinstance(token, tokens.TagCloseOpen): - open_pad = token.padding + padding = token.padding tag = self._pop() self._push() elif isinstance(token, tokens.TagCloseSelfclose): tag = self._pop() return Tag(type_, tag, attrs=attrs, showtag=showtag, - self_closing=True, open_padding=token.padding) + self_closing=True, padding=token.padding) elif isinstance(token, tokens.TagOpenClose): contents = self._pop() self._push() elif isinstance(token, tokens.TagCloseClose): return Tag(type_, tag, contents, attrs, showtag, False, - open_pad, self._pop()) + padding, self._pop()) else: self._write(self._handle_token(token)) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 1d31fa4..901e731 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -458,9 +458,9 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY - ## If the last element was TagAttrStart, remove it, add " " to its padding, then return that - padding = "" - return padding + if isinstance(self._stack[-1], tokens.TagAttrStart): + return self._stack.pop().padding + return "" def _actually_handle_chunk(self, chunks, is_new): if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: @@ -538,7 +538,8 @@ class Tokenizer(object): self._head += 1 reset = self._head try: - attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | contexts.TAG_OPEN_ATTR_IGNORE) + attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | + contexts.TAG_OPEN_ATTR_IGNORE) except BadRoute: self._head = reset self._write_text(next) @@ -654,7 +655,8 @@ class Tokenizer(object): elif this == "<" and next != "/" and ( not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() - elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): + elif self._context & ( + contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): if this == "\n": if self._context & contexts.TAG_CLOSE: self._pop() @@ -663,7 +665,8 @@ class Tokenizer(object): self._handle_tag_close_open() elif this == "/" and next == ">": return self._handle_tag_selfclose() - elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME: + elif this == "=" and ( + self._context & contexts.TAG_OPEN_ATTR_NAME): self._handle_tag_attribute_body() elif this == "<" and next == "/" and ( self._context & contexts.TAG_BODY): From 6ea618460fc122dcd60ebebd0ecf02a36f82d8cf Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 31 Dec 2012 03:19:22 -0500 Subject: [PATCH 15/77] _get_tag_type_from_stack() makes more sense now --- mwparserfromhell/parser/tokenizer.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 901e731..e83ec5d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -433,16 +433,18 @@ class Tokenizer(object): else: self._write_all(tokens) - def _get_tag_type_from_stack(self): - self._push_textbuffer() - if not self._stack: - return None # Tag has an empty name? - text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] + def _get_tag_type_from_stack(self, stack=None): + if stack is None: + stack = self._stack + self._push_textbuffer() + if not stack: + self._fail_route() # Tag has an empty name? + text = [tok for tok in stack if isinstance(tok, tokens.Text)] text = "".join([token.text for token in text]).rstrip().lower() try: return Tag.TRANSLATIONS[text] except KeyError: - return Tag.TAG_UNKNOWN + self._fail_route() def _actually_close_tag_opening(self): if self._context & contexts.TAG_OPEN_ATTR: @@ -452,8 +454,6 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_ATTR_BODY else: tag = self._get_tag_type_from_stack() - if not tag: - self._fail_route() self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY @@ -504,8 +504,6 @@ class Tokenizer(object): if self._context & contexts.TAG_OPEN_NAME: self._write_text(chunks.pop(0)) tag = self._get_tag_type_from_stack() - if not tag: - self._fail_route() self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME @@ -569,8 +567,8 @@ class Tokenizer(object): self._head += 1 def _handle_tag_close_close(self): - tag = self._get_tag_type_from_stack() closing = self._pop() + tag = self._get_tag_type_from_stack(closing) if tag != self._stack[0].type: # Closing and opening tags are not the same, so fail this route: self._fail_route() From 0ee505b5a506cfc1c0530935bb01933b94aa14dc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 24 Jan 2013 01:24:06 -0500 Subject: [PATCH 16/77] Docstrings for new tokenizer methods. --- mwparserfromhell/parser/tokenizer.py | 41 ++++++++++++++++++++++++++++++------ mwparserfromhell/tag_defs.py | 2 +- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index e83ec5d..8ec3355 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -434,6 +434,10 @@ class Tokenizer(object): self._write_all(tokens) def _get_tag_type_from_stack(self, stack=None): + """Return the tag type based on the text in *stack*. + + If *stack* is ``None``, we will use the current, topmost one. + """ if stack is None: stack = self._stack self._push_textbuffer() @@ -447,6 +451,13 @@ class Tokenizer(object): self._fail_route() def _actually_close_tag_opening(self): + """Handle cleanup at the end of a opening tag. + + The current context will be updated and the + :py:class:`~.tokens.TagOpenOpen` token will be written. Returns the + opening tag's padding to be used in the + :py:class:`~.tokens.TagOpenClose` token. + """ if self._context & contexts.TAG_OPEN_ATTR: if self._context & contexts.TAG_OPEN_ATTR_NAME: self._context ^= contexts.TAG_OPEN_ATTR_NAME @@ -463,6 +474,11 @@ class Tokenizer(object): return "" def _actually_handle_chunk(self, chunks, is_new): + """Actually handle a chunk of code within a tag's attributes. + + Called by :py:meth:`_handle_tag_chunk` and + :py:meth:`_handle_tag_attribute_body`. + """ if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: padding = 0 while chunks: @@ -495,6 +511,12 @@ class Tokenizer(object): self._write_text(chunk) def _handle_tag_chunk(self, text): + """Handle a chunk of code within a tag's attributes. + + This is called by :py:meth:`_parse`, which intercepts parsing of + wikicode when we're inside of an opening tag and no :py:attr:`MARKERS` + are present. + """ if " " not in text: self._write_text(text) return @@ -517,6 +539,12 @@ class Tokenizer(object): return self._pop() def _handle_tag_attribute_body(self): + """Handle the body, or value, of a tag attribute. + + Attribute bodies can usually be handled at once, but sometimes a new + stack must be created to keep track of "rich" attribute values that + contain, for example, templates. + """ self._context ^= contexts.TAG_OPEN_ATTR_NAME self._context |= contexts.TAG_OPEN_ATTR_BODY self._write(tokens.TagAttrEquals()) @@ -552,21 +580,25 @@ class Tokenizer(object): self._actually_handle_chunk(chunks, True) def _handle_tag_close_open(self): + """Handle the ending of an open tag (````).""" padding = self._actually_close_tag_opening() self._write(tokens.TagCloseOpen(padding=padding)) def _handle_tag_selfclose(self): + """Handle the ending of an tag that closes itself (````).""" padding = self._actually_close_tag_opening() self._write(tokens.TagCloseSelfclose(padding=padding)) self._head += 1 return self._pop() def _handle_tag_open_close(self): + """Handle the opening of a closing tag (````).""" self._write(tokens.TagOpenClose()) self._push(contexts.TAG_CLOSE) self._head += 1 def _handle_tag_close_close(self): + """Handle the ending of a closing tag (````).""" closing = self._pop() tag = self._get_tag_type_from_stack(closing) if tag != self._stack[0].type: @@ -653,8 +685,7 @@ class Tokenizer(object): elif this == "<" and next != "/" and ( not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() - elif self._context & ( - contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): + elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): if this == "\n": if self._context & contexts.TAG_CLOSE: self._pop() @@ -663,11 +694,9 @@ class Tokenizer(object): self._handle_tag_close_open() elif this == "/" and next == ">": return self._handle_tag_selfclose() - elif this == "=" and ( - self._context & contexts.TAG_OPEN_ATTR_NAME): + elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME: self._handle_tag_attribute_body() - elif this == "<" and next == "/" and ( - self._context & contexts.TAG_BODY): + elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: self._handle_tag_open_close() elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py index 74d3a81..b2ee90d 100644 --- a/mwparserfromhell/tag_defs.py +++ b/mwparserfromhell/tag_defs.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012 Ben Kurtovic +# Copyright (C) 2012-2013 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal From d8814968b71fdd9ceea22085c19d43b69101ba38 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 14 Mar 2013 11:02:10 -0400 Subject: [PATCH 17/77] Applying latest commit from develop --- mwparserfromhell/parser/__init__.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index 5baa687..fd8a314 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -26,16 +26,16 @@ modules: the :py:mod:`~.tokenizer` and the :py:mod:`~.builder`. This module joins them together under one interface. """ +from .builder import Builder +from .tokenizer import Tokenizer try: - from ._builder import CBuilder as Builder + from ._tokenizer import CTokenizer + use_c = True except ImportError: - from .builder import Builder -try: - from ._tokenizer import CTokenizer as Tokenizer -except ImportError: - from .tokenizer import Tokenizer + CTokenizer = None + use_c = False -__all__ = ["Parser"] +__all__ = ["use_c", "Parser"] class Parser(object): """Represents a parser for wikicode. @@ -48,7 +48,10 @@ class Parser(object): def __init__(self, text): self.text = text - self._tokenizer = Tokenizer() + if use_c and CTokenizer: + self._tokenizer = CTokenizer() + else: + self._tokenizer = Tokenizer() self._builder = Builder() def parse(self): From 61fc5b5eab7dbe9c0466fd07a656c8490d8d04ad Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 19 May 2013 14:41:48 -0400 Subject: [PATCH 18/77] Fix handling of self-closing tags (closes #31) --- mwparserfromhell/nodes/tag.py | 5 +++-- mwparserfromhell/parser/builder.py | 4 ++-- mwparserfromhell/parser/tokenizer.py | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index eb5d1ee..d301d85 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -79,8 +79,9 @@ class Tag(TagDefinitions, Node): if attr.value: for child in getter(attr.value): yield attr.value, child - for child in getter(self.contents): - yield self.contents, child + if self.contents: + for child in getter(self.contents): + yield self.contents, child def __strip__(self, normalize, collapse): if self.type in self.TAGS_VISIBLE: diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 60bfaa9..4b468b7 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -191,8 +191,8 @@ class Builder(object): self._push() elif isinstance(token, tokens.TagAttrQuote): quoted = True - elif isinstance(token, (tokens.TagAttrStart, - tokens.TagCloseOpen)): + elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen, + tokens.TagCloseSelfclose)): self._tokens.append(token) if name is not None: return Attribute(name, self._pop(), quoted, padding) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 82f748c..b466de5 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -26,8 +26,8 @@ import re from . import contexts from . import tokens -from ..nodes.tag import Tag from ..compat import htmlentities +from ..nodes.tag import Tag __all__ = ["Tokenizer"] @@ -431,7 +431,7 @@ class Tokenizer(object): try: return Tag.TRANSLATIONS[text] except KeyError: - self._fail_route() + return Tag.TAG_UNKNOWN def _actually_close_tag_opening(self): """Handle cleanup at the end of a opening tag. From 1b4c01b4c00d014499d9f5e5ad8ecc01bb20a2b7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 20 May 2013 03:05:11 -0400 Subject: [PATCH 19/77] Implement assertTagNodeEqual(), start test_tag(), add to tags.mwtest. --- mwparserfromhell/parser/builder.py | 2 +- tests/_test_tree_equality.py | 19 +++++++- tests/test_attribute.py | 0 tests/test_builder.py | 12 +++++- tests/test_tag.py | 0 tests/tokenizer/tags.mwtest | 88 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 117 insertions(+), 4 deletions(-) create mode 100644 tests/test_attribute.py create mode 100644 tests/test_tag.py create mode 100644 tests/tokenizer/tags.mwtest diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 4b468b7..5ec0780 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -170,7 +170,7 @@ class Builder(object): self._write(self._handle_token(token)) def _handle_comment(self): - """Handle a case where a hidden comment is at the head of the tokens.""" + """Handle a case where an HTML comment is at the head of the tokens.""" self._push() while self._tokens: token = self._tokens.pop() diff --git a/tests/_test_tree_equality.py b/tests/_test_tree_equality.py index 52130ed..2828147 100644 --- a/tests/_test_tree_equality.py +++ b/tests/_test_tree_equality.py @@ -91,7 +91,24 @@ class TreeEqualityTestCase(TestCase): def assertTagNodeEqual(self, expected, actual): """Assert that two Tag nodes have the same data.""" - self.fail("Holding this until feature/html_tags is ready.") + self.assertEqual(expected.type, actual.type) + self.assertWikicodeEqual(expected.tag, actual.tag) + if expected.contents is not None: + self.assertWikicodeEqual(expected.contents, actual.contents) + length = len(expected.attributes) + self.assertEqual(length, len(actual.attributes)) + for i in range(length): + exp_attr = expected.attributes[i] + act_attr = actual.attributes[i] + self.assertWikicodeEqual(exp_attr.name, act_attr.name) + if exp_attr.value is not None: + self.assertWikicodeEqual(exp_attr.value, act_attr.value) + self.assertIs(exp_attr.quoted, act_attr.quoted) + self.assertEqual(exp.attr.padding, act_attr.padding) + self.assertIs(expected.showtag, actual.showtag) + self.assertIs(expected.self_closing, actual.self_closing) + self.assertEqual(expected.padding, actual.padding) + self.assertWikicodeEqual(expected.closing_tag, actual.closing_tag) def assertTemplateNodeEqual(self, expected, actual): """Assert that two Template nodes have the same data.""" diff --git a/tests/test_attribute.py b/tests/test_attribute.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_builder.py b/tests/test_builder.py index 903d144..85a8c60 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -190,10 +190,18 @@ class TestBuilder(TreeEqualityTestCase): for test, valid in tests: self.assertWikicodeEqual(valid, self.builder.build(test)) - @unittest.skip("holding this until feature/html_tags is ready") def test_tag(self): """tests for building Tag nodes""" - pass + tests = [ + ([tokens.TagOpenOpen(showtag=True, type=101), + tokens.Text(text="ref"), tokens.TagCloseOpen(padding=""), + tokens.TagOpenClose(), tokens.Text(text="ref"), + tokens.TagCloseClose()], + wrap([Tag(101, wraptext("ref"), wrap([]), [], True, False, "", + wraptext("ref"))])), + ] + for test, valid in tests: + self.assertWikicodeEqual(valid, self.builder.build(test)) def test_integration(self): """a test for building a combination of templates together""" diff --git a/tests/test_tag.py b/tests/test_tag.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest new file mode 100644 index 0000000..9a6ce30 --- /dev/null +++ b/tests/tokenizer/tags.mwtest @@ -0,0 +1,88 @@ +name: basic +label: a basic tag with an open and close +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: basic_selfclosing +label: a basic self-closing tag +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding="")] + +--- + +name: content +label: a tag with some content in the middle +input: "this is a reference" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: padded_open +label: a tag with some padding in the open tag +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: padded_close +label: a tag with some padding in the close tag +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()] + +--- + +name: padded_selfclosing +label: a self-closing tag with padding +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding=" ")] + +--- + +name: attribute +label: a tag with a single attribute +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: attribute_value +label: a tag with a single attribute with a value +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: attribute_quoted +label: a tag with a single quoted attribute +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: attribute_hyphen +label: a tag with a single attribute, containing a hyphen +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: attribute_quoted_hyphen +label: a tag with a single quoted attribute, containing a hyphen +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: attribute_selfclosing +label: a self-closing tag with a single attribute +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")] + +--- + +name: attribute_selfclosing_value +label: a self-closing tag with a single attribute with a value +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] From 9ea06c283081771833729ec579b9aaee94599fe1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 28 May 2013 10:58:45 -0400 Subject: [PATCH 20/77] Push the textbuffer to fix a couple broken tests. --- mwparserfromhell/parser/tokenizer.py | 1 + tests/tokenizer/tags.mwtest | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index b466de5..b8450fd 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -452,6 +452,7 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY + self._push_textbuffer() if isinstance(self._stack[-1], tokens.TagAttrStart): return self._stack.pop().padding return "" diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 9a6ce30..8716e78 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -86,3 +86,10 @@ name: attribute_selfclosing_value label: a self-closing tag with a single attribute with a value input: "" output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] + +--- + +name: attribute_selfclosing_value_quoted +label: a self-closing tag with a single quoted attribute +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] From d2b39546691eda327979b12dbe44c0090868c790 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 1 Jun 2013 17:30:34 -0400 Subject: [PATCH 21/77] Fix remaining broken tests; some refactoring. --- mwparserfromhell/parser/tokenizer.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index b8450fd..67a652a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -476,7 +476,7 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_ATTR_IGNORE chunks.pop(0) return - elif self._context & contexts.TAG_OPEN_ATTR_QUOTED: + elif is_new and self._context & contexts.TAG_OPEN_ATTR_QUOTED: self._write_text(" ") # Quoted chunks don't lose their spaces if chunks: @@ -501,7 +501,7 @@ class Tokenizer(object): wikicode when we're inside of an opening tag and no :py:attr:`MARKERS` are present. """ - if " " not in text: + if " " not in text and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: self._write_text(text) return chunks = text.split(" ") @@ -603,7 +603,7 @@ class Tokenizer(object): elif this == "\n" or this == "[" or this == "}": return False return True - if context & contexts.TEMPLATE_NAME: + elif context & contexts.TEMPLATE_NAME: if this == "{" or this == "}" or this == "[": self._context |= contexts.FAIL_NEXT return True @@ -621,6 +621,8 @@ class Tokenizer(object): elif this is self.END or not this.isspace(): self._context |= contexts.HAS_TEXT return True + elif context & contexts.TAG_CLOSE: + return this != "<" and this != "\n" else: if context & contexts.FAIL_ON_EQUALS: if this == "=": @@ -653,10 +655,12 @@ class Tokenizer(object): while True: this = self._read() unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | - contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME) + contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME | + contexts.TAG_CLOSE) if self._context & unsafe: if not self._verify_safe(this): - if self._context & contexts.TEMPLATE_PARAM_KEY: + double = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) + if self._context & double: self._pop() self._fail_route() if this not in self.MARKERS: @@ -672,12 +676,12 @@ class Tokenizer(object): fail = ( contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | contexts.HEADING | contexts.COMMENT | contexts.TAG) - double_fail = ( - contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE | - contexts.TAG_OPEN_ATTR_QUOTED) - if self._context & double_fail: - self._pop() if self._context & fail: + double_fail = ( + contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE | + contexts.TAG_OPEN_ATTR_QUOTED) + if self._context & double_fail: + self._pop() self._fail_route() return self._pop() next = self._read(1) @@ -738,10 +742,10 @@ class Tokenizer(object): elif this == "<" and next != "/" and ( not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() - elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): - if this == "\n": - if self._context & contexts.TAG_CLOSE: - self._pop() + elif self._context & contexts.TAG_OPEN: + if self._context & contexts.TAG_OPEN_ATTR_QUOTED: + self._handle_tag_chunk(this) + elif this == "\n": self._fail_route() elif this == ">": self._handle_tag_close_open() @@ -749,6 +753,8 @@ class Tokenizer(object): return self._handle_tag_selfclose() elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME: self._handle_tag_attribute_body() + else: + self._handle_tag_chunk(this) elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: self._handle_tag_open_close() elif this == ">" and self._context & contexts.TAG_CLOSE: From 03e41286c6caf940d9f14ae1bdbd03df4e112493 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 12 Jun 2013 18:29:22 -0400 Subject: [PATCH 22/77] Add a number of tag tests. A couple of these are failing. --- tests/tokenizer/integration.mwtest | 7 ++ tests/tokenizer/tags.mwtest | 140 +++++++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+) diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index d3cb419..ba01c8c 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -33,6 +33,13 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t --- +name: rich_tags +label: a HTML tag with tons of other things in it +input: "{{dubious claim}}[[Source]]" +output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(padding=" "), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(padding=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(padding=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(padding=""), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagOpenClose(), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + name: wildcard label: a wildcard assortment of various things input: "{{{{{{{{foo}}bar|baz=biz}}buzz}}usr|{{bin}}}}" diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 8716e78..5af2074 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -93,3 +93,143 @@ name: attribute_selfclosing_value_quoted label: a self-closing tag with a single quoted attribute input: "" output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] + +--- + +name: incomplete_lbracket +label: incomplete tags: just a left bracket +input: "<" +output: [Text(text="<")] + +--- + +name: incomplete_lbracket_junk +label: incomplete tags: just a left bracket, surrounded by stuff +input: "foo" +output: [Text(text="junk ")] + +--- + +name: incomplete_open_unnamed_attr +label: incomplete tags: an open tag, unnamed attribute +input: "junk " +output: [Text(text="junk ")] + +--- + +name: incomplete_open_attr_equals +label: incomplete tags: an open tag, attribute, equal sign +input: "junk " +output: [Text(text="junk ")] + +--- + +name: incomplete_open_attr +label: incomplete tags: an open tag, attribute with a key/value +input: "junk " +output: [Text(text="junk ")] + +--- + +name: incomplete_open_attr_quoted +label: incomplete tags: an open tag, attribute with a key/value, quoted +input: "junk " +output: [Text(text="junk ")] + +--- + +name: incomplete_open_text +label: incomplete tags: an open tag, text +input: "junk foo" +output: [Text(text="junk foo")] + +--- + +name: incomplete_open_attr_text +label: incomplete tags: an open tag, attribute with a key/value, text +input: "junk bar" +output: [Text(text="junk bar")] + +--- + +name: incomplete_open_text_lbracket +label: incomplete tags: an open tag, text, left open bracket +input: "junk bar<" +output: [Text(text="junk bar<")] + +--- + +name: incomplete_open_text_lbracket_slash +label: incomplete tags: an open tag, text, left bracket, slash +input: "junk barbarbarbar" +output: [Text(text="junk bar")] From 6450814729c4725760386ae9e8a24a30c46b7033 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 28 Jun 2013 23:34:24 -0400 Subject: [PATCH 23/77] Remove 'type' attribute from tags; rework tag definitions. --- mwparserfromhell/nodes/tag.py | 30 ++------- mwparserfromhell/parser/builder.py | 8 +-- mwparserfromhell/parser/tokenizer.py | 21 ++---- mwparserfromhell/tag_defs.py | 123 ++++++++++------------------------- mwparserfromhell/utils.py | 2 + tests/test_builder.py | 9 ++- tests/tokenizer/tags.mwtest | 28 ++++---- 7 files changed, 72 insertions(+), 149 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index d301d85..cd5d0a2 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -24,18 +24,17 @@ from __future__ import unicode_literals from . import Node, Text from ..compat import str -from ..tag_defs import TagDefinitions +from ..tag_defs import get_wikicode, is_visible from ..utils import parse_anything __all__ = ["Tag"] -class Tag(TagDefinitions, Node): +class Tag(Node): """Represents an HTML-style tag in wikicode, like ````.""" - def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, + def __init__(self, tag, contents=None, attrs=None, showtag=True, self_closing=False, padding="", closing_tag=None): super(Tag, self).__init__() - self._type = type_ self._tag = tag self._contents = contents if attrs: @@ -52,7 +51,7 @@ class Tag(TagDefinitions, Node): def __unicode__(self): if not self.showtag: - open_, close = self.WIKICODE[self.type] + open_, close = get_wikicode[self.tag] if self.self_closing: return open_ else: @@ -84,7 +83,7 @@ class Tag(TagDefinitions, Node): yield self.contents, child def __strip__(self, normalize, collapse): - if self.type in self.TAGS_VISIBLE: + if is_visible(self.tag): return self.contents.strip_code(normalize, collapse) return None @@ -113,11 +112,6 @@ class Tag(TagDefinitions, Node): write(">") @property - def type(self): - """The tag type.""" - return self._type - - @property def tag(self): """The tag itself, as a :py:class:`~.Wikicode` object.""" return self._tag @@ -159,23 +153,9 @@ class Tag(TagDefinitions, Node): """ return self._closing_tag - @type.setter - def type(self, value): - value = int(value) - if value not in self.TAGS_ALL: - raise ValueError(value) - self._type = value - for key in self.TRANSLATIONS: - if self.TRANSLATIONS[key] == value: - self._tag = self._closing_tag = parse_anything(key) - @tag.setter def tag(self, value): self._tag = self._closing_tag = parse_anything(value) - try: - self._type = self.TRANSLATIONS[text] - except KeyError: - self._type = self.TAG_UNKNOWN @contents.setter def contents(self, value): diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 5ec0780..53abe91 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -202,7 +202,7 @@ class Builder(object): def _handle_tag(self, token): """Handle a case where a tag is at the head of the tokens.""" - type_, showtag = token.type, token.showtag + showtag = token.showtag attrs = [] self._push() while self._tokens: @@ -215,14 +215,14 @@ class Builder(object): self._push() elif isinstance(token, tokens.TagCloseSelfclose): tag = self._pop() - return Tag(type_, tag, attrs=attrs, showtag=showtag, + return Tag(tag, attrs=attrs, showtag=showtag, self_closing=True, padding=token.padding) elif isinstance(token, tokens.TagOpenClose): contents = self._pop() self._push() elif isinstance(token, tokens.TagCloseClose): - return Tag(type_, tag, contents, attrs, showtag, False, - padding, self._pop()) + return Tag(tag, contents, attrs, showtag, False, padding, + self._pop()) else: self._write(self._handle_token(token)) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 67a652a..e7fdb0e 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -27,7 +27,7 @@ import re from . import contexts from . import tokens from ..compat import htmlentities -from ..nodes.tag import Tag +from ..tag_defs import is_parsable __all__ = ["Tokenizer"] @@ -416,8 +416,8 @@ class Tokenizer(object): else: self._write_all(tokens) - def _get_tag_type_from_stack(self, stack=None): - """Return the tag type based on the text in *stack*. + def _get_tag_from_stack(self, stack=None): + """Return the tag based on the text in *stack*. If *stack* is ``None``, we will use the current, topmost one. """ @@ -427,11 +427,7 @@ class Tokenizer(object): if not stack: self._fail_route() # Tag has an empty name? text = [tok for tok in stack if isinstance(tok, tokens.Text)] - text = "".join([token.text for token in text]).rstrip().lower() - try: - return Tag.TRANSLATIONS[text] - except KeyError: - return Tag.TAG_UNKNOWN + return "".join([token.text for token in text]).rstrip().lower() def _actually_close_tag_opening(self): """Handle cleanup at the end of a opening tag. @@ -447,8 +443,7 @@ class Tokenizer(object): if self._context & contexts.TAG_OPEN_ATTR_BODY: self._context ^= contexts.TAG_OPEN_ATTR_BODY else: - tag = self._get_tag_type_from_stack() - self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) + self._write_first(tokens.TagOpenOpen(showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY @@ -509,8 +504,7 @@ class Tokenizer(object): is_quoted = False if self._context & contexts.TAG_OPEN_NAME: self._write_text(chunks.pop(0)) - tag = self._get_tag_type_from_stack() - self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) + self._write_first(tokens.TagOpenOpen(showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME self._actually_handle_chunk(chunks, True) @@ -584,8 +578,7 @@ class Tokenizer(object): def _handle_tag_close_close(self): """Handle the ending of a closing tag (````).""" closing = self._pop() - tag = self._get_tag_type_from_stack(closing) - if tag != self._stack[0].type: + if self._get_tag_from_stack(closing) != self._get_tag_from_stack(): # Closing and opening tags are not the same, so fail this route: self._fail_route() self._write_all(closing) diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py index b2ee90d..369692b 100644 --- a/mwparserfromhell/tag_defs.py +++ b/mwparserfromhell/tag_defs.py @@ -20,99 +20,48 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +"""Contains data regarding certain HTML tags.""" -class TagDefinitions(object): - """Contains numerical definitions for valid HTML (and wikicode) tags. +from __future__ import unicode_literals - Base class for :py:class:`~.Tag` objects. - """ +__all__ = ["get_wikicode", "is_parsable", "is_visible"] - TAG_UNKNOWN = 0 +PARSER_BLACKLIST = [ + # enwiki extensions @ 2013-06-28 + "categorytree", "gallery", "hiero", "imagemap", "inputbox", "math", + "nowiki", "pre", "score", "section", "source", "syntaxhighlight", + "templatedata", "timeline" +] - # Basic HTML: - TAG_ITALIC = 1 - TAG_BOLD = 2 - TAG_UNDERLINE = 3 - TAG_STRIKETHROUGH = 4 - TAG_UNORDERED_LIST = 5 - TAG_ORDERED_LIST = 6 - TAG_DEF_TERM = 7 - TAG_DEF_ITEM = 8 - TAG_BLOCKQUOTE = 9 - TAG_RULE = 10 - TAG_BREAK = 11 - TAG_ABBR = 12 - TAG_PRE = 13 - TAG_MONOSPACE = 14 - TAG_CODE = 15 - TAG_SPAN = 16 - TAG_DIV = 17 - TAG_FONT = 18 - TAG_SMALL = 19 - TAG_BIG = 20 - TAG_CENTER = 21 +INVISIBLE_TAGS = [ + # enwiki extensions @ 2013-06-28 + "categorytree", "gallery", "imagemap", "inputbox", "math", "score", + "section", "templatedata", "timeline" +] - # MediaWiki parser hooks: - TAG_REF = 101 - TAG_GALLERY = 102 - TAG_MATH = 103 - TAG_NOWIKI = 104 - TAG_NOINCLUDE = 105 - TAG_INCLUDEONLY = 106 - TAG_ONLYINCLUDE = 107 +# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 +SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] +SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] - # Additional parser hooks: - TAG_SYNTAXHIGHLIGHT = 201 - TAG_POEM = 202 +WIKICODE = { + "i": {"open": "''", "close": "''"}, + "b": {"open": "'''", "close": "'''"}, + "ul": {"open": "*"}, + "ol": {"open": "#"}, + "dt": {"open": ";"}, + "dd": {"open": ":"}, + "hr": {"open": "----"}, +} - # Lists of tags: - TAGS_ALL = set(range(300)) - TAGS_INVISIBLE = {TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE} - TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE +def get_wikicode(tag): + """Return the appropriate wikicode before and after the given *tag*.""" + data = WIKICODE[tag.lower()] + return (data.get("open"), data.get("close")) - TRANSLATIONS = { - "i": TAG_ITALIC, - "em": TAG_ITALIC, - "b": TAG_BOLD, - "strong": TAG_BOLD, - "u": TAG_UNDERLINE, - "s": TAG_STRIKETHROUGH, - "ul": TAG_UNORDERED_LIST, - "ol": TAG_ORDERED_LIST, - "dt": TAG_DEF_TERM, - "dd": TAG_DEF_ITEM, - "blockquote": TAG_BLOCKQUOTE, - "hl": TAG_RULE, - "br": TAG_BREAK, - "abbr": TAG_ABBR, - "pre": TAG_PRE, - "tt": TAG_MONOSPACE, - "code": TAG_CODE, - "span": TAG_SPAN, - "div": TAG_DIV, - "font": TAG_FONT, - "small": TAG_SMALL, - "big": TAG_BIG, - "center": TAG_CENTER, - "ref": TAG_REF, - "gallery": TAG_GALLERY, - "math": TAG_MATH, - "nowiki": TAG_NOWIKI, - "noinclude": TAG_NOINCLUDE, - "includeonly": TAG_INCLUDEONLY, - "onlyinclude": TAG_ONLYINCLUDE, - "syntaxhighlight": TAG_SYNTAXHIGHLIGHT, - "source": TAG_SYNTAXHIGHLIGHT, - "poem": TAG_POEM, - } +def is_parsable(tag): + """Return if the given *tag*'s contents should be passed to the parser.""" + return tag.lower() not in PARSER_BLACKLIST - WIKICODE = { - TAG_ITALIC: ("''", "''"), - TAG_BOLD: ("'''", "'''"), - TAG_UNORDERED_LIST: ("*", ""), - TAG_ORDERED_LIST: ("#", ""), - TAG_DEF_TERM: (";", ""), - TAG_DEF_ITEM: (":", ""), - TAG_RULE: ("----", ""), - } +def is_visible(tag): + """Return whether or not the given *tag* contains visible text.""" + return tag.lower() not in INVISIBLE_TAGS diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index b797419..31e5ba0 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -31,6 +31,8 @@ from .compat import bytes, str from .nodes import Node from .smart_list import SmartList +__all__ = ["parse_anything"] + def parse_anything(value): """Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. diff --git a/tests/test_builder.py b/tests/test_builder.py index 85a8c60..0c635ce 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -193,11 +193,10 @@ class TestBuilder(TreeEqualityTestCase): def test_tag(self): """tests for building Tag nodes""" tests = [ - ([tokens.TagOpenOpen(showtag=True, type=101), - tokens.Text(text="ref"), tokens.TagCloseOpen(padding=""), - tokens.TagOpenClose(), tokens.Text(text="ref"), - tokens.TagCloseClose()], - wrap([Tag(101, wraptext("ref"), wrap([]), [], True, False, "", + ([tokens.TagOpenOpen(showtag=True), tokens.Text(text="ref"), + tokens.TagCloseOpen(padding=""), tokens.TagOpenClose(), + tokens.Text(text="ref"), tokens.TagCloseClose()], + wrap([Tag(wraptext("ref"), wrap([]), [], True, False, "", wraptext("ref"))])), ] for test, valid in tests: diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 5af2074..a76d6b6 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -1,98 +1,98 @@ name: basic label: a basic tag with an open and close input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: basic_selfclosing label: a basic self-closing tag input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding="")] --- name: content label: a tag with some content in the middle input: "this is a reference" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: padded_open label: a tag with some padding in the open tag input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: padded_close label: a tag with some padding in the close tag input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()] --- name: padded_selfclosing label: a self-closing tag with padding input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding=" ")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding=" ")] --- name: attribute label: a tag with a single attribute input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_value label: a tag with a single attribute with a value input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_quoted label: a tag with a single quoted attribute input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_hyphen label: a tag with a single attribute, containing a hyphen input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_quoted_hyphen label: a tag with a single quoted attribute, containing a hyphen input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_selfclosing label: a self-closing tag with a single attribute input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")] --- name: attribute_selfclosing_value label: a self-closing tag with a single attribute with a value input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] --- name: attribute_selfclosing_value_quoted label: a self-closing tag with a single quoted attribute input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] --- From ce27d5d385a4adc14e136b33471216038dfc70a1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 29 Jun 2013 00:33:41 -0400 Subject: [PATCH 24/77] Fix six failing tests; add three more (all passing). --- mwparserfromhell/parser/tokenizer.py | 33 ++++++++++++++++++--------------- tests/tokenizer/tags.mwtest | 21 +++++++++++++++++++++ 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index e7fdb0e..93e9a8d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -21,6 +21,7 @@ # SOFTWARE. from __future__ import unicode_literals +from itertools import takewhile from math import log import re @@ -416,19 +417,6 @@ class Tokenizer(object): else: self._write_all(tokens) - def _get_tag_from_stack(self, stack=None): - """Return the tag based on the text in *stack*. - - If *stack* is ``None``, we will use the current, topmost one. - """ - if stack is None: - stack = self._stack - self._push_textbuffer() - if not stack: - self._fail_route() # Tag has an empty name? - text = [tok for tok in stack if isinstance(tok, tokens.Text)] - return "".join([token.text for token in text]).rstrip().lower() - def _actually_close_tag_opening(self): """Handle cleanup at the end of a opening tag. @@ -557,14 +545,27 @@ class Tokenizer(object): while chunks: self._actually_handle_chunk(chunks, True) + def _get_tag_from_stack(self, stack=None): + """Return the tag based on the text in *stack*.""" + if not stack: + sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen) + func = lambda tok: not isinstance(tok, sentinels) + stack = takewhile(func, self._stack) + text = [tok.text for tok in stack if isinstance(tok, tokens.Text)] + return "".join(text).rstrip().lower() + def _handle_tag_close_open(self): """Handle the ending of an open tag (````).""" padding = self._actually_close_tag_opening() + if not self._get_tag_from_stack(): # Tags cannot be blank + self._fail_route() self._write(tokens.TagCloseOpen(padding=padding)) def _handle_tag_selfclose(self): """Handle the ending of an tag that closes itself (````).""" padding = self._actually_close_tag_opening() + if not self._get_tag_from_stack(): # Tags cannot be blank + self._fail_route() self._write(tokens.TagCloseSelfclose(padding=padding)) self._head += 1 return self._pop() @@ -578,8 +579,10 @@ class Tokenizer(object): def _handle_tag_close_close(self): """Handle the ending of a closing tag (````).""" closing = self._pop() - if self._get_tag_from_stack(closing) != self._get_tag_from_stack(): - # Closing and opening tags are not the same, so fail this route: + close_tag = self._get_tag_from_stack(closing) + open_tag = self._get_tag_from_stack() + if not close_tag or close_tag != open_tag: + # Closing and opening tags are empty or unequal, so fail this tag: self._fail_route() self._write_all(closing) self._write(tokens.TagCloseClose()) diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index a76d6b6..849a4fd 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -233,3 +233,24 @@ name: incomplete_open_text_wrong_close label: incomplete tags: an open tag, text, wrong close input: "junk bar" output: [Text(text="junk bar")] + +--- + +name: incomplete_no_tag_name_open +label: incomplete tags: no tag name within brackets; just an open +input: "junk <>" +output: [Text(text="junk <>")] + +--- + +name: incomplete_no_tag_name_selfclosing +label: incomplete tags: no tag name within brackets; self-closing +input: "junk < />" +output: [Text(text="junk < />")] + +--- + +name: incomplete_no_tag_name_open_close +label: incomplete tags: no tag name within brackets; open and close +input: "junk <>" +output: [Text(text="junk <>")] From c241bff9f50896d83294ed12c72b8d59dc932b2b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 29 Jun 2013 00:37:29 -0400 Subject: [PATCH 25/77] Remove .type check from assertTagNodeEqual() --- tests/_test_tree_equality.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/_test_tree_equality.py b/tests/_test_tree_equality.py index 2828147..6976a13 100644 --- a/tests/_test_tree_equality.py +++ b/tests/_test_tree_equality.py @@ -91,7 +91,6 @@ class TreeEqualityTestCase(TestCase): def assertTagNodeEqual(self, expected, actual): """Assert that two Tag nodes have the same data.""" - self.assertEqual(expected.type, actual.type) self.assertWikicodeEqual(expected.tag, actual.tag) if expected.contents is not None: self.assertWikicodeEqual(expected.contents, actual.contents) From 81e8fdd6829c12468f0f12c71d707c452eb9e2bb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 30 Jun 2013 20:57:54 -0400 Subject: [PATCH 26/77] Give Attributes more attributes for padding data. --- mwparserfromhell/nodes/extras/attribute.py | 41 ++++++++++++++++++++++-------- mwparserfromhell/nodes/tag.py | 2 +- mwparserfromhell/parser/builder.py | 13 ++++++---- tests/tokenizer/tags.mwtest | 16 ++++++------ 4 files changed, 48 insertions(+), 24 deletions(-) diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py index 33ad851..5888dba 100644 --- a/mwparserfromhell/nodes/extras/attribute.py +++ b/mwparserfromhell/nodes/extras/attribute.py @@ -36,19 +36,22 @@ class Attribute(StringMixIn): whose value is ``"foo"``. """ - def __init__(self, name, value=None, quoted=True, padding=""): + def __init__(self, name, value=None, quoted=True, pad_first="", + pad_before_eq="", pad_after_eq=""): super(Attribute, self).__init__() self._name = name self._value = value self._quoted = quoted - self._padding = padding + self._pad_first = pad_first + self._pad_before_eq = pad_before_eq + self._pad_after_eq = pad_after_eq def __unicode__(self): - base = self.padding + str(self.name) + base = self.pad_first + str(self.name) + self.pad_before_eq if self.value: if self.quoted: - return base + '="' + str(self.value) + '"' - return base + "=" + str(self.value) + return base + '="' + self.pad_after_eq + str(self.value) + '"' + return base + "=" + self.pad_after_eq + str(self.value) return base @property @@ -67,9 +70,19 @@ class Attribute(StringMixIn): return self._quoted @property - def padding(self): + def pad_first(self): """Spacing to insert right before the attribute.""" - return self._padding + return self._pad_first + + @property + def pad_before_eq(self): + """Spacing to insert right before the equal sign.""" + return self._pad_before_eq + + @property + def pad_after_eq(self): + """Spacing to insert right after the equal sign.""" + return self._pad_after_eq @name.setter def name(self, value): @@ -83,6 +96,14 @@ class Attribute(StringMixIn): def quoted(self, value): self._quoted = bool(value) - @padding.setter - def padding(self, value): - self._padding = str(value) + @pad_first.setter + def pad_first(self, value): + self._pad_first = str(value) + + @pad_before_eq.setter + def pad_before_eq(self, value): + self._pad_before_eq = str(value) + + @pad_after_eq.setter + def pad_after_eq(self, value): + self._pad_after_eq = str(value) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index cd5d0a2..76b412c 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -59,7 +59,7 @@ class Tag(Node): result = "<" + str(self.tag) if self.attributes: - result += " " + " ".join([str(attr) for attr in self.attributes]) + result += "".join([str(attr) for attr in self.attributes]) if self.self_closing: result += self.padding + "/>" else: diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 53abe91..d92b845 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -180,9 +180,9 @@ class Builder(object): else: self._write(self._handle_token(token)) - def _handle_attribute(self, token): + def _handle_attribute(self, start): """Handle a case where a tag attribute is at the head of the tokens.""" - name, quoted, padding = None, False, token.padding + name, quoted = None, False self._push() while self._tokens: token = self._tokens.pop() @@ -194,9 +194,12 @@ class Builder(object): elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen, tokens.TagCloseSelfclose)): self._tokens.append(token) - if name is not None: - return Attribute(name, self._pop(), quoted, padding) - return Attribute(self._pop(), quoted=quoted, padding=padding) + if name: + value = self._pop() + else: + name, value = self._pop(), None + return Attribute(name, value, quoted, start.pad_first, + start.pad_before_eq, start.pad_after_eq) else: self._write(self._handle_token(token)) diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 849a4fd..1dfc1b1 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -43,56 +43,56 @@ output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding= name: attribute label: a tag with a single attribute input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_value label: a tag with a single attribute with a value input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_quoted label: a tag with a single quoted attribute input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_hyphen label: a tag with a single attribute, containing a hyphen input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_quoted_hyphen label: a tag with a single quoted attribute, containing a hyphen input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_selfclosing label: a self-closing tag with a single attribute input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(padding="")] --- name: attribute_selfclosing_value label: a self-closing tag with a single attribute with a value input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] --- name: attribute_selfclosing_value_quoted label: a self-closing tag with a single quoted attribute input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] --- From 5f5a081d9148c584511bffb3d6d3b8f63ea24d43 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 30 Jun 2013 21:02:11 -0400 Subject: [PATCH 27/77] Rewrite tag parser to be cleaner and safer. All tag tests passing. Still need to finish backslash support and support for templates and tags within tags. --- mwparserfromhell/parser/contexts.py | 87 ++++----- mwparserfromhell/parser/tokenizer.py | 339 ++++++++++++++++------------------- 2 files changed, 194 insertions(+), 232 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 3c9c798..9e5e568 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -65,15 +65,7 @@ Local (stack-specific) contexts: * :py:const:`TAG` * :py:const:`TAG_OPEN` - - * :py:const:`TAG_OPEN_NAME` - * :py:const:`TAG_OPEN_ATTR` - - * :py:const:`TAG_OPEN_ATTR_NAME` - * :py:const:`TAG_OPEN_ATTR_BODY` - * :py:const:`TAG_OPEN_ATTR_QUOTED` - * :py:const:`TAG_OPEN_ATTR_IGNORE` - + * :py:const:`TAG_ATTR` * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` @@ -93,47 +85,42 @@ Global contexts: # Local contexts: -TEMPLATE = 0b000000000000000000000000111 -TEMPLATE_NAME = 0b000000000000000000000000001 -TEMPLATE_PARAM_KEY = 0b000000000000000000000000010 -TEMPLATE_PARAM_VALUE = 0b000000000000000000000000100 - -ARGUMENT = 0b000000000000000000000011000 -ARGUMENT_NAME = 0b000000000000000000000001000 -ARGUMENT_DEFAULT = 0b000000000000000000000010000 - -WIKILINK = 0b000000000000000000001100000 -WIKILINK_TITLE = 0b000000000000000000000100000 -WIKILINK_TEXT = 0b000000000000000000001000000 - -HEADING = 0b000000000000001111110000000 -HEADING_LEVEL_1 = 0b000000000000000000010000000 -HEADING_LEVEL_2 = 0b000000000000000000100000000 -HEADING_LEVEL_3 = 0b000000000000000001000000000 -HEADING_LEVEL_4 = 0b000000000000000010000000000 -HEADING_LEVEL_5 = 0b000000000000000100000000000 -HEADING_LEVEL_6 = 0b000000000000001000000000000 - -COMMENT = 0b000000000000010000000000000 - -TAG = 0b000000111111100000000000000 -TAG_OPEN = 0b000000001111100000000000000 -TAG_OPEN_NAME = 0b000000000000100000000000000 -TAG_OPEN_ATTR = 0b000000001111000000000000000 -TAG_OPEN_ATTR_NAME = 0b000000000001000000000000000 -TAG_OPEN_ATTR_BODY = 0b000000000010000000000000000 -TAG_OPEN_ATTR_QUOTED = 0b000000000100000000000000000 -TAG_OPEN_ATTR_IGNORE = 0b000000001000000000000000000 -TAG_BODY = 0b000000010000000000000000000 -TAG_CLOSE = 0b000000100000000000000000000 - -SAFETY_CHECK = 0b111111000000000000000000000 -HAS_TEXT = 0b000001000000000000000000000 -FAIL_ON_TEXT = 0b000010000000000000000000000 -FAIL_NEXT = 0b000100000000000000000000000 -FAIL_ON_LBRACE = 0b001000000000000000000000000 -FAIL_ON_RBRACE = 0b010000000000000000000000000 -FAIL_ON_EQUALS = 0b100000000000000000000000000 +TEMPLATE = 0b000000000000000000000111 +TEMPLATE_NAME = 0b000000000000000000000001 +TEMPLATE_PARAM_KEY = 0b000000000000000000000010 +TEMPLATE_PARAM_VALUE = 0b000000000000000000000100 + +ARGUMENT = 0b000000000000000000011000 +ARGUMENT_NAME = 0b000000000000000000001000 +ARGUMENT_DEFAULT = 0b000000000000000000010000 + +WIKILINK = 0b000000000000000001100000 +WIKILINK_TITLE = 0b000000000000000000100000 +WIKILINK_TEXT = 0b000000000000000001000000 + +HEADING = 0b000000000001111110000000 +HEADING_LEVEL_1 = 0b000000000000000010000000 +HEADING_LEVEL_2 = 0b000000000000000100000000 +HEADING_LEVEL_3 = 0b000000000000001000000000 +HEADING_LEVEL_4 = 0b000000000000010000000000 +HEADING_LEVEL_5 = 0b000000000000100000000000 +HEADING_LEVEL_6 = 0b000000000001000000000000 + +COMMENT = 0b000000000010000000000000 + +TAG = 0b000000111100000000000000 +TAG_OPEN = 0b000000000100000000000000 +TAG_ATTR = 0b000000001000000000000000 +TAG_BODY = 0b000000010000000000000000 +TAG_CLOSE = 0b000000100000000000000000 + +SAFETY_CHECK = 0b111111000000000000000000 +HAS_TEXT = 0b000001000000000000000000 +FAIL_ON_TEXT = 0b000010000000000000000000 +FAIL_NEXT = 0b000100000000000000000000 +FAIL_ON_LBRACE = 0b001000000000000000000000 +FAIL_ON_RBRACE = 0b010000000000000000000000 +FAIL_ON_EQUALS = 0b100000000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 93e9a8d..a7b9e16 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -37,6 +37,26 @@ class BadRoute(Exception): pass +class _TagOpenData(object): + """Stores data about an HTML open tag, like ````.""" + CX_NAME = 1 << 0 + CX_ATTR_READY = 1 << 1 + CX_ATTR_NAME = 1 << 2 + CX_ATTR_VALUE = 1 << 3 + CX_NEED_SPACE = 1 << 4 + CX_NEED_EQUALS = 1 << 5 + CX_NEED_QUOTE = 1 << 6 + CX_ATTR = CX_ATTR_NAME | CX_ATTR_VALUE + + def __init__(self): + self.context = self.CX_NAME + self.literal = True + self.padding_buffer = [] + self.quote_buffer = [] + self.reset = 0 + self.ignore_quote = False + + class Tokenizer(object): """Creates a list of tokens from a string of wikicode.""" USES_C = False @@ -47,6 +67,7 @@ class Tokenizer(object): MAX_DEPTH = 40 MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE) + tag_splitter = re.compile(r"([\s\"\\])") def __init__(self): self._text = None @@ -410,165 +431,145 @@ class Tokenizer(object): reset = self._head self._head += 1 try: - tokens = self._parse(contexts.TAG_OPEN_NAME) + tokens = self._really_parse_tag() except BadRoute: self._head = reset self._write_text("<") else: self._write_all(tokens) - def _actually_close_tag_opening(self): - """Handle cleanup at the end of a opening tag. - - The current context will be updated and the - :py:class:`~.tokens.TagOpenOpen` token will be written. Returns the - opening tag's padding to be used in the - :py:class:`~.tokens.TagOpenClose` token. - """ - if self._context & contexts.TAG_OPEN_ATTR: - if self._context & contexts.TAG_OPEN_ATTR_NAME: - self._context ^= contexts.TAG_OPEN_ATTR_NAME - if self._context & contexts.TAG_OPEN_ATTR_BODY: - self._context ^= contexts.TAG_OPEN_ATTR_BODY - else: - self._write_first(tokens.TagOpenOpen(showtag=True)) - self._context ^= contexts.TAG_OPEN_NAME - self._context |= contexts.TAG_BODY - - self._push_textbuffer() - if isinstance(self._stack[-1], tokens.TagAttrStart): - return self._stack.pop().padding - return "" - - def _actually_handle_chunk(self, chunks, is_new): - """Actually handle a chunk of code within a tag's attributes. + def _really_parse_tag(self): + """Actually parse an HTML tag, starting with the open (````).""" + data = _TagOpenData() + self._push(contexts.TAG_OPEN) + self._write(tokens.TagOpenOpen(showtag=True)) + while True: + this, next = self._read(), self._read(1) + if this not in self.MARKERS: + for chunk in self.tag_splitter.split(this): + if self._handle_tag_chunk(data, chunk): + continue + elif this is self.END: + if self._context & contexts.TAG_ATTR: + self._pop() + self._fail_route() + elif this == ">" and data.literal: + if data.context & data.CX_ATTR: + self._push_tag_buffer(data) + padding = data.padding_buffer[0] if data.padding_buffer else "" + self._write(tokens.TagCloseOpen(padding=padding)) + self._context = contexts.TAG_BODY + self._head += 1 + return self._parse(push=False) + elif this == "/" and next == ">" and data.literal: + if data.context & data.CX_ATTR: + self._push_tag_buffer(data) + padding = data.padding_buffer[0] if data.padding_buffer else "" + self._write(tokens.TagCloseSelfclose(padding=padding)) + self._head += 1 + return self._pop() + else: + for chunk in self.tag_splitter.split(this): + if self._handle_tag_chunk(data, chunk): + continue + self._head += 1 - Called by :py:meth:`_handle_tag_chunk` and - :py:meth:`_handle_tag_attribute_body`. - """ - if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: - padding = 0 - while chunks: - if chunks[0] == "": - padding += 1 - chunks.pop(0) - else: - break - self._write(tokens.TagAttrStart(padding=" " * padding)) - elif self._context & contexts.TAG_OPEN_ATTR_IGNORE: - self._context ^= contexts.TAG_OPEN_ATTR_IGNORE - chunks.pop(0) + def _handle_tag_chunk(self, data, chunk): + if not chunk: return - elif is_new and self._context & contexts.TAG_OPEN_ATTR_QUOTED: - self._write_text(" ") # Quoted chunks don't lose their spaces - - if chunks: - chunk = chunks.pop(0) - if self._context & contexts.TAG_OPEN_ATTR_BODY: - self._context ^= contexts.TAG_OPEN_ATTR_BODY - self._context |= contexts.TAG_OPEN_ATTR_NAME - if self._context & contexts.TAG_OPEN_ATTR_QUOTED: - if re.search(r'[^\\]"', chunk[:-1]): - self._fail_route() - if re.search(r'[^\\]"$', chunk): - self._write_text(chunk[:-1]) - self._context ^= contexts.TAG_OPEN_ATTR_QUOTED - self._context |= contexts.TAG_OPEN_ATTR_NAME - return True # Back to _handle_tag_attribute_body() + if data.context & data.CX_NAME: + if chunk != chunk.lstrip(): # Tags cannot start with whitespace + self._fail_route() self._write_text(chunk) - - def _handle_tag_chunk(self, text): - """Handle a chunk of code within a tag's attributes. - - This is called by :py:meth:`_parse`, which intercepts parsing of - wikicode when we're inside of an opening tag and no :py:attr:`MARKERS` - are present. - """ - if " " not in text and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: - self._write_text(text) - return - chunks = text.split(" ") - is_new = False - is_quoted = False - if self._context & contexts.TAG_OPEN_NAME: - self._write_text(chunks.pop(0)) - self._write_first(tokens.TagOpenOpen(showtag=True)) - self._context ^= contexts.TAG_OPEN_NAME - self._context |= contexts.TAG_OPEN_ATTR_NAME - self._actually_handle_chunk(chunks, True) - is_new = True - while chunks: - result = self._actually_handle_chunk(chunks, is_new) - is_quoted = result or is_quoted - is_new = True - if is_quoted: - return self._pop() - - def _handle_tag_attribute_body(self): - """Handle the body, or value, of a tag attribute. - - Attribute bodies can usually be handled at once, but sometimes a new - stack must be created to keep track of "rich" attribute values that - contain, for example, templates. - """ - self._context ^= contexts.TAG_OPEN_ATTR_NAME - self._context |= contexts.TAG_OPEN_ATTR_BODY - self._write(tokens.TagAttrEquals()) - next = self._read(1) - if next not in self.MARKERS and next.startswith('"'): - chunks = None - if " " in next: - chunks = next.split(" ") - next = chunks.pop(0) - if re.search(r'[^\\]"$', next[1:]): - if not re.search(r'[^\\]"', next[1:-1]): - self._write(tokens.TagAttrQuote()) - self._write_text(next[1:-1]) - self._head += 1 + data.context = data.CX_NEED_SPACE + elif data.context & data.CX_NEED_SPACE: + if chunk.isspace(): + if data.context & data.CX_ATTR_VALUE: + self._push_tag_buffer(data) + data.padding_buffer.append(chunk) + data.context = data.CX_ATTR_READY else: - if not re.search(r'[^\\]"', next[1:]): - self._head += 1 - reset = self._head - try: - attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | - contexts.TAG_OPEN_ATTR_IGNORE) - except BadRoute: - self._head = reset - self._write_text(next) - else: - self._write(tokens.TagAttrQuote()) - self._write_text(next[1:]) - self._write_all(attr) - return - self._context ^= contexts.TAG_OPEN_ATTR_BODY - self._context |= contexts.TAG_OPEN_ATTR_NAME - while chunks: - self._actually_handle_chunk(chunks, True) + if data.context & data.CX_ATTR_VALUE: + data.context ^= data.CX_NEED_SPACE + data.quote_buffer = [] + data.ignore_quote = True + self._head = data.reset + return True # Break out of chunk processing early + else: + self._fail_route() + elif data.context & data.CX_ATTR_READY: + if chunk.isspace(): + data.padding_buffer.append(chunk) + else: + data.context = data.CX_ATTR_NAME + self._push(contexts.TAG_ATTR) + self._write_text(chunk) ### hook on here for {, <, etc + elif data.context & data.CX_ATTR_NAME: + if chunk.isspace(): + data.padding_buffer.append(chunk) + data.context |= data.CX_NEED_EQUALS + elif chunk == "=": + if not data.context & data.CX_NEED_EQUALS: + data.padding_buffer.append("") # No padding before equals + data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE + self._write(tokens.TagAttrEquals()) + else: + if data.context & data.CX_NEED_EQUALS: + self._push_tag_buffer(data) + data.padding_buffer.append("") # No padding before tag + data.context = data.CX_ATTR_NAME + self._push(contexts.TAG_ATTR) + self._write_text(chunk) ### hook on here for {, <, etc + elif data.context & data.CX_ATTR_VALUE: + ### handle backslashes here + if data.context & data.CX_NEED_QUOTE: + if chunk == '"' and not data.ignore_quote: + data.context ^= data.CX_NEED_QUOTE + data.literal = False + data.reset = self._head + elif chunk.isspace(): + data.padding_buffer.append(chunk) + else: + data.context ^= data.CX_NEED_QUOTE + self._write_text(chunk) ### hook on here for {, <, etc + elif not data.literal: + if chunk == '"': + data.context |= data.CX_NEED_SPACE + data.literal = True + else: + data.quote_buffer.append(chunk) + elif chunk.isspace(): + self._push_tag_buffer(data) + data.padding_buffer.append(chunk) + data.context = data.CX_ATTR_READY + else: + self._write_text(chunk) ### hook on here for {, <, etc + + def _push_tag_buffer(self, data): + buf = data.padding_buffer + while len(buf) < 3: + buf.append("") + self._write_first(tokens.TagAttrStart( + pad_after_eq=buf.pop(), pad_before_eq=buf.pop(), + pad_first=buf.pop())) + if data.quote_buffer: + self._write(tokens.TagAttrQuote()) + self._write_text("".join(data.quote_buffer)) + self._write_all(self._pop()) + data.padding_buffer, data.quote_buffer = [], [] + data.ignore_quote = False def _get_tag_from_stack(self, stack=None): """Return the tag based on the text in *stack*.""" if not stack: sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen) - func = lambda tok: not isinstance(tok, sentinels) - stack = takewhile(func, self._stack) + pred = lambda tok: not isinstance(tok, sentinels) + stack = takewhile(pred, self._stack) text = [tok.text for tok in stack if isinstance(tok, tokens.Text)] - return "".join(text).rstrip().lower() - - def _handle_tag_close_open(self): - """Handle the ending of an open tag (````).""" - padding = self._actually_close_tag_opening() - if not self._get_tag_from_stack(): # Tags cannot be blank - self._fail_route() - self._write(tokens.TagCloseOpen(padding=padding)) - - def _handle_tag_selfclose(self): - """Handle the ending of an tag that closes itself (````).""" - padding = self._actually_close_tag_opening() - if not self._get_tag_from_stack(): # Tags cannot be blank + try: + return "".join(text).rstrip().lower().split()[0] + except IndexError: self._fail_route() - self._write(tokens.TagCloseSelfclose(padding=padding)) - self._head += 1 - return self._pop() def _handle_tag_open_close(self): """Handle the opening of a closing tag (````).""" @@ -579,10 +580,7 @@ class Tokenizer(object): def _handle_tag_close_close(self): """Handle the ending of a closing tag (````).""" closing = self._pop() - close_tag = self._get_tag_from_stack(closing) - open_tag = self._get_tag_from_stack() - if not close_tag or close_tag != open_tag: - # Closing and opening tags are empty or unequal, so fail this tag: + if self._get_tag_from_stack(closing) != self._get_tag_from_stack(): self._fail_route() self._write_all(closing) self._write(tokens.TagCloseClose()) @@ -645,37 +643,30 @@ class Tokenizer(object): self._context |= contexts.FAIL_ON_RBRACE return True - def _parse(self, context=0): + def _parse(self, context=0, push=True): """Parse the wikicode string, using *context* for when to stop.""" - self._push(context) + unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | + contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME | + contexts.TAG_CLOSE) + fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | + contexts.HEADING | contexts.COMMENT | contexts.TAG) + double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) + + if push: + self._push(context) while True: this = self._read() - unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | - contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME | - contexts.TAG_CLOSE) if self._context & unsafe: if not self._verify_safe(this): - double = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) - if self._context & double: + if self._context & double_fail: self._pop() self._fail_route() if this not in self.MARKERS: - if self._context & contexts.TAG_OPEN: - should_exit = self._handle_tag_chunk(this) - if should_exit: - return should_exit - else: - self._write_text(this) + self._write_text(this) self._head += 1 continue if this is self.END: - fail = ( - contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | - contexts.HEADING | contexts.COMMENT | contexts.TAG) if self._context & fail: - double_fail = ( - contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE | - contexts.TAG_OPEN_ATTR_QUOTED) if self._context & double_fail: self._pop() self._fail_route() @@ -720,8 +711,6 @@ class Tokenizer(object): elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() - elif self._context & contexts.TAG_OPEN_ATTR_NAME: - self._handle_tag_attribute_body() else: self._write_text("=") elif this == "=" and self._context & contexts.HEADING: @@ -735,22 +724,8 @@ class Tokenizer(object): self._parse_comment() else: self._write_text(this) - elif this == "<" and next != "/" and ( - not self._context & (contexts.TAG ^ contexts.TAG_BODY)): + elif this == "<" and next != "/" and not self._context & contexts.TAG_CLOSE: self._parse_tag() - elif self._context & contexts.TAG_OPEN: - if self._context & contexts.TAG_OPEN_ATTR_QUOTED: - self._handle_tag_chunk(this) - elif this == "\n": - self._fail_route() - elif this == ">": - self._handle_tag_close_open() - elif this == "/" and next == ">": - return self._handle_tag_selfclose() - elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME: - self._handle_tag_attribute_body() - else: - self._handle_tag_chunk(this) elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: self._handle_tag_open_close() elif this == ">" and self._context & contexts.TAG_CLOSE: From 962adcd62c48a426750fd637cfa27a2d74943474 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 30 Jun 2013 22:27:44 -0400 Subject: [PATCH 28/77] Add docstrings for a couple new methods in the tokenizer. --- mwparserfromhell/parser/tokenizer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index a7b9e16..9817bd9 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -475,6 +475,11 @@ class Tokenizer(object): self._head += 1 def _handle_tag_chunk(self, data, chunk): + """Handle a *chunk* of text inside a HTML open tag. + + A "chunk" is either a marker, whitespace, or text containing no markers + or whitespace. *data* is a :py:class:`_TagOpenData` object. + """ if not chunk: return if data.context & data.CX_NAME: @@ -546,6 +551,10 @@ class Tokenizer(object): self._write_text(chunk) ### hook on here for {, <, etc def _push_tag_buffer(self, data): + """Write a pending tag attribute from *data* to the stack. + + *data* is a :py:class:`_TagOpenData` object. + """ buf = data.padding_buffer while len(buf) < 3: buf.append("") From 43e717cca927009c840ddabb3ebabad834d14adf Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 30 Jun 2013 22:41:19 -0400 Subject: [PATCH 29/77] Add a number of new tag tests. --- tests/tokenizer/tags.mwtest | 70 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 1dfc1b1..7d5f338 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -96,6 +96,76 @@ output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" " --- +name: invalid_space_begin_open +label: invalid tag: a space at the beginning of the open tag +input: "< ref>test" +output: [Text(text="< ref>test")] + +--- + +name: invalid_space_begin_close +label: invalid tag: a space at the beginning of the close tag +input: "test" +output: [Text(text="test")] + +--- + +name: valid_space_end +label: valid tag: spaces at the ends of both the open and close tags +input: "test" +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=" "), Text(text="test"), TagOpenClose(), Text(text="ref "), TagCloseClose()] + +--- + +name: invalid_template_ends +label: invalid tag: a template at the ends of both the open and close tags +input: "test" +output: [Text(text="test" +output: [Text(text="test" +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: valid_template_end_open_space_end_close +label: valid tag: a template at the end of the open tag; whitespace at the end of the close tag +input: "test" +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref\n"), TagCloseClose()] + +--- + +name: invalid_template_end_open_nospace +label: invalid tag: a template at the end of the open tag, without spacing +input: "test" +output: [Text(text="test" +output: [Text(text="test")] + +--- + +name: invalid_template_start_open +label: invalid tag: a template at the beginning of the open tag +input: "<{{foo}}ref>test" +output: [Text(text="<"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text="ref>test")] + +--- + name: incomplete_lbracket label: incomplete tags: just a left bracket input: "<" From 82edc93bbbd1786015a8c61521fd4f698b19724a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 30 Jun 2013 22:42:26 -0400 Subject: [PATCH 30/77] Pass some tests by simplifying the way tags are read from the stack. Two still fail because templates aren't implemented yet, but those are otherwise handled correctly. --- mwparserfromhell/parser/tokenizer.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9817bd9..8c91e4f 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -568,18 +568,6 @@ class Tokenizer(object): data.padding_buffer, data.quote_buffer = [], [] data.ignore_quote = False - def _get_tag_from_stack(self, stack=None): - """Return the tag based on the text in *stack*.""" - if not stack: - sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen) - pred = lambda tok: not isinstance(tok, sentinels) - stack = takewhile(pred, self._stack) - text = [tok.text for tok in stack if isinstance(tok, tokens.Text)] - try: - return "".join(text).rstrip().lower().split()[0] - except IndexError: - self._fail_route() - def _handle_tag_open_close(self): """Handle the opening of a closing tag (````).""" self._write(tokens.TagOpenClose()) @@ -588,8 +576,10 @@ class Tokenizer(object): def _handle_tag_close_close(self): """Handle the ending of a closing tag (````).""" + strip = lambda tok: tok.text.rstrip().lower() closing = self._pop() - if self._get_tag_from_stack(closing) != self._get_tag_from_stack(): + if len(closing) != 1 or (not isinstance(closing[0], tokens.Text) or + strip(closing[0]) != strip(self._stack[1])): self._fail_route() self._write_all(closing) self._write(tokens.TagCloseClose()) @@ -625,7 +615,7 @@ class Tokenizer(object): self._context |= contexts.HAS_TEXT return True elif context & contexts.TAG_CLOSE: - return this != "<" and this != "\n" + return this != "<" else: if context & contexts.FAIL_ON_EQUALS: if this == "=": From f63480bcf3a21b8eb61c944f30b79d04a04efe40 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 30 Jun 2013 23:48:58 -0400 Subject: [PATCH 31/77] Update the integration.rich_tags test to use the new tag tokens. Remove an now-unused import in the tokenizer. --- mwparserfromhell/parser/tokenizer.py | 1 - tests/tokenizer/integration.mwtest | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 8c91e4f..9207440 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -21,7 +21,6 @@ # SOFTWARE. from __future__ import unicode_literals -from itertools import takewhile from math import log import re diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index ba01c8c..736ecb1 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -36,7 +36,7 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t name: rich_tags label: a HTML tag with tons of other things in it input: "{{dubious claim}}[[Source]]" -output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(padding=" "), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(padding=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(padding=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(padding=""), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagOpenClose(), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- From dfe100ceb7eecec82d6a3af98d016dfd95d3f9ea Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 1 Jul 2013 20:44:56 -0400 Subject: [PATCH 32/77] Support templates and wikilinks inside tags (part 1) --- mwparserfromhell/parser/tokenizer.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9207440..21d0f2a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -221,6 +221,8 @@ class Tokenizer(object): self._head += 1 self._write_all(self._pop()) + if self._context & contexts.FAIL_NEXT: + self._context ^= contexts.FAIL_NEXT def _parse_template(self): """Parse a template at the head of the wikicode string.""" @@ -293,6 +295,8 @@ class Tokenizer(object): self._head = reset self._write_text("[[") else: + if self._context & contexts.FAIL_NEXT: + self._context ^= contexts.FAIL_NEXT self._write(tokens.WikilinkOpen()) self._write_all(wikilink) self._write(tokens.WikilinkClose()) @@ -507,7 +511,7 @@ class Tokenizer(object): else: data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - self._write_text(chunk) ### hook on here for {, <, etc + self._parse_tag_chunk(chunk) elif data.context & data.CX_ATTR_NAME: if chunk.isspace(): data.padding_buffer.append(chunk) @@ -523,7 +527,7 @@ class Tokenizer(object): data.padding_buffer.append("") # No padding before tag data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - self._write_text(chunk) ### hook on here for {, <, etc + self._parse_tag_chunk(chunk) elif data.context & data.CX_ATTR_VALUE: ### handle backslashes here if data.context & data.CX_NEED_QUOTE: @@ -535,7 +539,7 @@ class Tokenizer(object): data.padding_buffer.append(chunk) else: data.context ^= data.CX_NEED_QUOTE - self._write_text(chunk) ### hook on here for {, <, etc + self._parse_tag_chunk(chunk) elif not data.literal: if chunk == '"': data.context |= data.CX_NEED_SPACE @@ -547,7 +551,18 @@ class Tokenizer(object): data.padding_buffer.append(chunk) data.context = data.CX_ATTR_READY else: - self._write_text(chunk) ### hook on here for {, <, etc + self._parse_tag_chunk(chunk) + + def _parse_tag_chunk(self, chunk): + next = self._read(1) + if not self._can_recurse() or chunk not in self.MARKERS: + self._write_text(chunk) + elif chunk == next == "{": + self._parse_template_or_argument() + elif chunk == next == "[": + self._parse_wikilink() + else: + self._write_text(chunk) def _push_tag_buffer(self, data): """Write a pending tag attribute from *data* to the stack. @@ -678,8 +693,6 @@ class Tokenizer(object): elif this == next == "{": if self._can_recurse(): self._parse_template_or_argument() - if self._context & contexts.FAIL_NEXT: - self._context ^= contexts.FAIL_NEXT else: self._write_text("{") elif this == "|" and self._context & contexts.TEMPLATE: @@ -698,8 +711,6 @@ class Tokenizer(object): elif this == next == "[": if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): self._parse_wikilink() - if self._context & contexts.FAIL_NEXT: - self._context ^= contexts.FAIL_NEXT else: self._write_text("[") elif this == "|" and self._context & contexts.WIKILINK_TITLE: From e34026dabe359ffd16567c8c5002d76f4981fe57 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 1 Jul 2013 22:14:57 -0400 Subject: [PATCH 33/77] Support templates and wikilinks inside tags (part 2) --- mwparserfromhell/parser/tokenizer.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 21d0f2a..29c2772 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -51,7 +51,7 @@ class _TagOpenData(object): self.context = self.CX_NAME self.literal = True self.padding_buffer = [] - self.quote_buffer = [] + self.quoted = False self.reset = 0 self.ignore_quote = False @@ -454,6 +454,8 @@ class Tokenizer(object): continue elif this is self.END: if self._context & contexts.TAG_ATTR: + if data.quoted: + self._pop() self._pop() self._fail_route() elif this == ">" and data.literal: @@ -499,8 +501,9 @@ class Tokenizer(object): else: if data.context & data.CX_ATTR_VALUE: data.context ^= data.CX_NEED_SPACE - data.quote_buffer = [] + data.quoted = False data.ignore_quote = True + self._pop() self._head = data.reset return True # Break out of chunk processing early else: @@ -534,6 +537,8 @@ class Tokenizer(object): if chunk == '"' and not data.ignore_quote: data.context ^= data.CX_NEED_QUOTE data.literal = False + data.quoted = True + self._push(self._context) data.reset = self._head elif chunk.isspace(): data.padding_buffer.append(chunk) @@ -545,7 +550,7 @@ class Tokenizer(object): data.context |= data.CX_NEED_SPACE data.literal = True else: - data.quote_buffer.append(chunk) + self._parse_tag_chunk(chunk) elif chunk.isspace(): self._push_tag_buffer(data) data.padding_buffer.append(chunk) @@ -572,14 +577,15 @@ class Tokenizer(object): buf = data.padding_buffer while len(buf) < 3: buf.append("") + if data.quoted: + data.quoted = False + self._write_first(tokens.TagAttrQuote()) + self._write_all(self._pop()) self._write_first(tokens.TagAttrStart( pad_after_eq=buf.pop(), pad_before_eq=buf.pop(), pad_first=buf.pop())) - if data.quote_buffer: - self._write(tokens.TagAttrQuote()) - self._write_text("".join(data.quote_buffer)) self._write_all(self._pop()) - data.padding_buffer, data.quote_buffer = [], [] + data.padding_buffer = [] data.ignore_quote = False def _handle_tag_open_close(self): From 9693b6d5e61571dfd1e0ea3a65fb95a46dcad1c7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 2 Jul 2013 00:48:20 -0400 Subject: [PATCH 34/77] Replace data.literal and data.quoted with a data.CX_QUOTED context --- mwparserfromhell/parser/tokenizer.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 29c2772..129c19a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -42,16 +42,15 @@ class _TagOpenData(object): CX_ATTR_READY = 1 << 1 CX_ATTR_NAME = 1 << 2 CX_ATTR_VALUE = 1 << 3 - CX_NEED_SPACE = 1 << 4 - CX_NEED_EQUALS = 1 << 5 - CX_NEED_QUOTE = 1 << 6 + CX_QUOTED = 1 << 4 + CX_NEED_SPACE = 1 << 5 + CX_NEED_EQUALS = 1 << 6 + CX_NEED_QUOTE = 1 << 7 CX_ATTR = CX_ATTR_NAME | CX_ATTR_VALUE def __init__(self): self.context = self.CX_NAME - self.literal = True self.padding_buffer = [] - self.quoted = False self.reset = 0 self.ignore_quote = False @@ -448,17 +447,18 @@ class Tokenizer(object): self._write(tokens.TagOpenOpen(showtag=True)) while True: this, next = self._read(), self._read(1) + can_exit = not data.context & data.CX_QUOTED or data.context & data.CX_NEED_SPACE if this not in self.MARKERS: for chunk in self.tag_splitter.split(this): if self._handle_tag_chunk(data, chunk): continue elif this is self.END: if self._context & contexts.TAG_ATTR: - if data.quoted: + if data.context & data.CX_QUOTED: self._pop() self._pop() self._fail_route() - elif this == ">" and data.literal: + elif this == ">" and can_exit: if data.context & data.CX_ATTR: self._push_tag_buffer(data) padding = data.padding_buffer[0] if data.padding_buffer else "" @@ -466,7 +466,7 @@ class Tokenizer(object): self._context = contexts.TAG_BODY self._head += 1 return self._parse(push=False) - elif this == "/" and next == ">" and data.literal: + elif this == "/" and next == ">" and can_exit: if data.context & data.CX_ATTR: self._push_tag_buffer(data) padding = data.padding_buffer[0] if data.padding_buffer else "" @@ -499,9 +499,8 @@ class Tokenizer(object): data.padding_buffer.append(chunk) data.context = data.CX_ATTR_READY else: - if data.context & data.CX_ATTR_VALUE: - data.context ^= data.CX_NEED_SPACE - data.quoted = False + if data.context & data.CX_QUOTED: + data.context ^= data.CX_NEED_SPACE | data.CX_QUOTED data.ignore_quote = True self._pop() self._head = data.reset @@ -536,8 +535,7 @@ class Tokenizer(object): if data.context & data.CX_NEED_QUOTE: if chunk == '"' and not data.ignore_quote: data.context ^= data.CX_NEED_QUOTE - data.literal = False - data.quoted = True + data.context |= data.CX_QUOTED self._push(self._context) data.reset = self._head elif chunk.isspace(): @@ -545,10 +543,9 @@ class Tokenizer(object): else: data.context ^= data.CX_NEED_QUOTE self._parse_tag_chunk(chunk) - elif not data.literal: + elif data.context & data.CX_QUOTED: if chunk == '"': data.context |= data.CX_NEED_SPACE - data.literal = True else: self._parse_tag_chunk(chunk) elif chunk.isspace(): @@ -574,13 +571,12 @@ class Tokenizer(object): *data* is a :py:class:`_TagOpenData` object. """ + if data.context & data.CX_QUOTED: + self._write_first(tokens.TagAttrQuote()) + self._write_all(self._pop()) buf = data.padding_buffer while len(buf) < 3: buf.append("") - if data.quoted: - data.quoted = False - self._write_first(tokens.TagAttrQuote()) - self._write_all(self._pop()) self._write_first(tokens.TagAttrStart( pad_after_eq=buf.pop(), pad_before_eq=buf.pop(), pad_first=buf.pop())) From dd6bb1637d26fb26085143dd6c13be310d1b04bc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 2 Jul 2013 01:31:28 -0400 Subject: [PATCH 35/77] Support tag nesting properly; unit tests; recursion checks for tags. --- mwparserfromhell/parser/tokenizer.py | 16 +++++++++++----- tests/tokenizer/tags.mwtest | 28 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 129c19a..2d1245f 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -447,7 +447,8 @@ class Tokenizer(object): self._write(tokens.TagOpenOpen(showtag=True)) while True: this, next = self._read(), self._read(1) - can_exit = not data.context & data.CX_QUOTED or data.context & data.CX_NEED_SPACE + can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or + data.context & data.CX_NEED_SPACE) if this not in self.MARKERS: for chunk in self.tag_splitter.split(this): if self._handle_tag_chunk(data, chunk): @@ -488,8 +489,8 @@ class Tokenizer(object): if not chunk: return if data.context & data.CX_NAME: - if chunk != chunk.lstrip(): # Tags cannot start with whitespace - self._fail_route() + if chunk in self.MARKERS or chunk.isspace(): + self._fail_route() # Tags must start with text (not a space) self._write_text(chunk) data.context = data.CX_NEED_SPACE elif data.context & data.CX_NEED_SPACE: @@ -563,6 +564,8 @@ class Tokenizer(object): self._parse_template_or_argument() elif chunk == next == "[": self._parse_wikilink() + elif chunk == "<": + self._parse_tag() else: self._write_text(chunk) @@ -735,10 +738,13 @@ class Tokenizer(object): self._parse_comment() else: self._write_text(this) - elif this == "<" and next != "/" and not self._context & contexts.TAG_CLOSE: - self._parse_tag() elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: self._handle_tag_open_close() + elif this == "<": + if not self._context & contexts.TAG_CLOSE and self._can_recurse(): + self._parse_tag() + else: + self._write_text("<") elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() else: diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 7d5f338..17010e9 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -96,6 +96,34 @@ output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" " --- +name: nested_tag +label: a tag nested within the attributes of another +input: "foo>citation" +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: nested_tag_quoted +label: a tag nested within the attributes of another, quoted +input: "foo">citation" +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: nested_troll_tag +label: a bogus tag that appears to be nested within the attributes of another +input: ">citation" +output: [Text(text=">citation")] + +--- + +name: nested_troll_tag_quoted +label: a bogus tag that appears to be nested within the attributes of another, quoted +input: "citation" +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text=""), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + name: invalid_space_begin_open label: invalid tag: a space at the beginning of the open tag input: "< ref>test" From 5e8794da5eff96fc649956283e5e115582ade86d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 2 Jul 2013 20:04:28 -0400 Subject: [PATCH 36/77] Refactor more of the tag tokenization process. --- mwparserfromhell/parser/tokenizer.py | 39 +++++++++++++++++------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 2d1245f..084d94b 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -449,30 +449,18 @@ class Tokenizer(object): this, next = self._read(), self._read(1) can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or data.context & data.CX_NEED_SPACE) - if this not in self.MARKERS: - for chunk in self.tag_splitter.split(this): - if self._handle_tag_chunk(data, chunk): - continue - elif this is self.END: + if this is self.END: if self._context & contexts.TAG_ATTR: if data.context & data.CX_QUOTED: self._pop() self._pop() self._fail_route() elif this == ">" and can_exit: - if data.context & data.CX_ATTR: - self._push_tag_buffer(data) - padding = data.padding_buffer[0] if data.padding_buffer else "" - self._write(tokens.TagCloseOpen(padding=padding)) + self._handle_tag_close_open(data, tokens.TagCloseOpen) self._context = contexts.TAG_BODY - self._head += 1 return self._parse(push=False) elif this == "/" and next == ">" and can_exit: - if data.context & data.CX_ATTR: - self._push_tag_buffer(data) - padding = data.padding_buffer[0] if data.padding_buffer else "" - self._write(tokens.TagCloseSelfclose(padding=padding)) - self._head += 1 + self._handle_tag_close_open(data, tokens.TagCloseSelfclose) return self._pop() else: for chunk in self.tag_splitter.split(this): @@ -514,7 +502,7 @@ class Tokenizer(object): else: data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - self._parse_tag_chunk(chunk) + self._parse_text_in_tag(chunk) elif data.context & data.CX_ATTR_NAME: if chunk.isspace(): data.padding_buffer.append(chunk) @@ -530,7 +518,7 @@ class Tokenizer(object): data.padding_buffer.append("") # No padding before tag data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - self._parse_tag_chunk(chunk) + self._parse_text_in_tag(chunk) elif data.context & data.CX_ATTR_VALUE: ### handle backslashes here if data.context & data.CX_NEED_QUOTE: @@ -543,20 +531,21 @@ class Tokenizer(object): data.padding_buffer.append(chunk) else: data.context ^= data.CX_NEED_QUOTE - self._parse_tag_chunk(chunk) + self._parse_text_in_tag(chunk) elif data.context & data.CX_QUOTED: if chunk == '"': data.context |= data.CX_NEED_SPACE else: - self._parse_tag_chunk(chunk) + self._parse_text_in_tag(chunk) elif chunk.isspace(): self._push_tag_buffer(data) data.padding_buffer.append(chunk) data.context = data.CX_ATTR_READY else: - self._parse_tag_chunk(chunk) + self._parse_text_in_tag(chunk) - def _parse_tag_chunk(self, chunk): + def _parse_text_in_tag(self, chunk): + """Parse a chunk of text in a tag that has no special significance.""" next = self._read(1) if not self._can_recurse() or chunk not in self.MARKERS: self._write_text(chunk) @@ -587,6 +576,14 @@ class Tokenizer(object): data.padding_buffer = [] data.ignore_quote = False + def _handle_tag_close_open(self, data, token): + """Handle the closing of a open tag (````).""" + if data.context & data.CX_ATTR: + self._push_tag_buffer(data) + padding = data.padding_buffer[0] if data.padding_buffer else "" + self._write(token(padding=padding)) + self._head += 1 + def _handle_tag_open_close(self): """Handle the opening of a closing tag (````).""" self._write(tokens.TagOpenClose()) From e99c9d3038a64c71981fcd9783e2ab3a21f846c6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 3 Jul 2013 18:29:07 -0400 Subject: [PATCH 37/77] More tag refactoring; fix some bugs. --- mwparserfromhell/parser/tokenizer.py | 176 ++++++++++++++++------------------- 1 file changed, 80 insertions(+), 96 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 084d94b..5bb7059 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -46,13 +46,11 @@ class _TagOpenData(object): CX_NEED_SPACE = 1 << 5 CX_NEED_EQUALS = 1 << 6 CX_NEED_QUOTE = 1 << 7 - CX_ATTR = CX_ATTR_NAME | CX_ATTR_VALUE def __init__(self): self.context = self.CX_NAME self.padding_buffer = [] self.reset = 0 - self.ignore_quote = False class Tokenizer(object): @@ -452,7 +450,11 @@ class Tokenizer(object): if this is self.END: if self._context & contexts.TAG_ATTR: if data.context & data.CX_QUOTED: + # Unclosed attribute quote: reset, don't die + data.context = data.CX_ATTR_VALUE self._pop() + self._head = data.reset + continue self._pop() self._fail_route() elif this == ">" and can_exit: @@ -463,122 +465,104 @@ class Tokenizer(object): self._handle_tag_close_open(data, tokens.TagCloseSelfclose) return self._pop() else: - for chunk in self.tag_splitter.split(this): - if self._handle_tag_chunk(data, chunk): - continue + self._handle_tag_data(data, this) self._head += 1 - def _handle_tag_chunk(self, data, chunk): - """Handle a *chunk* of text inside a HTML open tag. + def _push_tag_buffer(self, data): + """Write a pending tag attribute from *data* to the stack.""" + if data.context & data.CX_QUOTED: + self._write_first(tokens.TagAttrQuote()) + self._write_all(self._pop()) + buf = data.padding_buffer + while len(buf) < 3: + buf.append("") + self._write_first(tokens.TagAttrStart(pad_after_eq=buf.pop(), + pad_before_eq=buf.pop(), pad_first=buf.pop())) + self._write_all(self._pop()) + data.padding_buffer = [] - A "chunk" is either a marker, whitespace, or text containing no markers - or whitespace. *data* is a :py:class:`_TagOpenData` object. - """ - if not chunk: - return - if data.context & data.CX_NAME: - if chunk in self.MARKERS or chunk.isspace(): - self._fail_route() # Tags must start with text (not a space) - self._write_text(chunk) - data.context = data.CX_NEED_SPACE - elif data.context & data.CX_NEED_SPACE: - if chunk.isspace(): - if data.context & data.CX_ATTR_VALUE: - self._push_tag_buffer(data) - data.padding_buffer.append(chunk) - data.context = data.CX_ATTR_READY - else: + def _handle_tag_data(self, data, text): + """Handle all sorts of *text* data inside of an HTML open tag.""" + for chunk in self.tag_splitter.split(text): + if not chunk: + continue + if data.context & data.CX_NAME: + if chunk in self.MARKERS or chunk.isspace(): + self._fail_route() # Tags must start with text, not spaces + data.context = data.CX_NEED_SPACE + elif chunk.isspace(): + self._handle_tag_space(data, chunk) + continue + elif data.context & data.CX_NEED_SPACE: if data.context & data.CX_QUOTED: - data.context ^= data.CX_NEED_SPACE | data.CX_QUOTED - data.ignore_quote = True + data.context = data.CX_ATTR_VALUE self._pop() - self._head = data.reset - return True # Break out of chunk processing early - else: - self._fail_route() - elif data.context & data.CX_ATTR_READY: - if chunk.isspace(): - data.padding_buffer.append(chunk) - else: + self._head = data.reset - 1 # Will be auto-incremented + return # Break early + self._fail_route() + elif data.context & data.CX_ATTR_READY: data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - self._parse_text_in_tag(chunk) - elif data.context & data.CX_ATTR_NAME: - if chunk.isspace(): - data.padding_buffer.append(chunk) - data.context |= data.CX_NEED_EQUALS - elif chunk == "=": - if not data.context & data.CX_NEED_EQUALS: - data.padding_buffer.append("") # No padding before equals - data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE - self._write(tokens.TagAttrEquals()) - else: + elif data.context & data.CX_ATTR_NAME: + if chunk == "=": + if not data.context & data.CX_NEED_EQUALS: + data.padding_buffer.append("") # No padding before '=' + data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE + self._write(tokens.TagAttrEquals()) + continue if data.context & data.CX_NEED_EQUALS: self._push_tag_buffer(data) data.padding_buffer.append("") # No padding before tag data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - self._parse_text_in_tag(chunk) - elif data.context & data.CX_ATTR_VALUE: - ### handle backslashes here - if data.context & data.CX_NEED_QUOTE: - if chunk == '"' and not data.ignore_quote: + elif data.context & data.CX_ATTR_VALUE: + ### handle backslashes here + if data.context & data.CX_NEED_QUOTE: data.context ^= data.CX_NEED_QUOTE - data.context |= data.CX_QUOTED - self._push(self._context) - data.reset = self._head - elif chunk.isspace(): - data.padding_buffer.append(chunk) - else: - data.context ^= data.CX_NEED_QUOTE - self._parse_text_in_tag(chunk) - elif data.context & data.CX_QUOTED: - if chunk == '"': - data.context |= data.CX_NEED_SPACE - else: - self._parse_text_in_tag(chunk) - elif chunk.isspace(): - self._push_tag_buffer(data) - data.padding_buffer.append(chunk) - data.context = data.CX_ATTR_READY - else: - self._parse_text_in_tag(chunk) + if chunk == '"': + data.context |= data.CX_QUOTED + self._push(self._context) + data.reset = self._head + continue + elif data.context & data.CX_QUOTED: + if chunk == '"': + data.context |= data.CX_NEED_SPACE + continue + self._handle_tag_text(chunk) - def _parse_text_in_tag(self, chunk): - """Parse a chunk of text in a tag that has no special significance.""" + def _handle_tag_space(self, data, text): + """Handle whitespace (*text*) inside of an HTML open tag.""" + ctx = data.context + end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NEED_QUOTE) + if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NEED_SPACE): + self._push_tag_buffer(data) + data.context = data.CX_ATTR_READY + elif ctx & data.CX_NEED_SPACE: + data.context = data.CX_ATTR_READY + elif ctx & data.CX_ATTR_NAME: + data.context |= data.CX_NEED_EQUALS + if ctx & data.CX_QUOTED and not ctx & data.CX_NEED_SPACE: + self._write_text(text) + else: + data.padding_buffer.append(text) + + def _handle_tag_text(self, text): + """Handle regular *text* inside of an HTML open tag.""" next = self._read(1) - if not self._can_recurse() or chunk not in self.MARKERS: - self._write_text(chunk) - elif chunk == next == "{": + if not self._can_recurse() or text not in self.MARKERS: + self._write_text(text) + elif text == next == "{": self._parse_template_or_argument() - elif chunk == next == "[": + elif text == next == "[": self._parse_wikilink() - elif chunk == "<": + elif text == "<": self._parse_tag() else: - self._write_text(chunk) - - def _push_tag_buffer(self, data): - """Write a pending tag attribute from *data* to the stack. - - *data* is a :py:class:`_TagOpenData` object. - """ - if data.context & data.CX_QUOTED: - self._write_first(tokens.TagAttrQuote()) - self._write_all(self._pop()) - buf = data.padding_buffer - while len(buf) < 3: - buf.append("") - self._write_first(tokens.TagAttrStart( - pad_after_eq=buf.pop(), pad_before_eq=buf.pop(), - pad_first=buf.pop())) - self._write_all(self._pop()) - data.padding_buffer = [] - data.ignore_quote = False + self._write_text(text) def _handle_tag_close_open(self, data, token): """Handle the closing of a open tag (````).""" - if data.context & data.CX_ATTR: + if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): self._push_tag_buffer(data) padding = data.padding_buffer[0] if data.padding_buffer else "" self._write(token(padding=padding)) From 17c71e335f35b3c10e572daeaf2cb2c6707ea000 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 3 Jul 2013 18:30:02 -0400 Subject: [PATCH 38/77] Add three tests for invalid attribute quote usage. --- tests/tokenizer/tags.mwtest | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 17010e9..89b2b2e 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -194,6 +194,27 @@ output: [Text(text="<"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text --- +name: unclosed_quote +label: a quoted attribute that is never closed +input: "stuff" +output: [TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo\"bar"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()] + +--- + +name: fake_quote_complex +label: a fake quoted attribute, with spaces and templates and links +input: "stuff" +output: [TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo"), TagAttrStart(pad_first=" ", pad_before_eq="\n", pad_after_eq=""), TemplateOpen(), Text(text="bar"), TemplateClose(), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), WikilinkOpen(), Text(text="baz"), WikilinkClose(), Text(text="\"buzz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()] + +--- + name: incomplete_lbracket label: incomplete tags: just a left bracket input: "<" From 591a0f5ed57f3ccad221a2870749031064003c5c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 3 Jul 2013 18:46:41 -0400 Subject: [PATCH 39/77] Change 'write' to 'emit'; adjust some other names for PEP8. --- mwparserfromhell/parser/tokenizer.py | 149 +++++++++++++++++------------------ 1 file changed, 74 insertions(+), 75 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 5bb7059..515a7a2 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -24,8 +24,7 @@ from __future__ import unicode_literals from math import log import re -from . import contexts -from . import tokens +from . import contexts, tokens from ..compat import htmlentities from ..tag_defs import is_parsable @@ -136,33 +135,33 @@ class Tokenizer(object): self._pop() raise BadRoute() - def _write(self, token): + def _emit(self, token): """Write a token to the end of the current token stack.""" self._push_textbuffer() self._stack.append(token) - def _write_first(self, token): + def _emit_first(self, token): """Write a token to the beginning of the current token stack.""" self._push_textbuffer() self._stack.insert(0, token) - def _write_text(self, text): + def _emit_text(self, text): """Write text to the current textbuffer.""" self._textbuffer.append(text) - def _write_all(self, tokenlist): + def _emit_all(self, tokenlist): """Write a series of tokens to the current stack at once.""" if tokenlist and isinstance(tokenlist[0], tokens.Text): - self._write_text(tokenlist.pop(0).text) + self._emit_text(tokenlist.pop(0).text) self._push_textbuffer() self._stack.extend(tokenlist) - def _write_text_then_stack(self, text): + def _emit_text_then_stack(self, text): """Pop the current stack, write *text*, and then write the stack.""" stack = self._pop() - self._write_text(text) + self._emit_text(text) if stack: - self._write_all(stack) + self._emit_all(stack) self._head -= 1 def _read(self, delta=0, wrap=False, strict=False): @@ -198,12 +197,12 @@ class Tokenizer(object): while braces: if braces == 1: - return self._write_text_then_stack("{") + return self._emit_text_then_stack("{") if braces == 2: try: self._parse_template() except BadRoute: - return self._write_text_then_stack("{{") + return self._emit_text_then_stack("{{") break try: self._parse_argument() @@ -213,11 +212,11 @@ class Tokenizer(object): self._parse_template() braces -= 2 except BadRoute: - return self._write_text_then_stack("{" * braces) + return self._emit_text_then_stack("{" * braces) if braces: self._head += 1 - self._write_all(self._pop()) + self._emit_all(self._pop()) if self._context & contexts.FAIL_NEXT: self._context ^= contexts.FAIL_NEXT @@ -229,9 +228,9 @@ class Tokenizer(object): except BadRoute: self._head = reset raise - self._write_first(tokens.TemplateOpen()) - self._write_all(template) - self._write(tokens.TemplateClose()) + self._emit_first(tokens.TemplateOpen()) + self._emit_all(template) + self._emit(tokens.TemplateClose()) def _parse_argument(self): """Parse an argument at the head of the wikicode string.""" @@ -241,9 +240,9 @@ class Tokenizer(object): except BadRoute: self._head = reset raise - self._write_first(tokens.ArgumentOpen()) - self._write_all(argument) - self._write(tokens.ArgumentClose()) + self._emit_first(tokens.ArgumentOpen()) + self._emit_all(argument) + self._emit(tokens.ArgumentClose()) def _handle_template_param(self): """Handle a template parameter at the head of the string.""" @@ -252,22 +251,22 @@ class Tokenizer(object): elif self._context & contexts.TEMPLATE_PARAM_VALUE: self._context ^= contexts.TEMPLATE_PARAM_VALUE elif self._context & contexts.TEMPLATE_PARAM_KEY: - self._write_all(self._pop(keep_context=True)) + self._emit_all(self._pop(keep_context=True)) self._context |= contexts.TEMPLATE_PARAM_KEY - self._write(tokens.TemplateParamSeparator()) + self._emit(tokens.TemplateParamSeparator()) self._push(self._context) def _handle_template_param_value(self): """Handle a template parameter's value at the head of the string.""" - self._write_all(self._pop(keep_context=True)) + self._emit_all(self._pop(keep_context=True)) self._context ^= contexts.TEMPLATE_PARAM_KEY self._context |= contexts.TEMPLATE_PARAM_VALUE - self._write(tokens.TemplateParamEquals()) + self._emit(tokens.TemplateParamEquals()) def _handle_template_end(self): """Handle the end of a template at the head of the string.""" if self._context & contexts.TEMPLATE_PARAM_KEY: - self._write_all(self._pop(keep_context=True)) + self._emit_all(self._pop(keep_context=True)) self._head += 1 return self._pop() @@ -275,7 +274,7 @@ class Tokenizer(object): """Handle the separator between an argument's name and default.""" self._context ^= contexts.ARGUMENT_NAME self._context |= contexts.ARGUMENT_DEFAULT - self._write(tokens.ArgumentSeparator()) + self._emit(tokens.ArgumentSeparator()) def _handle_argument_end(self): """Handle the end of an argument at the head of the string.""" @@ -290,19 +289,19 @@ class Tokenizer(object): wikilink = self._parse(contexts.WIKILINK_TITLE) except BadRoute: self._head = reset - self._write_text("[[") + self._emit_text("[[") else: if self._context & contexts.FAIL_NEXT: self._context ^= contexts.FAIL_NEXT - self._write(tokens.WikilinkOpen()) - self._write_all(wikilink) - self._write(tokens.WikilinkClose()) + self._emit(tokens.WikilinkOpen()) + self._emit_all(wikilink) + self._emit(tokens.WikilinkClose()) def _handle_wikilink_separator(self): """Handle the separator between a wikilink's title and its text.""" self._context ^= contexts.WIKILINK_TITLE self._context |= contexts.WIKILINK_TEXT - self._write(tokens.WikilinkSeparator()) + self._emit(tokens.WikilinkSeparator()) def _handle_wikilink_end(self): """Handle the end of a wikilink at the head of the string.""" @@ -324,13 +323,13 @@ class Tokenizer(object): title, level = self._parse(context) except BadRoute: self._head = reset + best - 1 - self._write_text("=" * best) + self._emit_text("=" * best) else: - self._write(tokens.HeadingStart(level=level)) + self._emit(tokens.HeadingStart(level=level)) if level < best: - self._write_text("=" * (best - level)) - self._write_all(title) - self._write(tokens.HeadingEnd()) + self._emit_text("=" * (best - level)) + self._emit_all(title) + self._emit(tokens.HeadingEnd()) finally: self._global ^= contexts.GL_HEADING @@ -349,28 +348,28 @@ class Tokenizer(object): after, after_level = self._parse(self._context) except BadRoute: if level < best: - self._write_text("=" * (best - level)) + self._emit_text("=" * (best - level)) self._head = reset + best - 1 return self._pop(), level else: # Found another closure - self._write_text("=" * best) - self._write_all(after) + self._emit_text("=" * best) + self._emit_all(after) return self._pop(), after_level def _really_parse_entity(self): """Actually parse an HTML entity and ensure that it is valid.""" - self._write(tokens.HTMLEntityStart()) + self._emit(tokens.HTMLEntityStart()) self._head += 1 this = self._read(strict=True) if this == "#": numeric = True - self._write(tokens.HTMLEntityNumeric()) + self._emit(tokens.HTMLEntityNumeric()) self._head += 1 this = self._read(strict=True) if this[0].lower() == "x": hexadecimal = True - self._write(tokens.HTMLEntityHex(char=this[0])) + self._emit(tokens.HTMLEntityHex(char=this[0])) this = this[1:] if not this: self._fail_route() @@ -396,8 +395,8 @@ class Tokenizer(object): if this not in htmlentities.entitydefs: self._fail_route() - self._write(tokens.Text(text=this)) - self._write(tokens.HTMLEntityEnd()) + self._emit(tokens.Text(text=this)) + self._emit(tokens.HTMLEntityEnd()) def _parse_entity(self): """Parse an HTML entity at the head of the wikicode string.""" @@ -407,9 +406,9 @@ class Tokenizer(object): self._really_parse_entity() except BadRoute: self._head = reset - self._write_text(self._read()) + self._emit_text(self._read()) else: - self._write_all(self._pop()) + self._emit_all(self._pop()) def _parse_comment(self): """Parse an HTML comment at the head of the wikicode string.""" @@ -419,11 +418,11 @@ class Tokenizer(object): comment = self._parse(contexts.COMMENT) except BadRoute: self._head = reset - self._write_text("