From d1a9ba9a34f544d241b7595655e74a68c5b3f60b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 1 Dec 2012 13:42:08 -0500 Subject: [PATCH 001/189] Starting tag work. - Translation dict, contexts, parse_* and handle_* hooks in tokenizer. --- mwparserfromhell/nodes/tag.py | 36 +++++++++++++++++ mwparserfromhell/parser/contexts.py | 65 +++++++++++++++++++----------- mwparserfromhell/parser/tokenizer.c | 1 - mwparserfromhell/parser/tokenizer.py | 77 +++++++++++++++++++++++++++++++++++- 4 files changed, 155 insertions(+), 24 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 5873a49..c32f398 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -73,6 +73,42 @@ class Tag(Node): TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE)) TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE + TRANSLATIONS = { + "i": TAG_ITALIC, + "em": TAG_ITALIC, + "b": TAG_BOLD, + "strong": TAG_BOLD, + "u": TAG_UNDERLINE, + "s": TAG_STRIKETHROUGH, + "ul": TAG_UNORDERED_LIST, + "ol": TAG_ORDERED_LIST, + "dt": TAG_DEF_TERM, + "dd": TAG_DEF_ITEM, + "blockquote": TAG_BLOCKQUOTE, + "hl": TAG_RULE, + "br": TAG_BREAK, + "abbr": TAG_ABBR, + "pre": TAG_PRE, + "tt": TAG_MONOSPACE, + "code": TAG_CODE, + "span": TAG_SPAN, + "div": TAG_DIV, + "font": TAG_FONT, + "small": TAG_SMALL, + "big": TAG_BIG, + "center": TAG_CENTER, + "ref": TAG_REF, + "gallery": TAG_GALLERY, + "math": TAG_MATH, + "nowiki": TAG_NOWIKI, + "noinclude": TAG_NOINCLUDE, + "includeonly": TAG_INCLUDEONLY, + "onlyinclude": TAG_ONLYINCLUDE, + "syntaxhighlight": TAG_SYNTAXHIGHLIGHT, + "source": TAG_SYNTAXHIGHLIGHT, + "poem": TAG_POEM, + } + def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, self_closing=False, open_padding=0, close_padding=0): super(Tag, self).__init__() diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 9d41870..a67bd76 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -62,35 +62,56 @@ Local (stack-specific) contexts: * :py:const:`COMMENT` -Global contexts: +* :py:const:`TAG` -* :py:const:`GL_HEADING` -""" + * :py:const:`TAG_OPEN` + * :py:const:`TAG_ATTR` -# Local contexts: + * :py:const:`TAG_ATTR_NAME` + * :py:const:`TAG_ATTR_BODY` + * :py:const:`TAG_ATTR_BODY_QUOTED` -TEMPLATE = 0b00000000000111 -TEMPLATE_NAME = 0b00000000000001 -TEMPLATE_PARAM_KEY = 0b00000000000010 -TEMPLATE_PARAM_VALUE = 0b00000000000100 + * :py:const:`TAG_BODY` + * :py:const:`TAG_CLOSE` -ARGUMENT = 0b00000000011000 -ARGUMENT_NAME = 0b00000000001000 -ARGUMENT_DEFAULT = 0b00000000010000 +Global contexts: -WIKILINK = 0b00000001100000 -WIKILINK_TITLE = 0b00000000100000 -WIKILINK_TEXT = 0b00000001000000 +* :py:const:`GL_HEADING` +""" -HEADING = 0b01111110000000 -HEADING_LEVEL_1 = 0b00000010000000 -HEADING_LEVEL_2 = 0b00000100000000 -HEADING_LEVEL_3 = 0b00001000000000 -HEADING_LEVEL_4 = 0b00010000000000 -HEADING_LEVEL_5 = 0b00100000000000 -HEADING_LEVEL_6 = 0b01000000000000 +# Local contexts: -COMMENT = 0b10000000000000 +TEMPLATE = 0b00000000000000000111 +TEMPLATE_NAME = 0b00000000000000000001 +TEMPLATE_PARAM_KEY = 0b00000000000000000010 +TEMPLATE_PARAM_VALUE = 0b00000000000000000100 + +ARGUMENT = 0b00000000000000011000 +ARGUMENT_NAME = 0b00000000000000001000 +ARGUMENT_DEFAULT = 0b00000000000000010000 + +WIKILINK = 0b00000000000001100000 +WIKILINK_TITLE = 0b00000000000000100000 +WIKILINK_TEXT = 0b00000000000001000000 + +HEADING = 0b00000001111110000000 +HEADING_LEVEL_1 = 0b00000000000010000000 +HEADING_LEVEL_2 = 0b00000000000100000000 +HEADING_LEVEL_3 = 0b00000000001000000000 +HEADING_LEVEL_4 = 0b00000000010000000000 +HEADING_LEVEL_5 = 0b00000000100000000000 +HEADING_LEVEL_6 = 0b00000001000000000000 + +COMMENT = 0b00000010000000000000 + +TAG = 0b11111100000000000000 +TAG_OPEN = 0b00000100000000000000 +TAG_ATTR = 0b00111000000000000000 +TAG_ATTR_NAME = 0b00001000000000000000 +TAG_ATTR_BODY = 0b00010000000000000000 +TAG_ATTR_BODY_QUOTED = 0b00100000000000000000 +TAG_BODY = 0b01000000000000000000 +TAG_CLOSE = 0b10000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index cc1b4dd..71b6cc3 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -767,7 +767,6 @@ Tokenizer_parse_heading(Tokenizer* self) self->global ^= GL_HEADING; return 0; } - level = PyInt_FromSsize_t(heading->level); if (!level) { Py_DECREF(heading->title); diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 5b0e976..f640aa2 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -27,6 +27,7 @@ import string from . import contexts from . import tokens +from ..nodes.tag import Tag from ..compat import htmlentities __all__ = ["Tokenizer"] @@ -420,6 +421,57 @@ class Tokenizer(object): self._write(tokens.CommentEnd()) self._head += 2 + def _parse_tag(self): + """Parse an HTML tag at the head of the wikicode string.""" + self._head += 1 + reset = self._head + self._push() + try: + t_open, type_, self_close, o_pad = self._parse(contexts.TAG_OPEN) + if not self_close: + t_body = self._parse(contexts.TAG_BODY) + t_close, c_pad = self._parse(contexts.TAG_CLOSE) + except BadRoute: + self._head = reset + self._pop() + self._write_text("<") + else: + self._pop() + self._write(tokens.TagOpenOpen(type=type_, showtag=False)) + self._write_all(t_open) + if self_close: + self._write(tokens.TagCloseSelfclose(padding=o_pad)) + else: + self._write(tokens.TagCloseOpen(padding=o_pad)) + self._write_all(t_body) + self._write(tokens.TagOpenClose()) + self._write_all(t_close) + self._write(tokens.TagCloseClose(padding=c_pad)) + + def _handle_attribute(self): + if not self._context & contexts.TAG_ATTR: + ## check name is valid + + def _handle_attribute_name(self): + ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED + pass + + def _handle_quoted_attribute_close(self): + pass + + def _handle_tag_close_open(self): + pass ## .padding + + def _handle_tag_selfclose(self): + pass ## .padding + + def _handle_tag_close_open(self): + pass + + def _handle_tag_close_close(self): + ## check that the closing name is the same as the opening name + pass ## .padding + def _parse(self, context=0): """Parse the wikicode string, using *context* for when to stop.""" self._push(context) @@ -432,7 +484,7 @@ class Tokenizer(object): if this is self.END: fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | contexts.HEADING | - contexts.COMMENT) + contexts.COMMENT | contexts.TAG) if self._context & contexts.TEMPLATE_PARAM_KEY: self._pop() if self._context & fail: @@ -484,6 +536,29 @@ class Tokenizer(object): self._parse_comment() else: self._write_text(this) + elif this == "<" and not self._context & (contexts.TAG ^ contexts.TAG_BODY): + self._parse_tag() + elif this == " " and (self._context & contexts.TAG_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): + self._handle_attribute() + elif this == "=" and self._context & contexts.TAG_ATTR_NAME: + self._handle_attribute_name() + elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: + self._handle_quoted_attribute_close() + elif this == "\n" and (self._context & contexts.TAG_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): + self._fail_route() + elif this == ">" and (self._context & contexts.TAG_ATTR_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): + return self._handle_tag_close_open() + elif this == "/" and next == ">" and ( + self._context & contexts.TAG_ATTR_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): + return self._handle_tag_selfclose() + elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: + self._handle_tag_close_open() + elif this == ">" and self._context & contexts.TAG_CLOSE: + self._handle_tag_close_close() else: self._write_text(this) self._head += 1 From 05ec7a1a92fdf2549e8722aabd4a36a4825f3227 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 8 Dec 2012 22:04:03 -0500 Subject: [PATCH 002/189] Improve padding support for Tags; more code for tags in tokenizer. --- mwparserfromhell/nodes/extras/attribute.py | 27 +++++-- mwparserfromhell/nodes/tag.py | 18 ++--- mwparserfromhell/parser/tokenizer.py | 116 ++++++++++++++++++----------- 3 files changed, 100 insertions(+), 61 deletions(-) diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py index 648bca0..58a99a8 100644 --- a/mwparserfromhell/nodes/extras/attribute.py +++ b/mwparserfromhell/nodes/extras/attribute.py @@ -36,18 +36,20 @@ class Attribute(StringMixIn): whose value is ``"foo"``. """ - def __init__(self, name, value=None, quoted=True): + def __init__(self, name, value=None, quoted=True, padding=""): super(Attribute, self).__init__() self._name = name self._value = value self._quoted = quoted + self._padding = padding def __unicode__(self): + base = self.padding + str(self.name) if self.value: if self.quoted: - return str(self.name) + '="' + str(self.value) + '"' - return str(self.name) + "=" + str(self.value) - return str(self.name) + return base + '="' + str(self.value) + '"' + return base + "=" + str(self.value) + return base @property def name(self): @@ -64,14 +66,23 @@ class Attribute(StringMixIn): """Whether the attribute's value is quoted with double quotes.""" return self._quoted + @property + def padding(self): + """Spacing to insert right before the attribute.""" + return self._padding + @name.setter - def name(self, newval): - self._name = parse_anything(newval) + def name(self, value): + self._name = parse_anything(value) @value.setter def value(self, newval): self._value = parse_anything(newval) @quoted.setter - def quoted(self, newval): - self._quoted = bool(newval) + def quoted(self, value): + self._quoted = bool(value) + + @padding.setter + def padding(self, value): + self._padding = str(value) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index c32f398..681a17a 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -110,7 +110,7 @@ class Tag(Node): } def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, - self_closing=False, open_padding=0, close_padding=0): + self_closing=False, open_padding="", close_padding=""): super(Tag, self).__init__() self._type = type_ self._tag = tag @@ -136,10 +136,10 @@ class Tag(Node): if self.attrs: result += " " + " ".join([str(attr) for attr in self.attrs]) if self.self_closing: - result += " " * self.open_padding + "/>" + result += self.open_padding + "/>" else: - result += " " * self.open_padding + ">" + str(self.contents) - result += "" + result += self.open_padding + ">" + str(self.contents) + result += "" return result def __iternodes__(self, getter): @@ -232,17 +232,17 @@ class Tag(Node): @property def self_closing(self): - """Whether the tag is self-closing with no content.""" + """Whether the tag is self-closing with no content (like ``
``).""" return self._self_closing @property def open_padding(self): - """How much spacing to insert before the first closing >.""" + """Spacing to insert before the first closing >.""" return self._open_padding @property def close_padding(self): - """How much spacing to insert before the last closing >.""" + """Spacing to insert before the last closing > (excl. self-closing).""" return self._close_padding @type.setter @@ -270,8 +270,8 @@ class Tag(Node): @open_padding.setter def open_padding(self, value): - self._open_padding = int(value) + self._open_padding = str(value) @close_padding.setter def close_padding(self, value): - self._close_padding = int(value) + self._close_padding = str(value) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index f640aa2..80d7610 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -425,52 +425,77 @@ class Tokenizer(object): """Parse an HTML tag at the head of the wikicode string.""" self._head += 1 reset = self._head - self._push() try: - t_open, type_, self_close, o_pad = self._parse(contexts.TAG_OPEN) - if not self_close: - t_body = self._parse(contexts.TAG_BODY) - t_close, c_pad = self._parse(contexts.TAG_CLOSE) + tokens = self._parse(contexts.TAG_OPEN) except BadRoute: self._head = reset - self._pop() self._write_text("<") else: - self._pop() - self._write(tokens.TagOpenOpen(type=type_, showtag=False)) - self._write_all(t_open) - if self_close: - self._write(tokens.TagCloseSelfclose(padding=o_pad)) - else: - self._write(tokens.TagCloseOpen(padding=o_pad)) - self._write_all(t_body) - self._write(tokens.TagOpenClose()) - self._write_all(t_close) - self._write(tokens.TagCloseClose(padding=c_pad)) + self._write_all(tokens) - def _handle_attribute(self): - if not self._context & contexts.TAG_ATTR: - ## check name is valid + def _get_tag_type_from_stack(self): + self._push_textbuffer() + if not self._stack: + return None # Tag has an empty name? + text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] + text = "".join([token.text for token in text]).strip().lower() + try: + return Tag.TRANSLATIONS[text] + except KeyError: + return Tag.TAG_UNKNOWN + + def _handle_tag_close_name(self): + tag = self._get_tag_type_from_stack() + if tag is None: + self._fail_route() + self._write(tokens.TagOpenOpen(type=tag, showtag=False)) - def _handle_attribute_name(self): - ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED - pass + # def _handle_attribute(self): + # if not self._context & contexts.TAG_ATTR: + # self._handle_tag_close_name() - def _handle_quoted_attribute_close(self): - pass + # def _handle_attribute_name(self): + # ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED + # pass + + # def _handle_quoted_attribute_close(self): + # pass def _handle_tag_close_open(self): - pass ## .padding + if not self._context & contexts.TAG_ATTR: + self._handle_tag_close_name() + + self._context ^= contexts.TAG_OPEN # also TAG_ATTR_* + self._context |= contexts.TAG_BODY + + padding = "" # TODO + self._write(tokens.TagCloseOpen(padding=padding)) def _handle_tag_selfclose(self): - pass ## .padding + self._context ^= contexts.TAG_OPEN # also TAG_ATTR_* + self._context |= contexts.TAG_BODY - def _handle_tag_close_open(self): - pass + padding = "" # TODO + self._write(tokens.TagCloseSelfclose(padding=padding)) + self._pop() + + def _handle_tag_open_close(self): + self._context ^= contexts.TAG_BODY + self._context |= contexts.TAG_CLOSE + self._write(tokens.TagOpenClose()) + self._push() + self._head += 1 def _handle_tag_close_close(self): - ## check that the closing name is the same as the opening name - pass ## .padding + tag = self._get_tag_type_from_stack() + closing = self._pop() + if tag != self._stack[0].type: + # Closing and opening tags are not the same, so fail this route: + self._fail_route() + self._write_all(closing) + padding = "" # TODO + self._write(tokens.TagCloseClose(padding=padding)) + return self._pop() def _parse(self, context=0): """Parse the wikicode string, using *context* for when to stop.""" @@ -485,7 +510,8 @@ class Tokenizer(object): fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | contexts.HEADING | contexts.COMMENT | contexts.TAG) - if self._context & contexts.TEMPLATE_PARAM_KEY: + double_fail = contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE + if self._context & double_fail: self._pop() if self._context & fail: self._fail_route() @@ -538,27 +564,29 @@ class Tokenizer(object): self._write_text(this) elif this == "<" and not self._context & (contexts.TAG ^ contexts.TAG_BODY): self._parse_tag() - elif this == " " and (self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): - self._handle_attribute() - elif this == "=" and self._context & contexts.TAG_ATTR_NAME: - self._handle_attribute_name() - elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: - self._handle_quoted_attribute_close() + # elif this == " " and (self._context & contexts.TAG_OPEN and not + # self._context & contexts.TAG_ATTR_BODY_QUOTED): + # self._handle_attribute() + # elif this == "=" and self._context & contexts.TAG_ATTR_NAME: + # self._handle_attribute_name() + # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: + # self._handle_quoted_attribute_close() elif this == "\n" and (self._context & contexts.TAG_OPEN and not self._context & contexts.TAG_ATTR_BODY_QUOTED): + if self._context & contexts.TAG_CLOSE: + self._pop() self._fail_route() - elif this == ">" and (self._context & contexts.TAG_ATTR_OPEN and not + elif this == ">" and (self._context & contexts.TAG_OPEN and not self._context & contexts.TAG_ATTR_BODY_QUOTED): - return self._handle_tag_close_open() + self._handle_tag_close_open() elif this == "/" and next == ">" and ( - self._context & contexts.TAG_ATTR_OPEN and not + self._context & contexts.TAG_OPEN and not self._context & contexts.TAG_ATTR_BODY_QUOTED): return self._handle_tag_selfclose() elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: - self._handle_tag_close_open() + self._handle_tag_open_close() elif this == ">" and self._context & contexts.TAG_CLOSE: - self._handle_tag_close_close() + return self._handle_tag_close_close() else: self._write_text(this) self._head += 1 From 7e46601b1d358a09dfa8641b03d6bb2a5eeb63c3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 9 Dec 2012 00:20:21 -0500 Subject: [PATCH 003/189] Tags should fully work now in tokenizer and builder. Still need to do attributes. --- mwparserfromhell/nodes/tag.py | 5 +-- mwparserfromhell/parser/builder.py | 2 ++ mwparserfromhell/parser/tokenizer.py | 62 ++++++++++++++++++++---------------- 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 681a17a..48effa1 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -70,8 +70,9 @@ class Tag(Node): TAG_POEM = 202 # Lists of tags: + TAGS_ALL = set(range(300)) TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE)) - TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE + TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE TRANSLATIONS = { "i": TAG_ITALIC, @@ -248,7 +249,7 @@ class Tag(Node): @type.setter def type(self, value): value = int(value) - if value not in self.TAGS_INVISIBLE | self.TAGS_VISIBLE: + if value not in self.TAGS_ALL: raise ValueError(value) self._type = value diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 61a8209..648842c 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -219,7 +219,9 @@ class Builder(object): self_closing=True, open_padding=token.padding) elif isinstance(token, tokens.TagOpenClose): contents = self._pop() + self._push() elif isinstance(token, tokens.TagCloseClose): + self._pop() return Tag(type_, tag, contents, attrs, showtag, False, open_pad, token.padding) else: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 80d7610..2e72951 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -423,8 +423,8 @@ class Tokenizer(object): def _parse_tag(self): """Parse an HTML tag at the head of the wikicode string.""" - self._head += 1 reset = self._head + self._head += 1 try: tokens = self._parse(contexts.TAG_OPEN) except BadRoute: @@ -444,11 +444,24 @@ class Tokenizer(object): except KeyError: return Tag.TAG_UNKNOWN - def _handle_tag_close_name(self): - tag = self._get_tag_type_from_stack() - if tag is None: - self._fail_route() - self._write(tokens.TagOpenOpen(type=tag, showtag=False)) + def _actually_close_tag_opening(self): + if self._context & contexts.TAG_ATTR: + if self._context & contexts.TAG_ATTR_BODY: + self._context ^= contexts.TAG_ATTR_BODY + if self._context & contexts.TAG_ATTR_BODY_QUOTED: + self._context ^= contexts.TAG_ATTR_BODY_QUOTED + else: + self._context ^= contexts.TAG_ATTR_NAME + else: + tag = self._get_tag_type_from_stack() + if tag is None: + self._fail_route() + self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) + + self._context ^= contexts.TAG_OPEN + self._context |= contexts.TAG_BODY + padding = "" # TODO + return padding # def _handle_attribute(self): # if not self._context & contexts.TAG_ATTR: @@ -462,28 +475,18 @@ class Tokenizer(object): # pass def _handle_tag_close_open(self): - if not self._context & contexts.TAG_ATTR: - self._handle_tag_close_name() - - self._context ^= contexts.TAG_OPEN # also TAG_ATTR_* - self._context |= contexts.TAG_BODY - - padding = "" # TODO + padding = self._actually_close_tag_opening() self._write(tokens.TagCloseOpen(padding=padding)) def _handle_tag_selfclose(self): - self._context ^= contexts.TAG_OPEN # also TAG_ATTR_* - self._context |= contexts.TAG_BODY - - padding = "" # TODO + padding = self._actually_close_tag_opening() self._write(tokens.TagCloseSelfclose(padding=padding)) - self._pop() + self._head += 1 + return self._pop() def _handle_tag_open_close(self): - self._context ^= contexts.TAG_BODY - self._context |= contexts.TAG_CLOSE self._write(tokens.TagOpenClose()) - self._push() + self._push(contexts.TAG_CLOSE) self._head += 1 def _handle_tag_close_close(self): @@ -562,7 +565,8 @@ class Tokenizer(object): self._parse_comment() else: self._write_text(this) - elif this == "<" and not self._context & (contexts.TAG ^ contexts.TAG_BODY): + elif this == "<" and next != "/" and ( + not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() # elif this == " " and (self._context & contexts.TAG_OPEN and not # self._context & contexts.TAG_ATTR_BODY_QUOTED): @@ -571,17 +575,19 @@ class Tokenizer(object): # self._handle_attribute_name() # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: # self._handle_quoted_attribute_close() - elif this == "\n" and (self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): + elif this == "\n" and ( + self._context & contexts.TAG_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): if self._context & contexts.TAG_CLOSE: self._pop() self._fail_route() - elif this == ">" and (self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): + elif this == ">" and ( + self._context & contexts.TAG_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): self._handle_tag_close_open() elif this == "/" and next == ">" and ( - self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): + self._context & contexts.TAG_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): return self._handle_tag_selfclose() elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: self._handle_tag_open_close() From f78bcf832a08b81d7a9a03f344d2bd82bf97b6c0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 9 Dec 2012 00:29:37 -0500 Subject: [PATCH 004/189] Keep .type and .tag synchronized in Tags when using their setters. --- mwparserfromhell/nodes/tag.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 48effa1..b1eb133 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -252,10 +252,17 @@ class Tag(Node): if value not in self.TAGS_ALL: raise ValueError(value) self._type = value + for key in self.TRANSLATIONS: + if self.TRANSLATIONS[key] == value: + self._tag = parse_anything(key) @tag.setter def tag(self, value): self._tag = parse_anything(value) + try: + self._type = self.TRANSLATIONS[text] + except KeyError: + self._type = self.TAG_UNKNOWN @contents.setter def contents(self, value): From 827c544721e223c2f9a5eaf90d5742b2d45de449 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 9 Dec 2012 01:38:45 -0500 Subject: [PATCH 005/189] Should correctly handle closing tags with strange spacing. --- mwparserfromhell/nodes/tag.py | 29 ++++++++++++++++++----------- mwparserfromhell/parser/builder.py | 3 +-- mwparserfromhell/parser/tokenizer.py | 6 +++--- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index b1eb133..1f3bdf9 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -111,7 +111,7 @@ class Tag(Node): } def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, - self_closing=False, open_padding="", close_padding=""): + self_closing=False, open_padding="", closing_tag=None): super(Tag, self).__init__() self._type = type_ self._tag = tag @@ -123,7 +123,10 @@ class Tag(Node): self._showtag = showtag self._self_closing = self_closing self._open_padding = open_padding - self._close_padding = close_padding + if closing_tag: + self._closing_tag = closing_tag + else: + self._closing_tag = tag def __unicode__(self): if not self.showtag: @@ -140,7 +143,7 @@ class Tag(Node): result += self.open_padding + "/>" else: result += self.open_padding + ">" + str(self.contents) - result += "" + result += "" return result def __iternodes__(self, getter): @@ -242,9 +245,13 @@ class Tag(Node): return self._open_padding @property - def close_padding(self): - """Spacing to insert before the last closing > (excl. self-closing).""" - return self._close_padding + def closing_tag(self): + """The closing tag, as a :py:class:`~.Wikicode` object. + + This will usually equal :py:attr:`tag`, unless there is additional + spacing, comments, or the like. + """ + return self._closing_tag @type.setter def type(self, value): @@ -254,11 +261,11 @@ class Tag(Node): self._type = value for key in self.TRANSLATIONS: if self.TRANSLATIONS[key] == value: - self._tag = parse_anything(key) + self._tag = self._closing_tag = parse_anything(key) @tag.setter def tag(self, value): - self._tag = parse_anything(value) + self._tag = self._closing_tag = parse_anything(value) try: self._type = self.TRANSLATIONS[text] except KeyError: @@ -280,6 +287,6 @@ class Tag(Node): def open_padding(self, value): self._open_padding = str(value) - @close_padding.setter - def close_padding(self, value): - self._close_padding = str(value) + @closing_tag.setter + def closing_tag(self, value): + self._closing_tag = parse_anything(value) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 648842c..90274fa 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -221,9 +221,8 @@ class Builder(object): contents = self._pop() self._push() elif isinstance(token, tokens.TagCloseClose): - self._pop() return Tag(type_, tag, contents, attrs, showtag, False, - open_pad, token.padding) + open_pad, self._pop()) else: self._write(self._handle_token(token)) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 2e72951..9e9465d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -496,8 +496,7 @@ class Tokenizer(object): # Closing and opening tags are not the same, so fail this route: self._fail_route() self._write_all(closing) - padding = "" # TODO - self._write(tokens.TagCloseClose(padding=padding)) + self._write(tokens.TagCloseClose()) return self._pop() def _parse(self, context=0): @@ -589,7 +588,8 @@ class Tokenizer(object): self._context & contexts.TAG_OPEN and not self._context & contexts.TAG_ATTR_BODY_QUOTED): return self._handle_tag_selfclose() - elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: + elif this == "<" and next == "/" and ( + self._context & contexts.TAG_BODY): self._handle_tag_open_close() elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() From a21c69fa1e0fc6111b98a5028e8c214f21139dd0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 9 Dec 2012 01:47:41 -0500 Subject: [PATCH 006/189] Split off tag definitions into a new file. --- mwparserfromhell/nodes/tag.py | 104 ++----------------------------------- mwparserfromhell/tag_defs.py | 118 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+), 100 deletions(-) create mode 100644 mwparserfromhell/tag_defs.py diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 1f3bdf9..ea98bb6 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -24,92 +24,14 @@ from __future__ import unicode_literals from . import Node, Text from ..compat import str +from ..tag_defs import TagDefinitions from ..utils import parse_anything __all__ = ["Tag"] -class Tag(Node): +class Tag(TagDefinitions, Node): """Represents an HTML-style tag in wikicode, like ````.""" - TAG_UNKNOWN = 0 - - # Basic HTML: - TAG_ITALIC = 1 - TAG_BOLD = 2 - TAG_UNDERLINE = 3 - TAG_STRIKETHROUGH = 4 - TAG_UNORDERED_LIST = 5 - TAG_ORDERED_LIST = 6 - TAG_DEF_TERM = 7 - TAG_DEF_ITEM = 8 - TAG_BLOCKQUOTE = 9 - TAG_RULE = 10 - TAG_BREAK = 11 - TAG_ABBR = 12 - TAG_PRE = 13 - TAG_MONOSPACE = 14 - TAG_CODE = 15 - TAG_SPAN = 16 - TAG_DIV = 17 - TAG_FONT = 18 - TAG_SMALL = 19 - TAG_BIG = 20 - TAG_CENTER = 21 - - # MediaWiki parser hooks: - TAG_REF = 101 - TAG_GALLERY = 102 - TAG_MATH = 103 - TAG_NOWIKI = 104 - TAG_NOINCLUDE = 105 - TAG_INCLUDEONLY = 106 - TAG_ONLYINCLUDE = 107 - - # Additional parser hooks: - TAG_SYNTAXHIGHLIGHT = 201 - TAG_POEM = 202 - - # Lists of tags: - TAGS_ALL = set(range(300)) - TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE)) - TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE - - TRANSLATIONS = { - "i": TAG_ITALIC, - "em": TAG_ITALIC, - "b": TAG_BOLD, - "strong": TAG_BOLD, - "u": TAG_UNDERLINE, - "s": TAG_STRIKETHROUGH, - "ul": TAG_UNORDERED_LIST, - "ol": TAG_ORDERED_LIST, - "dt": TAG_DEF_TERM, - "dd": TAG_DEF_ITEM, - "blockquote": TAG_BLOCKQUOTE, - "hl": TAG_RULE, - "br": TAG_BREAK, - "abbr": TAG_ABBR, - "pre": TAG_PRE, - "tt": TAG_MONOSPACE, - "code": TAG_CODE, - "span": TAG_SPAN, - "div": TAG_DIV, - "font": TAG_FONT, - "small": TAG_SMALL, - "big": TAG_BIG, - "center": TAG_CENTER, - "ref": TAG_REF, - "gallery": TAG_GALLERY, - "math": TAG_MATH, - "nowiki": TAG_NOWIKI, - "noinclude": TAG_NOINCLUDE, - "includeonly": TAG_INCLUDEONLY, - "onlyinclude": TAG_ONLYINCLUDE, - "syntaxhighlight": TAG_SYNTAXHIGHLIGHT, - "source": TAG_SYNTAXHIGHLIGHT, - "poem": TAG_POEM, - } - def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, self_closing=False, open_padding="", closing_tag=None): super(Tag, self).__init__() @@ -130,7 +52,7 @@ class Tag(Node): def __unicode__(self): if not self.showtag: - open_, close = self._translate() + open_, close = self.WIKICODE[self.type] if self.self_closing: return open_ else: @@ -188,24 +110,6 @@ class Tag(Node): get(self.tag) write(">") - def _translate(self): - """If the HTML-style tag has a wikicode representation, return that. - - For example, ``Foo`` can be represented as ``'''Foo'''``. This - returns a tuple of the character starting the sequence and the - character ending it. - """ - translations = { - self.TAG_ITALIC: ("''", "''"), - self.TAG_BOLD: ("'''", "'''"), - self.TAG_UNORDERED_LIST: ("*", ""), - self.TAG_ORDERED_LIST: ("#", ""), - self.TAG_DEF_TERM: (";", ""), - self.TAG_DEF_ITEM: (":", ""), - self.TAG_RULE: ("----", ""), - } - return translations[self.type] - @property def type(self): """The tag type.""" @@ -241,7 +145,7 @@ class Tag(Node): @property def open_padding(self): - """Spacing to insert before the first closing >.""" + """Spacing to insert before the first closing ``>``.""" return self._open_padding @property diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py new file mode 100644 index 0000000..74d3a81 --- /dev/null +++ b/mwparserfromhell/tag_defs.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals + +class TagDefinitions(object): + """Contains numerical definitions for valid HTML (and wikicode) tags. + + Base class for :py:class:`~.Tag` objects. + """ + + TAG_UNKNOWN = 0 + + # Basic HTML: + TAG_ITALIC = 1 + TAG_BOLD = 2 + TAG_UNDERLINE = 3 + TAG_STRIKETHROUGH = 4 + TAG_UNORDERED_LIST = 5 + TAG_ORDERED_LIST = 6 + TAG_DEF_TERM = 7 + TAG_DEF_ITEM = 8 + TAG_BLOCKQUOTE = 9 + TAG_RULE = 10 + TAG_BREAK = 11 + TAG_ABBR = 12 + TAG_PRE = 13 + TAG_MONOSPACE = 14 + TAG_CODE = 15 + TAG_SPAN = 16 + TAG_DIV = 17 + TAG_FONT = 18 + TAG_SMALL = 19 + TAG_BIG = 20 + TAG_CENTER = 21 + + # MediaWiki parser hooks: + TAG_REF = 101 + TAG_GALLERY = 102 + TAG_MATH = 103 + TAG_NOWIKI = 104 + TAG_NOINCLUDE = 105 + TAG_INCLUDEONLY = 106 + TAG_ONLYINCLUDE = 107 + + # Additional parser hooks: + TAG_SYNTAXHIGHLIGHT = 201 + TAG_POEM = 202 + + # Lists of tags: + TAGS_ALL = set(range(300)) + TAGS_INVISIBLE = {TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE} + TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE + + TRANSLATIONS = { + "i": TAG_ITALIC, + "em": TAG_ITALIC, + "b": TAG_BOLD, + "strong": TAG_BOLD, + "u": TAG_UNDERLINE, + "s": TAG_STRIKETHROUGH, + "ul": TAG_UNORDERED_LIST, + "ol": TAG_ORDERED_LIST, + "dt": TAG_DEF_TERM, + "dd": TAG_DEF_ITEM, + "blockquote": TAG_BLOCKQUOTE, + "hl": TAG_RULE, + "br": TAG_BREAK, + "abbr": TAG_ABBR, + "pre": TAG_PRE, + "tt": TAG_MONOSPACE, + "code": TAG_CODE, + "span": TAG_SPAN, + "div": TAG_DIV, + "font": TAG_FONT, + "small": TAG_SMALL, + "big": TAG_BIG, + "center": TAG_CENTER, + "ref": TAG_REF, + "gallery": TAG_GALLERY, + "math": TAG_MATH, + "nowiki": TAG_NOWIKI, + "noinclude": TAG_NOINCLUDE, + "includeonly": TAG_INCLUDEONLY, + "onlyinclude": TAG_ONLYINCLUDE, + "syntaxhighlight": TAG_SYNTAXHIGHLIGHT, + "source": TAG_SYNTAXHIGHLIGHT, + "poem": TAG_POEM, + } + + WIKICODE = { + TAG_ITALIC: ("''", "''"), + TAG_BOLD: ("'''", "'''"), + TAG_UNORDERED_LIST: ("*", ""), + TAG_ORDERED_LIST: ("#", ""), + TAG_DEF_TERM: (";", ""), + TAG_DEF_ITEM: (":", ""), + TAG_RULE: ("----", ""), + } From 252cc13a998d60d8a8daf89dc3aa53e5f9bdde27 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 9 Dec 2012 02:01:23 -0500 Subject: [PATCH 007/189] Move repeated context checks into one block in Tokenizer._parse(). --- mwparserfromhell/parser/tokenizer.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9e9465d..99f5a7b 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -574,20 +574,18 @@ class Tokenizer(object): # self._handle_attribute_name() # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: # self._handle_quoted_attribute_close() - elif this == "\n" and ( - self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): - if self._context & contexts.TAG_CLOSE: - self._pop() - self._fail_route() - elif this == ">" and ( - self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): - self._handle_tag_close_open() - elif this == "/" and next == ">" and ( - self._context & contexts.TAG_OPEN and not - self._context & contexts.TAG_ATTR_BODY_QUOTED): - return self._handle_tag_selfclose() + elif self._context & contexts.TAG_OPEN and ( + not self._context & contexts.TAG_ATTR_BODY_QUOTED): + if this == "\n": + if self._context & contexts.TAG_CLOSE: + self._pop() + self._fail_route() + elif this == ">": + self._handle_tag_close_open() + elif this == "/": + return self._handle_tag_selfclose() + else: + self._write_text(this) elif this == "<" and next == "/" and ( self._context & contexts.TAG_BODY): self._handle_tag_open_close() From d9f23b8faaedb94d667372fb2a892307cf15a38a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 22 Dec 2012 21:58:21 -0500 Subject: [PATCH 008/189] Really basic, messy, and fragile tag attribute support. --- mwparserfromhell/parser/contexts.py | 73 +++++++++++++++++++----------------- mwparserfromhell/parser/tokenizer.py | 65 ++++++++++++++++++-------------- 2 files changed, 75 insertions(+), 63 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index a67bd76..053c930 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -65,11 +65,13 @@ Local (stack-specific) contexts: * :py:const:`TAG` * :py:const:`TAG_OPEN` - * :py:const:`TAG_ATTR` - * :py:const:`TAG_ATTR_NAME` - * :py:const:`TAG_ATTR_BODY` - * :py:const:`TAG_ATTR_BODY_QUOTED` + * :py:const:`TAG_OPEN_NAME` + * :py:const:`TAG_OPEN_ATTR` + + * :py:const:`TAG_OPEN_ATTR_NAME` + * :py:const:`TAG_OPEN_ATTR_BODY` + * :py:const:`TAG_OPEN_ATTR_BODY_QUOTED` * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` @@ -81,37 +83,38 @@ Global contexts: # Local contexts: -TEMPLATE = 0b00000000000000000111 -TEMPLATE_NAME = 0b00000000000000000001 -TEMPLATE_PARAM_KEY = 0b00000000000000000010 -TEMPLATE_PARAM_VALUE = 0b00000000000000000100 - -ARGUMENT = 0b00000000000000011000 -ARGUMENT_NAME = 0b00000000000000001000 -ARGUMENT_DEFAULT = 0b00000000000000010000 - -WIKILINK = 0b00000000000001100000 -WIKILINK_TITLE = 0b00000000000000100000 -WIKILINK_TEXT = 0b00000000000001000000 - -HEADING = 0b00000001111110000000 -HEADING_LEVEL_1 = 0b00000000000010000000 -HEADING_LEVEL_2 = 0b00000000000100000000 -HEADING_LEVEL_3 = 0b00000000001000000000 -HEADING_LEVEL_4 = 0b00000000010000000000 -HEADING_LEVEL_5 = 0b00000000100000000000 -HEADING_LEVEL_6 = 0b00000001000000000000 - -COMMENT = 0b00000010000000000000 - -TAG = 0b11111100000000000000 -TAG_OPEN = 0b00000100000000000000 -TAG_ATTR = 0b00111000000000000000 -TAG_ATTR_NAME = 0b00001000000000000000 -TAG_ATTR_BODY = 0b00010000000000000000 -TAG_ATTR_BODY_QUOTED = 0b00100000000000000000 -TAG_BODY = 0b01000000000000000000 -TAG_CLOSE = 0b10000000000000000000 +TEMPLATE = 0b00000000000000000111 +TEMPLATE_NAME = 0b00000000000000000001 +TEMPLATE_PARAM_KEY = 0b00000000000000000010 +TEMPLATE_PARAM_VALUE = 0b00000000000000000100 + +ARGUMENT = 0b00000000000000011000 +ARGUMENT_NAME = 0b00000000000000001000 +ARGUMENT_DEFAULT = 0b00000000000000010000 + +WIKILINK = 0b00000000000001100000 +WIKILINK_TITLE = 0b00000000000000100000 +WIKILINK_TEXT = 0b00000000000001000000 + +HEADING = 0b00000001111110000000 +HEADING_LEVEL_1 = 0b00000000000010000000 +HEADING_LEVEL_2 = 0b00000000000100000000 +HEADING_LEVEL_3 = 0b00000000001000000000 +HEADING_LEVEL_4 = 0b00000000010000000000 +HEADING_LEVEL_5 = 0b00000000100000000000 +HEADING_LEVEL_6 = 0b00000001000000000000 + +COMMENT = 0b00000010000000000000 + +TAG = 0b11111100000000000000 +TAG_OPEN = 0b00111100000000000000 +TAG_OPEN_NAME = 0b00000100000000000000 +TAG_OPEN_ATTR = 0b00111000000000000000 +TAG_OPEN_ATTR_NAME = 0b00001000000000000000 +TAG_OPEN_ATTR_BODY = 0b00010000000000000000 +TAG_OPEN_ATTR_BODY_QUOTED = 0b00100000000000000000 +TAG_BODY = 0b01000000000000000000 +TAG_CLOSE = 0b10000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 99f5a7b..f65cbc1 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -426,7 +426,7 @@ class Tokenizer(object): reset = self._head self._head += 1 try: - tokens = self._parse(contexts.TAG_OPEN) + tokens = self._parse(contexts.TAG_OPEN_NAME) except BadRoute: self._head = reset self._write_text("<") @@ -438,34 +438,48 @@ class Tokenizer(object): if not self._stack: return None # Tag has an empty name? text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] - text = "".join([token.text for token in text]).strip().lower() + text = "".join([token.text for token in text]).rstrip().lower() try: return Tag.TRANSLATIONS[text] except KeyError: return Tag.TAG_UNKNOWN def _actually_close_tag_opening(self): - if self._context & contexts.TAG_ATTR: - if self._context & contexts.TAG_ATTR_BODY: - self._context ^= contexts.TAG_ATTR_BODY - if self._context & contexts.TAG_ATTR_BODY_QUOTED: - self._context ^= contexts.TAG_ATTR_BODY_QUOTED - else: - self._context ^= contexts.TAG_ATTR_NAME + if self._context & contexts.TAG_OPEN_ATTR: + if self._context & contexts.TAG_OPEN_ATTR_NAME: + self._context ^= contexts.TAG_OPEN_ATTR_NAME + if self._context & contexts.TAG_OPEN_ATTR_BODY: + self._context ^= contexts.TAG_OPEN_ATTR_BODY + if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: + self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED else: tag = self._get_tag_type_from_stack() - if tag is None: + if not tag: self._fail_route() self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) - - self._context ^= contexts.TAG_OPEN + self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY padding = "" # TODO return padding - # def _handle_attribute(self): - # if not self._context & contexts.TAG_ATTR: - # self._handle_tag_close_name() + def _handle_tag_chunk(self, text): + if " " not in text: + self._write_text(text) + return + chunks = text.split(" ") + if self._context & contexts.TAG_OPEN_NAME: + self._write_text(chunks.pop(0)) + tag = self._get_tag_type_from_stack() + if not tag: + self._fail_route() + self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) + self._context ^= contexts.TAG_OPEN_NAME + self._context |= contexts.TAG_OPEN_ATTR_NAME + self._write(tokens.TagAttrStart()) + for i, chunk in enumerate(chunks): + if i > 0: + self._write(tokens.TagAttrStart()) + self._write_text(chunk) # def _handle_attribute_name(self): # ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED @@ -505,7 +519,10 @@ class Tokenizer(object): while True: this = self._read() if this not in self.MARKERS: - self._write_text(this) + if self._context & contexts.TAG_OPEN: + self._handle_tag_chunk(this) + else: + self._write_text(this) self._head += 1 continue if this is self.END: @@ -567,25 +584,17 @@ class Tokenizer(object): elif this == "<" and next != "/" and ( not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() - # elif this == " " and (self._context & contexts.TAG_OPEN and not - # self._context & contexts.TAG_ATTR_BODY_QUOTED): - # self._handle_attribute() - # elif this == "=" and self._context & contexts.TAG_ATTR_NAME: - # self._handle_attribute_name() - # elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: - # self._handle_quoted_attribute_close() - elif self._context & contexts.TAG_OPEN and ( - not self._context & contexts.TAG_ATTR_BODY_QUOTED): + elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_BODY_QUOTED): if this == "\n": if self._context & contexts.TAG_CLOSE: self._pop() self._fail_route() elif this == ">": self._handle_tag_close_open() - elif this == "/": + elif this == "/" and next == ">": return self._handle_tag_selfclose() - else: - self._write_text(this) + # elif this == "=": + # self._handle_tag_attr_body() elif this == "<" and next == "/" and ( self._context & contexts.TAG_BODY): self._handle_tag_open_close() From d459899649362773ca0db16da37bebfc1f3ce180 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Dec 2012 18:38:31 -0500 Subject: [PATCH 009/189] More attribute stuff. --- mwparserfromhell/parser/builder.py | 10 +++--- mwparserfromhell/parser/tokenizer.py | 65 +++++++++++++++++++++++++----------- 2 files changed, 50 insertions(+), 25 deletions(-) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 90274fa..cb5499f 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -180,9 +180,9 @@ class Builder(object): else: self._write(self._handle_token(token)) - def _handle_attribute(self): + def _handle_attribute(self, token): """Handle a case where a tag attribute is at the head of the tokens.""" - name, quoted = None, False + name, quoted, padding = None, False, token.padding self._push() while self._tokens: token = self._tokens.pop() @@ -195,8 +195,8 @@ class Builder(object): tokens.TagCloseOpen)): self._tokens.append(token) if name is not None: - return Attribute(name, self._pop(), quoted) - return Attribute(self._pop(), quoted=quoted) + return Attribute(name, self._pop(), quoted, padding) + return Attribute(self._pop(), quoted=quoted, padding=padding) else: self._write(self._handle_token(token)) @@ -208,7 +208,7 @@ class Builder(object): while self._tokens: token = self._tokens.pop() if isinstance(token, tokens.TagAttrStart): - attrs.append(self._handle_attribute()) + attrs.append(self._handle_attribute(token)) elif isinstance(token, tokens.TagCloseOpen): open_pad = token.padding tag = self._pop() diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index f65cbc1..d3cb40f 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -450,8 +450,6 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_ATTR_NAME if self._context & contexts.TAG_OPEN_ATTR_BODY: self._context ^= contexts.TAG_OPEN_ATTR_BODY - if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: - self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED else: tag = self._get_tag_type_from_stack() if not tag: @@ -462,6 +460,20 @@ class Tokenizer(object): padding = "" # TODO return padding + def _actually_handle_chunk(self, chunks, is_new): + if is_new: + padding = 0 + while chunks: + if chunks[0] == "": + padding += 1 + chunks.pop(0) + else: + break + self._write(tokens.TagAttrStart(padding=" " * padding)) + if chunks: + chunk = chunks.pop(0) + self._write_text(chunk) + def _handle_tag_chunk(self, text): if " " not in text: self._write_text(text) @@ -475,18 +487,29 @@ class Tokenizer(object): self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME - self._write(tokens.TagAttrStart()) - for i, chunk in enumerate(chunks): - if i > 0: - self._write(tokens.TagAttrStart()) - self._write_text(chunk) - - # def _handle_attribute_name(self): - # ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED - # pass - - # def _handle_quoted_attribute_close(self): - # pass + self._actually_handle_chunk(chunks, True) + is_new = False + while chunks: + self._actually_handle_chunk(chunks, is_new) + is_new = True + + def _handle_tag_attribute_body(self): + self._context ^= contexts.TAG_OPEN_ATTR_NAME + self._context |= contexts.TAG_OPEN_ATTR_BODY + self._write(TagAttrEquals()) + next = self._read(1) + if next not in self.MARKERS and next.startswith('"'): + if re.search(r'[^\\]"$', next[1:]): + if not re.search(r'[^\\]"', next[1:-1]): + self._write(TagAttrQuote()) + self._write_text(next[1:-1]) + self._head += 1 + else: + if not re.search(r'[^\\]"', next[1:]): + self._push(contexts.TAG_OPEN_ATTR_BODY_QUOTED) + self._write(TagAttrQuote()) + self._write_text(next[1:]) + self._head += 1 def _handle_tag_close_open(self): padding = self._actually_close_tag_opening() @@ -526,10 +549,12 @@ class Tokenizer(object): self._head += 1 continue if this is self.END: - fail = (contexts.TEMPLATE | contexts.ARGUMENT | - contexts.WIKILINK | contexts.HEADING | - contexts.COMMENT | contexts.TAG) - double_fail = contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE + fail = ( + contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | + contexts.HEADING | contexts.COMMENT | contexts.TAG) + double_fail = ( + contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE | + contexts.TAG_OPEN_ATTR_BODY_QUOTED) if self._context & double_fail: self._pop() if self._context & fail: @@ -593,8 +618,8 @@ class Tokenizer(object): self._handle_tag_close_open() elif this == "/" and next == ">": return self._handle_tag_selfclose() - # elif this == "=": - # self._handle_tag_attr_body() + elif this == "=": + self._handle_tag_attribute_body() elif this == "<" and next == "/" and ( self._context & contexts.TAG_BODY): self._handle_tag_open_close() From 26d30f3d1a8c0caca854f7040d07555c6f794b0f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Dec 2012 19:18:09 -0500 Subject: [PATCH 010/189] Seems to be working for quoted attributes now. --- mwparserfromhell/parser/tokenizer.py | 40 ++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index d3cb40f..920d1cf 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -461,7 +461,7 @@ class Tokenizer(object): return padding def _actually_handle_chunk(self, chunks, is_new): - if is_new: + if is_new and not self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: padding = 0 while chunks: if chunks[0] == "": @@ -472,6 +472,15 @@ class Tokenizer(object): self._write(tokens.TagAttrStart(padding=" " * padding)) if chunks: chunk = chunks.pop(0) + if self._context & contexts.TAG_OPEN_ATTR_BODY: + self._context ^= contexts.TAG_OPEN_ATTR_BODY + self._context |= contexts.TAG_OPEN_ATTR_NAME + if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: + if re.search(r'[^\\]"', chunk[:-1]): + self._fail_route() + if re.search(r'[^\\]"$', chunk): + self._write_text(chunk[:-1]) + return self._pop() # Back to _handle_tag_attribute_body() self._write_text(chunk) def _handle_tag_chunk(self, text): @@ -490,26 +499,35 @@ class Tokenizer(object): self._actually_handle_chunk(chunks, True) is_new = False while chunks: - self._actually_handle_chunk(chunks, is_new) + should_exit = self._actually_handle_chunk(chunks, is_new) + if should_exit: + return should_exit is_new = True def _handle_tag_attribute_body(self): self._context ^= contexts.TAG_OPEN_ATTR_NAME self._context |= contexts.TAG_OPEN_ATTR_BODY - self._write(TagAttrEquals()) + self._write(tokens.TagAttrEquals()) next = self._read(1) if next not in self.MARKERS and next.startswith('"'): if re.search(r'[^\\]"$', next[1:]): if not re.search(r'[^\\]"', next[1:-1]): - self._write(TagAttrQuote()) + self._write(tokens.TagAttrQuote()) self._write_text(next[1:-1]) self._head += 1 else: if not re.search(r'[^\\]"', next[1:]): - self._push(contexts.TAG_OPEN_ATTR_BODY_QUOTED) - self._write(TagAttrQuote()) - self._write_text(next[1:]) self._head += 1 + reset = self._head + try: + attr = self._parse(contexts.TAG_OPEN_ATTR_BODY_QUOTED) + except BadRoute: + self._head = reset + self._write_text(next) + else: + self._write(tokens.TagAttrQuote()) + self._write_text(next[1:]) + self._write_all(attr) def _handle_tag_close_open(self): padding = self._actually_close_tag_opening() @@ -543,7 +561,9 @@ class Tokenizer(object): this = self._read() if this not in self.MARKERS: if self._context & contexts.TAG_OPEN: - self._handle_tag_chunk(this) + should_exit = self._handle_tag_chunk(this) + if should_exit: + return should_exit else: self._write_text(this) self._head += 1 @@ -593,6 +613,8 @@ class Tokenizer(object): elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() + elif self._context & contexts.TAG_OPEN_ATTR_NAME: + self._handle_tag_attribute_body() else: self._write_text("=") elif this == "=" and self._context & contexts.HEADING: @@ -618,7 +640,7 @@ class Tokenizer(object): self._handle_tag_close_open() elif this == "/" and next == ">": return self._handle_tag_selfclose() - elif this == "=": + elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME: self._handle_tag_attribute_body() elif this == "<" and next == "/" and ( self._context & contexts.TAG_BODY): From ca47305074aa04585d29dd91f346079e57156f53 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Dec 2012 21:35:48 -0500 Subject: [PATCH 011/189] Fix attribute behavior under certain strange circumstances. --- mwparserfromhell/parser/tokenizer.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 920d1cf..46c4399 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -470,6 +470,7 @@ class Tokenizer(object): else: break self._write(tokens.TagAttrStart(padding=" " * padding)) + if chunks: chunk = chunks.pop(0) if self._context & contexts.TAG_OPEN_ATTR_BODY: @@ -480,7 +481,9 @@ class Tokenizer(object): self._fail_route() if re.search(r'[^\\]"$', chunk): self._write_text(chunk[:-1]) - return self._pop() # Back to _handle_tag_attribute_body() + self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED + self._context |= contexts.TAG_OPEN_ATTR_NAME + return True # Back to _handle_tag_attribute_body() self._write_text(chunk) def _handle_tag_chunk(self, text): @@ -497,12 +500,15 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME self._actually_handle_chunk(chunks, True) + is_new = False + is_quoted = False while chunks: - should_exit = self._actually_handle_chunk(chunks, is_new) - if should_exit: - return should_exit + result = self._actually_handle_chunk(chunks, is_new) + is_quoted = result or is_quoted is_new = True + if is_quoted: + return self._pop() def _handle_tag_attribute_body(self): self._context ^= contexts.TAG_OPEN_ATTR_NAME @@ -510,6 +516,10 @@ class Tokenizer(object): self._write(tokens.TagAttrEquals()) next = self._read(1) if next not in self.MARKERS and next.startswith('"'): + chunks = None + if " " in next: + chunks = next.split(" ") + next = chunks.pop(0) if re.search(r'[^\\]"$', next[1:]): if not re.search(r'[^\\]"', next[1:-1]): self._write(tokens.TagAttrQuote()) @@ -528,6 +538,10 @@ class Tokenizer(object): self._write(tokens.TagAttrQuote()) self._write_text(next[1:]) self._write_all(attr) + self._context ^= contexts.TAG_OPEN_ATTR_BODY + self._context |= contexts.TAG_OPEN_ATTR_NAME + while chunks: + self._actually_handle_chunk(chunks, True) def _handle_tag_close_open(self): padding = self._actually_close_tag_opening() From 146d1fd006c32b4a71312cd966c3e124592bce92 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Dec 2012 21:44:56 -0500 Subject: [PATCH 012/189] Fix a bug in rendering Tags; attrs->attributes; update documentation. --- docs/api/mwparserfromhell.nodes.rst | 1 + docs/api/mwparserfromhell.rst | 6 ++++++ mwparserfromhell/nodes/tag.py | 4 ++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/api/mwparserfromhell.nodes.rst b/docs/api/mwparserfromhell.nodes.rst index d1016f9..a093c17 100644 --- a/docs/api/mwparserfromhell.nodes.rst +++ b/docs/api/mwparserfromhell.nodes.rst @@ -46,6 +46,7 @@ nodes Package .. automodule:: mwparserfromhell.nodes.tag :members: + :undoc-members: :show-inheritance: :mod:`template` Module diff --git a/docs/api/mwparserfromhell.rst b/docs/api/mwparserfromhell.rst index 3ca09c9..b682139 100644 --- a/docs/api/mwparserfromhell.rst +++ b/docs/api/mwparserfromhell.rst @@ -30,6 +30,12 @@ mwparserfromhell Package :members: :undoc-members: +:mod:`tag_defs` Module +---------------------- + +.. automodule:: mwparserfromhell.tag_defs + :members: + :mod:`utils` Module ------------------- diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index ea98bb6..833b597 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -65,7 +65,7 @@ class Tag(TagDefinitions, Node): result += self.open_padding + "/>" else: result += self.open_padding + ">" + str(self.contents) - result += "" + result += "" return result def __iternodes__(self, getter): @@ -126,7 +126,7 @@ class Tag(TagDefinitions, Node): return self._contents @property - def attrs(self): + def attributes(self): """The list of attributes affecting the tag. Each attribute is an instance of :py:class:`~.Attribute`. From a58c480639119b2cd3c78eee8dfe0893fa6360fc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Dec 2012 22:23:31 -0500 Subject: [PATCH 013/189] Fix some usage of attrs; shorten a context, fix some behavior I broke. --- mwparserfromhell/nodes/tag.py | 11 +++--- mwparserfromhell/parser/contexts.py | 68 +++++++++++++++++++----------------- mwparserfromhell/parser/tokenizer.py | 29 +++++++++------ 3 files changed, 60 insertions(+), 48 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 833b597..94f92c5 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -59,8 +59,8 @@ class Tag(TagDefinitions, Node): return open_ + str(self.contents) + close result = "<" + str(self.tag) - if self.attrs: - result += " " + " ".join([str(attr) for attr in self.attrs]) + if self.attributes: + result += " " + " ".join([str(attr) for attr in self.attributes]) if self.self_closing: result += self.open_padding + "/>" else: @@ -73,7 +73,7 @@ class Tag(TagDefinitions, Node): if self.showtag: for child in getter(self.tag): yield self.tag, child - for attr in self.attrs: + for attr in self.attributes: for child in getter(attr.name): yield attr.name, child if attr.value: @@ -89,12 +89,13 @@ class Tag(TagDefinitions, Node): def __showtree__(self, write, get, mark): tagnodes = self.tag.nodes - if (not self.attrs and len(tagnodes) == 1 and isinstance(tagnodes[0], Text)): + if not self.attributes and (len(tagnodes) == 1 and + isinstance(tagnodes[0], Text)): write("<" + str(tagnodes[0]) + ">") else: write("<") get(self.tag) - for attr in self.attrs: + for attr in self.attributes: get(attr.name) if not attr.value: continue diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 053c930..d87da9a 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -71,7 +71,8 @@ Local (stack-specific) contexts: * :py:const:`TAG_OPEN_ATTR_NAME` * :py:const:`TAG_OPEN_ATTR_BODY` - * :py:const:`TAG_OPEN_ATTR_BODY_QUOTED` + * :py:const:`TAG_OPEN_ATTR_QUOTED` + * :py:const:`TAG_OPEN_ATTR_IGNORE` * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` @@ -83,38 +84,39 @@ Global contexts: # Local contexts: -TEMPLATE = 0b00000000000000000111 -TEMPLATE_NAME = 0b00000000000000000001 -TEMPLATE_PARAM_KEY = 0b00000000000000000010 -TEMPLATE_PARAM_VALUE = 0b00000000000000000100 - -ARGUMENT = 0b00000000000000011000 -ARGUMENT_NAME = 0b00000000000000001000 -ARGUMENT_DEFAULT = 0b00000000000000010000 - -WIKILINK = 0b00000000000001100000 -WIKILINK_TITLE = 0b00000000000000100000 -WIKILINK_TEXT = 0b00000000000001000000 - -HEADING = 0b00000001111110000000 -HEADING_LEVEL_1 = 0b00000000000010000000 -HEADING_LEVEL_2 = 0b00000000000100000000 -HEADING_LEVEL_3 = 0b00000000001000000000 -HEADING_LEVEL_4 = 0b00000000010000000000 -HEADING_LEVEL_5 = 0b00000000100000000000 -HEADING_LEVEL_6 = 0b00000001000000000000 - -COMMENT = 0b00000010000000000000 - -TAG = 0b11111100000000000000 -TAG_OPEN = 0b00111100000000000000 -TAG_OPEN_NAME = 0b00000100000000000000 -TAG_OPEN_ATTR = 0b00111000000000000000 -TAG_OPEN_ATTR_NAME = 0b00001000000000000000 -TAG_OPEN_ATTR_BODY = 0b00010000000000000000 -TAG_OPEN_ATTR_BODY_QUOTED = 0b00100000000000000000 -TAG_BODY = 0b01000000000000000000 -TAG_CLOSE = 0b10000000000000000000 +TEMPLATE = 0b000000000000000000111 +TEMPLATE_NAME = 0b000000000000000000001 +TEMPLATE_PARAM_KEY = 0b000000000000000000010 +TEMPLATE_PARAM_VALUE = 0b000000000000000000100 + +ARGUMENT = 0b000000000000000011000 +ARGUMENT_NAME = 0b000000000000000001000 +ARGUMENT_DEFAULT = 0b000000000000000010000 + +WIKILINK = 0b000000000000001100000 +WIKILINK_TITLE = 0b000000000000000100000 +WIKILINK_TEXT = 0b000000000000001000000 + +HEADING = 0b000000001111110000000 +HEADING_LEVEL_1 = 0b000000000000010000000 +HEADING_LEVEL_2 = 0b000000000000100000000 +HEADING_LEVEL_3 = 0b000000000001000000000 +HEADING_LEVEL_4 = 0b000000000010000000000 +HEADING_LEVEL_5 = 0b000000000100000000000 +HEADING_LEVEL_6 = 0b000000001000000000000 + +COMMENT = 0b000000010000000000000 + +TAG = 0b111111100000000000000 +TAG_OPEN = 0b001111100000000000000 +TAG_OPEN_NAME = 0b000000100000000000000 +TAG_OPEN_ATTR = 0b001111000000000000000 +TAG_OPEN_ATTR_NAME = 0b000001000000000000000 +TAG_OPEN_ATTR_BODY = 0b000010000000000000000 +TAG_OPEN_ATTR_QUOTED = 0b000100000000000000000 +TAG_OPEN_ATTR_IGNORE = 0b001000000000000000000 +TAG_BODY = 0b010000000000000000000 +TAG_CLOSE = 0b100000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 46c4399..1d31fa4 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -457,11 +457,13 @@ class Tokenizer(object): self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY - padding = "" # TODO + + ## If the last element was TagAttrStart, remove it, add " " to its padding, then return that + padding = "" return padding def _actually_handle_chunk(self, chunks, is_new): - if is_new and not self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: + if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: padding = 0 while chunks: if chunks[0] == "": @@ -470,18 +472,24 @@ class Tokenizer(object): else: break self._write(tokens.TagAttrStart(padding=" " * padding)) + elif self._context & contexts.TAG_OPEN_ATTR_IGNORE: + self._context ^= contexts.TAG_OPEN_ATTR_IGNORE + chunks.pop(0) + return + elif self._context & contexts.TAG_OPEN_ATTR_QUOTED: + self._write_text(" ") # Quoted chunks don't lose their spaces if chunks: chunk = chunks.pop(0) if self._context & contexts.TAG_OPEN_ATTR_BODY: self._context ^= contexts.TAG_OPEN_ATTR_BODY self._context |= contexts.TAG_OPEN_ATTR_NAME - if self._context & contexts.TAG_OPEN_ATTR_BODY_QUOTED: + if self._context & contexts.TAG_OPEN_ATTR_QUOTED: if re.search(r'[^\\]"', chunk[:-1]): self._fail_route() if re.search(r'[^\\]"$', chunk): self._write_text(chunk[:-1]) - self._context ^= contexts.TAG_OPEN_ATTR_BODY_QUOTED + self._context ^= contexts.TAG_OPEN_ATTR_QUOTED self._context |= contexts.TAG_OPEN_ATTR_NAME return True # Back to _handle_tag_attribute_body() self._write_text(chunk) @@ -491,6 +499,8 @@ class Tokenizer(object): self._write_text(text) return chunks = text.split(" ") + is_new = False + is_quoted = False if self._context & contexts.TAG_OPEN_NAME: self._write_text(chunks.pop(0)) tag = self._get_tag_type_from_stack() @@ -500,9 +510,7 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME self._actually_handle_chunk(chunks, True) - - is_new = False - is_quoted = False + is_new = True while chunks: result = self._actually_handle_chunk(chunks, is_new) is_quoted = result or is_quoted @@ -530,7 +538,7 @@ class Tokenizer(object): self._head += 1 reset = self._head try: - attr = self._parse(contexts.TAG_OPEN_ATTR_BODY_QUOTED) + attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | contexts.TAG_OPEN_ATTR_IGNORE) except BadRoute: self._head = reset self._write_text(next) @@ -538,6 +546,7 @@ class Tokenizer(object): self._write(tokens.TagAttrQuote()) self._write_text(next[1:]) self._write_all(attr) + return self._context ^= contexts.TAG_OPEN_ATTR_BODY self._context |= contexts.TAG_OPEN_ATTR_NAME while chunks: @@ -588,7 +597,7 @@ class Tokenizer(object): contexts.HEADING | contexts.COMMENT | contexts.TAG) double_fail = ( contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE | - contexts.TAG_OPEN_ATTR_BODY_QUOTED) + contexts.TAG_OPEN_ATTR_QUOTED) if self._context & double_fail: self._pop() if self._context & fail: @@ -645,7 +654,7 @@ class Tokenizer(object): elif this == "<" and next != "/" and ( not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() - elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_BODY_QUOTED): + elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): if this == "\n": if self._context & contexts.TAG_CLOSE: self._pop() From eed7c918bfb0741fefd0473f61bbc1e9343ad033 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 23 Dec 2012 22:41:32 -0500 Subject: [PATCH 014/189] Implement padding support for Tags completely; open_padding->padding. --- mwparserfromhell/nodes/tag.py | 18 +++++++++--------- mwparserfromhell/parser/builder.py | 6 +++--- mwparserfromhell/parser/tokenizer.py | 15 +++++++++------ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 94f92c5..ecf6f2b 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -33,7 +33,7 @@ class Tag(TagDefinitions, Node): """Represents an HTML-style tag in wikicode, like ````.""" def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, - self_closing=False, open_padding="", closing_tag=None): + self_closing=False, padding="", closing_tag=None): super(Tag, self).__init__() self._type = type_ self._tag = tag @@ -44,7 +44,7 @@ class Tag(TagDefinitions, Node): self._attrs = [] self._showtag = showtag self._self_closing = self_closing - self._open_padding = open_padding + self._padding = padding if closing_tag: self._closing_tag = closing_tag else: @@ -62,9 +62,9 @@ class Tag(TagDefinitions, Node): if self.attributes: result += " " + " ".join([str(attr) for attr in self.attributes]) if self.self_closing: - result += self.open_padding + "/>" + result += self.padding + "/>" else: - result += self.open_padding + ">" + str(self.contents) + result += self.padding + ">" + str(self.contents) result += "" return result @@ -145,9 +145,9 @@ class Tag(TagDefinitions, Node): return self._self_closing @property - def open_padding(self): + def padding(self): """Spacing to insert before the first closing ``>``.""" - return self._open_padding + return self._padding @property def closing_tag(self): @@ -188,9 +188,9 @@ class Tag(TagDefinitions, Node): def self_closing(self, value): self._self_closing = bool(value) - @open_padding.setter - def open_padding(self, value): - self._open_padding = str(value) + @padding.setter + def padding(self, value): + self._padding = str(value) @closing_tag.setter def closing_tag(self, value): diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index cb5499f..2d9ea55 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -210,19 +210,19 @@ class Builder(object): if isinstance(token, tokens.TagAttrStart): attrs.append(self._handle_attribute(token)) elif isinstance(token, tokens.TagCloseOpen): - open_pad = token.padding + padding = token.padding tag = self._pop() self._push() elif isinstance(token, tokens.TagCloseSelfclose): tag = self._pop() return Tag(type_, tag, attrs=attrs, showtag=showtag, - self_closing=True, open_padding=token.padding) + self_closing=True, padding=token.padding) elif isinstance(token, tokens.TagOpenClose): contents = self._pop() self._push() elif isinstance(token, tokens.TagCloseClose): return Tag(type_, tag, contents, attrs, showtag, False, - open_pad, self._pop()) + padding, self._pop()) else: self._write(self._handle_token(token)) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 1d31fa4..901e731 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -458,9 +458,9 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY - ## If the last element was TagAttrStart, remove it, add " " to its padding, then return that - padding = "" - return padding + if isinstance(self._stack[-1], tokens.TagAttrStart): + return self._stack.pop().padding + return "" def _actually_handle_chunk(self, chunks, is_new): if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: @@ -538,7 +538,8 @@ class Tokenizer(object): self._head += 1 reset = self._head try: - attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | contexts.TAG_OPEN_ATTR_IGNORE) + attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | + contexts.TAG_OPEN_ATTR_IGNORE) except BadRoute: self._head = reset self._write_text(next) @@ -654,7 +655,8 @@ class Tokenizer(object): elif this == "<" and next != "/" and ( not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() - elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): + elif self._context & ( + contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): if this == "\n": if self._context & contexts.TAG_CLOSE: self._pop() @@ -663,7 +665,8 @@ class Tokenizer(object): self._handle_tag_close_open() elif this == "/" and next == ">": return self._handle_tag_selfclose() - elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME: + elif this == "=" and ( + self._context & contexts.TAG_OPEN_ATTR_NAME): self._handle_tag_attribute_body() elif this == "<" and next == "/" and ( self._context & contexts.TAG_BODY): From 6ea618460fc122dcd60ebebd0ecf02a36f82d8cf Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 31 Dec 2012 03:19:22 -0500 Subject: [PATCH 015/189] _get_tag_type_from_stack() makes more sense now --- mwparserfromhell/parser/tokenizer.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 901e731..e83ec5d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -433,16 +433,18 @@ class Tokenizer(object): else: self._write_all(tokens) - def _get_tag_type_from_stack(self): - self._push_textbuffer() - if not self._stack: - return None # Tag has an empty name? - text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] + def _get_tag_type_from_stack(self, stack=None): + if stack is None: + stack = self._stack + self._push_textbuffer() + if not stack: + self._fail_route() # Tag has an empty name? + text = [tok for tok in stack if isinstance(tok, tokens.Text)] text = "".join([token.text for token in text]).rstrip().lower() try: return Tag.TRANSLATIONS[text] except KeyError: - return Tag.TAG_UNKNOWN + self._fail_route() def _actually_close_tag_opening(self): if self._context & contexts.TAG_OPEN_ATTR: @@ -452,8 +454,6 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_ATTR_BODY else: tag = self._get_tag_type_from_stack() - if not tag: - self._fail_route() self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY @@ -504,8 +504,6 @@ class Tokenizer(object): if self._context & contexts.TAG_OPEN_NAME: self._write_text(chunks.pop(0)) tag = self._get_tag_type_from_stack() - if not tag: - self._fail_route() self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME @@ -569,8 +567,8 @@ class Tokenizer(object): self._head += 1 def _handle_tag_close_close(self): - tag = self._get_tag_type_from_stack() closing = self._pop() + tag = self._get_tag_type_from_stack(closing) if tag != self._stack[0].type: # Closing and opening tags are not the same, so fail this route: self._fail_route() From 0ee505b5a506cfc1c0530935bb01933b94aa14dc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 24 Jan 2013 01:24:06 -0500 Subject: [PATCH 016/189] Docstrings for new tokenizer methods. --- mwparserfromhell/parser/tokenizer.py | 41 ++++++++++++++++++++++++++++++------ mwparserfromhell/tag_defs.py | 2 +- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index e83ec5d..8ec3355 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -434,6 +434,10 @@ class Tokenizer(object): self._write_all(tokens) def _get_tag_type_from_stack(self, stack=None): + """Return the tag type based on the text in *stack*. + + If *stack* is ``None``, we will use the current, topmost one. + """ if stack is None: stack = self._stack self._push_textbuffer() @@ -447,6 +451,13 @@ class Tokenizer(object): self._fail_route() def _actually_close_tag_opening(self): + """Handle cleanup at the end of a opening tag. + + The current context will be updated and the + :py:class:`~.tokens.TagOpenOpen` token will be written. Returns the + opening tag's padding to be used in the + :py:class:`~.tokens.TagOpenClose` token. + """ if self._context & contexts.TAG_OPEN_ATTR: if self._context & contexts.TAG_OPEN_ATTR_NAME: self._context ^= contexts.TAG_OPEN_ATTR_NAME @@ -463,6 +474,11 @@ class Tokenizer(object): return "" def _actually_handle_chunk(self, chunks, is_new): + """Actually handle a chunk of code within a tag's attributes. + + Called by :py:meth:`_handle_tag_chunk` and + :py:meth:`_handle_tag_attribute_body`. + """ if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: padding = 0 while chunks: @@ -495,6 +511,12 @@ class Tokenizer(object): self._write_text(chunk) def _handle_tag_chunk(self, text): + """Handle a chunk of code within a tag's attributes. + + This is called by :py:meth:`_parse`, which intercepts parsing of + wikicode when we're inside of an opening tag and no :py:attr:`MARKERS` + are present. + """ if " " not in text: self._write_text(text) return @@ -517,6 +539,12 @@ class Tokenizer(object): return self._pop() def _handle_tag_attribute_body(self): + """Handle the body, or value, of a tag attribute. + + Attribute bodies can usually be handled at once, but sometimes a new + stack must be created to keep track of "rich" attribute values that + contain, for example, templates. + """ self._context ^= contexts.TAG_OPEN_ATTR_NAME self._context |= contexts.TAG_OPEN_ATTR_BODY self._write(tokens.TagAttrEquals()) @@ -552,21 +580,25 @@ class Tokenizer(object): self._actually_handle_chunk(chunks, True) def _handle_tag_close_open(self): + """Handle the ending of an open tag (````).""" padding = self._actually_close_tag_opening() self._write(tokens.TagCloseOpen(padding=padding)) def _handle_tag_selfclose(self): + """Handle the ending of an tag that closes itself (````).""" padding = self._actually_close_tag_opening() self._write(tokens.TagCloseSelfclose(padding=padding)) self._head += 1 return self._pop() def _handle_tag_open_close(self): + """Handle the opening of a closing tag (````).""" self._write(tokens.TagOpenClose()) self._push(contexts.TAG_CLOSE) self._head += 1 def _handle_tag_close_close(self): + """Handle the ending of a closing tag (````).""" closing = self._pop() tag = self._get_tag_type_from_stack(closing) if tag != self._stack[0].type: @@ -653,8 +685,7 @@ class Tokenizer(object): elif this == "<" and next != "/" and ( not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() - elif self._context & ( - contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): + elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): if this == "\n": if self._context & contexts.TAG_CLOSE: self._pop() @@ -663,11 +694,9 @@ class Tokenizer(object): self._handle_tag_close_open() elif this == "/" and next == ">": return self._handle_tag_selfclose() - elif this == "=" and ( - self._context & contexts.TAG_OPEN_ATTR_NAME): + elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME: self._handle_tag_attribute_body() - elif this == "<" and next == "/" and ( - self._context & contexts.TAG_BODY): + elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: self._handle_tag_open_close() elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py index 74d3a81..b2ee90d 100644 --- a/mwparserfromhell/tag_defs.py +++ b/mwparserfromhell/tag_defs.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012 Ben Kurtovic +# Copyright (C) 2012-2013 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal From d8814968b71fdd9ceea22085c19d43b69101ba38 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 14 Mar 2013 11:02:10 -0400 Subject: [PATCH 017/189] Applying latest commit from develop --- mwparserfromhell/parser/__init__.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index 5baa687..fd8a314 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -26,16 +26,16 @@ modules: the :py:mod:`~.tokenizer` and the :py:mod:`~.builder`. This module joins them together under one interface. """ +from .builder import Builder +from .tokenizer import Tokenizer try: - from ._builder import CBuilder as Builder + from ._tokenizer import CTokenizer + use_c = True except ImportError: - from .builder import Builder -try: - from ._tokenizer import CTokenizer as Tokenizer -except ImportError: - from .tokenizer import Tokenizer + CTokenizer = None + use_c = False -__all__ = ["Parser"] +__all__ = ["use_c", "Parser"] class Parser(object): """Represents a parser for wikicode. @@ -48,7 +48,10 @@ class Parser(object): def __init__(self, text): self.text = text - self._tokenizer = Tokenizer() + if use_c and CTokenizer: + self._tokenizer = CTokenizer() + else: + self._tokenizer = Tokenizer() self._builder = Builder() def parse(self): From 61fc5b5eab7dbe9c0466fd07a656c8490d8d04ad Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 19 May 2013 14:41:48 -0400 Subject: [PATCH 018/189] Fix handling of self-closing tags (closes #31) --- mwparserfromhell/nodes/tag.py | 5 +++-- mwparserfromhell/parser/builder.py | 4 ++-- mwparserfromhell/parser/tokenizer.py | 4 ++-- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index eb5d1ee..d301d85 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -79,8 +79,9 @@ class Tag(TagDefinitions, Node): if attr.value: for child in getter(attr.value): yield attr.value, child - for child in getter(self.contents): - yield self.contents, child + if self.contents: + for child in getter(self.contents): + yield self.contents, child def __strip__(self, normalize, collapse): if self.type in self.TAGS_VISIBLE: diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 60bfaa9..4b468b7 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -191,8 +191,8 @@ class Builder(object): self._push() elif isinstance(token, tokens.TagAttrQuote): quoted = True - elif isinstance(token, (tokens.TagAttrStart, - tokens.TagCloseOpen)): + elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen, + tokens.TagCloseSelfclose)): self._tokens.append(token) if name is not None: return Attribute(name, self._pop(), quoted, padding) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 82f748c..b466de5 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -26,8 +26,8 @@ import re from . import contexts from . import tokens -from ..nodes.tag import Tag from ..compat import htmlentities +from ..nodes.tag import Tag __all__ = ["Tokenizer"] @@ -431,7 +431,7 @@ class Tokenizer(object): try: return Tag.TRANSLATIONS[text] except KeyError: - self._fail_route() + return Tag.TAG_UNKNOWN def _actually_close_tag_opening(self): """Handle cleanup at the end of a opening tag. From 1b4c01b4c00d014499d9f5e5ad8ecc01bb20a2b7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 20 May 2013 03:05:11 -0400 Subject: [PATCH 019/189] Implement assertTagNodeEqual(), start test_tag(), add to tags.mwtest. --- mwparserfromhell/parser/builder.py | 2 +- tests/_test_tree_equality.py | 19 +++++++- tests/test_attribute.py | 0 tests/test_builder.py | 12 +++++- tests/test_tag.py | 0 tests/tokenizer/tags.mwtest | 88 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 117 insertions(+), 4 deletions(-) create mode 100644 tests/test_attribute.py create mode 100644 tests/test_tag.py create mode 100644 tests/tokenizer/tags.mwtest diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 4b468b7..5ec0780 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -170,7 +170,7 @@ class Builder(object): self._write(self._handle_token(token)) def _handle_comment(self): - """Handle a case where a hidden comment is at the head of the tokens.""" + """Handle a case where an HTML comment is at the head of the tokens.""" self._push() while self._tokens: token = self._tokens.pop() diff --git a/tests/_test_tree_equality.py b/tests/_test_tree_equality.py index 52130ed..2828147 100644 --- a/tests/_test_tree_equality.py +++ b/tests/_test_tree_equality.py @@ -91,7 +91,24 @@ class TreeEqualityTestCase(TestCase): def assertTagNodeEqual(self, expected, actual): """Assert that two Tag nodes have the same data.""" - self.fail("Holding this until feature/html_tags is ready.") + self.assertEqual(expected.type, actual.type) + self.assertWikicodeEqual(expected.tag, actual.tag) + if expected.contents is not None: + self.assertWikicodeEqual(expected.contents, actual.contents) + length = len(expected.attributes) + self.assertEqual(length, len(actual.attributes)) + for i in range(length): + exp_attr = expected.attributes[i] + act_attr = actual.attributes[i] + self.assertWikicodeEqual(exp_attr.name, act_attr.name) + if exp_attr.value is not None: + self.assertWikicodeEqual(exp_attr.value, act_attr.value) + self.assertIs(exp_attr.quoted, act_attr.quoted) + self.assertEqual(exp.attr.padding, act_attr.padding) + self.assertIs(expected.showtag, actual.showtag) + self.assertIs(expected.self_closing, actual.self_closing) + self.assertEqual(expected.padding, actual.padding) + self.assertWikicodeEqual(expected.closing_tag, actual.closing_tag) def assertTemplateNodeEqual(self, expected, actual): """Assert that two Template nodes have the same data.""" diff --git a/tests/test_attribute.py b/tests/test_attribute.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_builder.py b/tests/test_builder.py index 903d144..85a8c60 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -190,10 +190,18 @@ class TestBuilder(TreeEqualityTestCase): for test, valid in tests: self.assertWikicodeEqual(valid, self.builder.build(test)) - @unittest.skip("holding this until feature/html_tags is ready") def test_tag(self): """tests for building Tag nodes""" - pass + tests = [ + ([tokens.TagOpenOpen(showtag=True, type=101), + tokens.Text(text="ref"), tokens.TagCloseOpen(padding=""), + tokens.TagOpenClose(), tokens.Text(text="ref"), + tokens.TagCloseClose()], + wrap([Tag(101, wraptext("ref"), wrap([]), [], True, False, "", + wraptext("ref"))])), + ] + for test, valid in tests: + self.assertWikicodeEqual(valid, self.builder.build(test)) def test_integration(self): """a test for building a combination of templates together""" diff --git a/tests/test_tag.py b/tests/test_tag.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest new file mode 100644 index 0000000..9a6ce30 --- /dev/null +++ b/tests/tokenizer/tags.mwtest @@ -0,0 +1,88 @@ +name: basic +label: a basic tag with an open and close +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: basic_selfclosing +label: a basic self-closing tag +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding="")] + +--- + +name: content +label: a tag with some content in the middle +input: "this is a reference" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: padded_open +label: a tag with some padding in the open tag +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: padded_close +label: a tag with some padding in the close tag +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()] + +--- + +name: padded_selfclosing +label: a self-closing tag with padding +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding=" ")] + +--- + +name: attribute +label: a tag with a single attribute +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: attribute_value +label: a tag with a single attribute with a value +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: attribute_quoted +label: a tag with a single quoted attribute +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: attribute_hyphen +label: a tag with a single attribute, containing a hyphen +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: attribute_quoted_hyphen +label: a tag with a single quoted attribute, containing a hyphen +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: attribute_selfclosing +label: a self-closing tag with a single attribute +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")] + +--- + +name: attribute_selfclosing_value +label: a self-closing tag with a single attribute with a value +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] From 9ea06c283081771833729ec579b9aaee94599fe1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 28 May 2013 10:58:45 -0400 Subject: [PATCH 020/189] Push the textbuffer to fix a couple broken tests. --- mwparserfromhell/parser/tokenizer.py | 1 + tests/tokenizer/tags.mwtest | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index b466de5..b8450fd 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -452,6 +452,7 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY + self._push_textbuffer() if isinstance(self._stack[-1], tokens.TagAttrStart): return self._stack.pop().padding return "" diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 9a6ce30..8716e78 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -86,3 +86,10 @@ name: attribute_selfclosing_value label: a self-closing tag with a single attribute with a value input: "" output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] + +--- + +name: attribute_selfclosing_value_quoted +label: a self-closing tag with a single quoted attribute +input: "" +output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] From d2b39546691eda327979b12dbe44c0090868c790 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 1 Jun 2013 17:30:34 -0400 Subject: [PATCH 021/189] Fix remaining broken tests; some refactoring. --- mwparserfromhell/parser/tokenizer.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index b8450fd..67a652a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -476,7 +476,7 @@ class Tokenizer(object): self._context ^= contexts.TAG_OPEN_ATTR_IGNORE chunks.pop(0) return - elif self._context & contexts.TAG_OPEN_ATTR_QUOTED: + elif is_new and self._context & contexts.TAG_OPEN_ATTR_QUOTED: self._write_text(" ") # Quoted chunks don't lose their spaces if chunks: @@ -501,7 +501,7 @@ class Tokenizer(object): wikicode when we're inside of an opening tag and no :py:attr:`MARKERS` are present. """ - if " " not in text: + if " " not in text and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: self._write_text(text) return chunks = text.split(" ") @@ -603,7 +603,7 @@ class Tokenizer(object): elif this == "\n" or this == "[" or this == "}": return False return True - if context & contexts.TEMPLATE_NAME: + elif context & contexts.TEMPLATE_NAME: if this == "{" or this == "}" or this == "[": self._context |= contexts.FAIL_NEXT return True @@ -621,6 +621,8 @@ class Tokenizer(object): elif this is self.END or not this.isspace(): self._context |= contexts.HAS_TEXT return True + elif context & contexts.TAG_CLOSE: + return this != "<" and this != "\n" else: if context & contexts.FAIL_ON_EQUALS: if this == "=": @@ -653,10 +655,12 @@ class Tokenizer(object): while True: this = self._read() unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | - contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME) + contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME | + contexts.TAG_CLOSE) if self._context & unsafe: if not self._verify_safe(this): - if self._context & contexts.TEMPLATE_PARAM_KEY: + double = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) + if self._context & double: self._pop() self._fail_route() if this not in self.MARKERS: @@ -672,12 +676,12 @@ class Tokenizer(object): fail = ( contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | contexts.HEADING | contexts.COMMENT | contexts.TAG) - double_fail = ( - contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE | - contexts.TAG_OPEN_ATTR_QUOTED) - if self._context & double_fail: - self._pop() if self._context & fail: + double_fail = ( + contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE | + contexts.TAG_OPEN_ATTR_QUOTED) + if self._context & double_fail: + self._pop() self._fail_route() return self._pop() next = self._read(1) @@ -738,10 +742,10 @@ class Tokenizer(object): elif this == "<" and next != "/" and ( not self._context & (contexts.TAG ^ contexts.TAG_BODY)): self._parse_tag() - elif self._context & (contexts.TAG_OPEN ^ contexts.TAG_OPEN_ATTR_QUOTED): - if this == "\n": - if self._context & contexts.TAG_CLOSE: - self._pop() + elif self._context & contexts.TAG_OPEN: + if self._context & contexts.TAG_OPEN_ATTR_QUOTED: + self._handle_tag_chunk(this) + elif this == "\n": self._fail_route() elif this == ">": self._handle_tag_close_open() @@ -749,6 +753,8 @@ class Tokenizer(object): return self._handle_tag_selfclose() elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME: self._handle_tag_attribute_body() + else: + self._handle_tag_chunk(this) elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: self._handle_tag_open_close() elif this == ">" and self._context & contexts.TAG_CLOSE: From 03e41286c6caf940d9f14ae1bdbd03df4e112493 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 12 Jun 2013 18:29:22 -0400 Subject: [PATCH 022/189] Add a number of tag tests. A couple of these are failing. --- tests/tokenizer/integration.mwtest | 7 ++ tests/tokenizer/tags.mwtest | 140 +++++++++++++++++++++++++++++++++++++ 2 files changed, 147 insertions(+) diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index d3cb419..ba01c8c 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -33,6 +33,13 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t --- +name: rich_tags +label: a HTML tag with tons of other things in it +input: "{{dubious claim}}[[Source]]" +output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(padding=" "), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(padding=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(padding=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(padding=""), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagOpenClose(), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + name: wildcard label: a wildcard assortment of various things input: "{{{{{{{{foo}}bar|baz=biz}}buzz}}usr|{{bin}}}}" diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 8716e78..5af2074 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -93,3 +93,143 @@ name: attribute_selfclosing_value_quoted label: a self-closing tag with a single quoted attribute input: "" output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] + +--- + +name: incomplete_lbracket +label: incomplete tags: just a left bracket +input: "<" +output: [Text(text="<")] + +--- + +name: incomplete_lbracket_junk +label: incomplete tags: just a left bracket, surrounded by stuff +input: "foo" +output: [Text(text="junk ")] + +--- + +name: incomplete_open_unnamed_attr +label: incomplete tags: an open tag, unnamed attribute +input: "junk " +output: [Text(text="junk ")] + +--- + +name: incomplete_open_attr_equals +label: incomplete tags: an open tag, attribute, equal sign +input: "junk " +output: [Text(text="junk ")] + +--- + +name: incomplete_open_attr +label: incomplete tags: an open tag, attribute with a key/value +input: "junk " +output: [Text(text="junk ")] + +--- + +name: incomplete_open_attr_quoted +label: incomplete tags: an open tag, attribute with a key/value, quoted +input: "junk " +output: [Text(text="junk ")] + +--- + +name: incomplete_open_text +label: incomplete tags: an open tag, text +input: "junk foo" +output: [Text(text="junk foo")] + +--- + +name: incomplete_open_attr_text +label: incomplete tags: an open tag, attribute with a key/value, text +input: "junk bar" +output: [Text(text="junk bar")] + +--- + +name: incomplete_open_text_lbracket +label: incomplete tags: an open tag, text, left open bracket +input: "junk bar<" +output: [Text(text="junk bar<")] + +--- + +name: incomplete_open_text_lbracket_slash +label: incomplete tags: an open tag, text, left bracket, slash +input: "junk barbarbarbar" +output: [Text(text="junk bar")] From 58d9194a2c4620e948024bdb819bd1f484071227 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 21 Jun 2013 00:32:26 -0400 Subject: [PATCH 023/189] Version bump for v0.3; fix permissions on compat.py. --- CHANGELOG | 2 +- docs/changelog.rst | 4 ++-- docs/index.rst | 2 +- mwparserfromhell/__init__.py | 2 +- mwparserfromhell/compat.py | 0 5 files changed, 5 insertions(+), 5 deletions(-) mode change 100755 => 100644 mwparserfromhell/compat.py diff --git a/CHANGELOG b/CHANGELOG index 9772f8b..961d33d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ -v0.1.1 (19da4d2144) to v0.2: +v0.1.1 (19da4d2144) to v0.2 (edf6a3a8a6): - The parser now fully supports Python 3 in addition to Python 2.7. - Added a C tokenizer extension that is significantly faster than its Python diff --git a/docs/changelog.rst b/docs/changelog.rst index 0e8bbef..0f7347a 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -4,7 +4,7 @@ Changelog v0.2 ---- -19da4d2144_ to master_ (released June 20, 2013) +19da4d2144_ to edf6a3a8a6_ (released June 20, 2013) - The parser now fully supports Python 3 in addition to Python 2.7. - Added a C tokenizer extension that is significantly faster than its Python @@ -53,6 +53,6 @@ v0.1 ba94938fe8_ (released August 23, 2012) -.. _master: https://github.com/earwig/mwparserfromhell/tree/v0.2 +.. _edf6a3a8a6: https://github.com/earwig/mwparserfromhell/tree/v0.2 .. _19da4d2144: https://github.com/earwig/mwparserfromhell/tree/v0.1.1 .. _ba94938fe8: https://github.com/earwig/mwparserfromhell/tree/v0.1 diff --git a/docs/index.rst b/docs/index.rst index 4355b61..f2e3345 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,4 +1,4 @@ -MWParserFromHell v0.2 Documentation +MWParserFromHell v0.3 Documentation =================================== :py:mod:`mwparserfromhell` (the *MediaWiki Parser from Hell*) is a Python diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 5db2d4c..738d4c2 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -31,7 +31,7 @@ from __future__ import unicode_literals __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012, 2013 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.2" +__version__ = "0.3.dev" __email__ = "ben.kurtovic@verizon.net" from . import compat, nodes, parser, smart_list, string_mixin, utils, wikicode diff --git a/mwparserfromhell/compat.py b/mwparserfromhell/compat.py old mode 100755 new mode 100644 From 7b6b46da953948165072832d1979e0377ddece4a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 22 Jun 2013 22:24:36 -0400 Subject: [PATCH 024/189] Some documentation cleanup. --- CHANGELOG | 12 ++++++++++-- README.rst | 4 +++- docs/changelog.rst | 20 ++++++++++++++------ docs/index.rst | 4 ++-- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 961d33d..cbe2933 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,8 @@ -v0.1.1 (19da4d2144) to v0.2 (edf6a3a8a6): +v0.3 (unreleased): + +- Various fixes and cleanup. + +v0.2 (released June 20, 2013): - The parser now fully supports Python 3 in addition to Python 2.7. - Added a C tokenizer extension that is significantly faster than its Python @@ -24,10 +28,14 @@ v0.1.1 (19da4d2144) to v0.2 (edf6a3a8a6): - Fixed some broken example code in the README; other copyedits. - Other bugfixes and code cleanup. -v0.1 (ba94938fe8) to v0.1.1 (19da4d2144): +v0.1.1 (released September 21, 2012): - Added support for Comments () and Wikilinks ([[foo]]). - Added corresponding ifilter_links() and filter_links() methods to Wikicode. - Fixed a bug when parsing incomplete templates. - Fixed strip_code() to affect the contents of headings. - Various copyedits in documentation and comments. + +v0.1 (released August 23, 2012): + +- Initial release. diff --git a/README.rst b/README.rst index 77c01eb..df4d732 100644 --- a/README.rst +++ b/README.rst @@ -9,7 +9,8 @@ mwparserfromhell that provides an easy-to-use and outrageously powerful parser for MediaWiki_ wikicode. It supports Python 2 and Python 3. -Developed by Earwig_ with help from `Σ`_. +Developed by Earwig_ with help from `Σ`_. Full documentation is available on +ReadTheDocs_. Installation ------------ @@ -142,6 +143,7 @@ following code (via the API_):: return mwparserfromhell.parse(text) .. _MediaWiki: http://mediawiki.org +.. _ReadTheDocs: http://mwparserfromhell.readthedocs.org .. _Earwig: http://en.wikipedia.org/wiki/User:The_Earwig .. _Σ: http://en.wikipedia.org/wiki/User:%CE%A3 .. _Python Package Index: http://pypi.python.org diff --git a/docs/changelog.rst b/docs/changelog.rst index 0f7347a..4bf86b7 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,10 +1,19 @@ Changelog ========= +v0.3 +---- + +Unreleased +(`changes `__): + +- Various fixes and cleanup. + v0.2 ---- -19da4d2144_ to edf6a3a8a6_ (released June 20, 2013) +`Released June 20, 2013 `_ +(`changes `__): - The parser now fully supports Python 3 in addition to Python 2.7. - Added a C tokenizer extension that is significantly faster than its Python @@ -38,7 +47,8 @@ v0.2 v0.1.1 ------ -ba94938fe8_ to 19da4d2144_ (released September 21, 2012) +`Released September 21, 2012 `_ +(`changes `__): - Added support for :py:class:`Comments <.Comment>` (````) and :py:class:`Wikilinks <.Wikilink>` (``[[foo]]``). @@ -51,8 +61,6 @@ ba94938fe8_ to 19da4d2144_ (released September 21, 2012) v0.1 ---- -ba94938fe8_ (released August 23, 2012) +`Released August 23, 2012 `_: -.. _edf6a3a8a6: https://github.com/earwig/mwparserfromhell/tree/v0.2 -.. _19da4d2144: https://github.com/earwig/mwparserfromhell/tree/v0.1.1 -.. _ba94938fe8: https://github.com/earwig/mwparserfromhell/tree/v0.1 +- Initial release. diff --git a/docs/index.rst b/docs/index.rst index f2e3345..0603daf 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,5 +1,5 @@ -MWParserFromHell v0.3 Documentation -=================================== +MWParserFromHell v\ |version| Documentation +=========================================== :py:mod:`mwparserfromhell` (the *MediaWiki Parser from Hell*) is a Python package that provides an easy-to-use and outrageously powerful parser for From 2596e697aebc04e25a80e60c0abd7bcd5384be0f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 26 Jun 2013 16:40:19 -0400 Subject: [PATCH 025/189] Fix a possible compiler warning on some build systems. --- mwparserfromhell/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index df65d0e..86f2884 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1021,7 +1021,7 @@ Tokenizer_really_parse_entity(Tokenizer* self) break; j++; } - text[i] = this; + text[i] = (char) this; self->head++; i++; } From 6450814729c4725760386ae9e8a24a30c46b7033 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 28 Jun 2013 23:34:24 -0400 Subject: [PATCH 026/189] Remove 'type' attribute from tags; rework tag definitions. --- mwparserfromhell/nodes/tag.py | 30 ++------- mwparserfromhell/parser/builder.py | 8 +-- mwparserfromhell/parser/tokenizer.py | 21 ++---- mwparserfromhell/tag_defs.py | 123 ++++++++++------------------------- mwparserfromhell/utils.py | 2 + tests/test_builder.py | 9 ++- tests/tokenizer/tags.mwtest | 28 ++++---- 7 files changed, 72 insertions(+), 149 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index d301d85..cd5d0a2 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -24,18 +24,17 @@ from __future__ import unicode_literals from . import Node, Text from ..compat import str -from ..tag_defs import TagDefinitions +from ..tag_defs import get_wikicode, is_visible from ..utils import parse_anything __all__ = ["Tag"] -class Tag(TagDefinitions, Node): +class Tag(Node): """Represents an HTML-style tag in wikicode, like ````.""" - def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, + def __init__(self, tag, contents=None, attrs=None, showtag=True, self_closing=False, padding="", closing_tag=None): super(Tag, self).__init__() - self._type = type_ self._tag = tag self._contents = contents if attrs: @@ -52,7 +51,7 @@ class Tag(TagDefinitions, Node): def __unicode__(self): if not self.showtag: - open_, close = self.WIKICODE[self.type] + open_, close = get_wikicode[self.tag] if self.self_closing: return open_ else: @@ -84,7 +83,7 @@ class Tag(TagDefinitions, Node): yield self.contents, child def __strip__(self, normalize, collapse): - if self.type in self.TAGS_VISIBLE: + if is_visible(self.tag): return self.contents.strip_code(normalize, collapse) return None @@ -113,11 +112,6 @@ class Tag(TagDefinitions, Node): write(">") @property - def type(self): - """The tag type.""" - return self._type - - @property def tag(self): """The tag itself, as a :py:class:`~.Wikicode` object.""" return self._tag @@ -159,23 +153,9 @@ class Tag(TagDefinitions, Node): """ return self._closing_tag - @type.setter - def type(self, value): - value = int(value) - if value not in self.TAGS_ALL: - raise ValueError(value) - self._type = value - for key in self.TRANSLATIONS: - if self.TRANSLATIONS[key] == value: - self._tag = self._closing_tag = parse_anything(key) - @tag.setter def tag(self, value): self._tag = self._closing_tag = parse_anything(value) - try: - self._type = self.TRANSLATIONS[text] - except KeyError: - self._type = self.TAG_UNKNOWN @contents.setter def contents(self, value): diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 5ec0780..53abe91 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -202,7 +202,7 @@ class Builder(object): def _handle_tag(self, token): """Handle a case where a tag is at the head of the tokens.""" - type_, showtag = token.type, token.showtag + showtag = token.showtag attrs = [] self._push() while self._tokens: @@ -215,14 +215,14 @@ class Builder(object): self._push() elif isinstance(token, tokens.TagCloseSelfclose): tag = self._pop() - return Tag(type_, tag, attrs=attrs, showtag=showtag, + return Tag(tag, attrs=attrs, showtag=showtag, self_closing=True, padding=token.padding) elif isinstance(token, tokens.TagOpenClose): contents = self._pop() self._push() elif isinstance(token, tokens.TagCloseClose): - return Tag(type_, tag, contents, attrs, showtag, False, - padding, self._pop()) + return Tag(tag, contents, attrs, showtag, False, padding, + self._pop()) else: self._write(self._handle_token(token)) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 67a652a..e7fdb0e 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -27,7 +27,7 @@ import re from . import contexts from . import tokens from ..compat import htmlentities -from ..nodes.tag import Tag +from ..tag_defs import is_parsable __all__ = ["Tokenizer"] @@ -416,8 +416,8 @@ class Tokenizer(object): else: self._write_all(tokens) - def _get_tag_type_from_stack(self, stack=None): - """Return the tag type based on the text in *stack*. + def _get_tag_from_stack(self, stack=None): + """Return the tag based on the text in *stack*. If *stack* is ``None``, we will use the current, topmost one. """ @@ -427,11 +427,7 @@ class Tokenizer(object): if not stack: self._fail_route() # Tag has an empty name? text = [tok for tok in stack if isinstance(tok, tokens.Text)] - text = "".join([token.text for token in text]).rstrip().lower() - try: - return Tag.TRANSLATIONS[text] - except KeyError: - return Tag.TAG_UNKNOWN + return "".join([token.text for token in text]).rstrip().lower() def _actually_close_tag_opening(self): """Handle cleanup at the end of a opening tag. @@ -447,8 +443,7 @@ class Tokenizer(object): if self._context & contexts.TAG_OPEN_ATTR_BODY: self._context ^= contexts.TAG_OPEN_ATTR_BODY else: - tag = self._get_tag_type_from_stack() - self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) + self._write_first(tokens.TagOpenOpen(showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY @@ -509,8 +504,7 @@ class Tokenizer(object): is_quoted = False if self._context & contexts.TAG_OPEN_NAME: self._write_text(chunks.pop(0)) - tag = self._get_tag_type_from_stack() - self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) + self._write_first(tokens.TagOpenOpen(showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME self._actually_handle_chunk(chunks, True) @@ -584,8 +578,7 @@ class Tokenizer(object): def _handle_tag_close_close(self): """Handle the ending of a closing tag (````).""" closing = self._pop() - tag = self._get_tag_type_from_stack(closing) - if tag != self._stack[0].type: + if self._get_tag_from_stack(closing) != self._get_tag_from_stack(): # Closing and opening tags are not the same, so fail this route: self._fail_route() self._write_all(closing) diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py index b2ee90d..369692b 100644 --- a/mwparserfromhell/tag_defs.py +++ b/mwparserfromhell/tag_defs.py @@ -20,99 +20,48 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +"""Contains data regarding certain HTML tags.""" -class TagDefinitions(object): - """Contains numerical definitions for valid HTML (and wikicode) tags. +from __future__ import unicode_literals - Base class for :py:class:`~.Tag` objects. - """ +__all__ = ["get_wikicode", "is_parsable", "is_visible"] - TAG_UNKNOWN = 0 +PARSER_BLACKLIST = [ + # enwiki extensions @ 2013-06-28 + "categorytree", "gallery", "hiero", "imagemap", "inputbox", "math", + "nowiki", "pre", "score", "section", "source", "syntaxhighlight", + "templatedata", "timeline" +] - # Basic HTML: - TAG_ITALIC = 1 - TAG_BOLD = 2 - TAG_UNDERLINE = 3 - TAG_STRIKETHROUGH = 4 - TAG_UNORDERED_LIST = 5 - TAG_ORDERED_LIST = 6 - TAG_DEF_TERM = 7 - TAG_DEF_ITEM = 8 - TAG_BLOCKQUOTE = 9 - TAG_RULE = 10 - TAG_BREAK = 11 - TAG_ABBR = 12 - TAG_PRE = 13 - TAG_MONOSPACE = 14 - TAG_CODE = 15 - TAG_SPAN = 16 - TAG_DIV = 17 - TAG_FONT = 18 - TAG_SMALL = 19 - TAG_BIG = 20 - TAG_CENTER = 21 +INVISIBLE_TAGS = [ + # enwiki extensions @ 2013-06-28 + "categorytree", "gallery", "imagemap", "inputbox", "math", "score", + "section", "templatedata", "timeline" +] - # MediaWiki parser hooks: - TAG_REF = 101 - TAG_GALLERY = 102 - TAG_MATH = 103 - TAG_NOWIKI = 104 - TAG_NOINCLUDE = 105 - TAG_INCLUDEONLY = 106 - TAG_ONLYINCLUDE = 107 +# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 +SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] +SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] - # Additional parser hooks: - TAG_SYNTAXHIGHLIGHT = 201 - TAG_POEM = 202 +WIKICODE = { + "i": {"open": "''", "close": "''"}, + "b": {"open": "'''", "close": "'''"}, + "ul": {"open": "*"}, + "ol": {"open": "#"}, + "dt": {"open": ";"}, + "dd": {"open": ":"}, + "hr": {"open": "----"}, +} - # Lists of tags: - TAGS_ALL = set(range(300)) - TAGS_INVISIBLE = {TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE} - TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE +def get_wikicode(tag): + """Return the appropriate wikicode before and after the given *tag*.""" + data = WIKICODE[tag.lower()] + return (data.get("open"), data.get("close")) - TRANSLATIONS = { - "i": TAG_ITALIC, - "em": TAG_ITALIC, - "b": TAG_BOLD, - "strong": TAG_BOLD, - "u": TAG_UNDERLINE, - "s": TAG_STRIKETHROUGH, - "ul": TAG_UNORDERED_LIST, - "ol": TAG_ORDERED_LIST, - "dt": TAG_DEF_TERM, - "dd": TAG_DEF_ITEM, - "blockquote": TAG_BLOCKQUOTE, - "hl": TAG_RULE, - "br": TAG_BREAK, - "abbr": TAG_ABBR, - "pre": TAG_PRE, - "tt": TAG_MONOSPACE, - "code": TAG_CODE, - "span": TAG_SPAN, - "div": TAG_DIV, - "font": TAG_FONT, - "small": TAG_SMALL, - "big": TAG_BIG, - "center": TAG_CENTER, - "ref": TAG_REF, - "gallery": TAG_GALLERY, - "math": TAG_MATH, - "nowiki": TAG_NOWIKI, - "noinclude": TAG_NOINCLUDE, - "includeonly": TAG_INCLUDEONLY, - "onlyinclude": TAG_ONLYINCLUDE, - "syntaxhighlight": TAG_SYNTAXHIGHLIGHT, - "source": TAG_SYNTAXHIGHLIGHT, - "poem": TAG_POEM, - } +def is_parsable(tag): + """Return if the given *tag*'s contents should be passed to the parser.""" + return tag.lower() not in PARSER_BLACKLIST - WIKICODE = { - TAG_ITALIC: ("''", "''"), - TAG_BOLD: ("'''", "'''"), - TAG_UNORDERED_LIST: ("*", ""), - TAG_ORDERED_LIST: ("#", ""), - TAG_DEF_TERM: (";", ""), - TAG_DEF_ITEM: (":", ""), - TAG_RULE: ("----", ""), - } +def is_visible(tag): + """Return whether or not the given *tag* contains visible text.""" + return tag.lower() not in INVISIBLE_TAGS diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index b797419..31e5ba0 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -31,6 +31,8 @@ from .compat import bytes, str from .nodes import Node from .smart_list import SmartList +__all__ = ["parse_anything"] + def parse_anything(value): """Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. diff --git a/tests/test_builder.py b/tests/test_builder.py index 85a8c60..0c635ce 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -193,11 +193,10 @@ class TestBuilder(TreeEqualityTestCase): def test_tag(self): """tests for building Tag nodes""" tests = [ - ([tokens.TagOpenOpen(showtag=True, type=101), - tokens.Text(text="ref"), tokens.TagCloseOpen(padding=""), - tokens.TagOpenClose(), tokens.Text(text="ref"), - tokens.TagCloseClose()], - wrap([Tag(101, wraptext("ref"), wrap([]), [], True, False, "", + ([tokens.TagOpenOpen(showtag=True), tokens.Text(text="ref"), + tokens.TagCloseOpen(padding=""), tokens.TagOpenClose(), + tokens.Text(text="ref"), tokens.TagCloseClose()], + wrap([Tag(wraptext("ref"), wrap([]), [], True, False, "", wraptext("ref"))])), ] for test, valid in tests: diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 5af2074..a76d6b6 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -1,98 +1,98 @@ name: basic label: a basic tag with an open and close input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: basic_selfclosing label: a basic self-closing tag input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding="")] --- name: content label: a tag with some content in the middle input: "this is a reference" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: padded_open label: a tag with some padding in the open tag input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: padded_close label: a tag with some padding in the close tag input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()] --- name: padded_selfclosing label: a self-closing tag with padding input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding=" ")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding=" ")] --- name: attribute label: a tag with a single attribute input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_value label: a tag with a single attribute with a value input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_quoted label: a tag with a single quoted attribute input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_hyphen label: a tag with a single attribute, containing a hyphen input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_quoted_hyphen label: a tag with a single quoted attribute, containing a hyphen input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_selfclosing label: a self-closing tag with a single attribute input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")] --- name: attribute_selfclosing_value label: a self-closing tag with a single attribute with a value input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] --- name: attribute_selfclosing_value_quoted label: a self-closing tag with a single quoted attribute input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] --- From ce27d5d385a4adc14e136b33471216038dfc70a1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 29 Jun 2013 00:33:41 -0400 Subject: [PATCH 027/189] Fix six failing tests; add three more (all passing). --- mwparserfromhell/parser/tokenizer.py | 33 ++++++++++++++++++--------------- tests/tokenizer/tags.mwtest | 21 +++++++++++++++++++++ 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index e7fdb0e..93e9a8d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -21,6 +21,7 @@ # SOFTWARE. from __future__ import unicode_literals +from itertools import takewhile from math import log import re @@ -416,19 +417,6 @@ class Tokenizer(object): else: self._write_all(tokens) - def _get_tag_from_stack(self, stack=None): - """Return the tag based on the text in *stack*. - - If *stack* is ``None``, we will use the current, topmost one. - """ - if stack is None: - stack = self._stack - self._push_textbuffer() - if not stack: - self._fail_route() # Tag has an empty name? - text = [tok for tok in stack if isinstance(tok, tokens.Text)] - return "".join([token.text for token in text]).rstrip().lower() - def _actually_close_tag_opening(self): """Handle cleanup at the end of a opening tag. @@ -557,14 +545,27 @@ class Tokenizer(object): while chunks: self._actually_handle_chunk(chunks, True) + def _get_tag_from_stack(self, stack=None): + """Return the tag based on the text in *stack*.""" + if not stack: + sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen) + func = lambda tok: not isinstance(tok, sentinels) + stack = takewhile(func, self._stack) + text = [tok.text for tok in stack if isinstance(tok, tokens.Text)] + return "".join(text).rstrip().lower() + def _handle_tag_close_open(self): """Handle the ending of an open tag (````).""" padding = self._actually_close_tag_opening() + if not self._get_tag_from_stack(): # Tags cannot be blank + self._fail_route() self._write(tokens.TagCloseOpen(padding=padding)) def _handle_tag_selfclose(self): """Handle the ending of an tag that closes itself (````).""" padding = self._actually_close_tag_opening() + if not self._get_tag_from_stack(): # Tags cannot be blank + self._fail_route() self._write(tokens.TagCloseSelfclose(padding=padding)) self._head += 1 return self._pop() @@ -578,8 +579,10 @@ class Tokenizer(object): def _handle_tag_close_close(self): """Handle the ending of a closing tag (````).""" closing = self._pop() - if self._get_tag_from_stack(closing) != self._get_tag_from_stack(): - # Closing and opening tags are not the same, so fail this route: + close_tag = self._get_tag_from_stack(closing) + open_tag = self._get_tag_from_stack() + if not close_tag or close_tag != open_tag: + # Closing and opening tags are empty or unequal, so fail this tag: self._fail_route() self._write_all(closing) self._write(tokens.TagCloseClose()) diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index a76d6b6..849a4fd 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -233,3 +233,24 @@ name: incomplete_open_text_wrong_close label: incomplete tags: an open tag, text, wrong close input: "junk bar" output: [Text(text="junk bar")] + +--- + +name: incomplete_no_tag_name_open +label: incomplete tags: no tag name within brackets; just an open +input: "junk <>" +output: [Text(text="junk <>")] + +--- + +name: incomplete_no_tag_name_selfclosing +label: incomplete tags: no tag name within brackets; self-closing +input: "junk < />" +output: [Text(text="junk < />")] + +--- + +name: incomplete_no_tag_name_open_close +label: incomplete tags: no tag name within brackets; open and close +input: "junk <>" +output: [Text(text="junk <>")] From c241bff9f50896d83294ed12c72b8d59dc932b2b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 29 Jun 2013 00:37:29 -0400 Subject: [PATCH 028/189] Remove .type check from assertTagNodeEqual() --- tests/_test_tree_equality.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/_test_tree_equality.py b/tests/_test_tree_equality.py index 2828147..6976a13 100644 --- a/tests/_test_tree_equality.py +++ b/tests/_test_tree_equality.py @@ -91,7 +91,6 @@ class TreeEqualityTestCase(TestCase): def assertTagNodeEqual(self, expected, actual): """Assert that two Tag nodes have the same data.""" - self.assertEqual(expected.type, actual.type) self.assertWikicodeEqual(expected.tag, actual.tag) if expected.contents is not None: self.assertWikicodeEqual(expected.contents, actual.contents) From 81e8fdd6829c12468f0f12c71d707c452eb9e2bb Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 30 Jun 2013 20:57:54 -0400 Subject: [PATCH 029/189] Give Attributes more attributes for padding data. --- mwparserfromhell/nodes/extras/attribute.py | 41 ++++++++++++++++++++++-------- mwparserfromhell/nodes/tag.py | 2 +- mwparserfromhell/parser/builder.py | 13 ++++++---- tests/tokenizer/tags.mwtest | 16 ++++++------ 4 files changed, 48 insertions(+), 24 deletions(-) diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py index 33ad851..5888dba 100644 --- a/mwparserfromhell/nodes/extras/attribute.py +++ b/mwparserfromhell/nodes/extras/attribute.py @@ -36,19 +36,22 @@ class Attribute(StringMixIn): whose value is ``"foo"``. """ - def __init__(self, name, value=None, quoted=True, padding=""): + def __init__(self, name, value=None, quoted=True, pad_first="", + pad_before_eq="", pad_after_eq=""): super(Attribute, self).__init__() self._name = name self._value = value self._quoted = quoted - self._padding = padding + self._pad_first = pad_first + self._pad_before_eq = pad_before_eq + self._pad_after_eq = pad_after_eq def __unicode__(self): - base = self.padding + str(self.name) + base = self.pad_first + str(self.name) + self.pad_before_eq if self.value: if self.quoted: - return base + '="' + str(self.value) + '"' - return base + "=" + str(self.value) + return base + '="' + self.pad_after_eq + str(self.value) + '"' + return base + "=" + self.pad_after_eq + str(self.value) return base @property @@ -67,9 +70,19 @@ class Attribute(StringMixIn): return self._quoted @property - def padding(self): + def pad_first(self): """Spacing to insert right before the attribute.""" - return self._padding + return self._pad_first + + @property + def pad_before_eq(self): + """Spacing to insert right before the equal sign.""" + return self._pad_before_eq + + @property + def pad_after_eq(self): + """Spacing to insert right after the equal sign.""" + return self._pad_after_eq @name.setter def name(self, value): @@ -83,6 +96,14 @@ class Attribute(StringMixIn): def quoted(self, value): self._quoted = bool(value) - @padding.setter - def padding(self, value): - self._padding = str(value) + @pad_first.setter + def pad_first(self, value): + self._pad_first = str(value) + + @pad_before_eq.setter + def pad_before_eq(self, value): + self._pad_before_eq = str(value) + + @pad_after_eq.setter + def pad_after_eq(self, value): + self._pad_after_eq = str(value) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index cd5d0a2..76b412c 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -59,7 +59,7 @@ class Tag(Node): result = "<" + str(self.tag) if self.attributes: - result += " " + " ".join([str(attr) for attr in self.attributes]) + result += "".join([str(attr) for attr in self.attributes]) if self.self_closing: result += self.padding + "/>" else: diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 53abe91..d92b845 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -180,9 +180,9 @@ class Builder(object): else: self._write(self._handle_token(token)) - def _handle_attribute(self, token): + def _handle_attribute(self, start): """Handle a case where a tag attribute is at the head of the tokens.""" - name, quoted, padding = None, False, token.padding + name, quoted = None, False self._push() while self._tokens: token = self._tokens.pop() @@ -194,9 +194,12 @@ class Builder(object): elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen, tokens.TagCloseSelfclose)): self._tokens.append(token) - if name is not None: - return Attribute(name, self._pop(), quoted, padding) - return Attribute(self._pop(), quoted=quoted, padding=padding) + if name: + value = self._pop() + else: + name, value = self._pop(), None + return Attribute(name, value, quoted, start.pad_first, + start.pad_before_eq, start.pad_after_eq) else: self._write(self._handle_token(token)) diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 849a4fd..1dfc1b1 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -43,56 +43,56 @@ output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding= name: attribute label: a tag with a single attribute input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_value label: a tag with a single attribute with a value input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_quoted label: a tag with a single quoted attribute input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_hyphen label: a tag with a single attribute, containing a hyphen input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_quoted_hyphen label: a tag with a single quoted attribute, containing a hyphen input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_selfclosing label: a self-closing tag with a single attribute input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(padding="")] --- name: attribute_selfclosing_value label: a self-closing tag with a single attribute with a value input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] --- name: attribute_selfclosing_value_quoted label: a self-closing tag with a single quoted attribute input: "" -output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] --- From 5f5a081d9148c584511bffb3d6d3b8f63ea24d43 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 30 Jun 2013 21:02:11 -0400 Subject: [PATCH 030/189] Rewrite tag parser to be cleaner and safer. All tag tests passing. Still need to finish backslash support and support for templates and tags within tags. --- mwparserfromhell/parser/contexts.py | 87 ++++----- mwparserfromhell/parser/tokenizer.py | 339 ++++++++++++++++------------------- 2 files changed, 194 insertions(+), 232 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 3c9c798..9e5e568 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -65,15 +65,7 @@ Local (stack-specific) contexts: * :py:const:`TAG` * :py:const:`TAG_OPEN` - - * :py:const:`TAG_OPEN_NAME` - * :py:const:`TAG_OPEN_ATTR` - - * :py:const:`TAG_OPEN_ATTR_NAME` - * :py:const:`TAG_OPEN_ATTR_BODY` - * :py:const:`TAG_OPEN_ATTR_QUOTED` - * :py:const:`TAG_OPEN_ATTR_IGNORE` - + * :py:const:`TAG_ATTR` * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` @@ -93,47 +85,42 @@ Global contexts: # Local contexts: -TEMPLATE = 0b000000000000000000000000111 -TEMPLATE_NAME = 0b000000000000000000000000001 -TEMPLATE_PARAM_KEY = 0b000000000000000000000000010 -TEMPLATE_PARAM_VALUE = 0b000000000000000000000000100 - -ARGUMENT = 0b000000000000000000000011000 -ARGUMENT_NAME = 0b000000000000000000000001000 -ARGUMENT_DEFAULT = 0b000000000000000000000010000 - -WIKILINK = 0b000000000000000000001100000 -WIKILINK_TITLE = 0b000000000000000000000100000 -WIKILINK_TEXT = 0b000000000000000000001000000 - -HEADING = 0b000000000000001111110000000 -HEADING_LEVEL_1 = 0b000000000000000000010000000 -HEADING_LEVEL_2 = 0b000000000000000000100000000 -HEADING_LEVEL_3 = 0b000000000000000001000000000 -HEADING_LEVEL_4 = 0b000000000000000010000000000 -HEADING_LEVEL_5 = 0b000000000000000100000000000 -HEADING_LEVEL_6 = 0b000000000000001000000000000 - -COMMENT = 0b000000000000010000000000000 - -TAG = 0b000000111111100000000000000 -TAG_OPEN = 0b000000001111100000000000000 -TAG_OPEN_NAME = 0b000000000000100000000000000 -TAG_OPEN_ATTR = 0b000000001111000000000000000 -TAG_OPEN_ATTR_NAME = 0b000000000001000000000000000 -TAG_OPEN_ATTR_BODY = 0b000000000010000000000000000 -TAG_OPEN_ATTR_QUOTED = 0b000000000100000000000000000 -TAG_OPEN_ATTR_IGNORE = 0b000000001000000000000000000 -TAG_BODY = 0b000000010000000000000000000 -TAG_CLOSE = 0b000000100000000000000000000 - -SAFETY_CHECK = 0b111111000000000000000000000 -HAS_TEXT = 0b000001000000000000000000000 -FAIL_ON_TEXT = 0b000010000000000000000000000 -FAIL_NEXT = 0b000100000000000000000000000 -FAIL_ON_LBRACE = 0b001000000000000000000000000 -FAIL_ON_RBRACE = 0b010000000000000000000000000 -FAIL_ON_EQUALS = 0b100000000000000000000000000 +TEMPLATE = 0b000000000000000000000111 +TEMPLATE_NAME = 0b000000000000000000000001 +TEMPLATE_PARAM_KEY = 0b000000000000000000000010 +TEMPLATE_PARAM_VALUE = 0b000000000000000000000100 + +ARGUMENT = 0b000000000000000000011000 +ARGUMENT_NAME = 0b000000000000000000001000 +ARGUMENT_DEFAULT = 0b000000000000000000010000 + +WIKILINK = 0b000000000000000001100000 +WIKILINK_TITLE = 0b000000000000000000100000 +WIKILINK_TEXT = 0b000000000000000001000000 + +HEADING = 0b000000000001111110000000 +HEADING_LEVEL_1 = 0b000000000000000010000000 +HEADING_LEVEL_2 = 0b000000000000000100000000 +HEADING_LEVEL_3 = 0b000000000000001000000000 +HEADING_LEVEL_4 = 0b000000000000010000000000 +HEADING_LEVEL_5 = 0b000000000000100000000000 +HEADING_LEVEL_6 = 0b000000000001000000000000 + +COMMENT = 0b000000000010000000000000 + +TAG = 0b000000111100000000000000 +TAG_OPEN = 0b000000000100000000000000 +TAG_ATTR = 0b000000001000000000000000 +TAG_BODY = 0b000000010000000000000000 +TAG_CLOSE = 0b000000100000000000000000 + +SAFETY_CHECK = 0b111111000000000000000000 +HAS_TEXT = 0b000001000000000000000000 +FAIL_ON_TEXT = 0b000010000000000000000000 +FAIL_NEXT = 0b000100000000000000000000 +FAIL_ON_LBRACE = 0b001000000000000000000000 +FAIL_ON_RBRACE = 0b010000000000000000000000 +FAIL_ON_EQUALS = 0b100000000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 93e9a8d..a7b9e16 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -37,6 +37,26 @@ class BadRoute(Exception): pass +class _TagOpenData(object): + """Stores data about an HTML open tag, like ````.""" + CX_NAME = 1 << 0 + CX_ATTR_READY = 1 << 1 + CX_ATTR_NAME = 1 << 2 + CX_ATTR_VALUE = 1 << 3 + CX_NEED_SPACE = 1 << 4 + CX_NEED_EQUALS = 1 << 5 + CX_NEED_QUOTE = 1 << 6 + CX_ATTR = CX_ATTR_NAME | CX_ATTR_VALUE + + def __init__(self): + self.context = self.CX_NAME + self.literal = True + self.padding_buffer = [] + self.quote_buffer = [] + self.reset = 0 + self.ignore_quote = False + + class Tokenizer(object): """Creates a list of tokens from a string of wikicode.""" USES_C = False @@ -47,6 +67,7 @@ class Tokenizer(object): MAX_DEPTH = 40 MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE) + tag_splitter = re.compile(r"([\s\"\\])") def __init__(self): self._text = None @@ -410,165 +431,145 @@ class Tokenizer(object): reset = self._head self._head += 1 try: - tokens = self._parse(contexts.TAG_OPEN_NAME) + tokens = self._really_parse_tag() except BadRoute: self._head = reset self._write_text("<") else: self._write_all(tokens) - def _actually_close_tag_opening(self): - """Handle cleanup at the end of a opening tag. - - The current context will be updated and the - :py:class:`~.tokens.TagOpenOpen` token will be written. Returns the - opening tag's padding to be used in the - :py:class:`~.tokens.TagOpenClose` token. - """ - if self._context & contexts.TAG_OPEN_ATTR: - if self._context & contexts.TAG_OPEN_ATTR_NAME: - self._context ^= contexts.TAG_OPEN_ATTR_NAME - if self._context & contexts.TAG_OPEN_ATTR_BODY: - self._context ^= contexts.TAG_OPEN_ATTR_BODY - else: - self._write_first(tokens.TagOpenOpen(showtag=True)) - self._context ^= contexts.TAG_OPEN_NAME - self._context |= contexts.TAG_BODY - - self._push_textbuffer() - if isinstance(self._stack[-1], tokens.TagAttrStart): - return self._stack.pop().padding - return "" - - def _actually_handle_chunk(self, chunks, is_new): - """Actually handle a chunk of code within a tag's attributes. + def _really_parse_tag(self): + """Actually parse an HTML tag, starting with the open (````).""" + data = _TagOpenData() + self._push(contexts.TAG_OPEN) + self._write(tokens.TagOpenOpen(showtag=True)) + while True: + this, next = self._read(), self._read(1) + if this not in self.MARKERS: + for chunk in self.tag_splitter.split(this): + if self._handle_tag_chunk(data, chunk): + continue + elif this is self.END: + if self._context & contexts.TAG_ATTR: + self._pop() + self._fail_route() + elif this == ">" and data.literal: + if data.context & data.CX_ATTR: + self._push_tag_buffer(data) + padding = data.padding_buffer[0] if data.padding_buffer else "" + self._write(tokens.TagCloseOpen(padding=padding)) + self._context = contexts.TAG_BODY + self._head += 1 + return self._parse(push=False) + elif this == "/" and next == ">" and data.literal: + if data.context & data.CX_ATTR: + self._push_tag_buffer(data) + padding = data.padding_buffer[0] if data.padding_buffer else "" + self._write(tokens.TagCloseSelfclose(padding=padding)) + self._head += 1 + return self._pop() + else: + for chunk in self.tag_splitter.split(this): + if self._handle_tag_chunk(data, chunk): + continue + self._head += 1 - Called by :py:meth:`_handle_tag_chunk` and - :py:meth:`_handle_tag_attribute_body`. - """ - if is_new and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: - padding = 0 - while chunks: - if chunks[0] == "": - padding += 1 - chunks.pop(0) - else: - break - self._write(tokens.TagAttrStart(padding=" " * padding)) - elif self._context & contexts.TAG_OPEN_ATTR_IGNORE: - self._context ^= contexts.TAG_OPEN_ATTR_IGNORE - chunks.pop(0) + def _handle_tag_chunk(self, data, chunk): + if not chunk: return - elif is_new and self._context & contexts.TAG_OPEN_ATTR_QUOTED: - self._write_text(" ") # Quoted chunks don't lose their spaces - - if chunks: - chunk = chunks.pop(0) - if self._context & contexts.TAG_OPEN_ATTR_BODY: - self._context ^= contexts.TAG_OPEN_ATTR_BODY - self._context |= contexts.TAG_OPEN_ATTR_NAME - if self._context & contexts.TAG_OPEN_ATTR_QUOTED: - if re.search(r'[^\\]"', chunk[:-1]): - self._fail_route() - if re.search(r'[^\\]"$', chunk): - self._write_text(chunk[:-1]) - self._context ^= contexts.TAG_OPEN_ATTR_QUOTED - self._context |= contexts.TAG_OPEN_ATTR_NAME - return True # Back to _handle_tag_attribute_body() + if data.context & data.CX_NAME: + if chunk != chunk.lstrip(): # Tags cannot start with whitespace + self._fail_route() self._write_text(chunk) - - def _handle_tag_chunk(self, text): - """Handle a chunk of code within a tag's attributes. - - This is called by :py:meth:`_parse`, which intercepts parsing of - wikicode when we're inside of an opening tag and no :py:attr:`MARKERS` - are present. - """ - if " " not in text and not self._context & contexts.TAG_OPEN_ATTR_QUOTED: - self._write_text(text) - return - chunks = text.split(" ") - is_new = False - is_quoted = False - if self._context & contexts.TAG_OPEN_NAME: - self._write_text(chunks.pop(0)) - self._write_first(tokens.TagOpenOpen(showtag=True)) - self._context ^= contexts.TAG_OPEN_NAME - self._context |= contexts.TAG_OPEN_ATTR_NAME - self._actually_handle_chunk(chunks, True) - is_new = True - while chunks: - result = self._actually_handle_chunk(chunks, is_new) - is_quoted = result or is_quoted - is_new = True - if is_quoted: - return self._pop() - - def _handle_tag_attribute_body(self): - """Handle the body, or value, of a tag attribute. - - Attribute bodies can usually be handled at once, but sometimes a new - stack must be created to keep track of "rich" attribute values that - contain, for example, templates. - """ - self._context ^= contexts.TAG_OPEN_ATTR_NAME - self._context |= contexts.TAG_OPEN_ATTR_BODY - self._write(tokens.TagAttrEquals()) - next = self._read(1) - if next not in self.MARKERS and next.startswith('"'): - chunks = None - if " " in next: - chunks = next.split(" ") - next = chunks.pop(0) - if re.search(r'[^\\]"$', next[1:]): - if not re.search(r'[^\\]"', next[1:-1]): - self._write(tokens.TagAttrQuote()) - self._write_text(next[1:-1]) - self._head += 1 + data.context = data.CX_NEED_SPACE + elif data.context & data.CX_NEED_SPACE: + if chunk.isspace(): + if data.context & data.CX_ATTR_VALUE: + self._push_tag_buffer(data) + data.padding_buffer.append(chunk) + data.context = data.CX_ATTR_READY else: - if not re.search(r'[^\\]"', next[1:]): - self._head += 1 - reset = self._head - try: - attr = self._parse(contexts.TAG_OPEN_ATTR_QUOTED | - contexts.TAG_OPEN_ATTR_IGNORE) - except BadRoute: - self._head = reset - self._write_text(next) - else: - self._write(tokens.TagAttrQuote()) - self._write_text(next[1:]) - self._write_all(attr) - return - self._context ^= contexts.TAG_OPEN_ATTR_BODY - self._context |= contexts.TAG_OPEN_ATTR_NAME - while chunks: - self._actually_handle_chunk(chunks, True) + if data.context & data.CX_ATTR_VALUE: + data.context ^= data.CX_NEED_SPACE + data.quote_buffer = [] + data.ignore_quote = True + self._head = data.reset + return True # Break out of chunk processing early + else: + self._fail_route() + elif data.context & data.CX_ATTR_READY: + if chunk.isspace(): + data.padding_buffer.append(chunk) + else: + data.context = data.CX_ATTR_NAME + self._push(contexts.TAG_ATTR) + self._write_text(chunk) ### hook on here for {, <, etc + elif data.context & data.CX_ATTR_NAME: + if chunk.isspace(): + data.padding_buffer.append(chunk) + data.context |= data.CX_NEED_EQUALS + elif chunk == "=": + if not data.context & data.CX_NEED_EQUALS: + data.padding_buffer.append("") # No padding before equals + data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE + self._write(tokens.TagAttrEquals()) + else: + if data.context & data.CX_NEED_EQUALS: + self._push_tag_buffer(data) + data.padding_buffer.append("") # No padding before tag + data.context = data.CX_ATTR_NAME + self._push(contexts.TAG_ATTR) + self._write_text(chunk) ### hook on here for {, <, etc + elif data.context & data.CX_ATTR_VALUE: + ### handle backslashes here + if data.context & data.CX_NEED_QUOTE: + if chunk == '"' and not data.ignore_quote: + data.context ^= data.CX_NEED_QUOTE + data.literal = False + data.reset = self._head + elif chunk.isspace(): + data.padding_buffer.append(chunk) + else: + data.context ^= data.CX_NEED_QUOTE + self._write_text(chunk) ### hook on here for {, <, etc + elif not data.literal: + if chunk == '"': + data.context |= data.CX_NEED_SPACE + data.literal = True + else: + data.quote_buffer.append(chunk) + elif chunk.isspace(): + self._push_tag_buffer(data) + data.padding_buffer.append(chunk) + data.context = data.CX_ATTR_READY + else: + self._write_text(chunk) ### hook on here for {, <, etc + + def _push_tag_buffer(self, data): + buf = data.padding_buffer + while len(buf) < 3: + buf.append("") + self._write_first(tokens.TagAttrStart( + pad_after_eq=buf.pop(), pad_before_eq=buf.pop(), + pad_first=buf.pop())) + if data.quote_buffer: + self._write(tokens.TagAttrQuote()) + self._write_text("".join(data.quote_buffer)) + self._write_all(self._pop()) + data.padding_buffer, data.quote_buffer = [], [] + data.ignore_quote = False def _get_tag_from_stack(self, stack=None): """Return the tag based on the text in *stack*.""" if not stack: sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen) - func = lambda tok: not isinstance(tok, sentinels) - stack = takewhile(func, self._stack) + pred = lambda tok: not isinstance(tok, sentinels) + stack = takewhile(pred, self._stack) text = [tok.text for tok in stack if isinstance(tok, tokens.Text)] - return "".join(text).rstrip().lower() - - def _handle_tag_close_open(self): - """Handle the ending of an open tag (````).""" - padding = self._actually_close_tag_opening() - if not self._get_tag_from_stack(): # Tags cannot be blank - self._fail_route() - self._write(tokens.TagCloseOpen(padding=padding)) - - def _handle_tag_selfclose(self): - """Handle the ending of an tag that closes itself (````).""" - padding = self._actually_close_tag_opening() - if not self._get_tag_from_stack(): # Tags cannot be blank + try: + return "".join(text).rstrip().lower().split()[0] + except IndexError: self._fail_route() - self._write(tokens.TagCloseSelfclose(padding=padding)) - self._head += 1 - return self._pop() def _handle_tag_open_close(self): """Handle the opening of a closing tag (````).""" @@ -579,10 +580,7 @@ class Tokenizer(object): def _handle_tag_close_close(self): """Handle the ending of a closing tag (````).""" closing = self._pop() - close_tag = self._get_tag_from_stack(closing) - open_tag = self._get_tag_from_stack() - if not close_tag or close_tag != open_tag: - # Closing and opening tags are empty or unequal, so fail this tag: + if self._get_tag_from_stack(closing) != self._get_tag_from_stack(): self._fail_route() self._write_all(closing) self._write(tokens.TagCloseClose()) @@ -645,37 +643,30 @@ class Tokenizer(object): self._context |= contexts.FAIL_ON_RBRACE return True - def _parse(self, context=0): + def _parse(self, context=0, push=True): """Parse the wikicode string, using *context* for when to stop.""" - self._push(context) + unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | + contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME | + contexts.TAG_CLOSE) + fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | + contexts.HEADING | contexts.COMMENT | contexts.TAG) + double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) + + if push: + self._push(context) while True: this = self._read() - unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | - contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME | - contexts.TAG_CLOSE) if self._context & unsafe: if not self._verify_safe(this): - double = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) - if self._context & double: + if self._context & double_fail: self._pop() self._fail_route() if this not in self.MARKERS: - if self._context & contexts.TAG_OPEN: - should_exit = self._handle_tag_chunk(this) - if should_exit: - return should_exit - else: - self._write_text(this) + self._write_text(this) self._head += 1 continue if this is self.END: - fail = ( - contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | - contexts.HEADING | contexts.COMMENT | contexts.TAG) if self._context & fail: - double_fail = ( - contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE | - contexts.TAG_OPEN_ATTR_QUOTED) if self._context & double_fail: self._pop() self._fail_route() @@ -720,8 +711,6 @@ class Tokenizer(object): elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() - elif self._context & contexts.TAG_OPEN_ATTR_NAME: - self._handle_tag_attribute_body() else: self._write_text("=") elif this == "=" and self._context & contexts.HEADING: @@ -735,22 +724,8 @@ class Tokenizer(object): self._parse_comment() else: self._write_text(this) - elif this == "<" and next != "/" and ( - not self._context & (contexts.TAG ^ contexts.TAG_BODY)): + elif this == "<" and next != "/" and not self._context & contexts.TAG_CLOSE: self._parse_tag() - elif self._context & contexts.TAG_OPEN: - if self._context & contexts.TAG_OPEN_ATTR_QUOTED: - self._handle_tag_chunk(this) - elif this == "\n": - self._fail_route() - elif this == ">": - self._handle_tag_close_open() - elif this == "/" and next == ">": - return self._handle_tag_selfclose() - elif this == "=" and self._context & contexts.TAG_OPEN_ATTR_NAME: - self._handle_tag_attribute_body() - else: - self._handle_tag_chunk(this) elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: self._handle_tag_open_close() elif this == ">" and self._context & contexts.TAG_CLOSE: From 962adcd62c48a426750fd637cfa27a2d74943474 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 30 Jun 2013 22:27:44 -0400 Subject: [PATCH 031/189] Add docstrings for a couple new methods in the tokenizer. --- mwparserfromhell/parser/tokenizer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index a7b9e16..9817bd9 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -475,6 +475,11 @@ class Tokenizer(object): self._head += 1 def _handle_tag_chunk(self, data, chunk): + """Handle a *chunk* of text inside a HTML open tag. + + A "chunk" is either a marker, whitespace, or text containing no markers + or whitespace. *data* is a :py:class:`_TagOpenData` object. + """ if not chunk: return if data.context & data.CX_NAME: @@ -546,6 +551,10 @@ class Tokenizer(object): self._write_text(chunk) ### hook on here for {, <, etc def _push_tag_buffer(self, data): + """Write a pending tag attribute from *data* to the stack. + + *data* is a :py:class:`_TagOpenData` object. + """ buf = data.padding_buffer while len(buf) < 3: buf.append("") From 43e717cca927009c840ddabb3ebabad834d14adf Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 30 Jun 2013 22:41:19 -0400 Subject: [PATCH 032/189] Add a number of new tag tests. --- tests/tokenizer/tags.mwtest | 70 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 1dfc1b1..7d5f338 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -96,6 +96,76 @@ output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" " --- +name: invalid_space_begin_open +label: invalid tag: a space at the beginning of the open tag +input: "< ref>test" +output: [Text(text="< ref>test")] + +--- + +name: invalid_space_begin_close +label: invalid tag: a space at the beginning of the close tag +input: "test" +output: [Text(text="test")] + +--- + +name: valid_space_end +label: valid tag: spaces at the ends of both the open and close tags +input: "test" +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=" "), Text(text="test"), TagOpenClose(), Text(text="ref "), TagCloseClose()] + +--- + +name: invalid_template_ends +label: invalid tag: a template at the ends of both the open and close tags +input: "test" +output: [Text(text="test" +output: [Text(text="test" +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: valid_template_end_open_space_end_close +label: valid tag: a template at the end of the open tag; whitespace at the end of the close tag +input: "test" +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref\n"), TagCloseClose()] + +--- + +name: invalid_template_end_open_nospace +label: invalid tag: a template at the end of the open tag, without spacing +input: "test" +output: [Text(text="test" +output: [Text(text="test")] + +--- + +name: invalid_template_start_open +label: invalid tag: a template at the beginning of the open tag +input: "<{{foo}}ref>test" +output: [Text(text="<"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text="ref>test")] + +--- + name: incomplete_lbracket label: incomplete tags: just a left bracket input: "<" From 82edc93bbbd1786015a8c61521fd4f698b19724a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 30 Jun 2013 22:42:26 -0400 Subject: [PATCH 033/189] Pass some tests by simplifying the way tags are read from the stack. Two still fail because templates aren't implemented yet, but those are otherwise handled correctly. --- mwparserfromhell/parser/tokenizer.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9817bd9..8c91e4f 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -568,18 +568,6 @@ class Tokenizer(object): data.padding_buffer, data.quote_buffer = [], [] data.ignore_quote = False - def _get_tag_from_stack(self, stack=None): - """Return the tag based on the text in *stack*.""" - if not stack: - sentinels = (tokens.TagAttrStart, tokens.TagCloseOpen) - pred = lambda tok: not isinstance(tok, sentinels) - stack = takewhile(pred, self._stack) - text = [tok.text for tok in stack if isinstance(tok, tokens.Text)] - try: - return "".join(text).rstrip().lower().split()[0] - except IndexError: - self._fail_route() - def _handle_tag_open_close(self): """Handle the opening of a closing tag (````).""" self._write(tokens.TagOpenClose()) @@ -588,8 +576,10 @@ class Tokenizer(object): def _handle_tag_close_close(self): """Handle the ending of a closing tag (````).""" + strip = lambda tok: tok.text.rstrip().lower() closing = self._pop() - if self._get_tag_from_stack(closing) != self._get_tag_from_stack(): + if len(closing) != 1 or (not isinstance(closing[0], tokens.Text) or + strip(closing[0]) != strip(self._stack[1])): self._fail_route() self._write_all(closing) self._write(tokens.TagCloseClose()) @@ -625,7 +615,7 @@ class Tokenizer(object): self._context |= contexts.HAS_TEXT return True elif context & contexts.TAG_CLOSE: - return this != "<" and this != "\n" + return this != "<" else: if context & contexts.FAIL_ON_EQUALS: if this == "=": From f63480bcf3a21b8eb61c944f30b79d04a04efe40 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 30 Jun 2013 23:48:58 -0400 Subject: [PATCH 034/189] Update the integration.rich_tags test to use the new tag tokens. Remove an now-unused import in the tokenizer. --- mwparserfromhell/parser/tokenizer.py | 1 - tests/tokenizer/integration.mwtest | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 8c91e4f..9207440 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -21,7 +21,6 @@ # SOFTWARE. from __future__ import unicode_literals -from itertools import takewhile from math import log import re diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index ba01c8c..736ecb1 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -36,7 +36,7 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t name: rich_tags label: a HTML tag with tons of other things in it input: "{{dubious claim}}[[Source]]" -output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(padding=" "), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(padding=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(padding=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(padding=""), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagOpenClose(), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- From 6c2898d7bd65517d9aa57385841cdda001c1fc06 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 1 Jul 2013 02:10:02 -0400 Subject: [PATCH 035/189] Make {{|=}} build correctly; add a test for this. --- mwparserfromhell/parser/builder.py | 2 +- tests/test_builder.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 2cd7831..e89fb33 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -83,7 +83,7 @@ class Builder(object): tokens.TemplateClose)): self._tokens.append(token) value = self._pop() - if not key: + if key is None: key = self._wrap([Text(str(default))]) return Parameter(key, value, showkey) else: diff --git a/tests/test_builder.py b/tests/test_builder.py index 2d44b6c..6b46f71 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -72,6 +72,14 @@ class TestBuilder(TreeEqualityTestCase): wrap([Template(wraptext("foo"), params=[ Parameter(wraptext("bar"), wraptext("baz"))])])), + ([tokens.TemplateOpen(), tokens.TemplateParamSeparator(), + tokens.TemplateParamSeparator(), tokens.TemplateParamEquals(), + tokens.TemplateParamSeparator(), tokens.TemplateClose()], + wrap([Template(wrap([]), params=[ + Parameter(wraptext("1"), wrap([]), showkey=False), + Parameter(wrap([]), wrap([]), showkey=True), + Parameter(wraptext("2"), wrap([]), showkey=False)])])), + ([tokens.TemplateOpen(), tokens.Text(text="foo"), tokens.TemplateParamSeparator(), tokens.Text(text="bar"), tokens.TemplateParamEquals(), tokens.Text(text="baz"), From dfe100ceb7eecec82d6a3af98d016dfd95d3f9ea Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 1 Jul 2013 20:44:56 -0400 Subject: [PATCH 036/189] Support templates and wikilinks inside tags (part 1) --- mwparserfromhell/parser/tokenizer.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9207440..21d0f2a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -221,6 +221,8 @@ class Tokenizer(object): self._head += 1 self._write_all(self._pop()) + if self._context & contexts.FAIL_NEXT: + self._context ^= contexts.FAIL_NEXT def _parse_template(self): """Parse a template at the head of the wikicode string.""" @@ -293,6 +295,8 @@ class Tokenizer(object): self._head = reset self._write_text("[[") else: + if self._context & contexts.FAIL_NEXT: + self._context ^= contexts.FAIL_NEXT self._write(tokens.WikilinkOpen()) self._write_all(wikilink) self._write(tokens.WikilinkClose()) @@ -507,7 +511,7 @@ class Tokenizer(object): else: data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - self._write_text(chunk) ### hook on here for {, <, etc + self._parse_tag_chunk(chunk) elif data.context & data.CX_ATTR_NAME: if chunk.isspace(): data.padding_buffer.append(chunk) @@ -523,7 +527,7 @@ class Tokenizer(object): data.padding_buffer.append("") # No padding before tag data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - self._write_text(chunk) ### hook on here for {, <, etc + self._parse_tag_chunk(chunk) elif data.context & data.CX_ATTR_VALUE: ### handle backslashes here if data.context & data.CX_NEED_QUOTE: @@ -535,7 +539,7 @@ class Tokenizer(object): data.padding_buffer.append(chunk) else: data.context ^= data.CX_NEED_QUOTE - self._write_text(chunk) ### hook on here for {, <, etc + self._parse_tag_chunk(chunk) elif not data.literal: if chunk == '"': data.context |= data.CX_NEED_SPACE @@ -547,7 +551,18 @@ class Tokenizer(object): data.padding_buffer.append(chunk) data.context = data.CX_ATTR_READY else: - self._write_text(chunk) ### hook on here for {, <, etc + self._parse_tag_chunk(chunk) + + def _parse_tag_chunk(self, chunk): + next = self._read(1) + if not self._can_recurse() or chunk not in self.MARKERS: + self._write_text(chunk) + elif chunk == next == "{": + self._parse_template_or_argument() + elif chunk == next == "[": + self._parse_wikilink() + else: + self._write_text(chunk) def _push_tag_buffer(self, data): """Write a pending tag attribute from *data* to the stack. @@ -678,8 +693,6 @@ class Tokenizer(object): elif this == next == "{": if self._can_recurse(): self._parse_template_or_argument() - if self._context & contexts.FAIL_NEXT: - self._context ^= contexts.FAIL_NEXT else: self._write_text("{") elif this == "|" and self._context & contexts.TEMPLATE: @@ -698,8 +711,6 @@ class Tokenizer(object): elif this == next == "[": if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): self._parse_wikilink() - if self._context & contexts.FAIL_NEXT: - self._context ^= contexts.FAIL_NEXT else: self._write_text("[") elif this == "|" and self._context & contexts.WIKILINK_TITLE: From e34026dabe359ffd16567c8c5002d76f4981fe57 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 1 Jul 2013 22:14:57 -0400 Subject: [PATCH 037/189] Support templates and wikilinks inside tags (part 2) --- mwparserfromhell/parser/tokenizer.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 21d0f2a..29c2772 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -51,7 +51,7 @@ class _TagOpenData(object): self.context = self.CX_NAME self.literal = True self.padding_buffer = [] - self.quote_buffer = [] + self.quoted = False self.reset = 0 self.ignore_quote = False @@ -454,6 +454,8 @@ class Tokenizer(object): continue elif this is self.END: if self._context & contexts.TAG_ATTR: + if data.quoted: + self._pop() self._pop() self._fail_route() elif this == ">" and data.literal: @@ -499,8 +501,9 @@ class Tokenizer(object): else: if data.context & data.CX_ATTR_VALUE: data.context ^= data.CX_NEED_SPACE - data.quote_buffer = [] + data.quoted = False data.ignore_quote = True + self._pop() self._head = data.reset return True # Break out of chunk processing early else: @@ -534,6 +537,8 @@ class Tokenizer(object): if chunk == '"' and not data.ignore_quote: data.context ^= data.CX_NEED_QUOTE data.literal = False + data.quoted = True + self._push(self._context) data.reset = self._head elif chunk.isspace(): data.padding_buffer.append(chunk) @@ -545,7 +550,7 @@ class Tokenizer(object): data.context |= data.CX_NEED_SPACE data.literal = True else: - data.quote_buffer.append(chunk) + self._parse_tag_chunk(chunk) elif chunk.isspace(): self._push_tag_buffer(data) data.padding_buffer.append(chunk) @@ -572,14 +577,15 @@ class Tokenizer(object): buf = data.padding_buffer while len(buf) < 3: buf.append("") + if data.quoted: + data.quoted = False + self._write_first(tokens.TagAttrQuote()) + self._write_all(self._pop()) self._write_first(tokens.TagAttrStart( pad_after_eq=buf.pop(), pad_before_eq=buf.pop(), pad_first=buf.pop())) - if data.quote_buffer: - self._write(tokens.TagAttrQuote()) - self._write_text("".join(data.quote_buffer)) self._write_all(self._pop()) - data.padding_buffer, data.quote_buffer = [], [] + data.padding_buffer = [] data.ignore_quote = False def _handle_tag_open_close(self): From 9693b6d5e61571dfd1e0ea3a65fb95a46dcad1c7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 2 Jul 2013 00:48:20 -0400 Subject: [PATCH 038/189] Replace data.literal and data.quoted with a data.CX_QUOTED context --- mwparserfromhell/parser/tokenizer.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 29c2772..129c19a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -42,16 +42,15 @@ class _TagOpenData(object): CX_ATTR_READY = 1 << 1 CX_ATTR_NAME = 1 << 2 CX_ATTR_VALUE = 1 << 3 - CX_NEED_SPACE = 1 << 4 - CX_NEED_EQUALS = 1 << 5 - CX_NEED_QUOTE = 1 << 6 + CX_QUOTED = 1 << 4 + CX_NEED_SPACE = 1 << 5 + CX_NEED_EQUALS = 1 << 6 + CX_NEED_QUOTE = 1 << 7 CX_ATTR = CX_ATTR_NAME | CX_ATTR_VALUE def __init__(self): self.context = self.CX_NAME - self.literal = True self.padding_buffer = [] - self.quoted = False self.reset = 0 self.ignore_quote = False @@ -448,17 +447,18 @@ class Tokenizer(object): self._write(tokens.TagOpenOpen(showtag=True)) while True: this, next = self._read(), self._read(1) + can_exit = not data.context & data.CX_QUOTED or data.context & data.CX_NEED_SPACE if this not in self.MARKERS: for chunk in self.tag_splitter.split(this): if self._handle_tag_chunk(data, chunk): continue elif this is self.END: if self._context & contexts.TAG_ATTR: - if data.quoted: + if data.context & data.CX_QUOTED: self._pop() self._pop() self._fail_route() - elif this == ">" and data.literal: + elif this == ">" and can_exit: if data.context & data.CX_ATTR: self._push_tag_buffer(data) padding = data.padding_buffer[0] if data.padding_buffer else "" @@ -466,7 +466,7 @@ class Tokenizer(object): self._context = contexts.TAG_BODY self._head += 1 return self._parse(push=False) - elif this == "/" and next == ">" and data.literal: + elif this == "/" and next == ">" and can_exit: if data.context & data.CX_ATTR: self._push_tag_buffer(data) padding = data.padding_buffer[0] if data.padding_buffer else "" @@ -499,9 +499,8 @@ class Tokenizer(object): data.padding_buffer.append(chunk) data.context = data.CX_ATTR_READY else: - if data.context & data.CX_ATTR_VALUE: - data.context ^= data.CX_NEED_SPACE - data.quoted = False + if data.context & data.CX_QUOTED: + data.context ^= data.CX_NEED_SPACE | data.CX_QUOTED data.ignore_quote = True self._pop() self._head = data.reset @@ -536,8 +535,7 @@ class Tokenizer(object): if data.context & data.CX_NEED_QUOTE: if chunk == '"' and not data.ignore_quote: data.context ^= data.CX_NEED_QUOTE - data.literal = False - data.quoted = True + data.context |= data.CX_QUOTED self._push(self._context) data.reset = self._head elif chunk.isspace(): @@ -545,10 +543,9 @@ class Tokenizer(object): else: data.context ^= data.CX_NEED_QUOTE self._parse_tag_chunk(chunk) - elif not data.literal: + elif data.context & data.CX_QUOTED: if chunk == '"': data.context |= data.CX_NEED_SPACE - data.literal = True else: self._parse_tag_chunk(chunk) elif chunk.isspace(): @@ -574,13 +571,12 @@ class Tokenizer(object): *data* is a :py:class:`_TagOpenData` object. """ + if data.context & data.CX_QUOTED: + self._write_first(tokens.TagAttrQuote()) + self._write_all(self._pop()) buf = data.padding_buffer while len(buf) < 3: buf.append("") - if data.quoted: - data.quoted = False - self._write_first(tokens.TagAttrQuote()) - self._write_all(self._pop()) self._write_first(tokens.TagAttrStart( pad_after_eq=buf.pop(), pad_before_eq=buf.pop(), pad_first=buf.pop())) From dd6bb1637d26fb26085143dd6c13be310d1b04bc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 2 Jul 2013 01:31:28 -0400 Subject: [PATCH 039/189] Support tag nesting properly; unit tests; recursion checks for tags. --- mwparserfromhell/parser/tokenizer.py | 16 +++++++++++----- tests/tokenizer/tags.mwtest | 28 ++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 129c19a..2d1245f 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -447,7 +447,8 @@ class Tokenizer(object): self._write(tokens.TagOpenOpen(showtag=True)) while True: this, next = self._read(), self._read(1) - can_exit = not data.context & data.CX_QUOTED or data.context & data.CX_NEED_SPACE + can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or + data.context & data.CX_NEED_SPACE) if this not in self.MARKERS: for chunk in self.tag_splitter.split(this): if self._handle_tag_chunk(data, chunk): @@ -488,8 +489,8 @@ class Tokenizer(object): if not chunk: return if data.context & data.CX_NAME: - if chunk != chunk.lstrip(): # Tags cannot start with whitespace - self._fail_route() + if chunk in self.MARKERS or chunk.isspace(): + self._fail_route() # Tags must start with text (not a space) self._write_text(chunk) data.context = data.CX_NEED_SPACE elif data.context & data.CX_NEED_SPACE: @@ -563,6 +564,8 @@ class Tokenizer(object): self._parse_template_or_argument() elif chunk == next == "[": self._parse_wikilink() + elif chunk == "<": + self._parse_tag() else: self._write_text(chunk) @@ -735,10 +738,13 @@ class Tokenizer(object): self._parse_comment() else: self._write_text(this) - elif this == "<" and next != "/" and not self._context & contexts.TAG_CLOSE: - self._parse_tag() elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: self._handle_tag_open_close() + elif this == "<": + if not self._context & contexts.TAG_CLOSE and self._can_recurse(): + self._parse_tag() + else: + self._write_text("<") elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() else: diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 7d5f338..17010e9 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -96,6 +96,34 @@ output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" " --- +name: nested_tag +label: a tag nested within the attributes of another +input: "foo>citation" +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: nested_tag_quoted +label: a tag nested within the attributes of another, quoted +input: "foo">citation" +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + +name: nested_troll_tag +label: a bogus tag that appears to be nested within the attributes of another +input: ">citation" +output: [Text(text=">citation")] + +--- + +name: nested_troll_tag_quoted +label: a bogus tag that appears to be nested within the attributes of another, quoted +input: "citation" +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text=""), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] + +--- + name: invalid_space_begin_open label: invalid tag: a space at the beginning of the open tag input: "< ref>test" From 5e8794da5eff96fc649956283e5e115582ade86d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 2 Jul 2013 20:04:28 -0400 Subject: [PATCH 040/189] Refactor more of the tag tokenization process. --- mwparserfromhell/parser/tokenizer.py | 39 +++++++++++++++++------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 2d1245f..084d94b 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -449,30 +449,18 @@ class Tokenizer(object): this, next = self._read(), self._read(1) can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or data.context & data.CX_NEED_SPACE) - if this not in self.MARKERS: - for chunk in self.tag_splitter.split(this): - if self._handle_tag_chunk(data, chunk): - continue - elif this is self.END: + if this is self.END: if self._context & contexts.TAG_ATTR: if data.context & data.CX_QUOTED: self._pop() self._pop() self._fail_route() elif this == ">" and can_exit: - if data.context & data.CX_ATTR: - self._push_tag_buffer(data) - padding = data.padding_buffer[0] if data.padding_buffer else "" - self._write(tokens.TagCloseOpen(padding=padding)) + self._handle_tag_close_open(data, tokens.TagCloseOpen) self._context = contexts.TAG_BODY - self._head += 1 return self._parse(push=False) elif this == "/" and next == ">" and can_exit: - if data.context & data.CX_ATTR: - self._push_tag_buffer(data) - padding = data.padding_buffer[0] if data.padding_buffer else "" - self._write(tokens.TagCloseSelfclose(padding=padding)) - self._head += 1 + self._handle_tag_close_open(data, tokens.TagCloseSelfclose) return self._pop() else: for chunk in self.tag_splitter.split(this): @@ -514,7 +502,7 @@ class Tokenizer(object): else: data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - self._parse_tag_chunk(chunk) + self._parse_text_in_tag(chunk) elif data.context & data.CX_ATTR_NAME: if chunk.isspace(): data.padding_buffer.append(chunk) @@ -530,7 +518,7 @@ class Tokenizer(object): data.padding_buffer.append("") # No padding before tag data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - self._parse_tag_chunk(chunk) + self._parse_text_in_tag(chunk) elif data.context & data.CX_ATTR_VALUE: ### handle backslashes here if data.context & data.CX_NEED_QUOTE: @@ -543,20 +531,21 @@ class Tokenizer(object): data.padding_buffer.append(chunk) else: data.context ^= data.CX_NEED_QUOTE - self._parse_tag_chunk(chunk) + self._parse_text_in_tag(chunk) elif data.context & data.CX_QUOTED: if chunk == '"': data.context |= data.CX_NEED_SPACE else: - self._parse_tag_chunk(chunk) + self._parse_text_in_tag(chunk) elif chunk.isspace(): self._push_tag_buffer(data) data.padding_buffer.append(chunk) data.context = data.CX_ATTR_READY else: - self._parse_tag_chunk(chunk) + self._parse_text_in_tag(chunk) - def _parse_tag_chunk(self, chunk): + def _parse_text_in_tag(self, chunk): + """Parse a chunk of text in a tag that has no special significance.""" next = self._read(1) if not self._can_recurse() or chunk not in self.MARKERS: self._write_text(chunk) @@ -587,6 +576,14 @@ class Tokenizer(object): data.padding_buffer = [] data.ignore_quote = False + def _handle_tag_close_open(self, data, token): + """Handle the closing of a open tag (````).""" + if data.context & data.CX_ATTR: + self._push_tag_buffer(data) + padding = data.padding_buffer[0] if data.padding_buffer else "" + self._write(token(padding=padding)) + self._head += 1 + def _handle_tag_open_close(self): """Handle the opening of a closing tag (````).""" self._write(tokens.TagOpenClose()) From e99c9d3038a64c71981fcd9783e2ab3a21f846c6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 3 Jul 2013 18:29:07 -0400 Subject: [PATCH 041/189] More tag refactoring; fix some bugs. --- mwparserfromhell/parser/tokenizer.py | 176 ++++++++++++++++------------------- 1 file changed, 80 insertions(+), 96 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 084d94b..5bb7059 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -46,13 +46,11 @@ class _TagOpenData(object): CX_NEED_SPACE = 1 << 5 CX_NEED_EQUALS = 1 << 6 CX_NEED_QUOTE = 1 << 7 - CX_ATTR = CX_ATTR_NAME | CX_ATTR_VALUE def __init__(self): self.context = self.CX_NAME self.padding_buffer = [] self.reset = 0 - self.ignore_quote = False class Tokenizer(object): @@ -452,7 +450,11 @@ class Tokenizer(object): if this is self.END: if self._context & contexts.TAG_ATTR: if data.context & data.CX_QUOTED: + # Unclosed attribute quote: reset, don't die + data.context = data.CX_ATTR_VALUE self._pop() + self._head = data.reset + continue self._pop() self._fail_route() elif this == ">" and can_exit: @@ -463,122 +465,104 @@ class Tokenizer(object): self._handle_tag_close_open(data, tokens.TagCloseSelfclose) return self._pop() else: - for chunk in self.tag_splitter.split(this): - if self._handle_tag_chunk(data, chunk): - continue + self._handle_tag_data(data, this) self._head += 1 - def _handle_tag_chunk(self, data, chunk): - """Handle a *chunk* of text inside a HTML open tag. + def _push_tag_buffer(self, data): + """Write a pending tag attribute from *data* to the stack.""" + if data.context & data.CX_QUOTED: + self._write_first(tokens.TagAttrQuote()) + self._write_all(self._pop()) + buf = data.padding_buffer + while len(buf) < 3: + buf.append("") + self._write_first(tokens.TagAttrStart(pad_after_eq=buf.pop(), + pad_before_eq=buf.pop(), pad_first=buf.pop())) + self._write_all(self._pop()) + data.padding_buffer = [] - A "chunk" is either a marker, whitespace, or text containing no markers - or whitespace. *data* is a :py:class:`_TagOpenData` object. - """ - if not chunk: - return - if data.context & data.CX_NAME: - if chunk in self.MARKERS or chunk.isspace(): - self._fail_route() # Tags must start with text (not a space) - self._write_text(chunk) - data.context = data.CX_NEED_SPACE - elif data.context & data.CX_NEED_SPACE: - if chunk.isspace(): - if data.context & data.CX_ATTR_VALUE: - self._push_tag_buffer(data) - data.padding_buffer.append(chunk) - data.context = data.CX_ATTR_READY - else: + def _handle_tag_data(self, data, text): + """Handle all sorts of *text* data inside of an HTML open tag.""" + for chunk in self.tag_splitter.split(text): + if not chunk: + continue + if data.context & data.CX_NAME: + if chunk in self.MARKERS or chunk.isspace(): + self._fail_route() # Tags must start with text, not spaces + data.context = data.CX_NEED_SPACE + elif chunk.isspace(): + self._handle_tag_space(data, chunk) + continue + elif data.context & data.CX_NEED_SPACE: if data.context & data.CX_QUOTED: - data.context ^= data.CX_NEED_SPACE | data.CX_QUOTED - data.ignore_quote = True + data.context = data.CX_ATTR_VALUE self._pop() - self._head = data.reset - return True # Break out of chunk processing early - else: - self._fail_route() - elif data.context & data.CX_ATTR_READY: - if chunk.isspace(): - data.padding_buffer.append(chunk) - else: + self._head = data.reset - 1 # Will be auto-incremented + return # Break early + self._fail_route() + elif data.context & data.CX_ATTR_READY: data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - self._parse_text_in_tag(chunk) - elif data.context & data.CX_ATTR_NAME: - if chunk.isspace(): - data.padding_buffer.append(chunk) - data.context |= data.CX_NEED_EQUALS - elif chunk == "=": - if not data.context & data.CX_NEED_EQUALS: - data.padding_buffer.append("") # No padding before equals - data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE - self._write(tokens.TagAttrEquals()) - else: + elif data.context & data.CX_ATTR_NAME: + if chunk == "=": + if not data.context & data.CX_NEED_EQUALS: + data.padding_buffer.append("") # No padding before '=' + data.context = data.CX_ATTR_VALUE | data.CX_NEED_QUOTE + self._write(tokens.TagAttrEquals()) + continue if data.context & data.CX_NEED_EQUALS: self._push_tag_buffer(data) data.padding_buffer.append("") # No padding before tag data.context = data.CX_ATTR_NAME self._push(contexts.TAG_ATTR) - self._parse_text_in_tag(chunk) - elif data.context & data.CX_ATTR_VALUE: - ### handle backslashes here - if data.context & data.CX_NEED_QUOTE: - if chunk == '"' and not data.ignore_quote: + elif data.context & data.CX_ATTR_VALUE: + ### handle backslashes here + if data.context & data.CX_NEED_QUOTE: data.context ^= data.CX_NEED_QUOTE - data.context |= data.CX_QUOTED - self._push(self._context) - data.reset = self._head - elif chunk.isspace(): - data.padding_buffer.append(chunk) - else: - data.context ^= data.CX_NEED_QUOTE - self._parse_text_in_tag(chunk) - elif data.context & data.CX_QUOTED: - if chunk == '"': - data.context |= data.CX_NEED_SPACE - else: - self._parse_text_in_tag(chunk) - elif chunk.isspace(): - self._push_tag_buffer(data) - data.padding_buffer.append(chunk) - data.context = data.CX_ATTR_READY - else: - self._parse_text_in_tag(chunk) + if chunk == '"': + data.context |= data.CX_QUOTED + self._push(self._context) + data.reset = self._head + continue + elif data.context & data.CX_QUOTED: + if chunk == '"': + data.context |= data.CX_NEED_SPACE + continue + self._handle_tag_text(chunk) - def _parse_text_in_tag(self, chunk): - """Parse a chunk of text in a tag that has no special significance.""" + def _handle_tag_space(self, data, text): + """Handle whitespace (*text*) inside of an HTML open tag.""" + ctx = data.context + end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NEED_QUOTE) + if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NEED_SPACE): + self._push_tag_buffer(data) + data.context = data.CX_ATTR_READY + elif ctx & data.CX_NEED_SPACE: + data.context = data.CX_ATTR_READY + elif ctx & data.CX_ATTR_NAME: + data.context |= data.CX_NEED_EQUALS + if ctx & data.CX_QUOTED and not ctx & data.CX_NEED_SPACE: + self._write_text(text) + else: + data.padding_buffer.append(text) + + def _handle_tag_text(self, text): + """Handle regular *text* inside of an HTML open tag.""" next = self._read(1) - if not self._can_recurse() or chunk not in self.MARKERS: - self._write_text(chunk) - elif chunk == next == "{": + if not self._can_recurse() or text not in self.MARKERS: + self._write_text(text) + elif text == next == "{": self._parse_template_or_argument() - elif chunk == next == "[": + elif text == next == "[": self._parse_wikilink() - elif chunk == "<": + elif text == "<": self._parse_tag() else: - self._write_text(chunk) - - def _push_tag_buffer(self, data): - """Write a pending tag attribute from *data* to the stack. - - *data* is a :py:class:`_TagOpenData` object. - """ - if data.context & data.CX_QUOTED: - self._write_first(tokens.TagAttrQuote()) - self._write_all(self._pop()) - buf = data.padding_buffer - while len(buf) < 3: - buf.append("") - self._write_first(tokens.TagAttrStart( - pad_after_eq=buf.pop(), pad_before_eq=buf.pop(), - pad_first=buf.pop())) - self._write_all(self._pop()) - data.padding_buffer = [] - data.ignore_quote = False + self._write_text(text) def _handle_tag_close_open(self, data, token): """Handle the closing of a open tag (````).""" - if data.context & data.CX_ATTR: + if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): self._push_tag_buffer(data) padding = data.padding_buffer[0] if data.padding_buffer else "" self._write(token(padding=padding)) From 17c71e335f35b3c10e572daeaf2cb2c6707ea000 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 3 Jul 2013 18:30:02 -0400 Subject: [PATCH 042/189] Add three tests for invalid attribute quote usage. --- tests/tokenizer/tags.mwtest | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 17010e9..89b2b2e 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -194,6 +194,27 @@ output: [Text(text="<"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text --- +name: unclosed_quote +label: a quoted attribute that is never closed +input: "stuff" +output: [TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo\"bar"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()] + +--- + +name: fake_quote_complex +label: a fake quoted attribute, with spaces and templates and links +input: "stuff" +output: [TagOpenOpen(showtag=True), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo"), TagAttrStart(pad_first=" ", pad_before_eq="\n", pad_after_eq=""), TemplateOpen(), Text(text="bar"), TemplateClose(), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), WikilinkOpen(), Text(text="baz"), WikilinkClose(), Text(text="\"buzz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()] + +--- + name: incomplete_lbracket label: incomplete tags: just a left bracket input: "<" From 591a0f5ed57f3ccad221a2870749031064003c5c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 3 Jul 2013 18:46:41 -0400 Subject: [PATCH 043/189] Change 'write' to 'emit'; adjust some other names for PEP8. --- mwparserfromhell/parser/tokenizer.py | 149 +++++++++++++++++------------------ 1 file changed, 74 insertions(+), 75 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 5bb7059..515a7a2 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -24,8 +24,7 @@ from __future__ import unicode_literals from math import log import re -from . import contexts -from . import tokens +from . import contexts, tokens from ..compat import htmlentities from ..tag_defs import is_parsable @@ -136,33 +135,33 @@ class Tokenizer(object): self._pop() raise BadRoute() - def _write(self, token): + def _emit(self, token): """Write a token to the end of the current token stack.""" self._push_textbuffer() self._stack.append(token) - def _write_first(self, token): + def _emit_first(self, token): """Write a token to the beginning of the current token stack.""" self._push_textbuffer() self._stack.insert(0, token) - def _write_text(self, text): + def _emit_text(self, text): """Write text to the current textbuffer.""" self._textbuffer.append(text) - def _write_all(self, tokenlist): + def _emit_all(self, tokenlist): """Write a series of tokens to the current stack at once.""" if tokenlist and isinstance(tokenlist[0], tokens.Text): - self._write_text(tokenlist.pop(0).text) + self._emit_text(tokenlist.pop(0).text) self._push_textbuffer() self._stack.extend(tokenlist) - def _write_text_then_stack(self, text): + def _emit_text_then_stack(self, text): """Pop the current stack, write *text*, and then write the stack.""" stack = self._pop() - self._write_text(text) + self._emit_text(text) if stack: - self._write_all(stack) + self._emit_all(stack) self._head -= 1 def _read(self, delta=0, wrap=False, strict=False): @@ -198,12 +197,12 @@ class Tokenizer(object): while braces: if braces == 1: - return self._write_text_then_stack("{") + return self._emit_text_then_stack("{") if braces == 2: try: self._parse_template() except BadRoute: - return self._write_text_then_stack("{{") + return self._emit_text_then_stack("{{") break try: self._parse_argument() @@ -213,11 +212,11 @@ class Tokenizer(object): self._parse_template() braces -= 2 except BadRoute: - return self._write_text_then_stack("{" * braces) + return self._emit_text_then_stack("{" * braces) if braces: self._head += 1 - self._write_all(self._pop()) + self._emit_all(self._pop()) if self._context & contexts.FAIL_NEXT: self._context ^= contexts.FAIL_NEXT @@ -229,9 +228,9 @@ class Tokenizer(object): except BadRoute: self._head = reset raise - self._write_first(tokens.TemplateOpen()) - self._write_all(template) - self._write(tokens.TemplateClose()) + self._emit_first(tokens.TemplateOpen()) + self._emit_all(template) + self._emit(tokens.TemplateClose()) def _parse_argument(self): """Parse an argument at the head of the wikicode string.""" @@ -241,9 +240,9 @@ class Tokenizer(object): except BadRoute: self._head = reset raise - self._write_first(tokens.ArgumentOpen()) - self._write_all(argument) - self._write(tokens.ArgumentClose()) + self._emit_first(tokens.ArgumentOpen()) + self._emit_all(argument) + self._emit(tokens.ArgumentClose()) def _handle_template_param(self): """Handle a template parameter at the head of the string.""" @@ -252,22 +251,22 @@ class Tokenizer(object): elif self._context & contexts.TEMPLATE_PARAM_VALUE: self._context ^= contexts.TEMPLATE_PARAM_VALUE elif self._context & contexts.TEMPLATE_PARAM_KEY: - self._write_all(self._pop(keep_context=True)) + self._emit_all(self._pop(keep_context=True)) self._context |= contexts.TEMPLATE_PARAM_KEY - self._write(tokens.TemplateParamSeparator()) + self._emit(tokens.TemplateParamSeparator()) self._push(self._context) def _handle_template_param_value(self): """Handle a template parameter's value at the head of the string.""" - self._write_all(self._pop(keep_context=True)) + self._emit_all(self._pop(keep_context=True)) self._context ^= contexts.TEMPLATE_PARAM_KEY self._context |= contexts.TEMPLATE_PARAM_VALUE - self._write(tokens.TemplateParamEquals()) + self._emit(tokens.TemplateParamEquals()) def _handle_template_end(self): """Handle the end of a template at the head of the string.""" if self._context & contexts.TEMPLATE_PARAM_KEY: - self._write_all(self._pop(keep_context=True)) + self._emit_all(self._pop(keep_context=True)) self._head += 1 return self._pop() @@ -275,7 +274,7 @@ class Tokenizer(object): """Handle the separator between an argument's name and default.""" self._context ^= contexts.ARGUMENT_NAME self._context |= contexts.ARGUMENT_DEFAULT - self._write(tokens.ArgumentSeparator()) + self._emit(tokens.ArgumentSeparator()) def _handle_argument_end(self): """Handle the end of an argument at the head of the string.""" @@ -290,19 +289,19 @@ class Tokenizer(object): wikilink = self._parse(contexts.WIKILINK_TITLE) except BadRoute: self._head = reset - self._write_text("[[") + self._emit_text("[[") else: if self._context & contexts.FAIL_NEXT: self._context ^= contexts.FAIL_NEXT - self._write(tokens.WikilinkOpen()) - self._write_all(wikilink) - self._write(tokens.WikilinkClose()) + self._emit(tokens.WikilinkOpen()) + self._emit_all(wikilink) + self._emit(tokens.WikilinkClose()) def _handle_wikilink_separator(self): """Handle the separator between a wikilink's title and its text.""" self._context ^= contexts.WIKILINK_TITLE self._context |= contexts.WIKILINK_TEXT - self._write(tokens.WikilinkSeparator()) + self._emit(tokens.WikilinkSeparator()) def _handle_wikilink_end(self): """Handle the end of a wikilink at the head of the string.""" @@ -324,13 +323,13 @@ class Tokenizer(object): title, level = self._parse(context) except BadRoute: self._head = reset + best - 1 - self._write_text("=" * best) + self._emit_text("=" * best) else: - self._write(tokens.HeadingStart(level=level)) + self._emit(tokens.HeadingStart(level=level)) if level < best: - self._write_text("=" * (best - level)) - self._write_all(title) - self._write(tokens.HeadingEnd()) + self._emit_text("=" * (best - level)) + self._emit_all(title) + self._emit(tokens.HeadingEnd()) finally: self._global ^= contexts.GL_HEADING @@ -349,28 +348,28 @@ class Tokenizer(object): after, after_level = self._parse(self._context) except BadRoute: if level < best: - self._write_text("=" * (best - level)) + self._emit_text("=" * (best - level)) self._head = reset + best - 1 return self._pop(), level else: # Found another closure - self._write_text("=" * best) - self._write_all(after) + self._emit_text("=" * best) + self._emit_all(after) return self._pop(), after_level def _really_parse_entity(self): """Actually parse an HTML entity and ensure that it is valid.""" - self._write(tokens.HTMLEntityStart()) + self._emit(tokens.HTMLEntityStart()) self._head += 1 this = self._read(strict=True) if this == "#": numeric = True - self._write(tokens.HTMLEntityNumeric()) + self._emit(tokens.HTMLEntityNumeric()) self._head += 1 this = self._read(strict=True) if this[0].lower() == "x": hexadecimal = True - self._write(tokens.HTMLEntityHex(char=this[0])) + self._emit(tokens.HTMLEntityHex(char=this[0])) this = this[1:] if not this: self._fail_route() @@ -396,8 +395,8 @@ class Tokenizer(object): if this not in htmlentities.entitydefs: self._fail_route() - self._write(tokens.Text(text=this)) - self._write(tokens.HTMLEntityEnd()) + self._emit(tokens.Text(text=this)) + self._emit(tokens.HTMLEntityEnd()) def _parse_entity(self): """Parse an HTML entity at the head of the wikicode string.""" @@ -407,9 +406,9 @@ class Tokenizer(object): self._really_parse_entity() except BadRoute: self._head = reset - self._write_text(self._read()) + self._emit_text(self._read()) else: - self._write_all(self._pop()) + self._emit_all(self._pop()) def _parse_comment(self): """Parse an HTML comment at the head of the wikicode string.""" @@ -419,11 +418,11 @@ class Tokenizer(object): comment = self._parse(contexts.COMMENT) except BadRoute: self._head = reset - self._write_text("") + self.assertTrue(code1.matches("Cleanup")) + self.assertTrue(code1.matches("cleanup")) + self.assertTrue(code1.matches(" cleanup\n")) + self.assertFalse(code1.matches("CLEANup")) + self.assertFalse(code1.matches("Blah")) + self.assertTrue(code2.matches("stub")) + self.assertTrue(code2.matches("Stub")) + self.assertFalse(code2.matches("StuB")) + def test_filter_family(self): """test the Wikicode.i?filter() family of functions""" def genlist(gen): From 9f579ec29fc96af8f411d24ff6828159e1688d40 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 14 Aug 2013 22:08:36 -0400 Subject: [PATCH 128/189] Add a test for before a list. --- tests/tokenizer/tags_wikimarkup.mwtest | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index 632ba72..feff9c5 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -514,3 +514,10 @@ name: hr_interruption_long label: a hr that is interrupted, but the first part remains valid because it is long enough input: "----x--" output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="x--")] + +--- + +name: nowiki_cancel +label: a nowiki tag before a list causes it to not be parsed +input: "* Unordered list" +output: [TagOpenOpen(), Text(text="nowiki"), TagCloseSelfclose(padding=" "), Text(text="* Unordered list")] From be5d2cbb07da98f9babec7e1b799b40f374dfe52 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 14 Aug 2013 22:24:14 -0400 Subject: [PATCH 129/189] Support HTML entities inside parser-blacklisted tags (closes #36) --- mwparserfromhell/parser/tokenizer.c | 9 +++++++-- mwparserfromhell/parser/tokenizer.py | 5 ++++- tests/tokenizer/html_entities.mwtest | 14 ++++++++++++++ tests/tokenizer/tags.mwtest | 21 +++++++++++++++++++++ 4 files changed, 46 insertions(+), 3 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index eff000a..912cfd7 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1578,16 +1578,21 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self) while (1) { this = Tokenizer_READ(self, 0); next = Tokenizer_READ(self, 1); - self->head++; if (this == *"") return Tokenizer_fail_route(self); else if (this == *"<" && next == *"/") { if (Tokenizer_handle_tag_open_close(self)) return NULL; + self->head++; return Tokenizer_parse(self, 0, 0); } - if (Tokenizer_emit_char(self, this)) + else if (this == *"&") { + if (Tokenizer_parse_entity(self)) + return NULL; + } + else if (Tokenizer_emit_char(self, this)) return NULL; + self->head++; } } diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 38ffa80..583d2f8 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -552,14 +552,17 @@ class Tokenizer(object): """Handle the body of an HTML tag that is parser-blacklisted.""" while True: this, next = self._read(), self._read(1) - self._head += 1 if this is self.END: self._fail_route() elif this == "<" and next == "/": self._handle_tag_open_close() + self._head += 1 return self._parse(push=False) + elif this == "&": + self._parse_entity() else: self._emit_text(this) + self._head += 1 def _handle_single_only_tag_end(self): """Handle the end of an implicitly closing single-only HTML tag.""" diff --git a/tests/tokenizer/html_entities.mwtest b/tests/tokenizer/html_entities.mwtest index 625dd60..53bedbd 100644 --- a/tests/tokenizer/html_entities.mwtest +++ b/tests/tokenizer/html_entities.mwtest @@ -117,6 +117,20 @@ output: [Text(text="&;")] --- +name: invalid_partial_amp_pound +label: invalid entities: just an ampersand, pound sign +input: "&#" +output: [Text(text="&#")] + +--- + +name: invalid_partial_amp_pound_x +label: invalid entities: just an ampersand, pound sign, x +input: "&#x" +output: [Text(text="&#x")] + +--- + name: invalid_partial_amp_pound_semicolon label: invalid entities: an ampersand, pound sign, and semicolon input: "&#;" diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 50c844e..dc02a51 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -467,6 +467,27 @@ output: [TemplateOpen(), Text(text="t1"), TemplateClose(), Text(text="") --- +name: unparsable_entity +label: a HTML entity inside unparsable text is still parsed +input: "{{t1}}{{t2}} {{t3}}{{t4}}" +output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="{{t2}}"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="{{t3}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t4"), TemplateClose()] + +--- + +name: unparsable_entity_incomplete +label: an incomplete HTML entity inside unparsable text +input: "&" +output: [TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="&"), TagOpenClose(), Text(text="nowiki"), TagCloseClose()] + +--- + +name: unparsable_entity_incomplete_2 +label: an incomplete HTML entity inside unparsable text +input: "&" +output: [Text(text="&")] + +--- + name: single_open_close label: a tag that supports being single; both an open and a close tag input: "foo
  • bar{{baz}}
  • " From 25d53cacf8abc76a55cbf1af1b77b4cb9b6b0f5c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 14 Aug 2013 23:54:06 -0400 Subject: [PATCH 130/189] Begin porting C tokenizer to Python 3. --- mwparserfromhell/parser/tokenizer.c | 70 ++++++++++++++++++++++++++----------- mwparserfromhell/parser/tokenizer.h | 27 +++++++------- setup.py | 5 +-- 3 files changed, 65 insertions(+), 37 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 4df61d8..60223e1 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -207,7 +207,7 @@ static void Tokenizer_dealloc(Tokenizer* self) free(this); this = next; } - self->ob_type->tp_free((PyObject*) self); + Py_TYPE(self)->tp_free((PyObject*) self); } static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) @@ -835,7 +835,11 @@ static int Tokenizer_parse_heading(Tokenizer* self) self->global ^= GL_HEADING; return 0; } +#ifdef IS_PY3K + level = PyLong_FromSsize_t(heading->level); +#else level = PyInt_FromSsize_t(heading->level); +#endif if (!level) { Py_DECREF(heading->title); free(heading); @@ -2299,30 +2303,40 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) return Tokenizer_parse(self, 0, 1); } -static void load_entitydefs(void) +static int load_entitydefs(void) { PyObject *tempmod, *defmap, *deflist; unsigned numdefs, i; +#ifdef IS_PY3K + tempmod = PyImport_ImportModule("html.entities"); +#else tempmod = PyImport_ImportModule("htmlentitydefs"); +#endif if (!tempmod) - return; + return -1; defmap = PyObject_GetAttrString(tempmod, "entitydefs"); if (!defmap) - return; + return -1; Py_DECREF(tempmod); deflist = PyDict_Keys(defmap); if (!deflist) - return; + return -1; Py_DECREF(defmap); numdefs = (unsigned) PyList_GET_SIZE(defmap); entitydefs = calloc(numdefs + 1, sizeof(char*)); - for (i = 0; i < numdefs; i++) + if (!entitydefs) + return -1; + for (i = 0; i < numdefs; i++) { entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i)); + if (!entitydefs[i]) + return -1; + } Py_DECREF(deflist); + return 0; } -static void load_tokens(void) +static int load_tokens(void) { PyObject *tempmod, *tokens, *globals = PyEval_GetGlobals(), @@ -2332,12 +2346,12 @@ static void load_tokens(void) char *name = "mwparserfromhell.parser"; if (!fromlist || !modname) - return; + return -1; PyList_SET_ITEM(fromlist, 0, modname); tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); Py_DECREF(fromlist); if (!tempmod) - return; + return -1; tokens = PyObject_GetAttrString(tempmod, "tokens"); Py_DECREF(tempmod); @@ -2379,9 +2393,10 @@ static void load_tokens(void) TagCloseClose = PyObject_GetAttrString(tokens, "TagCloseClose"); Py_DECREF(tokens); + return 0; } -static void load_tag_defs(void) +static int load_tag_defs(void) { PyObject *tempmod, *globals = PyEval_GetGlobals(), @@ -2391,33 +2406,48 @@ static void load_tag_defs(void) char *name = "mwparserfromhell"; if (!fromlist || !modname) - return; + return -1; PyList_SET_ITEM(fromlist, 0, modname); tempmod = PyImport_ImportModuleLevel(name, globals, locals, fromlist, 0); Py_DECREF(fromlist); if (!tempmod) - return; + return -1; tag_defs = PyObject_GetAttrString(tempmod, "tag_defs"); Py_DECREF(tempmod); + return 0; } -PyMODINIT_FUNC init_tokenizer(void) +#ifdef IS_PY3K + #define INIT_ERROR return NULL + PyMODINIT_FUNC PyInit__tokenizer(void) +#else + #define INIT_ERROR return + PyMODINIT_FUNC init_tokenizer(void) +#endif { PyObject *module; TokenizerType.tp_new = PyType_GenericNew; if (PyType_Ready(&TokenizerType) < 0) - return; - module = Py_InitModule("_tokenizer", module_methods); + INIT_ERROR; +#ifdef IS_PY3K + module = PyModule_Create(&module_def); +#else + module = Py_InitModule("_tokenizer", NULL); +#endif + if (!module) + INIT_ERROR; Py_INCREF(&TokenizerType); PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); Py_INCREF(Py_True); PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); - EMPTY = PyUnicode_FromString(""); NOARGS = PyTuple_New(0); - - load_entitydefs(); - load_tokens(); - load_tag_defs(); + if (!EMPTY || !NOARGS) + INIT_ERROR; + if (load_entitydefs() || load_tokens() || load_tag_defs()) + INIT_ERROR; +#ifdef IS_PY3K + return module; +#endif } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index d5f755d..2bf6973 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -28,6 +28,7 @@ SOFTWARE. #include #include #include +#include #if PY_MAJOR_VERSION >= 3 #define IS_PY3K @@ -253,27 +254,18 @@ static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); /* More structs for creating the Tokenizer type: */ -static PyMethodDef -Tokenizer_methods[] = { +static PyMethodDef Tokenizer_methods[] = { {"tokenize", (PyCFunction) Tokenizer_tokenize, METH_VARARGS, "Build a list of tokens from a string of wikicode and return it."}, {NULL} }; -static PyMemberDef -Tokenizer_members[] = { +static PyMemberDef Tokenizer_members[] = { {NULL} }; -static PyMethodDef -module_methods[] = { - {NULL} -}; - -static PyTypeObject -TokenizerType = { - PyObject_HEAD_INIT(NULL) - 0, /* ob_size */ +static PyTypeObject TokenizerType = { + PyVarObject_HEAD_INIT(NULL, 0) "_tokenizer.CTokenizer", /* tp_name */ sizeof(Tokenizer), /* tp_basicsize */ 0, /* tp_itemsize */ @@ -312,3 +304,12 @@ TokenizerType = { 0, /* tp_alloc */ Tokenizer_new, /* tp_new */ }; + +#ifdef IS_PY3K +static PyModuleDef module_def = { + PyModuleDef_HEAD_INIT, + "_tokenizer", + "Creates a list of tokens from a string of wikicode.", + -1, NULL, NULL, NULL, NULL, NULL +}; +#endif diff --git a/setup.py b/setup.py index 8b4ae86..5e6d779 100644 --- a/setup.py +++ b/setup.py @@ -29,16 +29,13 @@ from mwparserfromhell.compat import py3k with open("README.rst") as fp: long_docs = fp.read() -# builder = Extension("mwparserfromhell.parser._builder", -# sources = ["mwparserfromhell/parser/builder.c"]) - tokenizer = Extension("mwparserfromhell.parser._tokenizer", sources = ["mwparserfromhell/parser/tokenizer.c"]) setup( name = "mwparserfromhell", packages = find_packages(exclude=("tests",)), - ext_modules = [] if py3k else [tokenizer], + ext_modules = [tokenizer], test_suite = "tests", version = __version__, author = "Ben Kurtovic", From e02ad8239f16d78e216e8f809ee162c8213e2287 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 15 Aug 2013 00:11:42 -0400 Subject: [PATCH 131/189] Make load_entitydefs() work on Python 3. --- mwparserfromhell/parser/tokenizer.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 60223e1..b3ad3ec 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2307,6 +2307,9 @@ static int load_entitydefs(void) { PyObject *tempmod, *defmap, *deflist; unsigned numdefs, i; +#ifdef IS_PY3K + PyObject *string; +#endif #ifdef IS_PY3K tempmod = PyImport_ImportModule("html.entities"); @@ -2328,7 +2331,15 @@ static int load_entitydefs(void) if (!entitydefs) return -1; for (i = 0; i < numdefs; i++) { +#ifdef IS_PY3K + string = PyUnicode_AsASCIIString(PyList_GET_ITEM(deflist, i)); + if (!string) + return -1; + entitydefs[i] = PyBytes_AsString(string); + Py_DECREF(string); +#else entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i)); +#endif if (!entitydefs[i]) return -1; } From b5ec7f3bebc71ba161c7a1533032abb5cb6211f0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 15 Aug 2013 00:33:22 -0400 Subject: [PATCH 132/189] Fix py3k module importing; stick a bunch of macros in one place. --- mwparserfromhell/parser/tokenizer.c | 24 +++++------------------- mwparserfromhell/parser/tokenizer.h | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index b3ad3ec..f6eea84 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2311,11 +2311,7 @@ static int load_entitydefs(void) PyObject *string; #endif -#ifdef IS_PY3K - tempmod = PyImport_ImportModule("html.entities"); -#else - tempmod = PyImport_ImportModule("htmlentitydefs"); -#endif + tempmod = PyImport_ImportModule(ENTITYDEFS_MODULE); if (!tempmod) return -1; defmap = PyObject_GetAttrString(tempmod, "entitydefs"); @@ -2353,7 +2349,7 @@ static int load_tokens(void) *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(), *fromlist = PyList_New(1), - *modname = PyBytes_FromString("tokens"); + *modname = IMPORT_NAME_FUNC("tokens"); char *name = "mwparserfromhell.parser"; if (!fromlist || !modname) @@ -2413,7 +2409,7 @@ static int load_tag_defs(void) *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(), *fromlist = PyList_New(1), - *modname = PyBytes_FromString("tag_defs"); + *modname = IMPORT_NAME_FUNC("tag_defs"); char *name = "mwparserfromhell"; if (!fromlist || !modname) @@ -2428,24 +2424,14 @@ static int load_tag_defs(void) return 0; } -#ifdef IS_PY3K - #define INIT_ERROR return NULL - PyMODINIT_FUNC PyInit__tokenizer(void) -#else - #define INIT_ERROR return - PyMODINIT_FUNC init_tokenizer(void) -#endif +PyMODINIT_FUNC INIT_FUNC_NAME(void) { PyObject *module; TokenizerType.tp_new = PyType_GenericNew; if (PyType_Ready(&TokenizerType) < 0) INIT_ERROR; -#ifdef IS_PY3K - module = PyModule_Create(&module_def); -#else - module = Py_InitModule("_tokenizer", NULL); -#endif + module = CREATE_MODULE; if (!module) INIT_ERROR; Py_INCREF(&TokenizerType); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 2bf6973..1229688 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -252,6 +252,23 @@ static PyObject* Tokenizer_parse(Tokenizer*, int, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); +/* Macros for Python 2/3 compatibility: */ + +#ifdef IS_PY3K + #define IMPORT_NAME_FUNC PyUnicode_FromString + #define CREATE_MODULE PyModule_Create(&module_def); + #define ENTITYDEFS_MODULE "html.entities" + #define INIT_FUNC_NAME PyInit__tokenizer + #define INIT_ERROR return NULL +#else + #define IMPORT_NAME_FUNC PyBytes_FromString + #define CREATE_MODULE Py_InitModule("_tokenizer", NULL); + #define ENTITYDEFS_MODULE "htmlentitydefs" + #define INIT_FUNC_NAME init_tokenizer + #define INIT_ERROR return +#endif + + /* More structs for creating the Tokenizer type: */ static PyMethodDef Tokenizer_methods[] = { From db86176c085063f1b5227dc3e99c003ef443f0fc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 15 Aug 2013 00:41:24 -0400 Subject: [PATCH 133/189] wiki_markup attr should be unicode, not bytes --- mwparserfromhell/parser/tokenizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index f6eea84..c08fb11 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1697,7 +1697,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag, { PyObject *markup, *kwargs; - markup = PyBytes_FromString(ticks); + markup = PyUnicode_FromString(ticks); if (!markup) return -1; kwargs = PyDict_New(); From 5e8e050ca3421e20c0462df071ff83e8c47ff703 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 15 Aug 2013 00:53:04 -0400 Subject: [PATCH 134/189] A few tweaks; py3k support now complete. --- mwparserfromhell/parser/tokenizer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index c08fb11..88ca3f2 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1017,7 +1017,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) self->head++; continue; } - if (i >= 8) + if (i >= MAX_ENTITY_SIZE) FAIL_ROUTE_AND_EXIT() for (j = 0; j < NUM_MARKERS; j++) { if (this == *MARKERS[j]) @@ -2332,7 +2332,6 @@ static int load_entitydefs(void) if (!string) return -1; entitydefs[i] = PyBytes_AsString(string); - Py_DECREF(string); #else entitydefs[i] = PyBytes_AsString(PyList_GET_ITEM(deflist, i)); #endif From 8923d96a57b35d3b911733719099107671ff84d6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 15 Aug 2013 01:19:39 -0400 Subject: [PATCH 135/189] More unification. --- mwparserfromhell/parser/tokenizer.c | 16 ++++++---------- mwparserfromhell/parser/tokenizer.h | 2 ++ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 88ca3f2..8dace5a 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -835,11 +835,7 @@ static int Tokenizer_parse_heading(Tokenizer* self) self->global ^= GL_HEADING; return 0; } -#ifdef IS_PY3K - level = PyLong_FromSsize_t(heading->level); -#else - level = PyInt_FromSsize_t(heading->level); -#endif + level = NEW_INT_FUNC(heading->level); if (!level) { Py_DECREF(heading->title); free(heading); @@ -2279,7 +2275,11 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) { PyObject *text, *temp; - if (!PyArg_ParseTuple(args, "U", &text)) { + if (PyArg_ParseTuple(args, "U", &text)) { + Py_XDECREF(self->text); + self->text = PySequence_Fast(text, "expected a sequence"); + } + else { const char* encoded; Py_ssize_t size; /* Failed to parse a Unicode object; try a string instead. */ @@ -2294,10 +2294,6 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) Py_XDECREF(temp); self->text = text; } - else { - Py_XDECREF(self->text); - self->text = PySequence_Fast(text, "expected a sequence"); - } self->head = self->global = self->depth = self->cycles = 0; self->length = PyList_GET_SIZE(self->text); return Tokenizer_parse(self, 0, 1); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 1229688..264360e 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -255,12 +255,14 @@ static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); /* Macros for Python 2/3 compatibility: */ #ifdef IS_PY3K + #define NEW_INT_FUNC PyLong_FromSsize_t #define IMPORT_NAME_FUNC PyUnicode_FromString #define CREATE_MODULE PyModule_Create(&module_def); #define ENTITYDEFS_MODULE "html.entities" #define INIT_FUNC_NAME PyInit__tokenizer #define INIT_ERROR return NULL #else + #define NEW_INT_FUNC PyInt_FromSsize_t #define IMPORT_NAME_FUNC PyBytes_FromString #define CREATE_MODULE Py_InitModule("_tokenizer", NULL); #define ENTITYDEFS_MODULE "htmlentitydefs" From 3e5c41d4dd02f0ac16282945ee79f2a9caff64b3 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 15 Aug 2013 02:48:52 -0400 Subject: [PATCH 136/189] Add py3k tokenizer to changelog. --- CHANGELOG | 1 + docs/changelog.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index f7858d8..84193e1 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,7 @@ v0.3 (unreleased): of False. This is a breaking change if you rely on any filter() methods being non-recursive by default. - Added a matches() method to Wikicode for page/template name comparisons. +- The C tokenizer extension now works on Python 3 in addition to Python 2.7. - Various fixes and cleanup. v0.2 (released June 20, 2013): diff --git a/docs/changelog.rst b/docs/changelog.rst index 8cfa2ec..78a27ad 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -16,6 +16,7 @@ Unreleased if you rely on any filter() methods being non-recursive by default.** - Added a :py:meth:`.matches` method to :py:class:`~.Wikicode` for page/template name comparisons. +- The C tokenizer extension now works on Python 3 in addition to Python 2.7. - Various fixes and cleanup. v0.2 From 0d934f8ad11004bbc4e0230a051f1c14e4cd63ea Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 15 Aug 2013 18:52:17 -0400 Subject: [PATCH 137/189] Squash a couple memory leaks. --- mwparserfromhell/parser/tokenizer.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 6600203..67a4ae6 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1646,11 +1646,13 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) } if (!IS_SINGLE_ONLY(name)) FAIL_ROUTE(0); + Py_DECREF(name); break; } Textbuffer_write(&buf, this); pos++; } + Textbuffer_dealloc(buf); if (!BAD_ROUTE) { tag = Tokenizer_really_parse_tag(self); if (!tag) @@ -1664,7 +1666,12 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) // Set invalid=True flag of TagOpenOpen if (PyObject_SetAttrString(PyList_GET_ITEM(tag, 0), "invalid", Py_True)) return -1; - return Tokenizer_emit_all(self, tag); + if (Tokenizer_emit_all(self, tag)) { + Py_DECREF(tag); + return -1; + } + Py_DECREF(tag); + return 0; } /* @@ -1685,7 +1692,10 @@ static int Tokenizer_parse_tag(Tokenizer* self) if (!tag) { return -1; } - Tokenizer_emit_all(self, tag); + if (Tokenizer_emit_all(self, tag)) { + Py_DECREF(tag); + return -1; + } Py_DECREF(tag); return 0; } From 4ef29c2611c1652410d8867ca62540b4866abd36 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 17 Aug 2013 19:55:52 -0400 Subject: [PATCH 138/189] Clean up Wikicode; unify tests for _do_search()-using methods. --- mwparserfromhell/wikicode.py | 9 +++-- tests/test_wikicode.py | 81 ++++++++++++++++---------------------------- 2 files changed, 34 insertions(+), 56 deletions(-) diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index a1921d7..b814ee5 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -108,8 +108,8 @@ class Wikicode(StringMixIn): def _do_search(self, obj, recursive, callback, context, *args, **kwargs): """Look within *context* for *obj*, executing *callback* if found. - If *recursive* is ``True``, we'll look within context and its - descendants, otherwise we'll just execute callback. We raise + If *recursive* is ``True``, we'll look within *context* and its + descendants, otherwise we'll just execute *callback*. We raise :py:exc:`ValueError` if *obj* isn't in our node list or context. If found, *callback* is passed the context, the index of the node within the context, and whatever were passed as ``*args`` and ``**kwargs``. @@ -375,9 +375,8 @@ class Wikicode(StringMixIn): """ if matches: matches = r"^(=+?)\s*" + matches + r"\s*\1$" - headings = self.filter_headings(recursive=True) - filtered = self.filter_headings(recursive=True, matches=matches, - flags=flags) + headings = self.filter_headings() + filtered = self.filter_headings(matches=matches, flags=flags) if levels: filtered = [head for head in filtered if head.level in levels] diff --git a/tests/test_wikicode.py b/tests/test_wikicode.py index 5a28fb5..2ad22dd 100644 --- a/tests/test_wikicode.py +++ b/tests/test_wikicode.py @@ -122,66 +122,45 @@ class TestWikicode(TreeEqualityTestCase): code3.insert(-1000, "derp") self.assertEqual("derp{{foo}}bar[[baz]]", code3) - def test_insert_before(self): - """test Wikicode.insert_before()""" + def _test_search(self, meth, expected): + """Base test for insert_before(), insert_after(), and replace().""" code = parse("{{a}}{{b}}{{c}}{{d}}") - code.insert_before("{{b}}", "x", recursive=True) - code.insert_before("{{d}}", "[[y]]", recursive=False) - self.assertEqual("{{a}}x{{b}}{{c}}[[y]]{{d}}", code) - code.insert_before(code.get(2), "z") - self.assertEqual("{{a}}xz{{b}}{{c}}[[y]]{{d}}", code) - self.assertRaises(ValueError, code.insert_before, "{{r}}", "n", - recursive=True) - self.assertRaises(ValueError, code.insert_before, "{{r}}", "n", - recursive=False) + func = getattr(code, meth) + func("{{b}}", "x", recursive=True) + func("{{d}}", "[[y]]", recursive=False) + self.assertEqual(expected[0], code) + func(code.get(2), "z") + self.assertEqual(expected[1], code) + self.assertRaises(ValueError, func, "{{r}}", "n", recursive=True) + self.assertRaises(ValueError, func, "{{r}}", "n", recursive=False) code2 = parse("{{a|{{b}}|{{c|d={{f}}}}}}") - code2.insert_before(code2.get(0).params[0].value.get(0), "x", - recursive=True) - code2.insert_before("{{f}}", "y", recursive=True) - self.assertEqual("{{a|x{{b}}|{{c|d=y{{f}}}}}}", code2) - self.assertRaises(ValueError, code2.insert_before, "{{f}}", "y", - recursive=False) + func = getattr(code2, meth) + func(code2.get(0).params[0].value.get(0), "x", recursive=True) + func("{{f}}", "y", recursive=True) + self.assertEqual(expected[2], code2) + self.assertRaises(ValueError, func, "{{f}}", "y", recursive=False) + + def test_insert_before(self): + """test Wikicode.insert_before()""" + expected = [ + "{{a}}x{{b}}{{c}}[[y]]{{d}}", "{{a}}xz{{b}}{{c}}[[y]]{{d}}", + "{{a|x{{b}}|{{c|d=y{{f}}}}}}"] + self._test_search("insert_before", expected) def test_insert_after(self): """test Wikicode.insert_after()""" - code = parse("{{a}}{{b}}{{c}}{{d}}") - code.insert_after("{{b}}", "x", recursive=True) - code.insert_after("{{d}}", "[[y]]", recursive=False) - self.assertEqual("{{a}}{{b}}x{{c}}{{d}}[[y]]", code) - code.insert_after(code.get(2), "z") - self.assertEqual("{{a}}{{b}}xz{{c}}{{d}}[[y]]", code) - self.assertRaises(ValueError, code.insert_after, "{{r}}", "n", - recursive=True) - self.assertRaises(ValueError, code.insert_after, "{{r}}", "n", - recursive=False) - - code2 = parse("{{a|{{b}}|{{c|d={{f}}}}}}") - code2.insert_after(code2.get(0).params[0].value.get(0), "x", - recursive=True) - code2.insert_after("{{f}}", "y", recursive=True) - self.assertEqual("{{a|{{b}}x|{{c|d={{f}}y}}}}", code2) - self.assertRaises(ValueError, code2.insert_after, "{{f}}", "y", - recursive=False) + expected = [ + "{{a}}{{b}}x{{c}}{{d}}[[y]]", "{{a}}{{b}}xz{{c}}{{d}}[[y]]", + "{{a|{{b}}x|{{c|d={{f}}y}}}}"] + self._test_search("insert_after", expected) def test_replace(self): """test Wikicode.replace()""" - code = parse("{{a}}{{b}}{{c}}{{d}}") - code.replace("{{b}}", "x", recursive=True) - code.replace("{{d}}", "[[y]]", recursive=False) - self.assertEqual("{{a}}x{{c}}[[y]]", code) - code.replace(code.get(1), "z") - self.assertEqual("{{a}}z{{c}}[[y]]", code) - self.assertRaises(ValueError, code.replace, "{{r}}", "n", - recursive=True) - self.assertRaises(ValueError, code.replace, "{{r}}", "n", - recursive=False) - - code2 = parse("{{a|{{b}}|{{c|d={{f}}}}}}") - code2.replace(code2.get(0).params[0].value.get(0), "x", recursive=True) - code2.replace("{{f}}", "y", recursive=True) - self.assertEqual("{{a|x|{{c|d=y}}}}", code2) - self.assertRaises(ValueError, code2.replace, "y", "z", recursive=False) + expected = [ + "{{a}}x{{c}}[[y]]", "{{a}}xz[[y]]", "{{a|x|{{c|d=y}}}}" + ] + self._test_search("replace", expected) def test_append(self): """test Wikicode.append()""" From 1d9340b965e246bf42ec3e07227db30157bbbe54 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 02:11:21 -0400 Subject: [PATCH 139/189] Tests for the _do_search() wrapper funcs' newly expanded scopes. --- tests/test_wikicode.py | 120 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 107 insertions(+), 13 deletions(-) diff --git a/tests/test_wikicode.py b/tests/test_wikicode.py index 2ad22dd..409f888 100644 --- a/tests/test_wikicode.py +++ b/tests/test_wikicode.py @@ -133,33 +133,84 @@ class TestWikicode(TreeEqualityTestCase): self.assertEqual(expected[1], code) self.assertRaises(ValueError, func, "{{r}}", "n", recursive=True) self.assertRaises(ValueError, func, "{{r}}", "n", recursive=False) + fake = parse("{{a}}").get(0) + self.assertRaises(ValueError, func, fake, "n", recursive=True) + self.assertRaises(ValueError, func, fake, "n", recursive=False) - code2 = parse("{{a|{{b}}|{{c|d={{f}}}}}}") + code2 = parse("{{a}}{{a}}{{a}}{{b}}{{b}}{{b}}") func = getattr(code2, meth) - func(code2.get(0).params[0].value.get(0), "x", recursive=True) - func("{{f}}", "y", recursive=True) + func(code2.get(1), "c", recursive=False) + func("{{a}}", "d", recursive=False) + func(code2.get(-1), "e", recursive=True) + func("{{b}}", "f", recursive=True) self.assertEqual(expected[2], code2) + + code3 = parse("{{a|{{b}}|{{c|d={{f}}}}}}") + func = getattr(code3, meth) + obj = code3.get(0).params[0].value.get(0) + self.assertRaises(ValueError, func, obj, "x", recursive=False) + func(obj, "x", recursive=True) self.assertRaises(ValueError, func, "{{f}}", "y", recursive=False) + func("{{f}}", "y", recursive=True) + self.assertEqual(expected[3], code3) + + code4 = parse("{{a}}{{b}}{{c}}{{d}}{{e}}{{f}}{{g}}") + func = getattr(code4, meth) + fake = parse("{{b}}{{c}}") + self.assertRaises(ValueError, func, fake, "q", recursive=False) + self.assertRaises(ValueError, func, fake, "q", recursive=True) + func("{{b}}{{c}}", "w", recursive=False) + func("{{d}}{{e}}", "x", recursive=True) + func(wrap(code4.nodes[-2:]), "y", recursive=False) + func(wrap(code4.nodes[-2:]), "z", recursive=True) + self.assertEqual(expected[4], code4) + self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=False) + self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=True) + + code5 = parse("{{a|{{b}}{{c}}|{{f|{{g}}={{h}}{{i}}}}}}") + func = getattr(code5, meth) + self.assertRaises(ValueError, func, "{{b}}{{c}}", "x", recursive=False) + func("{{b}}{{c}}", "x", recursive=True) + obj = code5.get(0).params[1].value.get(0).params[0].value + self.assertRaises(ValueError, func, obj, "y", recursive=False) + func(obj, "y", recursive=True) + self.assertEqual(expected[5], code5) + + code6 = parse("here is {{some text and a {{template}}}}") + func = getattr(code6, meth) + self.assertRaises(ValueError, func, "text and", "ab", recursive=False) + func("text and", "ab", recursive=True) + self.assertRaises(ValueError, func, "is {{some", "cd", recursive=False) + func("is {{some", "cd", recursive=True) + self.assertEqual(expected[6], code6) def test_insert_before(self): """test Wikicode.insert_before()""" expected = [ "{{a}}x{{b}}{{c}}[[y]]{{d}}", "{{a}}xz{{b}}{{c}}[[y]]{{d}}", - "{{a|x{{b}}|{{c|d=y{{f}}}}}}"] + "d{{a}}cd{{a}}d{{a}}f{{b}}f{{b}}ef{{b}}", + "{{a|x{{b}}|{{c|d=y{{f}}}}}}", + "{{a}}w{{b}}{{c}}x{{d}}{{e}}yz{{f}}{{g}}", + "{{a|x{{b}}{{c}}|{{f|{{g}}=y{{h}}{{i}}}}}}", + "here cdis {{some abtext and a {{template}}}}"] self._test_search("insert_before", expected) def test_insert_after(self): """test Wikicode.insert_after()""" expected = [ "{{a}}{{b}}x{{c}}{{d}}[[y]]", "{{a}}{{b}}xz{{c}}{{d}}[[y]]", - "{{a|{{b}}x|{{c|d={{f}}y}}}}"] + "{{a}}d{{a}}dc{{a}}d{{b}}f{{b}}f{{b}}fe", + "{{a|{{b}}x|{{c|d={{f}}y}}}}", + "{{a}}{{b}}{{c}}w{{d}}{{e}}x{{f}}{{g}}yz", + "{{a|{{b}}{{c}}x|{{f|{{g}}={{h}}{{i}}y}}}}", + "here is {{somecd text andab a {{template}}}}"] self._test_search("insert_after", expected) def test_replace(self): """test Wikicode.replace()""" expected = [ - "{{a}}x{{c}}[[y]]", "{{a}}xz[[y]]", "{{a|x|{{c|d=y}}}}" - ] + "{{a}}x{{c}}[[y]]", "{{a}}xz[[y]]", "dcdffe", "{{a|x|{{c|d=y}}}}", + "{{a}}wz", "{{a|x|{{f|{{g}}=y}}}}", "here cd ab a {{template}}}}"] self._test_search("replace", expected) def test_append(self): @@ -182,12 +233,55 @@ class TestWikicode(TreeEqualityTestCase): self.assertEqual("{{a}}{{d}}", code) self.assertRaises(ValueError, code.remove, "{{r}}", recursive=True) self.assertRaises(ValueError, code.remove, "{{r}}", recursive=False) - - code2 = parse("{{a|{{b}}|{{c|d={{f}}{{h}}}}}}") - code2.remove(code2.get(0).params[0].value.get(0), recursive=True) - code2.remove("{{f}}", recursive=True) - self.assertEqual("{{a||{{c|d={{h}}}}}}", code2) - self.assertRaises(ValueError, code2.remove, "{{h}}", recursive=False) + fake = parse("{{a}}").get(0) + self.assertRaises(ValueError, code.remove, fake, recursive=True) + self.assertRaises(ValueError, code.remove, fake, recursive=False) + + code2 = parse("{{a}}{{a}}{{a}}{{b}}{{b}}{{b}}") + code2.remove(code2.get(1), recursive=False) + self.assertEqual("{{a}}{{a}}{{b}}{{b}}{{b}}", code2) + code2.remove("{{a}}", recursive=False) + self.assertEqual("{{b}}{{b}}{{b}}", code2) + code2.remove(code2.get(-1), recursive=True) + self.assertEqual("{{b}}{{b}}", code2) + code2.remove("{{b}}", recursive=True) + self.assertEqual("", code2) + + code3 = parse("{{a|{{b}}|{{c|d={{f}}}}}}") + obj = code3.get(0).params[0].value.get(0) + self.assertRaises(ValueError, code3.remove, obj, recursive=False) + code3.remove(obj, recursive=True) + self.assertRaises(ValueError, code3.remove, "{{f}}", recursive=False) + code3.remove("{{f}}", recursive=True) + self.assertEqual("{{a||{{c|d=}}}}", code3) + + code4 = parse("{{a}}{{b}}{{c}}{{d}}{{e}}{{f}}{{g}}{{h}}{{i}}{{j}}") + fake = parse("{{b}}{{c}}") + self.assertRaises(ValueError, code4.remove, fake, recursive=False) + self.assertRaises(ValueError, code4.remove, fake, recursive=True) + code4.remove("{{b}}{{c}}", recursive=False) + code4.remove("{{d}}{{e}}", recursive=True) + code4.remove(wrap(code4.nodes[-2:]), recursive=False) + code4.remove(wrap(code4.nodes[-2:]), recursive=True) + self.assertEqual("{{a}}{{f}}", code4) + self.assertRaises(ValueError, code4.remove, "{{a}}{{b}}", False) + self.assertRaises(ValueError, code4.remove, "{{a}}{{b}}", True) + + code5 = parse("{{a|{{b}}{{c}}|{{f|{{g}}={{h}}{{i}}}}}}") + self.assertRaises(ValueError, code5.remove, "{{b}}{{c}}", False) + code5.remove("{{b}}{{c}}", recursive=True) + obj = code5.get(0).params[1].value.get(0).params[0].value + self.assertRaises(ValueError, code5.remove, obj, recursive=False) + code5.remove(obj, recursive=True) + self.assertEqual("{{a||{{f|{{g}}=}}}}", code5) + + code6 = parse("here is {{some text and a {{template}}}}") + func = code6.remove + self.assertRaises(ValueError, func, "text and", recursive=False) + func("text and", recursive=True) + self.assertRaises(ValueError, func, "is {{some", recursive=False) + func("is {{some", recursive=True) + self.assertEqual("here a {{template}}}}", code6) def test_matches(self): """test Wikicode.matches()""" From bda12c4c363318e043a078e4fcc60752228f3322 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 02:12:20 -0400 Subject: [PATCH 140/189] Reimplement Wikicode._do_search() to support more input types. --- mwparserfromhell/wikicode.py | 167 +++++++++++++++++++++++++++---------------- 1 file changed, 107 insertions(+), 60 deletions(-) diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index b814ee5..5c4d66a 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -60,19 +60,6 @@ class Wikicode(StringMixIn): for context, child in node.__iternodes__(self._get_all_nodes): yield child - def _get_context(self, node, obj): - """Return a ``Wikicode`` that contains *obj* in its descendants. - - The closest (shortest distance from *node*) suitable ``Wikicode`` will - be returned, or ``None`` if the *obj* is the *node* itself. - - Raises ``ValueError`` if *obj* is not within *node*. - """ - for context, child in node.__iternodes__(self._get_all_nodes): - if self._is_equivalent(obj, child): - return context - raise ValueError(obj) - def _get_all_nodes(self, code): """Iterate over all of our descendant nodes. @@ -105,26 +92,54 @@ class Wikicode(StringMixIn): return False return obj in nodes - def _do_search(self, obj, recursive, callback, context, *args, **kwargs): - """Look within *context* for *obj*, executing *callback* if found. + def _do_search(self, obj, recursive, context=None, literal=None): + """Return some info about the location of *obj* within *context*. - If *recursive* is ``True``, we'll look within *context* and its - descendants, otherwise we'll just execute *callback*. We raise - :py:exc:`ValueError` if *obj* isn't in our node list or context. If - found, *callback* is passed the context, the index of the node within - the context, and whatever were passed as ``*args`` and ``**kwargs``. + If *recursive* is ``True``, we'll look within *context* (``self`` by + default) and its descendants, otherwise just *context*. We raise + :py:exc:`ValueError` if *obj* isn't found. The return data is a list of + 3-tuples (*type*, *context*, *data*) where *type* is *obj*\ 's best + type resolution (either ``Node``, ``Wikicode``, or ``str``), *context* + is the closest ``Wikicode`` encompassing it, and *data* is either a + ``Node``, a list of ``Node``\ s, or ``None`` depending on *type*. """ - if recursive: - for i, node in enumerate(context.nodes): - if self._is_equivalent(obj, node): - return callback(context, i, *args, **kwargs) - if self._contains(self._get_children(node), obj): - context = self._get_context(node, obj) - return self._do_search(obj, recursive, callback, context, - *args, **kwargs) - raise ValueError(obj) + if not context: + context = self + literal = isinstance(obj, (Node, Wikicode)) + obj = parse_anything(obj) + if not obj or obj not in self: + raise ValueError(obj) + if len(obj.nodes) == 1: + obj = obj.get(0) + + compare = lambda a, b: (a is b) if literal else (a == b) + results = [] + i = 0 + while i < len(context.nodes): + node = context.get(i) + if isinstance(obj, Node) and compare(obj, node): + results.append((Node, context, node)) + elif isinstance(obj, Wikicode) and compare(obj.get(0), node): + for j in range(1, len(obj.nodes)): + if not compare(obj.get(j), context.get(i + j)): + break + else: + nodes = list(context.nodes[i:i + len(obj.nodes)]) + results.append((Wikicode, context, nodes)) + i += len(obj.nodes) - 1 + elif recursive: + contexts = node.__iternodes__(self._get_all_nodes) + for code in {ctx for ctx, child in contexts}: + if code and obj in code: + search = self._do_search(obj, recursive, code, literal) + results.extend(search) + i += 1 - callback(context, self.index(obj, recursive=False), *args, **kwargs) + if not results and not literal and recursive: + results.append((str, context, None)) + if not results and context is self: + raise ValueError(obj) + return results def _get_tree(self, code, lines, marker, indent): """Build a tree to illustrate the way the Wikicode object was parsed. @@ -253,41 +268,64 @@ class Wikicode(StringMixIn): def insert_before(self, obj, value, recursive=True): """Insert *value* immediately before *obj* in the list of nodes. - *obj* can be either a string or a :py:class:`~.Node`. *value* can be - anything parasable by :py:func:`.parse_anything`. If *recursive* is - ``True``, we will try to find *obj* within our child nodes even if it - is not a direct descendant of this :py:class:`~.Wikicode` object. If - *obj* is not in the node list, :py:exc:`ValueError` is raised. + *obj* can be either a string, a :py:class:`~.Node`, or other + :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`, + for example). *value* can be anything parasable by + :py:func:`.parse_anything`. If *recursive* is ``True``, we will try to + find *obj* within our child nodes even if it is not a direct descendant + of this :py:class:`~.Wikicode` object. If *obj* is not found, + :py:exc:`ValueError` is raised. """ - callback = lambda self, i, value: self.insert(i, value) - self._do_search(obj, recursive, callback, self, value) + for restype, context, data in self._do_search(obj, recursive): + if restype in (Node, Wikicode): + i = context.index(data if restype is Node else data[0], False) + context.insert(i, value) + else: + obj = str(obj) + context.nodes = str(context).replace(obj, str(value) + obj) def insert_after(self, obj, value, recursive=True): """Insert *value* immediately after *obj* in the list of nodes. - *obj* can be either a string or a :py:class:`~.Node`. *value* can be - anything parasable by :py:func:`.parse_anything`. If *recursive* is - ``True``, we will try to find *obj* within our child nodes even if it - is not a direct descendant of this :py:class:`~.Wikicode` object. If - *obj* is not in the node list, :py:exc:`ValueError` is raised. + *obj* can be either a string, a :py:class:`~.Node`, or other + :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`, + for example). *value* can be anything parasable by + :py:func:`.parse_anything`. If *recursive* is ``True``, we will try to + find *obj* within our child nodes even if it is not a direct descendant + of this :py:class:`~.Wikicode` object. If *obj* is not found, + :py:exc:`ValueError` is raised. """ - callback = lambda self, i, value: self.insert(i + 1, value) - self._do_search(obj, recursive, callback, self, value) + for restype, context, data in self._do_search(obj, recursive): + if restype in (Node, Wikicode): + i = context.index(data if restype is Node else data[-1], False) + context.insert(i + 1, value) + else: + obj = str(obj) + context.nodes = str(context).replace(obj, obj + str(value)) def replace(self, obj, value, recursive=True): """Replace *obj* with *value* in the list of nodes. - *obj* can be either a string or a :py:class:`~.Node`. *value* can be - anything parasable by :py:func:`.parse_anything`. If *recursive* is - ``True``, we will try to find *obj* within our child nodes even if it - is not a direct descendant of this :py:class:`~.Wikicode` object. If - *obj* is not in the node list, :py:exc:`ValueError` is raised. + *obj* can be either a string, a :py:class:`~.Node`, or other + :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`, + for example). *value* can be anything parasable by + :py:func:`.parse_anything`. If *recursive* is ``True``, we will try to + find *obj* within our child nodes even if it is not a direct descendant + of this :py:class:`~.Wikicode` object. If *obj* is not found, + :py:exc:`ValueError` is raised. """ - def callback(self, i, value): - self.nodes.pop(i) - self.insert(i, value) - - self._do_search(obj, recursive, callback, self, value) + for restype, context, data in self._do_search(obj, recursive): + if restype is Node: + i = context.index(data, False) + context.nodes.pop(i) + context.insert(i, value) + elif restype is Wikicode: + i = context.index(data[0], False) + for _ in data: + context.nodes.pop(i) + context.insert(i, value) + else: + context.nodes = str(context).replace(str(obj), str(value)) def append(self, value): """Insert *value* at the end of the list of nodes. @@ -301,13 +339,22 @@ class Wikicode(StringMixIn): def remove(self, obj, recursive=True): """Remove *obj* from the list of nodes. - *obj* can be either a string or a :py:class:`~.Node`. If *recursive* is - ``True``, we will try to find *obj* within our child nodes even if it - is not a direct descendant of this :py:class:`~.Wikicode` object. If - *obj* is not in the node list, :py:exc:`ValueError` is raised. + *obj* can be either a string, a :py:class:`~.Node`, or other + :py:class:`~.Wikicode` object (as created by :py:meth:`get_sections`, + for example). If *recursive* is ``True``, we will try to find *obj* + within our child nodes even if it is not a direct descendant of this + :py:class:`~.Wikicode` object. If *obj* is not found, + :py:exc:`ValueError` is raised. """ - callback = lambda self, i: self.nodes.pop(i) - self._do_search(obj, recursive, callback, self) + for restype, context, data in self._do_search(obj, recursive): + if restype is Node: + context.nodes.pop(context.index(data, False)) + elif restype is Wikicode: + i = context.index(data[0], False) + for _ in data: + context.nodes.pop(i) + else: + context.nodes = str(context).replace(str(obj), "") def matches(self, other): """Do a loose equivalency test suitable for comparing page names. From 06efcd0b01484572a0677918f837b6c7141862a0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 02:18:29 -0400 Subject: [PATCH 141/189] Update changelog re: expanded _do_search() methods (closes #34). --- CHANGELOG | 4 ++++ docs/changelog.rst | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 84193e1..3876562 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -7,6 +7,10 @@ v0.3 (unreleased): of False. This is a breaking change if you rely on any filter() methods being non-recursive by default. - Added a matches() method to Wikicode for page/template name comparisons. +- The 'obj' param of Wikicode.insert_before(), insert_after(), replace(), and + remove() now accepts other Wikicode objects and strings representing parts of + wikitext, instead of just nodes. These methods also make all possible + substitutions instead of just one. - The C tokenizer extension now works on Python 3 in addition to Python 2.7. - Various fixes and cleanup. diff --git a/docs/changelog.rst b/docs/changelog.rst index 78a27ad..c5e2516 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -16,6 +16,11 @@ Unreleased if you rely on any filter() methods being non-recursive by default.** - Added a :py:meth:`.matches` method to :py:class:`~.Wikicode` for page/template name comparisons. +- The *obj* param of :py:meth:`Wikicode.insert_before <.insert_before>`, + :py:meth:`~.insert_after`, :py:meth:`~.replace`, and :py:meth:`~.remove` now + accepts :py:class:`~.Wikicode` objects and strings representing parts of + wikitext, instead of just nodes. These methods also make all possible + substitutions instead of just one. - The C tokenizer extension now works on Python 3 in addition to Python 2.7. - Various fixes and cleanup. From 943caede7a1c2440ddd6d10abc2faa4ecf45641a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 02:34:30 -0400 Subject: [PATCH 142/189] Fix for Python 3. --- mwparserfromhell/wikicode.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index 5c4d66a..b5e854d 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -129,10 +129,12 @@ class Wikicode(StringMixIn): i += len(obj.nodes) - 1 elif recursive: contexts = node.__iternodes__(self._get_all_nodes) - for code in {ctx for ctx, child in contexts}: - if code and obj in code: + processed = [] + for code in (ctx for ctx, child in contexts): + if code and code not in processed and obj in code: search = self._do_search(obj, recursive, code, literal) results.extend(search) + processed.append(code) i += 1 if not results and not literal and recursive: From fdf4f6774d18209b2e4297c802623416b167d8ee Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 03:12:54 -0400 Subject: [PATCH 143/189] Make test_remove() use _test_search(). --- tests/test_wikicode.py | 113 +++++++++++++++---------------------------------- 1 file changed, 34 insertions(+), 79 deletions(-) diff --git a/tests/test_wikicode.py b/tests/test_wikicode.py index 409f888..08cf93c 100644 --- a/tests/test_wikicode.py +++ b/tests/test_wikicode.py @@ -21,6 +21,7 @@ # SOFTWARE. from __future__ import unicode_literals +from functools import partial import re from types import GeneratorType import unittest @@ -124,13 +125,12 @@ class TestWikicode(TreeEqualityTestCase): def _test_search(self, meth, expected): """Base test for insert_before(), insert_after(), and replace().""" - code = parse("{{a}}{{b}}{{c}}{{d}}") - func = getattr(code, meth) + code = parse("{{a}}{{b}}{{c}}{{d}}{{e}}") + func = partial(meth, code) func("{{b}}", "x", recursive=True) func("{{d}}", "[[y]]", recursive=False) - self.assertEqual(expected[0], code) func(code.get(2), "z") - self.assertEqual(expected[1], code) + self.assertEqual(expected[0], code) self.assertRaises(ValueError, func, "{{r}}", "n", recursive=True) self.assertRaises(ValueError, func, "{{r}}", "n", recursive=False) fake = parse("{{a}}").get(0) @@ -138,24 +138,24 @@ class TestWikicode(TreeEqualityTestCase): self.assertRaises(ValueError, func, fake, "n", recursive=False) code2 = parse("{{a}}{{a}}{{a}}{{b}}{{b}}{{b}}") - func = getattr(code2, meth) + func = partial(meth, code2) func(code2.get(1), "c", recursive=False) func("{{a}}", "d", recursive=False) func(code2.get(-1), "e", recursive=True) func("{{b}}", "f", recursive=True) - self.assertEqual(expected[2], code2) + self.assertEqual(expected[1], code2) code3 = parse("{{a|{{b}}|{{c|d={{f}}}}}}") - func = getattr(code3, meth) + func = partial(meth, code3) obj = code3.get(0).params[0].value.get(0) self.assertRaises(ValueError, func, obj, "x", recursive=False) func(obj, "x", recursive=True) self.assertRaises(ValueError, func, "{{f}}", "y", recursive=False) func("{{f}}", "y", recursive=True) - self.assertEqual(expected[3], code3) + self.assertEqual(expected[2], code3) - code4 = parse("{{a}}{{b}}{{c}}{{d}}{{e}}{{f}}{{g}}") - func = getattr(code4, meth) + code4 = parse("{{a}}{{b}}{{c}}{{d}}{{e}}{{f}}{{g}}{{h}}{{i}}{{j}}") + func = partial(meth, code4) fake = parse("{{b}}{{c}}") self.assertRaises(ValueError, func, fake, "q", recursive=False) self.assertRaises(ValueError, func, fake, "q", recursive=True) @@ -163,55 +163,59 @@ class TestWikicode(TreeEqualityTestCase): func("{{d}}{{e}}", "x", recursive=True) func(wrap(code4.nodes[-2:]), "y", recursive=False) func(wrap(code4.nodes[-2:]), "z", recursive=True) - self.assertEqual(expected[4], code4) + self.assertEqual(expected[3], code4) self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=False) self.assertRaises(ValueError, func, "{{c}}{{d}}", "q", recursive=True) code5 = parse("{{a|{{b}}{{c}}|{{f|{{g}}={{h}}{{i}}}}}}") - func = getattr(code5, meth) + func = partial(meth, code5) self.assertRaises(ValueError, func, "{{b}}{{c}}", "x", recursive=False) func("{{b}}{{c}}", "x", recursive=True) obj = code5.get(0).params[1].value.get(0).params[0].value self.assertRaises(ValueError, func, obj, "y", recursive=False) func(obj, "y", recursive=True) - self.assertEqual(expected[5], code5) + self.assertEqual(expected[4], code5) code6 = parse("here is {{some text and a {{template}}}}") - func = getattr(code6, meth) + func = partial(meth, code6) self.assertRaises(ValueError, func, "text and", "ab", recursive=False) func("text and", "ab", recursive=True) self.assertRaises(ValueError, func, "is {{some", "cd", recursive=False) func("is {{some", "cd", recursive=True) - self.assertEqual(expected[6], code6) + self.assertEqual(expected[5], code6) def test_insert_before(self): """test Wikicode.insert_before()""" + meth = lambda code, *args, **kw: code.insert_before(*args, **kw) expected = [ - "{{a}}x{{b}}{{c}}[[y]]{{d}}", "{{a}}xz{{b}}{{c}}[[y]]{{d}}", + "{{a}}xz{{b}}{{c}}[[y]]{{d}}{{e}}", "d{{a}}cd{{a}}d{{a}}f{{b}}f{{b}}ef{{b}}", "{{a|x{{b}}|{{c|d=y{{f}}}}}}", - "{{a}}w{{b}}{{c}}x{{d}}{{e}}yz{{f}}{{g}}", + "{{a}}w{{b}}{{c}}x{{d}}{{e}}{{f}}{{g}}{{h}}yz{{i}}{{j}}", "{{a|x{{b}}{{c}}|{{f|{{g}}=y{{h}}{{i}}}}}}", "here cdis {{some abtext and a {{template}}}}"] - self._test_search("insert_before", expected) + self._test_search(meth, expected) def test_insert_after(self): """test Wikicode.insert_after()""" + meth = lambda code, *args, **kw: code.insert_after(*args, **kw) expected = [ - "{{a}}{{b}}x{{c}}{{d}}[[y]]", "{{a}}{{b}}xz{{c}}{{d}}[[y]]", + "{{a}}{{b}}xz{{c}}{{d}}[[y]]{{e}}", "{{a}}d{{a}}dc{{a}}d{{b}}f{{b}}f{{b}}fe", "{{a|{{b}}x|{{c|d={{f}}y}}}}", - "{{a}}{{b}}{{c}}w{{d}}{{e}}x{{f}}{{g}}yz", + "{{a}}{{b}}{{c}}w{{d}}{{e}}x{{f}}{{g}}{{h}}{{i}}{{j}}yz", "{{a|{{b}}{{c}}x|{{f|{{g}}={{h}}{{i}}y}}}}", "here is {{somecd text andab a {{template}}}}"] - self._test_search("insert_after", expected) + self._test_search(meth, expected) def test_replace(self): """test Wikicode.replace()""" + meth = lambda code, *args, **kw: code.replace(*args, **kw) expected = [ - "{{a}}x{{c}}[[y]]", "{{a}}xz[[y]]", "dcdffe", "{{a|x|{{c|d=y}}}}", - "{{a}}wz", "{{a|x|{{f|{{g}}=y}}}}", "here cd ab a {{template}}}}"] - self._test_search("replace", expected) + "{{a}}xz[[y]]{{e}}", "dcdffe", "{{a|x|{{c|d=y}}}}", + "{{a}}wx{{f}}{{g}}z", "{{a|x|{{f|{{g}}=y}}}}", + "here cd ab a {{template}}}}"] + self._test_search(meth, expected) def test_append(self): """test Wikicode.append()""" @@ -227,61 +231,12 @@ class TestWikicode(TreeEqualityTestCase): def test_remove(self): """test Wikicode.remove()""" - code = parse("{{a}}{{b}}{{c}}{{d}}") - code.remove("{{b}}", recursive=True) - code.remove(code.get(1), recursive=True) - self.assertEqual("{{a}}{{d}}", code) - self.assertRaises(ValueError, code.remove, "{{r}}", recursive=True) - self.assertRaises(ValueError, code.remove, "{{r}}", recursive=False) - fake = parse("{{a}}").get(0) - self.assertRaises(ValueError, code.remove, fake, recursive=True) - self.assertRaises(ValueError, code.remove, fake, recursive=False) - - code2 = parse("{{a}}{{a}}{{a}}{{b}}{{b}}{{b}}") - code2.remove(code2.get(1), recursive=False) - self.assertEqual("{{a}}{{a}}{{b}}{{b}}{{b}}", code2) - code2.remove("{{a}}", recursive=False) - self.assertEqual("{{b}}{{b}}{{b}}", code2) - code2.remove(code2.get(-1), recursive=True) - self.assertEqual("{{b}}{{b}}", code2) - code2.remove("{{b}}", recursive=True) - self.assertEqual("", code2) - - code3 = parse("{{a|{{b}}|{{c|d={{f}}}}}}") - obj = code3.get(0).params[0].value.get(0) - self.assertRaises(ValueError, code3.remove, obj, recursive=False) - code3.remove(obj, recursive=True) - self.assertRaises(ValueError, code3.remove, "{{f}}", recursive=False) - code3.remove("{{f}}", recursive=True) - self.assertEqual("{{a||{{c|d=}}}}", code3) - - code4 = parse("{{a}}{{b}}{{c}}{{d}}{{e}}{{f}}{{g}}{{h}}{{i}}{{j}}") - fake = parse("{{b}}{{c}}") - self.assertRaises(ValueError, code4.remove, fake, recursive=False) - self.assertRaises(ValueError, code4.remove, fake, recursive=True) - code4.remove("{{b}}{{c}}", recursive=False) - code4.remove("{{d}}{{e}}", recursive=True) - code4.remove(wrap(code4.nodes[-2:]), recursive=False) - code4.remove(wrap(code4.nodes[-2:]), recursive=True) - self.assertEqual("{{a}}{{f}}", code4) - self.assertRaises(ValueError, code4.remove, "{{a}}{{b}}", False) - self.assertRaises(ValueError, code4.remove, "{{a}}{{b}}", True) - - code5 = parse("{{a|{{b}}{{c}}|{{f|{{g}}={{h}}{{i}}}}}}") - self.assertRaises(ValueError, code5.remove, "{{b}}{{c}}", False) - code5.remove("{{b}}{{c}}", recursive=True) - obj = code5.get(0).params[1].value.get(0).params[0].value - self.assertRaises(ValueError, code5.remove, obj, recursive=False) - code5.remove(obj, recursive=True) - self.assertEqual("{{a||{{f|{{g}}=}}}}", code5) - - code6 = parse("here is {{some text and a {{template}}}}") - func = code6.remove - self.assertRaises(ValueError, func, "text and", recursive=False) - func("text and", recursive=True) - self.assertRaises(ValueError, func, "is {{some", recursive=False) - func("is {{some", recursive=True) - self.assertEqual("here a {{template}}}}", code6) + meth = lambda code, obj, value, **kw: code.remove(obj, **kw) + expected = [ + "{{a}}{{c}}", "", "{{a||{{c|d=}}}}", "{{a}}{{f}}", + "{{a||{{f|{{g}}=}}}}", "here a {{template}}}}" + ] + self._test_search(meth, expected) def test_matches(self): """test Wikicode.matches()""" From a7dda77474c7730e23e8c376e6d91cea4eab9235 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 13:11:56 -0400 Subject: [PATCH 144/189] Template.has_param() -> Template.has() --- CHANGELOG | 2 ++ README.rst | 2 +- docs/changelog.rst | 13 ++++++++----- docs/usage.rst | 2 +- mwparserfromhell/nodes/template.py | 7 +++++-- tests/test_docs.py | 2 +- tests/test_template.py | 22 +++++++++++----------- 7 files changed, 29 insertions(+), 21 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 3876562..8922738 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -11,6 +11,8 @@ v0.3 (unreleased): remove() now accepts other Wikicode objects and strings representing parts of wikitext, instead of just nodes. These methods also make all possible substitutions instead of just one. +- Renamed Template.has_param() to has() for consistency with Template's other + methods; has_param() is now an alias. - The C tokenizer extension now works on Python 3 in addition to Python 2.7. - Various fixes and cleanup. diff --git a/README.rst b/README.rst index f1092ee..b5fd912 100644 --- a/README.rst +++ b/README.rst @@ -90,7 +90,7 @@ whitespace:: >>> text = "{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}" >>> code = mwparserfromhell.parse(text) >>> for template in code.filter_templates(): - ... if template.name.matches("Cleanup") and not template.has_param("date"): + ... if template.name.matches("Cleanup") and not template.has("date"): ... template.add("date", "July 2012") ... >>> print code diff --git a/docs/changelog.rst b/docs/changelog.rst index c5e2516..86dfd78 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -16,11 +16,14 @@ Unreleased if you rely on any filter() methods being non-recursive by default.** - Added a :py:meth:`.matches` method to :py:class:`~.Wikicode` for page/template name comparisons. -- The *obj* param of :py:meth:`Wikicode.insert_before <.insert_before>`, - :py:meth:`~.insert_after`, :py:meth:`~.replace`, and :py:meth:`~.remove` now - accepts :py:class:`~.Wikicode` objects and strings representing parts of - wikitext, instead of just nodes. These methods also make all possible - substitutions instead of just one. +- The *obj* param of :py:meth:`Wikicode.insert_before() <.insert_before>`, + :py:meth:`~.insert_after`, :py:meth:`~.Wikicode.replace`, and + :py:meth:`~.Wikicode.remove` now accepts :py:class:`~.Wikicode` objects and + strings representing parts of wikitext, instead of just nodes. These methods + also make all possible substitutions instead of just one. +- Renamed :py:meth:`Template.has_param() <.has_param>` to + :py:meth:`~.Template.has` for consistency with :py:class:`~.Template`\ 's + other methods; :py:meth:`~.has_param` is now an alias. - The C tokenizer extension now works on Python 3 in addition to Python 2.7. - Various fixes and cleanup. diff --git a/docs/usage.rst b/docs/usage.rst index 15b384b..974c670 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -58,7 +58,7 @@ names, which takes care of capitalization and whitespace:: >>> text = "{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}" >>> code = mwparserfromhell.parse(text) >>> for template in code.filter_templates(): - ... if template.name.matches("Cleanup") and not template.has_param("date"): + ... if template.name.matches("Cleanup") and not template.has("date"): ... template.add("date", "July 2012") ... >>> print code diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 6dfc4f0..c326b65 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -164,7 +164,7 @@ class Template(Node): def name(self, value): self._name = parse_anything(value) - def has_param(self, name, ignore_empty=True): + def has(self, name, ignore_empty=True): """Return ``True`` if any parameter in the template is named *name*. With *ignore_empty*, ``False`` will be returned even if the template @@ -180,6 +180,9 @@ class Template(Node): return True return False + has_param = lambda self, *args, **kwargs: self.has(*args, **kwargs) + has_param.__doc__ = "Alias for :py:meth:`has`." + def get(self, name): """Get the parameter whose name is *name*. @@ -226,7 +229,7 @@ class Template(Node): name, value = parse_anything(name), parse_anything(value) self._surface_escape(value, "|") - if self.has_param(name): + if self.has(name): self.remove(name, keep_field=True) existing = self.get(name) if showkey is not None: diff --git a/tests/test_docs.py b/tests/test_docs.py index 53b3b76..6d066bd 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -90,7 +90,7 @@ class TestDocs(unittest.TestCase): text = "{{cleanup}} '''Foo''' is a [[bar]]. {{uncategorized}}" code = mwparserfromhell.parse(text) for template in code.filter_templates(): - if template.name.matches("Cleanup") and not template.has_param("date"): + if template.name.matches("Cleanup") and not template.has("date"): template.add("date", "July 2012") res = "{{cleanup|date=July 2012}} '''Foo''' is a [[bar]]. {{uncategorized}}" self.assertPrint(code, res) diff --git a/tests/test_template.py b/tests/test_template.py index 28592df..9ed099d 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -115,23 +115,23 @@ class TestTemplate(TreeEqualityTestCase): self.assertEqual([], node1.params) self.assertIs(plist, node2.params) - def test_has_param(self): - """test Template.has_param()""" + def test_has(self): + """test Template.has()""" node1 = Template(wraptext("foobar")) node2 = Template(wraptext("foo"), [pgenh("1", "bar"), pgens("\nabc ", "def")]) node3 = Template(wraptext("foo"), [pgenh("1", "a"), pgens("b", "c"), pgens("1", "d")]) node4 = Template(wraptext("foo"), [pgenh("1", "a"), pgens("b", " ")]) - self.assertFalse(node1.has_param("foobar")) - self.assertTrue(node2.has_param(1)) - self.assertTrue(node2.has_param("abc")) - self.assertFalse(node2.has_param("def")) - self.assertTrue(node3.has_param("1")) - self.assertTrue(node3.has_param(" b ")) - self.assertFalse(node4.has_param("b")) - self.assertTrue(node3.has_param("b", False)) - self.assertTrue(node4.has_param("b", False)) + self.assertFalse(node1.has("foobar")) + self.assertTrue(node2.has(1)) + self.assertTrue(node2.has("abc")) + self.assertFalse(node2.has("def")) + self.assertTrue(node3.has("1")) + self.assertTrue(node3.has(" b ")) + self.assertFalse(node4.has("b")) + self.assertTrue(node3.has("b", False)) + self.assertTrue(node4.has("b", False)) def test_get(self): """test Template.get()""" From 53c26589ee544a413506a319c1b7a3026a6a6ba0 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 14:45:24 -0400 Subject: [PATCH 145/189] Some node fixes; attributes with empty values now display correctly. --- mwparserfromhell/compat.py | 2 -- mwparserfromhell/nodes/extras/attribute.py | 4 ++-- mwparserfromhell/nodes/template.py | 12 ++++++------ mwparserfromhell/parser/tokens.py | 4 ++-- tests/test_attribute.py | 4 ++++ 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/mwparserfromhell/compat.py b/mwparserfromhell/compat.py index bb81513..864605c 100644 --- a/mwparserfromhell/compat.py +++ b/mwparserfromhell/compat.py @@ -15,14 +15,12 @@ py3k = sys.version_info[0] == 3 if py3k: bytes = bytes str = str - basestring = str maxsize = sys.maxsize import html.entities as htmlentities else: bytes = str str = unicode - basestring = basestring maxsize = sys.maxint import htmlentitydefs as htmlentities diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py index ad282af..05860a0 100644 --- a/mwparserfromhell/nodes/extras/attribute.py +++ b/mwparserfromhell/nodes/extras/attribute.py @@ -48,7 +48,7 @@ class Attribute(StringMixIn): def __unicode__(self): base = self.pad_first + str(self.name) + self.pad_before_eq - if self.value: + if self.value is not None: if self.quoted: return base + '="' + self.pad_after_eq + str(self.value) + '"' return base + "=" + self.pad_after_eq + str(self.value) @@ -100,7 +100,7 @@ class Attribute(StringMixIn): @value.setter def value(self, newval): - self._value = parse_anything(newval) + self._value = None if newval is None else parse_anything(newval) @quoted.setter def quoted(self, value): diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index c326b65..a6b1665 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -26,7 +26,7 @@ import re from . import HTMLEntity, Node, Text from .extras import Parameter -from ..compat import basestring, str +from ..compat import str from ..utils import parse_anything __all__ = ["Template"] @@ -84,7 +84,7 @@ class Template(Node): replacement = str(HTMLEntity(value=ord(char))) for node in code.filter_text(recursive=False): if char in node: - code.replace(node, node.replace(char, replacement)) + code.replace(node, node.replace(char, replacement), False) def _blank_param_value(self, value): """Remove the content from *value* while keeping its whitespace. @@ -170,9 +170,9 @@ class Template(Node): With *ignore_empty*, ``False`` will be returned even if the template contains a parameter with the name *name*, if the parameter's value is empty. Note that a template may have multiple parameters with the - same name. + same name, but only the last one is read by the MediaWiki parser. """ - name = name.strip() if isinstance(name, basestring) else str(name) + name = str(name).strip() for param in self.params: if param.name.strip() == name: if ignore_empty and not param.value.strip(): @@ -191,7 +191,7 @@ class Template(Node): parameters can have the same name, we'll return the last match, since the last parameter is the only one read by the MediaWiki parser. """ - name = name.strip() if isinstance(name, basestring) else str(name) + name = str(name).strip() for param in reversed(self.params): if param.name.strip() == name: return param @@ -294,7 +294,7 @@ class Template(Node): the first instance if none have dependents, otherwise the one with dependents will be kept). """ - name = name.strip() if isinstance(name, basestring) else str(name) + name = str(name).strip() removed = False to_remove = [] for i, param in enumerate(self.params): diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 8c2ea87..0ffac86 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -30,7 +30,7 @@ into the :py:class`~.Wikicode` tree by the :py:class:`~.Builder`. from __future__ import unicode_literals -from ..compat import basestring, py3k +from ..compat import py3k, str __all__ = ["Token"] @@ -43,7 +43,7 @@ class Token(object): def __repr__(self): args = [] for key, value in self._kwargs.items(): - if isinstance(value, basestring) and len(value) > 100: + if isinstance(value, str) and len(value) > 100: args.append(key + "=" + repr(value[:97] + "...")) else: args.append(key + "=" + repr(value)) diff --git a/tests/test_attribute.py b/tests/test_attribute.py index 8dd84cb..dbf3145 100644 --- a/tests/test_attribute.py +++ b/tests/test_attribute.py @@ -40,6 +40,8 @@ class TestAttribute(TreeEqualityTestCase): self.assertEqual(' foo="bar"', str(node2)) node3 = Attribute(wraptext("a"), wraptext("b"), False, "", " ", " ") self.assertEqual("a = b", str(node3)) + node4 = Attribute(wraptext("a"), wrap([]), False, " ", "", " ") + self.assertEqual(" a= ", str(node4)) def test_name(self): """test getter/setter for the name attribute""" @@ -56,6 +58,8 @@ class TestAttribute(TreeEqualityTestCase): self.assertIs(value, node.value) node.value = "{{bar}}" self.assertWikicodeEqual(wrap([Template(wraptext("bar"))]), node.value) + node.value = None + self.assertIs(None, node.value) def test_quoted(self): """test getter/setter for the quoted attribute""" From 9280cb16feee70760561033c640f49fafaa062ee Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 22:21:45 -0400 Subject: [PATCH 146/189] Fix behavior with quoted attrs and spaces before the =. --- mwparserfromhell/nodes/extras/attribute.py | 9 +++++---- tests/test_attribute.py | 2 ++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/mwparserfromhell/nodes/extras/attribute.py b/mwparserfromhell/nodes/extras/attribute.py index 05860a0..8f7f453 100644 --- a/mwparserfromhell/nodes/extras/attribute.py +++ b/mwparserfromhell/nodes/extras/attribute.py @@ -47,12 +47,13 @@ class Attribute(StringMixIn): self._pad_after_eq = pad_after_eq def __unicode__(self): - base = self.pad_first + str(self.name) + self.pad_before_eq + result = self.pad_first + str(self.name) + self.pad_before_eq if self.value is not None: + result += "=" + self.pad_after_eq if self.quoted: - return base + '="' + self.pad_after_eq + str(self.value) + '"' - return base + "=" + self.pad_after_eq + str(self.value) - return base + return result + '"' + str(self.value) + '"' + return result + str(self.value) + return result def _set_padding(self, attr, value): """Setter for the value of a padding attribute.""" diff --git a/tests/test_attribute.py b/tests/test_attribute.py index dbf3145..f34c670 100644 --- a/tests/test_attribute.py +++ b/tests/test_attribute.py @@ -38,6 +38,8 @@ class TestAttribute(TreeEqualityTestCase): self.assertEqual(" foo", str(node)) node2 = Attribute(wraptext("foo"), wraptext("bar")) self.assertEqual(' foo="bar"', str(node2)) + node3 = Attribute(wraptext("a"), wraptext("b"), True, "", " ", " ") + self.assertEqual('a = "b"', str(node3)) node3 = Attribute(wraptext("a"), wraptext("b"), False, "", " ", " ") self.assertEqual("a = b", str(node3)) node4 = Attribute(wraptext("a"), wrap([]), False, " ", "", " ") From ccfc1cbe06809e8bc476b7ab956a4e0b2dfbac66 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 22:34:26 -0400 Subject: [PATCH 147/189] Add test cases for Tag's new has/get/add/remove methods. --- tests/test_tag.py | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/tests/test_tag.py b/tests/test_tag.py index 09eda9e..5ef92a5 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -30,6 +30,7 @@ from ._test_tree_equality import TreeEqualityTestCase, getnodes, wrap, wraptext agen = lambda name, value: Attribute(wraptext(name), wraptext(value)) agennq = lambda name, value: Attribute(wraptext(name), wraptext(value), False) +agenp = lambda name, v, a, b, c: Attribute(wraptext(name), v, True, a, b, c) agenpnv = lambda name, a, b, c: Attribute(wraptext(name), None, True, a, b, c) class TestTag(TreeEqualityTestCase): @@ -224,5 +225,91 @@ class TestTag(TreeEqualityTestCase): self.assertWikicodeEqual(parsed, node.closing_tag) self.assertEqual("foobar", node) + def test_has(self): + """test Tag.has()""" + node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")]) + self.assertTrue(node.has("name")) + self.assertTrue(node.has(" name ")) + self.assertTrue(node.has(wraptext("name"))) + self.assertFalse(node.has("Name")) + self.assertFalse(node.has("foo")) + + attrs = [agen("id", "foo"), agenp("class", "bar", " ", "\n", "\n"), + agen("foo", "bar"), agenpnv("foo", " ", " \n ", " \t")] + node2 = Tag(wraptext("div"), attrs=attrs, self_closing=True) + self.assertTrue(node2.has("id")) + self.assertTrue(node2.has("class")) + self.assertTrue(node2.has(attrs[1].pad_first + str(attrs[1].name) + + attrs[1].pad_before_eq)) + self.assertTrue(node2.has(attrs[3])) + self.assertTrue(node2.has(str(attrs[3]))) + self.assertFalse(node2.has("idclass")) + self.assertFalse(node2.has("id class")) + self.assertFalse(node2.has("id=foo")) + + def test_get(self): + """test Tag.get()""" + attrs = [agen("name", "foo")] + node = Tag(wraptext("ref"), wraptext("cite"), attrs) + self.assertIs(attrs[0], node.get("name")) + self.assertIs(attrs[0], node.get(" name ")) + self.assertIs(attrs[0], node.get(wraptext("name"))) + self.assertRaises(ValueError, node.get, "Name") + self.assertRaises(ValueError, node.get, "foo") + + attrs = [agen("id", "foo"), agenp("class", "bar", " ", "\n", "\n"), + agen("foo", "bar"), agenpnv("foo", " ", " \n ", " \t")] + node2 = Tag(wraptext("div"), attrs=attrs, self_closing=True) + self.assertIs(attrs[0], node2.get("id")) + self.assertIs(attrs[1], node2.get("class")) + self.assertIs(attrs[1], node2.get( + attrs[1].pad_first + str(attrs[1].name) + attrs[1].pad_before_eq)) + self.assertIs(attrs[3], node2.get(attrs[3])) + self.assertIs(attrs[3], node2.get(str(attrs[3]))) + self.assertIs(attrs[3], node2.get(" foo")) + self.assertRaises(ValueError, node2.get, "idclass") + self.assertRaises(ValueError, node2.get, "id class") + self.assertRaises(ValueError, node2.get, "id=foo") + + def test_add(self): + """test Tag.add()""" + node = Tag(wraptext("ref"), wraptext("cite")) + node.add("name", "value") + node.add("name", "value", quoted=False) + node.add("name") + node.add(1, False) + node.add("style", "{{foobar}}") + node.add("name", "value", True, "\n", " ", " ") + attr1 = ' name="value"' + attr2 = " name=value" + attr3 = " name" + attr4 = ' 1="False"' + attr5 = ' style="{{foobar}}"' + attr6 = '\nname = "value"' + self.assertEqual(attr1, node.attributes[0]) + self.assertEqual(attr2, node.attributes[1]) + self.assertEqual(attr3, node.attributes[2]) + self.assertEqual(attr4, node.attributes[3]) + self.assertEqual(attr5, node.attributes[4]) + self.assertEqual(attr6, node.attributes[5]) + self.assertEqual(attr6, node.get("name")) + self.assertWikicodeEqual(wrap([Template(wraptext("foobar"))]), + node.attributes[4].value) + self.assertEqual("".join(("cite
    ")), node) + + def test_remove(self): + """test Tag.remove()""" + attrs = [agen("id", "foo"), agenp("class", "bar", " ", "\n", "\n"), + agen("foo", "bar"), agenpnv("foo", " ", " \n ", " \t")] + node = Tag(wraptext("div"), attrs=attrs, self_closing=True) + node.remove("class") + self.assertEqual('
    ', node) + node.remove("foo") + self.assertEqual('
    ', node) + self.assertRaises(ValueError, node.remove, "foo") + node.remove("id") + self.assertEqual('
    ', node) + if __name__ == "__main__": unittest.main(verbosity=2) From 800bd20e3971a8942b3bb228fbe5a6dc28c20356 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 22:35:06 -0400 Subject: [PATCH 148/189] Implement Tag.has(), Tag.get(), Tag.add(), and Tag.remove() (closes #11). --- mwparserfromhell/nodes/tag.py | 54 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 1f2b048..b4aec3e 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -23,6 +23,7 @@ from __future__ import unicode_literals from . import Node, Text +from .extras import Attribute from ..compat import str from ..tag_defs import is_visible from ..utils import parse_anything @@ -216,3 +217,56 @@ class Tag(Node): @closing_tag.setter def closing_tag(self, value): self._closing_tag = parse_anything(value) + + def has(self, name): + """Return whether any attribute in the tag has the given *name*. + + Note that a tag may have multiple attributes with the same name, but + only the last one is read by the MediaWiki parser. + """ + for attr in self.attributes: + if attr.name == name.strip(): + return True + return False + + def get(self, name): + """Get the attribute with the given *name*. + + The returned object is a :py:class:`~.Attribute` instance. Raises + :py:exc:`ValueError` if no attribute has this name. Since multiple + attributes can have the same name, we'll return the last match, since + all but the last are ignored by the MediaWiki parser. + """ + for attr in reversed(self.attributes): + if attr.name == name.strip(): + return attr + raise ValueError(name) + + def add(self, name, value=None, quoted=True, pad_first=" ", + pad_before_eq="", pad_after_eq=""): + """Add an attribute with the given *name* and *value*. + + *name* and *value* can be anything parasable by + :py:func:`.utils.parse_anything`; *value* can be omitted if the + attribute is valueless. *quoted* is a bool telling whether to wrap the + *value* in double quotes (this is recommended). *pad_first*, + *pad_before_eq*, and *pad_after_eq* are whitespace used as padding + before the name, before the equal sign (or after the name if no value), + and after the equal sign (ignored if no value), respectively. + """ + if value is not None: + value = parse_anything(value) + attr = Attribute(parse_anything(name), value, quoted) + attr.pad_first = pad_first + attr.pad_before_eq = pad_before_eq + attr.pad_after_eq = pad_after_eq + self.attributes.append(attr) + return attr + + def remove(self, name): + """Remove all attributes with the given *name*.""" + attrs = [attr for attr in self.attributes if attr.name == name.strip()] + if not attrs: + raise ValueError(name) + for attr in attrs: + self.attributes.remove(attr) From fff93b77270209e01ff0d482d7e8c0f1824c556d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 23:04:44 -0400 Subject: [PATCH 149/189] Add changelog entries for ExternalLinks. --- CHANGELOG | 8 +++++--- docs/changelog.rst | 9 +++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 8922738..84edc60 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,8 +1,10 @@ v0.3 (unreleased): -- Added complete support for HTML Tags, along with appropriate unit tests. This - includes forms like foo, , and wiki-markup tags - like bold ('''), italics (''), and lists (*, #, ; and :). +- Added complete support for HTML Tags, including forms like foo, + , and wiki-markup tags like bold ('''), italics (''), and + lists (*, #, ; and :). +- Added support for ExternalLinks (http://example.com/ and + [http://example.com/ Example]). - Wikicode's filter methods are now passed 'recursive=True' by default instead of False. This is a breaking change if you rely on any filter() methods being non-recursive by default. diff --git a/docs/changelog.rst b/docs/changelog.rst index 86dfd78..810f594 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,10 +7,11 @@ v0.3 Unreleased (`changes `__): -- Added complete support for HTML :py:class:`Tags <.Tag>`, along with - appropriate unit tests. This includes forms like ``foo``, - ````, and wiki-markup tags like bold (``'''``), italics - (``''``), and lists (``*``, ``#``, ``;`` and ``:``). +- Added complete support for HTML :py:class:`Tags <.Tag>`, including forms like + ``foo``, ````, and wiki-markup tags like bold + (``'''``), italics (``''``), and lists (``*``, ``#``, ``;`` and ``:``). +- Added support for :py:class:`.ExternalLink`\ s (``http://example.com/`` and + ``[http://example.com/ Example]``). - :py:class:`Wikicode's <.Wikicode>` :py:meth:`.filter` methods are now passed *recursive=True* by default instead of *False*. **This is a breaking change if you rely on any filter() methods being non-recursive by default.** From 0886b6fbf6256f36a062448fda31fcd79da10d89 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 23:05:13 -0400 Subject: [PATCH 150/189] Add ExternalLink Node type. --- mwparserfromhell/nodes/external_link.py | 95 +++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 mwparserfromhell/nodes/external_link.py diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py new file mode 100644 index 0000000..a604f9a --- /dev/null +++ b/mwparserfromhell/nodes/external_link.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012-2013 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals + +from . import Node +from ..compat import str +from ..utils import parse_anything + +__all__ = ["ExternalLink"] + +class ExternalLink(Node): + """Represents an external link, like ``[http://example.com/ Example]``.""" + + def __init__(self, url, title=None, brackets=True): + super(ExternalLink, self).__init__() + self._url = url + self._title = title + self._brackets = brackets + + def __unicode__(self): + if self.brackets: + if self.title is not None: + return "[" + str(self.url) + " " + str(self.title) + "]" + return "[" + str(self.url) + "]" + return str(self.url) + + def __iternodes__(self, getter): + yield None, self + for child in getter(self.url): + yield self.url, child + if self.title is not None: + for child in getter(self.title): + yield self.title, child + + def __strip__(self, normalize, collapse): + if self.title.strip(): + return self.title.strip_code(normalize, collapse) + return None + + def __showtree__(self, write, get, mark): + write("[") + get(self.url) + if self.title is not None: + get(self.title) + write("]") + + @property + def url(self): + """The url of the link target, as a :py:class:`~.Wikicode` object.""" + return self._url + + @property + def title(self): + """The link title (if given), as a :py:class:`~.Wikicode` object.""" + return self._title + + @property + def brackets(self): + """Whether to enclose the URL in brackets or display it straight.""" + return self._brackets + + @url.setter + def url(self, value): + self._url = parse_anything(value) + + @title.setter + def title(self, value): + if value is None: + self._title = None + else: + self._title = parse_anything(value) + + @brackets.setter + def brackets(self, value): + self._brackets = bool(value) From 8fe8b1fef59446a24d1c66dc6b683dd5a3760a58 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 23:12:36 -0400 Subject: [PATCH 151/189] Implement ExternalLinks as tokens and in the builder. --- mwparserfromhell/nodes/__init__.py | 1 + mwparserfromhell/parser/builder.py | 22 ++++++++++++++++++++-- mwparserfromhell/parser/tokens.py | 4 ++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index faaa0b2..ba97b3f 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -69,6 +69,7 @@ from . import extras from .text import Text from .argument import Argument from .comment import Comment +from .external_link import ExternalLink from .heading import Heading from .html_entity import HTMLEntity from .tag import Tag diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 196ef14..ee914c3 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -24,8 +24,8 @@ from __future__ import unicode_literals from . import tokens from ..compat import str -from ..nodes import (Argument, Comment, Heading, HTMLEntity, Tag, Template, - Text, Wikilink) +from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, + Template, Text, Wikilink) from ..nodes.extras import Attribute, Parameter from ..smart_list import SmartList from ..wikicode import Wikicode @@ -234,6 +234,22 @@ class Builder(object): else: self._write(self._handle_token(token)) + def _handle_external_link(self, token): + """Handle when an external link is at the head of the tokens.""" + brackets, url = token.brackets, None + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.ExternalLinkSeparator): + url = self._pop() + self._push() + elif isinstance(token, tokens.ExternalLinkClose): + if url is not None: + return ExternalLink(url, self._pop(), brackets) + return ExternalLink(self._pop(), brackets=brackets) + else: + self._write(self._handle_token(token)) + def _handle_token(self, token): """Handle a single token.""" if isinstance(token, tokens.Text): @@ -252,6 +268,8 @@ class Builder(object): return self._handle_comment() elif isinstance(token, tokens.TagOpenOpen): return self._handle_tag(token) + elif isinstance(token, tokens.ExternalLinkOpen): + return self._handle_external_link(token) def build(self, tokenlist): """Build a Wikicode object from a list tokens and return it.""" diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 0ffac86..ae58ec8 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -104,4 +104,8 @@ TagCloseSelfclose = make("TagCloseSelfclose") # /> TagOpenClose = make("TagOpenClose") # +ExternalLinkOpen = make("ExternalLinkOpen") # [ +ExternalLinkSeparator = make("ExternalLinkSeparator") # +ExternalLinkClose = make("ExternalLinkClose") # ] + del make From 88f4fa7c37d321858ccb20bc74e3f4e9e9eaa50a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 23:21:15 -0400 Subject: [PATCH 152/189] Add external link contexts; reorder stuff for consistency. --- mwparserfromhell/parser/builder.py | 36 ++++++++++++------------ mwparserfromhell/parser/contexts.py | 55 ++++++++++++++++++++++--------------- mwparserfromhell/parser/tokens.py | 8 +++--- 3 files changed, 55 insertions(+), 44 deletions(-) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index ee914c3..d31f450 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -142,6 +142,22 @@ class Builder(object): else: self._write(self._handle_token(token)) + def _handle_external_link(self, token): + """Handle when an external link is at the head of the tokens.""" + brackets, url = token.brackets, None + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.ExternalLinkSeparator): + url = self._pop() + self._push() + elif isinstance(token, tokens.ExternalLinkClose): + if url is not None: + return ExternalLink(url, self._pop(), brackets) + return ExternalLink(self._pop(), brackets=brackets) + else: + self._write(self._handle_token(token)) + def _handle_entity(self): """Handle a case where an HTML entity is at the head of the tokens.""" token = self._tokens.pop() @@ -234,22 +250,6 @@ class Builder(object): else: self._write(self._handle_token(token)) - def _handle_external_link(self, token): - """Handle when an external link is at the head of the tokens.""" - brackets, url = token.brackets, None - self._push() - while self._tokens: - token = self._tokens.pop() - if isinstance(token, tokens.ExternalLinkSeparator): - url = self._pop() - self._push() - elif isinstance(token, tokens.ExternalLinkClose): - if url is not None: - return ExternalLink(url, self._pop(), brackets) - return ExternalLink(self._pop(), brackets=brackets) - else: - self._write(self._handle_token(token)) - def _handle_token(self, token): """Handle a single token.""" if isinstance(token, tokens.Text): @@ -260,6 +260,8 @@ class Builder(object): return self._handle_argument() elif isinstance(token, tokens.WikilinkOpen): return self._handle_wikilink() + elif isinstance(token, tokens.ExternalLinkOpen): + return self._handle_external_link(token) elif isinstance(token, tokens.HTMLEntityStart): return self._handle_entity() elif isinstance(token, tokens.HeadingStart): @@ -268,8 +270,6 @@ class Builder(object): return self._handle_comment() elif isinstance(token, tokens.TagOpenOpen): return self._handle_tag(token) - elif isinstance(token, tokens.ExternalLinkOpen): - return self._handle_external_link(token) def build(self, tokenlist): """Build a Wikicode object from a list tokens and return it.""" diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index a1b67be..38154bb 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -51,6 +51,12 @@ Local (stack-specific) contexts: * :py:const:`WIKILINK_TITLE` * :py:const:`WIKILINK_TEXT` +* :py:const:`EXTERNAL_LINK` + + * :py:const:`EXTERNAL_LINK_URL` + * :py:const:`EXTERNAL_LINK_TITLE` + * :py:const:`EXTERNAL_LINK_BRACKETS` + * :py:const:`HEADING` * :py:const:`HEADING_LEVEL_1` @@ -112,35 +118,40 @@ WIKILINK_TITLE = 1 << 5 WIKILINK_TEXT = 1 << 6 WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT -HEADING_LEVEL_1 = 1 << 7 -HEADING_LEVEL_2 = 1 << 8 -HEADING_LEVEL_3 = 1 << 9 -HEADING_LEVEL_4 = 1 << 10 -HEADING_LEVEL_5 = 1 << 11 -HEADING_LEVEL_6 = 1 << 12 +EXTERNAL_LINK_URL = 1 << 7 +EXTERNAL_LINK_TITLE = 1 << 8 +EXTERNAL_LINK_BRACKETS = 1 << 9 +EXTERNAL_LINK = EXTERNAL_LINK_URL + EXTERNAL_LINK_TITLE + +HEADING_LEVEL_1 = 1 << 10 +HEADING_LEVEL_2 = 1 << 11 +HEADING_LEVEL_3 = 1 << 12 +HEADING_LEVEL_4 = 1 << 13 +HEADING_LEVEL_5 = 1 << 14 +HEADING_LEVEL_6 = 1 << 15 HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 + HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6) -TAG_OPEN = 1 << 13 -TAG_ATTR = 1 << 14 -TAG_BODY = 1 << 15 -TAG_CLOSE = 1 << 16 +TAG_OPEN = 1 << 16 +TAG_ATTR = 1 << 17 +TAG_BODY = 1 << 18 +TAG_CLOSE = 1 << 19 TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE -STYLE_ITALICS = 1 << 17 -STYLE_BOLD = 1 << 18 -STYLE_PASS_AGAIN = 1 << 19 -STYLE_SECOND_PASS = 1 << 20 +STYLE_ITALICS = 1 << 20 +STYLE_BOLD = 1 << 21 +STYLE_PASS_AGAIN = 1 << 22 +STYLE_SECOND_PASS = 1 << 23 STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS -DL_TERM = 1 << 21 +DL_TERM = 1 << 24 -HAS_TEXT = 1 << 22 -FAIL_ON_TEXT = 1 << 23 -FAIL_NEXT = 1 << 24 -FAIL_ON_LBRACE = 1 << 25 -FAIL_ON_RBRACE = 1 << 26 -FAIL_ON_EQUALS = 1 << 27 +HAS_TEXT = 1 << 25 +FAIL_ON_TEXT = 1 << 26 +FAIL_NEXT = 1 << 27 +FAIL_ON_LBRACE = 1 << 28 +FAIL_ON_RBRACE = 1 << 29 +FAIL_ON_EQUALS = 1 << 30 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) @@ -150,7 +161,7 @@ GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + HEADING + TAG + STYLE +FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXTERNAL_LINK + HEADING + TAG + STYLE UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index ae58ec8..57308ea 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -84,6 +84,10 @@ WikilinkOpen = make("WikilinkOpen") # [[ WikilinkSeparator = make("WikilinkSeparator") # | WikilinkClose = make("WikilinkClose") # ]] +ExternalLinkOpen = make("ExternalLinkOpen") # [ +ExternalLinkSeparator = make("ExternalLinkSeparator") # +ExternalLinkClose = make("ExternalLinkClose") # ] + HTMLEntityStart = make("HTMLEntityStart") # & HTMLEntityNumeric = make("HTMLEntityNumeric") # # HTMLEntityHex = make("HTMLEntityHex") # x @@ -104,8 +108,4 @@ TagCloseSelfclose = make("TagCloseSelfclose") # /> TagOpenClose = make("TagOpenClose") # -ExternalLinkOpen = make("ExternalLinkOpen") # [ -ExternalLinkSeparator = make("ExternalLinkSeparator") # -ExternalLinkClose = make("ExternalLinkClose") # ] - del make From cbf67c78424b5de14d0ad4b9023d81c61fcbe17d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 20 Aug 2013 02:07:38 -0400 Subject: [PATCH 153/189] Add hooks for some ext link stuff; add a INVALID_LINK aggregate context. --- mwparserfromhell/parser/contexts.py | 20 +++---- mwparserfromhell/parser/tokenizer.c | 15 +++--- mwparserfromhell/parser/tokenizer.h | 100 +++++++++++++++++++---------------- mwparserfromhell/parser/tokenizer.py | 18 +++++-- tests/tokenizer/integration.mwtest | 7 +++ 5 files changed, 96 insertions(+), 64 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 38154bb..c6d2941 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -51,11 +51,11 @@ Local (stack-specific) contexts: * :py:const:`WIKILINK_TITLE` * :py:const:`WIKILINK_TEXT` -* :py:const:`EXTERNAL_LINK` +* :py:const:`EXT_LINK` - * :py:const:`EXTERNAL_LINK_URL` - * :py:const:`EXTERNAL_LINK_TITLE` - * :py:const:`EXTERNAL_LINK_BRACKETS` + * :py:const:`EXT_LINK_URL` + * :py:const:`EXT_LINK_TITLE` + * :py:const:`EXT_LINK_BRACKETS` * :py:const:`HEADING` @@ -100,6 +100,7 @@ Aggregate contexts: * :py:const:`FAIL` * :py:const:`UNSAFE` * :py:const:`DOUBLE` +* :py:const:`INVALID_LINK` """ @@ -118,10 +119,10 @@ WIKILINK_TITLE = 1 << 5 WIKILINK_TEXT = 1 << 6 WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT -EXTERNAL_LINK_URL = 1 << 7 -EXTERNAL_LINK_TITLE = 1 << 8 -EXTERNAL_LINK_BRACKETS = 1 << 9 -EXTERNAL_LINK = EXTERNAL_LINK_URL + EXTERNAL_LINK_TITLE +EXT_LINK_URL = 1 << 7 +EXT_LINK_TITLE = 1 << 8 +EXT_LINK_BRACKETS = 1 << 9 +EXT_LINK = EXT_LINK_URL + EXT_LINK_TITLE + EXT_LINK_BRACKETS HEADING_LEVEL_1 = 1 << 10 HEADING_LEVEL_2 = 1 << 11 @@ -161,7 +162,8 @@ GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXTERNAL_LINK + HEADING + TAG + STYLE +FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK + HEADING + TAG + STYLE UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE +INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URL diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 67a4ae6..267e7c5 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2192,9 +2192,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) if (Tokenizer_emit_char(self, this)) return NULL; } - else if (this == next && next == *"[") { - if (!(this_context & LC_WIKILINK_TITLE) && - Tokenizer_CAN_RECURSE(self)) { + else if (this == next && next == *"[" && Tokenizer_CAN_RECURSE(self)) { + if (!(this_context & AGG_INVALID_LINK)) { if (Tokenizer_parse_wikilink(self)) return NULL; } @@ -2243,9 +2242,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) return NULL; } } - else if (this == *"<") { - if (!(this_context & LC_TAG_CLOSE) && - Tokenizer_CAN_RECURSE(self)) { + else if (this == *"<" && !(this_context & LC_TAG_CLOSE)) { + if (Tokenizer_CAN_RECURSE(self)) { if (Tokenizer_parse_tag(self)) return NULL; } @@ -2389,6 +2387,11 @@ static int load_tokens(void) WikilinkSeparator = PyObject_GetAttrString(tokens, "WikilinkSeparator"); WikilinkClose = PyObject_GetAttrString(tokens, "WikilinkClose"); + ExternalLinkOpen = PyObject_GetAttrString(tokens, "ExternalLinkOpen"); + ExternalLinkSeparator = PyObject_GetAttrString(tokens, + "ExternalLinkSeparator"); + ExternalLinkClose = PyObject_GetAttrString(tokens, "ExternalLinkClose"); + HTMLEntityStart = PyObject_GetAttrString(tokens, "HTMLEntityStart"); HTMLEntityNumeric = PyObject_GetAttrString(tokens, "HTMLEntityNumeric"); HTMLEntityHex = PyObject_GetAttrString(tokens, "HTMLEntityHex"); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 264360e..16c76eb 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -82,6 +82,10 @@ static PyObject* WikilinkOpen; static PyObject* WikilinkSeparator; static PyObject* WikilinkClose; +static PyObject* ExternalLinkOpen; +static PyObject* ExternalLinkSeparator; +static PyObject* ExternalLinkClose; + static PyObject* HTMLEntityStart; static PyObject* HTMLEntityNumeric; static PyObject* HTMLEntityHex; @@ -104,48 +108,53 @@ static PyObject* TagCloseClose; /* Local contexts: */ -#define LC_TEMPLATE 0x0000007 -#define LC_TEMPLATE_NAME 0x0000001 -#define LC_TEMPLATE_PARAM_KEY 0x0000002 -#define LC_TEMPLATE_PARAM_VALUE 0x0000004 - -#define LC_ARGUMENT 0x0000018 -#define LC_ARGUMENT_NAME 0x0000008 -#define LC_ARGUMENT_DEFAULT 0x0000010 - -#define LC_WIKILINK 0x0000060 -#define LC_WIKILINK_TITLE 0x0000020 -#define LC_WIKILINK_TEXT 0x0000040 - -#define LC_HEADING 0x0001F80 -#define LC_HEADING_LEVEL_1 0x0000080 -#define LC_HEADING_LEVEL_2 0x0000100 -#define LC_HEADING_LEVEL_3 0x0000200 -#define LC_HEADING_LEVEL_4 0x0000400 -#define LC_HEADING_LEVEL_5 0x0000800 -#define LC_HEADING_LEVEL_6 0x0001000 - -#define LC_TAG 0x001E000 -#define LC_TAG_OPEN 0x0002000 -#define LC_TAG_ATTR 0x0004000 -#define LC_TAG_BODY 0x0008000 -#define LC_TAG_CLOSE 0x0010000 - -#define LC_STYLE 0x01E0000 -#define LC_STYLE_ITALICS 0x0020000 -#define LC_STYLE_BOLD 0x0040000 -#define LC_STYLE_PASS_AGAIN 0x0080000 -#define LC_STYLE_SECOND_PASS 0x0100000 - -#define LC_DLTERM 0x0200000 - -#define LC_SAFETY_CHECK 0xFC00000 -#define LC_HAS_TEXT 0x0400000 -#define LC_FAIL_ON_TEXT 0x0800000 -#define LC_FAIL_NEXT 0x1000000 -#define LC_FAIL_ON_LBRACE 0x2000000 -#define LC_FAIL_ON_RBRACE 0x4000000 -#define LC_FAIL_ON_EQUALS 0x8000000 +#define LC_TEMPLATE 0x00000007 +#define LC_TEMPLATE_NAME 0x00000001 +#define LC_TEMPLATE_PARAM_KEY 0x00000002 +#define LC_TEMPLATE_PARAM_VALUE 0x00000004 + +#define LC_ARGUMENT 0x00000018 +#define LC_ARGUMENT_NAME 0x00000008 +#define LC_ARGUMENT_DEFAULT 0x00000010 + +#define LC_WIKILINK 0x00000060 +#define LC_WIKILINK_TITLE 0x00000020 +#define LC_WIKILINK_TEXT 0x00000040 + +#define LC_EXT_LINK 0x00000380 +#define LC_EXT_LINK_URL 0x00000080 +#define LC_EXT_LINK_TITLE 0x00000100 +#define LC_EXT_LINK_BRACKETS 0x00000200 + +#define LC_HEADING 0x0000FC00 +#define LC_HEADING_LEVEL_1 0x00000400 +#define LC_HEADING_LEVEL_2 0x00000800 +#define LC_HEADING_LEVEL_3 0x00001000 +#define LC_HEADING_LEVEL_4 0x00002000 +#define LC_HEADING_LEVEL_5 0x00004000 +#define LC_HEADING_LEVEL_6 0x00008000 + +#define LC_TAG 0x000F0000 +#define LC_TAG_OPEN 0x00010000 +#define LC_TAG_ATTR 0x00020000 +#define LC_TAG_BODY 0x00040000 +#define LC_TAG_CLOSE 0x00080000 + +#define LC_STYLE 0x00F00000 +#define LC_STYLE_ITALICS 0x00100000 +#define LC_STYLE_BOLD 0x00200000 +#define LC_STYLE_PASS_AGAIN 0x00400000 +#define LC_STYLE_SECOND_PASS 0x00800000 + +#define LC_DLTERM 0x01000000 + +#define LC_SAFETY_CHECK 0x7E000000 +#define LC_HAS_TEXT 0x02000000 +#define LC_FAIL_ON_TEXT 0x04000000 +#define LC_FAIL_NEXT 0x08000000 +#define LC_FAIL_ON_LBRACE 0x10000000 +#define LC_FAIL_ON_RBRACE 0x20000000 +#define LC_FAIL_ON_EQUALS 0x40000000 /* Global contexts: */ @@ -153,9 +162,10 @@ static PyObject* TagCloseClose; /* Aggregate contexts: */ -#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) -#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) -#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) +#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) +#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) +#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URL) /* Tag contexts: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 583d2f8..9f675ac 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -311,6 +311,11 @@ class Tokenizer(object): self._head += 1 return self._pop() + def _parse_external_link(self, brackets): + """Parse an external link at the head of the wikicode string.""" + self._emit_text(self._read()) + # raise NotImplementedError() + def _parse_heading(self): """Parse a section heading at the head of the wikicode string.""" self._global |= contexts.GL_HEADING @@ -898,8 +903,8 @@ class Tokenizer(object): return self._handle_argument_end() else: self._emit_text("}") - elif this == next == "[": - if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): + elif this == next == "[" and self._can_recurse(): + if not self._context & contexts.INVALID_LINK: self._parse_wikilink() else: self._emit_text("[") @@ -907,6 +912,11 @@ class Tokenizer(object): self._handle_wikilink_separator() elif this == next == "]" and self._context & contexts.WIKILINK: return self._handle_wikilink_end() + elif this == "[" and not self._context & contexts.INVALID_LINK: ## or this == ":" + if self._can_recurse(): + self._parse_external_link(brackets=this == "[") + else: + self._emit_text("[") elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() @@ -928,8 +938,8 @@ class Tokenizer(object): self._handle_tag_open_close() else: self._handle_invalid_tag_start() - elif this == "<": - if not self._context & contexts.TAG_CLOSE and self._can_recurse(): + elif this == "<" and not self._context & contexts.TAG_CLOSE: + if self._can_recurse(): self._parse_tag() else: self._emit_text("<") diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index 0277a51..e4ff8c4 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -12,6 +12,13 @@ output: [TemplateOpen(), ArgumentOpen(), ArgumentOpen(), Text(text="foo"), Argum --- +name: link_in_template_name +label: a wikilink inside a template name, which breaks the template +input: "{{foo[[bar]]}}" +output: [Text(text="{{foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="}}")] + +--- + name: rich_heading label: a heading with templates/wikilinks in it input: "== Head{{ing}} [[with]] {{{funky|{{stuf}}}}} ==" From 5e6e5b6301f5f50ca8585a5b73f72af49898cdf2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 Aug 2013 01:07:32 -0400 Subject: [PATCH 154/189] tag_defs.py -> definitions.py; more outline stuff --- mwparserfromhell/{tag_defs.py => definitions.py} | 2 +- mwparserfromhell/nodes/tag.py | 6 ++-- mwparserfromhell/parser/tokenizer.c | 16 +++++----- mwparserfromhell/parser/tokenizer.h | 8 ++--- mwparserfromhell/parser/tokenizer.py | 37 ++++++++++++++++++------ 5 files changed, 44 insertions(+), 25 deletions(-) rename mwparserfromhell/{tag_defs.py => definitions.py} (97%) diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/definitions.py similarity index 97% rename from mwparserfromhell/tag_defs.py rename to mwparserfromhell/definitions.py index 2395fc6..2d7ab0c 100644 --- a/mwparserfromhell/tag_defs.py +++ b/mwparserfromhell/definitions.py @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -"""Contains data regarding certain HTML tags.""" +"""Contains data about certain markup, like HTML tags and external links.""" from __future__ import unicode_literals diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index b4aec3e..80b8a88 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -25,7 +25,7 @@ from __future__ import unicode_literals from . import Node, Text from .extras import Attribute from ..compat import str -from ..tag_defs import is_visible +from ..definitions import is_visible from ..utils import parse_anything __all__ = ["Tag"] @@ -152,7 +152,7 @@ class Tag(Node): This makes the tag look like a lone close tag. It is technically invalid and is only parsable Wikicode when the tag itself is single-only, like ``
    `` and ````. See - :py:func:`.tag_defs.is_single_only`. + :py:func:`.definitions.is_single_only`. """ return self._invalid @@ -161,7 +161,7 @@ class Tag(Node): """Whether the tag is implicitly self-closing, with no ending slash. This is only possible for specific "single" tags like ``
    `` and - ``
  • ``. See :py:func:`.tag_defs.is_single`. This field only has an + ``
  • ``. See :py:func:`.definitions.is_single`. This field only has an effect if :py:attr:`self_closing` is also ``True``. """ return self._implicit diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 267e7c5..2b74f6b 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -37,12 +37,12 @@ static int heading_level_from_context(int n) } /* - Call the given function in tag_defs, using 'tag' as a parameter, and return - its output as a bool. + Call the given function in definitions.py, using 'tag' as a parameter, and + return its output as a bool. */ -static int call_tag_def_func(const char* funcname, PyObject* tag) +static int call_def_func(const char* funcname, PyObject* tag) { - PyObject* func = PyObject_GetAttrString(tag_defs, funcname); + PyObject* func = PyObject_GetAttrString(definitions, funcname); PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL); int ans = (result == Py_True) ? 1 : 0; @@ -2416,13 +2416,13 @@ static int load_tokens(void) return 0; } -static int load_tag_defs(void) +static int load_definitions(void) { PyObject *tempmod, *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(), *fromlist = PyList_New(1), - *modname = IMPORT_NAME_FUNC("tag_defs"); + *modname = IMPORT_NAME_FUNC("definitions"); char *name = "mwparserfromhell"; if (!fromlist || !modname) @@ -2432,7 +2432,7 @@ static int load_tag_defs(void) Py_DECREF(fromlist); if (!tempmod) return -1; - tag_defs = PyObject_GetAttrString(tempmod, "tag_defs"); + definitions = PyObject_GetAttrString(tempmod, "definitions"); Py_DECREF(tempmod); return 0; } @@ -2455,7 +2455,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void) NOARGS = PyTuple_New(0); if (!EMPTY || !NOARGS) INIT_ERROR; - if (load_entitydefs() || load_tokens() || load_tag_defs()) + if (load_entitydefs() || load_tokens() || load_definitions()) INIT_ERROR; #ifdef IS_PY3K return module; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 16c76eb..41c1c1b 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -62,7 +62,7 @@ static char** entitydefs; static PyObject* EMPTY; static PyObject* NOARGS; -static PyObject* tag_defs; +static PyObject* definitions; /* Tokens: */ @@ -241,9 +241,9 @@ typedef struct { /* Macros for accessing HTML tag definitions: */ #define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") -#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag)) -#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag)) -#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag)) +#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag)) +#define IS_SINGLE(tag) (call_def_func("is_single", tag)) +#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag)) /* Function prototypes: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9f675ac..07ae0b1 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -26,7 +26,7 @@ import re from . import contexts, tokens from ..compat import htmlentities -from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only +from ..definitions import get_html_tag, is_parsable, is_single, is_single_only __all__ = ["Tokenizer"] @@ -60,7 +60,7 @@ class Tokenizer(object): START = object() END = object() MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", - ":", "/", "-", "\n", END] + ":", "/", "-", "\n", START, END] MAX_DEPTH = 40 MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) @@ -311,10 +311,30 @@ class Tokenizer(object): self._head += 1 return self._pop() + def _really_parse_external_link(self, brackets): + """Really parse an external link.""" + # link = self._parse(contexts.EXT_LINK_URL) + raise BadRoute() + def _parse_external_link(self, brackets): """Parse an external link at the head of the wikicode string.""" - self._emit_text(self._read()) - # raise NotImplementedError() + reset = self._head + self._head += 1 + try: + bad_context = self._context & contexts.INVALID_LINK + if bad_context or not self._can_recurse(): + raise BadRoute() + link = self._really_parse_external_link(brackets) + except BadRoute: + self._head = reset + if not brackets and self._context & contexts.DL_TERM: + self._handle_dl_term() + else: + self._emit_text(self._read()) + else: + self._emit(tokens.ExternalLinkOpen(brackets)) + self._emit_all(link) + self._emit(tokens.ExternalLinkClose()) def _parse_heading(self): """Parse a section heading at the head of the wikicode string.""" @@ -912,11 +932,10 @@ class Tokenizer(object): self._handle_wikilink_separator() elif this == next == "]" and self._context & contexts.WIKILINK: return self._handle_wikilink_end() - elif this == "[" and not self._context & contexts.INVALID_LINK: ## or this == ":" - if self._can_recurse(): - self._parse_external_link(brackets=this == "[") - else: - self._emit_text("[") + elif this == "[": + self._parse_external_link(True) + elif this == ":" and self._read(-1) not in self.MARKERS: + self._parse_external_link(False) elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() From 5fc36cea7156fd86c848463fd6db2740462665c6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 Aug 2013 02:48:13 -0400 Subject: [PATCH 155/189] Add is_protocol(). --- mwparserfromhell/definitions.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 2d7ab0c..7352f23 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -25,7 +25,17 @@ from __future__ import unicode_literals __all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", - "is_single_only"] + "is_single_only", "is_protocol"] + +URL_PROTOCOLS = { + # [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943 + "http": True, "https": True, "ftp": True, "ftps": True, "ssh": True, + "sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False, + "sips": False, "gopher": True, "telnet": True, "nntp": True, + "worldwind": True, "mailto": False, "tel": False, "sms": False, + "news": False, "svn": True, "git": True, "mms": True, "bitcoin": False, + "magnet": False, "urn": False, "geo": False +} PARSER_BLACKLIST = [ # enwiki extensions @ 2013-06-28 @@ -70,3 +80,9 @@ def is_single(tag): def is_single_only(tag): """Return whether or not the given *tag* must exist without a close tag.""" return tag.lower() in SINGLE_ONLY + +def is_protocol(protocol, slashes=True): + """Return whether *protcol* is valid for external links.""" + if slashes: + return protocol in URL_PROTOCOLS + return protocol in URL_PROTOCOLS and not URL_PROTOCOLS[protocol] From e2d007cb9f09c617e48d1240bb08de6d3e79895a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 Aug 2013 03:14:13 -0400 Subject: [PATCH 156/189] Actually, they're called schemes, not protocols. --- mwparserfromhell/definitions.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 7352f23..1cc1eb5 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -25,9 +25,9 @@ from __future__ import unicode_literals __all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", - "is_single_only", "is_protocol"] + "is_single_only", "is_scheme"] -URL_PROTOCOLS = { +URL_SCHEMES = { # [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943 "http": True, "https": True, "ftp": True, "ftps": True, "ssh": True, "sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False, @@ -81,8 +81,8 @@ def is_single_only(tag): """Return whether or not the given *tag* must exist without a close tag.""" return tag.lower() in SINGLE_ONLY -def is_protocol(protocol, slashes=True): - """Return whether *protcol* is valid for external links.""" +def is_scheme(scheme, slashes=True): + """Return whether *scheme* is valid for external links.""" if slashes: - return protocol in URL_PROTOCOLS - return protocol in URL_PROTOCOLS and not URL_PROTOCOLS[protocol] + return scheme in URL_SCHEMES + return scheme in URL_SCHEMES and not URL_SCHEMES[scheme] From 223f3fa6588390dca6c3a71d2e93c9be7d8a0fd9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 Aug 2013 03:25:18 -0400 Subject: [PATCH 157/189] Actually, they're called URI schemes, not URL schemes. --- mwparserfromhell/definitions.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 1cc1eb5..ef8255e 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -27,7 +27,7 @@ from __future__ import unicode_literals __all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", "is_single_only", "is_scheme"] -URL_SCHEMES = { +URI_SCHEMES = { # [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943 "http": True, "https": True, "ftp": True, "ftps": True, "ssh": True, "sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False, @@ -83,6 +83,7 @@ def is_single_only(tag): def is_scheme(scheme, slashes=True): """Return whether *scheme* is valid for external links.""" + scheme = scheme.lower() if slashes: - return scheme in URL_SCHEMES - return scheme in URL_SCHEMES and not URL_SCHEMES[scheme] + return scheme in URI_SCHEMES + return scheme in URI_SCHEMES and not URI_SCHEMES[scheme] From f3025eaafe7178a0aaedca4a70648410037fc9ec Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 00:56:39 -0400 Subject: [PATCH 158/189] Fix some wikilink-related tests. --- tests/tokenizer/wikilinks.mwtest | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/tests/tokenizer/wikilinks.mwtest b/tests/tokenizer/wikilinks.mwtest index 0682ef1..8eb381a 100644 --- a/tests/tokenizer/wikilinks.mwtest +++ b/tests/tokenizer/wikilinks.mwtest @@ -40,17 +40,17 @@ output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar|b --- -name: nested -label: a wikilink nested within the value of another -input: "[[foo|[[bar]]]]" -output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), WikilinkOpen(), Text(text="bar"), WikilinkClose(), WikilinkClose()] +name: newline_text +label: a newline in the middle of the text +input: "[[foo|foo\nbar]]" +output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="foo\nbar"), WikilinkClose()] --- -name: nested_with_text -label: a wikilink nested within the value of another, separated by other data -input: "[[foo|a[[b]]c]]" -output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c"), WikilinkClose()] +name: bracket_text +label: a left bracket in the middle of the text +input: "[[foo|bar[baz]]" +output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar[baz"), WikilinkClose()] --- @@ -96,13 +96,34 @@ output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), --- -name: invalid_nested_text +name: invalid_nested_padding label: invalid wikilink: trying to nest in the wrong context, with a text param input: "[[foo[[bar]]|baz]]" output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="|baz]]")] --- +name: invalid_nested_text +label: invalid wikilink: a wikilink nested within the value of another +input: "[[foo|[[bar]]" +output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose()] + +--- + +name: invalid_nested_text_2 +label: invalid wikilink: a wikilink nested within the value of another, two pairs of closing brackets +input: "[[foo|[[bar]]]]" +output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="]]")] + +--- + +name: invalid_nested_text_padding +label: invalid wikilink: a wikilink nested within the value of another, separated by other data +input: "[[foo|a[[b]]c]]" +output: [Text(text="[[foo|a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c]]")] + +--- + name: incomplete_open_only label: incomplete wikilinks: just an open input: "[[" From d42e05a554076d43dd53568bf383ec3e265c2fe2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 00:57:34 -0400 Subject: [PATCH 159/189] Implement improved wikilink handling. --- mwparserfromhell/parser/contexts.py | 12 ++++++------ mwparserfromhell/parser/tokenizer.c | 13 ++++++------- mwparserfromhell/parser/tokenizer.h | 4 ++-- mwparserfromhell/parser/tokenizer.py | 10 ++++++++-- 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index c6d2941..0d25400 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -53,7 +53,7 @@ Local (stack-specific) contexts: * :py:const:`EXT_LINK` - * :py:const:`EXT_LINK_URL` + * :py:const:`EXT_LINK_URI` * :py:const:`EXT_LINK_TITLE` * :py:const:`EXT_LINK_BRACKETS` @@ -119,10 +119,10 @@ WIKILINK_TITLE = 1 << 5 WIKILINK_TEXT = 1 << 6 WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT -EXT_LINK_URL = 1 << 7 +EXT_LINK_URI = 1 << 7 EXT_LINK_TITLE = 1 << 8 EXT_LINK_BRACKETS = 1 << 9 -EXT_LINK = EXT_LINK_URL + EXT_LINK_TITLE + EXT_LINK_BRACKETS +EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE + EXT_LINK_BRACKETS HEADING_LEVEL_1 = 1 << 10 HEADING_LEVEL_2 = 1 << 11 @@ -163,7 +163,7 @@ GL_HEADING = 1 << 0 # Aggregate contexts: FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK + HEADING + TAG + STYLE -UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + - TAG_CLOSE) +UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE -INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URL +INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK + EXT_LINK diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 2b74f6b..46df405 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2050,18 +2050,17 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) if (context & LC_FAIL_NEXT) { return -1; } - if (context & LC_WIKILINK_TITLE) { - if (data == *"]" || data == *"{") + if (context & LC_WIKILINK) { + if (context & LC_WIKILINK_TEXT) + return (data == *"[" && Tokenizer_READ(self, 1) == *"[") ? -1 : 0; + else if (data == *"]" || data == *"{") self->topstack->context |= LC_FAIL_NEXT; else if (data == *"\n" || data == *"[" || data == *"}") return -1; return 0; } - if (context & LC_TAG_CLOSE) { - if (data == *"<") - return -1; - return 0; - } + if (context & LC_TAG_CLOSE) + return (data == *"<") ? -1 : 0; if (context & LC_TEMPLATE_NAME) { if (data == *"{" || data == *"}" || data == *"[") { self->topstack->context |= LC_FAIL_NEXT; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 41c1c1b..5961dcc 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -163,9 +163,9 @@ static PyObject* TagCloseClose; /* Aggregate contexts: */ #define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) -#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) +#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) -#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URL) +#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK_URL) /* Tag contexts: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 07ae0b1..84de78e 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -835,12 +835,16 @@ class Tokenizer(object): context = self._context if context & contexts.FAIL_NEXT: return False - if context & contexts.WIKILINK_TITLE: - if this == "]" or this == "{": + if context & contexts.WIKILINK: + if context & contexts.WIKILINK_TEXT: + return not (this == self._read(1) == "[") + elif this == "]" or this == "{": self._context |= contexts.FAIL_NEXT elif this == "\n" or this == "[" or this == "}": return False return True + elif context & contexts.EXT_LINK_TITLE: + return this != "\n" elif context & contexts.TEMPLATE_NAME: if this == "{" or this == "}" or this == "[": self._context |= contexts.FAIL_NEXT @@ -936,6 +940,8 @@ class Tokenizer(object): self._parse_external_link(True) elif this == ":" and self._read(-1) not in self.MARKERS: self._parse_external_link(False) + elif this == "]" and self._context & contexts.EXT_LINK_TITLE: + return self._pop() elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() From da272ae10a78c8bd2be633aefab1b827c411d554 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 00:59:46 -0400 Subject: [PATCH 160/189] Start implementation of external links in Python. --- mwparserfromhell/parser/tokenizer.py | 100 +++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 5 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 84de78e..9acf32d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -26,7 +26,8 @@ import re from . import contexts, tokens from ..compat import htmlentities -from ..definitions import get_html_tag, is_parsable, is_single, is_single_only +from ..definitions import (get_html_tag, is_parsable, is_single, + is_single_only, is_scheme) __all__ = ["Tokenizer"] @@ -313,8 +314,95 @@ class Tokenizer(object): def _really_parse_external_link(self, brackets): """Really parse an external link.""" - # link = self._parse(contexts.EXT_LINK_URL) - raise BadRoute() + scheme_valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" + if brackets: + self._push(contexts.EXT_LINK_URI) + if self._read() == self._read(1) == "/": + self._emit_text("//") + self._head += 2 + else: + scheme = "" + while all(char in scheme_valid for char in self._read()): + scheme += self._read() + self._emit_text(self._read()) + self._head += 1 + if self._read() != ":": + self._fail_route() + self._emit_text(":") + self._head += 1 + slashes = self._read() == self._read(1) == "/" + if slashes: + self._emit_text("//") + self._head += 2 + if not is_scheme(scheme, slashes): + self._fail_route() + else: + scheme = [] + try: + # Ugly, but we have to backtrack through the textbuffer looking + # for our scheme since it was just parsed as text: + for i in range(-1, -len(self._textbuffer) - 1, -1): + for char in reversed(self._textbuffer[i]): + if char.isspace() or char in self.MARKERS: + raise StopIteration() + if char not in scheme_valid: + raise BadRoute() + scheme.append(char) + except StopIteration: + pass + scheme = "".join(reversed(scheme)) + slashes = self._read() == self._read(1) == "/" + if not is_scheme(scheme, slashes): + raise BadRoute() + # Remove the scheme from the textbuffer, now that it's part of the + # external link: + length = len(scheme) + while length: + if length < len(self._textbuffer[-1]): + self._textbuffer[-1] = self._textbuffer[-1][:-length] + break + length -= len(self._textbuffer[-1]) + self._textbuffer.pop() + self._push(contexts.EXT_LINK_URI) + self._emit_text(scheme) + self._emit_text(":") + if slashes: + self._emit_text("//") + self._head += 2 + parentheses = False + + while True: + this, next = self._read(), self._read(1) + if this is self.END or this == "\n": + if brackets: + self._fail_route() + self._head -= 1 + return self._pop(), None + elif this == next == "{" and self._can_recurse(): + self._parse_template_or_argument() + elif this == "&": + self._parse_entity() + elif this == "]": + if not brackets: + self._head -= 1 + return self._pop(), None + elif this == "(" and not brackets and not parentheses: + parentheses = True + self._emit_text(this) + elif " " in this: ## Should be a more general whitespace check + before, after = this.split(" ", 1) + self._emit_text(before) + if brackets: + self._emit(tokens.ExternalLinkSeparator()) + self._emit_text(after) + self._context ^= contexts.EXT_LINK_URI + self._context |= contexts.EXT_LINK_TITLE + self._head += 1 + return self._parse(push=False), None + return self._pop(), " " + after + else: + self._emit_text(this) + self._head += 1 def _parse_external_link(self, brackets): """Parse an external link at the head of the wikicode string.""" @@ -324,7 +412,7 @@ class Tokenizer(object): bad_context = self._context & contexts.INVALID_LINK if bad_context or not self._can_recurse(): raise BadRoute() - link = self._really_parse_external_link(brackets) + link, extra = self._really_parse_external_link(brackets) except BadRoute: self._head = reset if not brackets and self._context & contexts.DL_TERM: @@ -332,9 +420,11 @@ class Tokenizer(object): else: self._emit_text(self._read()) else: - self._emit(tokens.ExternalLinkOpen(brackets)) + self._emit(tokens.ExternalLinkOpen(brackets=brackets)) self._emit_all(link) self._emit(tokens.ExternalLinkClose()) + if extra: + self._emit_text(extra) def _parse_heading(self): """Parse a section heading at the head of the wikicode string.""" From 0ecf2e42310bc36fbf220883f95836d4fe96bc7a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 01:02:27 -0400 Subject: [PATCH 161/189] Add a couple integration tests for ext links vs.
    . --- tests/tokenizer/integration.mwtest | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index e4ff8c4..083b12c 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -58,3 +58,17 @@ name: wildcard_redux label: an even wilder assortment of various things input: "{{a|b|{{c|[[d]]{{{e}}}}}}}[[f|{{{g}}}]]{{i|j= }}" output: [TemplateOpen(), Text(text="a"), TemplateParamSeparator(), Text(text="b"), TemplateParamSeparator(), TemplateOpen(), Text(text="c"), TemplateParamSeparator(), WikilinkOpen(), Text(text="d"), WikilinkClose(), ArgumentOpen(), Text(text="e"), ArgumentClose(), TemplateClose(), TemplateClose(), WikilinkOpen(), Text(text="f"), WikilinkSeparator(), ArgumentOpen(), Text(text="g"), ArgumentClose(), CommentStart(), Text(text="h"), CommentEnd(), WikilinkClose(), TemplateOpen(), Text(text="i"), TemplateParamSeparator(), Text(text="j"), TemplateParamEquals(), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), TemplateClose()] + +--- + +name: link_inside_dl +label: an external link inside a def list, such that the external link is parsed +input: ";;;mailto:example" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), ExternalLinkOpen(brackets=False), Text(text="mailto:example"), ExternalLinkClose()] + +--- + +name: link_inside_dl_2 +label: an external link inside a def list, such that the external link is not parsed +input: ";;;malito:example" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="malito"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="example")] From 93c51fe57c1711c674c41ea0799be5193ff3bf21 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 01:58:27 -0400 Subject: [PATCH 162/189] Tokenizer tests for external links. --- tests/tokenizer/external_links.mwtest | 459 ++++++++++++++++++++++++++++++++++ 1 file changed, 459 insertions(+) create mode 100644 tests/tokenizer/external_links.mwtest diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest new file mode 100644 index 0000000..9e0ebdd --- /dev/null +++ b/tests/tokenizer/external_links.mwtest @@ -0,0 +1,459 @@ +name: basic +label: basic external link +input: "http://example.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose()] + +--- + +name: basic_brackets +label: basic external link in brackets +input: "[http://example.com/]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkClose()] + +--- + +name: brackets_space +label: basic external link in brackets, with a space after +input: "[http://example.com/ ]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text=""), ExternalLinkClose()] + +--- + +name: brackets_title +label: basic external link in brackets, with a title +input: "[http://example.com/ Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_multiword_title +label: basic external link in brackets, with a multi-word title +input: "[http://example.com/ Example Web Page]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text="Example Web Page"), ExternalLinkClose()] + +--- + +name: brackets_adjacent +label: three adjacent bracket-enclosed external links +input: "[http://foo.com/ Foo][http://bar.com/ Bar]\n[http://baz.com/ Baz]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foo.com/"), ExternalLinkSeparator(), Text(text="Foo"), ExternalLinkClose(), ExternalLinkOpen(brackets=True), Text(text="http://bar.com/"), ExternalLinkSeparator(), Text(text="Bar"), ExternalLinkClose(), Text(text="\n"), ExternalLinkOpen(brackets=True), Text(text="http://baz.com/"), ExternalLinkSeparator(), Text(text="Baz"), ExternalLinkClose()] + +--- + +name: brackets_newline_before +label: bracket-enclosed link with a newline before the title +input: "[http://example.com/ \nExample]" +output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" \nExample]")] + +--- + +name: brackets_newline_inside +label: bracket-enclosed link with a newline in the title +input: "[http://example.com/ Example \nWeb Page]" +output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" Example \nWeb Page]")] + +--- + +name: brackets_newline_after +label: bracket-enclosed link with a newline after the title +input: "[http://example.com/ Example\n]" +output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" Example\n]")] + +--- + +name: brackets_space_before +label: bracket-enclosed link with a space before the URL +input: "[ http://example.com Example]" +output: [Text(text="[ "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" Example]")] + +--- + +name: brackets_title_like_url +label: bracket-enclosed link with a title that looks like a URL +input: "[http://example.com http://example.com]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: brackets_recursive +label: bracket-enclosed link with a bracket-enclosed link as the title +input: "[http://example.com [http://example.com]]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="[http://example.com"), ExternalLinkClose(), Text(text="]")] + +--- + +name: period_after +label: a period after a free link that is excluded +input: "http://example.com." +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=".")] + +--- + +name: colons_after +label: colons after a free link that are excluded +input: "http://example.com/foo:bar:::baz:::" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo:bar:::baz"), ExternalLinkClose(), Text(text=":::")] + +--- + +name: close_paren_after_excluded +label: a closing parenthesis after a free link that is excluded +input: "http://example.)com)" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.)com"), ExternalLinkClose(), Text(text=")")] + +--- + +name: close_paren_after_included +label: a closing parenthesis after a free link that is included because of an opening parenthesis in the URL +input: "http://example.(com)" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.(com)"), ExternalLinkClose()] + +--- + +name: open_bracket_inside +label: an open bracket inside a free link that causes it to be ended abruptly +input: "http://foobar[baz.com" +output: [ExternalLinkOpen(brackets=False), Text(text="http://foobar"), ExternalLinkClose(), Text(text="[baz.com")] + +--- + +name: brackets_period_after +label: a period after a bracket-enclosed link that is included +input: "[http://example.com. Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com."), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_colons_after +label: colons after a bracket-enclosed link that are included +input: "[http://example.com/foo:bar:::baz::: Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo:bar:::baz:::"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_close_paren_after_included +label: a closing parenthesis after a bracket-enclosed link that is included +input: "[http://example.)com) Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.)com)"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_close_paren_after_included_2 +label: a closing parenthesis after a bracket-enclosed link that is also included +input: "[http://example.(com) Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_open_bracket_inside +label: an open bracket inside a bracket-enclosed link that causes it to switch to the title context abruptly +input: "[http://foobar[baz.com Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar"), ExternalLinkSeparator(), Text(text="[baz.com Example"), ExternalLinkClose()] + +--- + +name: adjacent_space +label: two free links separated by a space +input: "http://example.com http://example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: adjacent_newline +label: two free links separated by a newline +input: "http://example.com\nhttp://example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text="\n"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: adjacent_close_bracket +label: two free links separated by a close bracket +input: "http://example.com]http://example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text="]"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: html_entity_in_url +label: a HTML entity parsed correctly inside a free link +input: "http://exa mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="mple.com/"), ExternalLinkClose()] + +--- + +name: template_in_url +label: a template parsed correctly inside a free link +input: "http://exa{{template}}mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), TemplateOpen(), Text(text="template"), TemplateClose(), Text(text="mple.com/"), ExternalLinkClose()] + +--- + +name: argument_in_url +label: an argument parsed correctly inside a free link +input: "http://exa{{{argument}}}mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ArgumentOpen(), Text(text="argument"), ArgumentClose(), Text(text="mple.com/"), ExternalLinkClose()] + +--- + +name: wikilink_in_url +label: a wikilink that destroys a free link +input: "http://exa[[wikilink]]mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ExternalLinkClose(), WikilinkOpen(), Text(text="wikilink"), WikilinkClose(), Text(text="mple.com/")] + +--- + +name: external_link_in_url +label: a bracketed link that destroys a free link +input: "http://exa[http://example.com/]mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ExternalLinkClose(), ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkClose(), Text(text="mple.com/")] + +--- + +name: spaces_padding +label: spaces padding a free link +input: " http://example.com " +output: [Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" ")] + +--- + +name: text_and_spaces_padding +label: text and spaces padding a free link +input: "x http://example.com x" +output: [Text(text="x "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" x")] + +--- + +name: template_before +label: a template before a free link +input: "{{foo}}http://example.com" +output: [TemplateOpen(), Text(text="foo"), TemplateClose(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: spaces_padding_no_slashes +label: spaces padding a free link with no slashes after the colon +input: " mailto:example@example.com " +output: [Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" ")] + +--- + +name: text_and_spaces_padding_no_slashes +label: text and spaces padding a free link with no slashes after the colon +input: "x mailto:example@example.com x" +output: [Text(text="x "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" x")] + +--- + +name: template_before_no_slashes +label: a template before a free link with no slashes after the colon +input: "{{foo}}mailto:example@example.com" +output: [TemplateOpen(), Text(text="foo"), TemplateClose(), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose()] + +--- + +name: no_slashes +label: a free link with no slashes after the colon +input: "mailto:example@example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose()] + +--- + +name: slashes_optional +label: a free link using a scheme that doesn't need slashes, but has them anyway +input: "mailto://example@example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="mailto://example@example.com"), ExternalLinkClose()] + +--- + +name: short +label: a very short free link +input: "mailto://abc" +output: [ExternalLinkOpen(brackets=False), Text(text="mailto://abc"), ExternalLinkClose()] + +--- + +name: slashes_missing +label: slashes missing from a free link with a scheme that requires them +input: "http:example@example.com" +output: [Text(text="http:example@example.com")] + +--- + +name: no_scheme_but_slashes +label: no scheme in a free link, but slashes (protocol-relative free links are not supported) +input: "//example.com" +output: [Text(text="//example.com")] + +--- + +name: no_scheme_but_colon +label: no scheme in a free link, but a colon +input: ":example.com" +output: [Text(text=":example.com")] + +--- + +name: no_scheme_but_colon_and_slashes +label: no scheme in a free link, but a colon and slashes +input: "://example.com" +output: [Text(text="://example.com")] + +--- + +name: fake_scheme_no_slashes +label: a nonexistent scheme in a free link, without slashes +input: "fake:example.com" +output: [Text(text="fake:example.com")] + +--- + +name: fake_scheme_slashes +label: a nonexistent scheme in a free link, with slashes +input: "fake://example.com" +output: [Text(text="fake://example.com")] + +--- + +name: fake_scheme_brackets_no_slashes +label: a nonexistent scheme in a bracketed link, without slashes +input: "[fake:example.com]" +output: [Text(text="[fake:example.com]")] + +--- + +name: fake_scheme_brackets_slashes +label: #=a nonexistent scheme in a bracketed link, with slashes +input: "[fake://example.com]" +output: [Text(text="[fake://example.com]")] + +--- + +name: interrupted_scheme +label: an otherwise valid scheme with something in the middle of it, in a free link +input: "ht?tp://example.com" +output: [Text(text="ht?tp://example.com")] + +--- + +name: interrupted_scheme_brackets +label: an otherwise valid scheme with something in the middle of it, in a bracketed link +input: "[ht?tp://example.com]" +output: [Text(text="[ht?tp://example.com]")] + +--- + +name: no_slashes_brackets +label: no slashes after the colon in a bracketed link +input: "[mailto:example@example.com Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="mailto:example@example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: space_before_no_slashes_brackets +label: a space before a bracketed link with no slashes after the colon +input: "[ mailto:example@example.com Example]" +output: [Text(text="[ "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" Example]")] + +--- + +name: slashes_optional_brackets +label: a bracketed link using a scheme that doesn't need slashes, but has them anyway +input: "[mailto://example@example.com Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="mailto://example@example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: short_brackets +label: a very short link in brackets +input: "[mailto://abc Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="mailto://abc"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: slashes_missing_brackets +label: slashes missing from a scheme that requires them in a bracketed link +input: "[http:example@example.com Example]" +output: [Text(text="[http:example@example.com Example]")] + +--- + +name: protcol_relative +label: a protocol-relative link (in brackets) +input: "[//example.com Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="//example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: scheme_missing_but_colon_brackets +label: scheme missing from a bracketed link, but with a colon +input: "[:example.com Example]" +output: [Text(text="[:example.com Example]")] + +--- + +name: scheme_missing_but_colon_slashes_brackets +label: scheme missing from a bracketed link, but with a colon and slashes +input: "[://example.com Example]" +output: [Text(text="[://example.com Example]")] + +--- + +name: unclosed_protocol_relative +label: an unclosed protocol-relative bracketed link +input: "[//example.com" +output: [Text(text="[//example.com")] + +--- + +name: space_before_protcol_relative +label: a space before a protocol-relative bracketed link +input: "[ //example.com]" +output: [Text(text="[ //example.com]")] + +--- + +name: unclosed_just_scheme +label: an unclosed bracketed link, ending after the scheme +input: "[http" +output: [Text(text="[http")] + +--- + +name: unclosed_scheme_colon +label: an unclosed bracketed link, ending after the colon +input: "[http:" +output: [Text(text="[http:")] + +--- + +name: unclosed_scheme_colon_slashes +label: an unclosed bracketed link, ending after the slashes +input: "[http://" +output: [Text(text="[http://")] + +--- + +name: incomplete_scheme_colon +label: a free link with just a scheme and a colon +input: "http:" +output: [Text(text="http:")] + +--- + +name: incomplete_scheme_colon_slashes +label: a free link with just a scheme, colon, and slashes +input: "http://" +output: [Text(text="http://")] + +--- + +name: brackets_scheme_but_no_url +label: brackets around a scheme, colon, and slashes +input: "[http://]" +output: [Text(text="[http://]")] + +--- + +name: brackets_scheme_title_but_no_url +label: brackets around a scheme, colon, and slashes, with a title +input: "[http:// Example]" +output: [Text(text="[http:// Example]")] From 176290d75a22ee44c1d81e9eb688025d4e1f808e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 02:01:47 -0400 Subject: [PATCH 163/189] Add a couple more tests. --- tests/tokenizer/external_links.mwtest | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index 9e0ebdd..7e1b7a5 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -432,6 +432,13 @@ output: [Text(text="[http://")] --- +name: incomplete_bracket +label: just an open bracket +input: "[" +output: [Text(text="[")] + +--- + name: incomplete_scheme_colon label: a free link with just a scheme and a colon input: "http:" @@ -447,6 +454,13 @@ output: [Text(text="http://")] --- name: brackets_scheme_but_no_url +label: brackets around a scheme and a colon +input: "[mailto:]" +output: [Text(text="[mailto:]")] + +--- + +name: brackets_scheme_slashes_but_no_url label: brackets around a scheme, colon, and slashes input: "[http://]" output: [Text(text="[http://]")] From 44ee185377df12f525c91c6712d6c7cf43c86936 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 02:21:06 -0400 Subject: [PATCH 164/189] Fix some tests involving colons starting lines. --- tests/tokenizer/external_links.mwtest | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index 7e1b7a5..ee221e0 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -287,15 +287,15 @@ output: [Text(text="//example.com")] name: no_scheme_but_colon label: no scheme in a free link, but a colon -input: ":example.com" -output: [Text(text=":example.com")] +input: " :example.com" +output: [Text(text=" :example.com")] --- name: no_scheme_but_colon_and_slashes label: no scheme in a free link, but a colon and slashes -input: "://example.com" -output: [Text(text="://example.com")] +input: " ://example.com" +output: [Text(text=" ://example.com")] --- From 432198547af4077687606abd3a21e5458fea6530 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 02:21:19 -0400 Subject: [PATCH 165/189] Fix some external links; refactor into different methods. --- mwparserfromhell/parser/tokenizer.py | 121 ++++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 53 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9acf32d..29bec56 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -312,65 +312,67 @@ class Tokenizer(object): self._head += 1 return self._pop() - def _really_parse_external_link(self, brackets): - """Really parse an external link.""" - scheme_valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" - if brackets: - self._push(contexts.EXT_LINK_URI) - if self._read() == self._read(1) == "/": - self._emit_text("//") - self._head += 2 - else: - scheme = "" - while all(char in scheme_valid for char in self._read()): - scheme += self._read() - self._emit_text(self._read()) - self._head += 1 - if self._read() != ":": - self._fail_route() - self._emit_text(":") - self._head += 1 - slashes = self._read() == self._read(1) == "/" - if slashes: - self._emit_text("//") - self._head += 2 - if not is_scheme(scheme, slashes): - self._fail_route() + def _parse_bracketed_uri_scheme(self): + """Parse the URI scheme of a bracket-enclosed external link.""" + self._push(contexts.EXT_LINK_URI) + if self._read() == self._read(1) == "/": + self._emit_text("//") + self._head += 2 else: - scheme = [] - try: - # Ugly, but we have to backtrack through the textbuffer looking - # for our scheme since it was just parsed as text: - for i in range(-1, -len(self._textbuffer) - 1, -1): - for char in reversed(self._textbuffer[i]): - if char.isspace() or char in self.MARKERS: - raise StopIteration() - if char not in scheme_valid: - raise BadRoute() - scheme.append(char) - except StopIteration: - pass - scheme = "".join(reversed(scheme)) - slashes = self._read() == self._read(1) == "/" - if not is_scheme(scheme, slashes): - raise BadRoute() - # Remove the scheme from the textbuffer, now that it's part of the - # external link: - length = len(scheme) - while length: - if length < len(self._textbuffer[-1]): - self._textbuffer[-1] = self._textbuffer[-1][:-length] - break - length -= len(self._textbuffer[-1]) - self._textbuffer.pop() - self._push(contexts.EXT_LINK_URI) - self._emit_text(scheme) + valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" + all_valid = lambda: all(char in valid for char in self._read()) + scheme = "" + while self._read() is not self.END and all_valid(): + scheme += self._read() + self._emit_text(self._read()) + self._head += 1 + if self._read() != ":": + self._fail_route() self._emit_text(":") + self._head += 1 + slashes = self._read() == self._read(1) == "/" if slashes: self._emit_text("//") self._head += 2 - parentheses = False + if not is_scheme(scheme, slashes): + self._fail_route() + + def _parse_free_uri_scheme(self): + """Parse the URI scheme of a free (no brackets) external link.""" + valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" + scheme = [] + try: + # Ugly, but we have to backtrack through the textbuffer looking for + # our scheme since it was just parsed as text: + for i in range(-1, -len(self._textbuffer) - 1, -1): + for char in reversed(self._textbuffer[i]): + if char.isspace() or char in self.MARKERS: + raise StopIteration() + if char not in valid: + raise BadRoute() + scheme.append(char) + except StopIteration: + pass + scheme = "".join(reversed(scheme)) + slashes = self._read() == self._read(1) == "/" + if not is_scheme(scheme, slashes): + raise BadRoute() + parentheses = False + self._push(contexts.EXT_LINK_URI) + self._emit_text(scheme) + self._emit_text(":") + if slashes: + self._emit_text("//") + self._head += 2 + def _really_parse_external_link(self, brackets): + """Really parse an external link.""" + if brackets: + self._parse_bracketed_uri_scheme() + else: + self._parse_free_uri_scheme() + if self._read() in self.MARKERS or self._read()[0].isspace(): ## Should actually check for valid chars + self._fail_route() while True: this, next = self._read(), self._read(1) if this is self.END or this == "\n": @@ -404,6 +406,16 @@ class Tokenizer(object): self._emit_text(this) self._head += 1 + def _remove_uri_scheme_from_textbuffer(self, scheme): + """Remove the URI scheme of a new external link from the textbuffer.""" + length = len(scheme) + while length: + if length < len(self._textbuffer[-1]): + self._textbuffer[-1] = self._textbuffer[-1][:-length] + break + length -= len(self._textbuffer[-1]) + self._textbuffer.pop() + def _parse_external_link(self, brackets): """Parse an external link at the head of the wikicode string.""" reset = self._head @@ -420,6 +432,9 @@ class Tokenizer(object): else: self._emit_text(self._read()) else: + if not brackets: + scheme = link[0].text.split(":", 1)[0] + self._remove_uri_scheme_from_textbuffer(scheme) self._emit(tokens.ExternalLinkOpen(brackets=brackets)) self._emit_all(link) self._emit(tokens.ExternalLinkClose()) From be505465c06595ff9c7592aeb729d2b69ee952a7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 02:34:55 -0400 Subject: [PATCH 166/189] Alter a test that imitates strange (incorrect?) MediaWiki behavior. --- tests/tokenizer/external_links.mwtest | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index ee221e0..b517ada 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -146,9 +146,9 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), Ext --- name: brackets_open_bracket_inside -label: an open bracket inside a bracket-enclosed link that causes it to switch to the title context abruptly +label: an open bracket inside a bracket-enclosed link that is also included input: "[http://foobar[baz.com Example]" -output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar"), ExternalLinkSeparator(), Text(text="[baz.com Example"), ExternalLinkClose()] +output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] --- From bd10aab823562f349f433ef80525aee134c5e317 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 05:05:30 -0400 Subject: [PATCH 167/189] Finish external links. --- mwparserfromhell/parser/tokenizer.py | 64 +++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 29bec56..e9768fa 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -357,7 +357,6 @@ class Tokenizer(object): slashes = self._read() == self._read(1) == "/" if not is_scheme(scheme, slashes): raise BadRoute() - parentheses = False self._push(contexts.EXT_LINK_URI) self._emit_text(scheme) self._emit_text(":") @@ -365,43 +364,75 @@ class Tokenizer(object): self._emit_text("//") self._head += 2 + def _handle_free_link_text(self, punct, tail, this): + """Handle text in a free ext link, including trailing punctuation.""" + if "(" in this and ")" in punct: + punct = punct[:-1] # ')' is not longer valid punctuation + if this.endswith(punct): + for i in range(-1, -len(this) - 1, -1): + if i == -len(this) or this[i - 1] not in punct: + break + stripped = this[:i] + if stripped and tail: + self._emit_text(tail) + tail = "" + tail += this[i:] + this = stripped + elif tail: + self._emit_text(tail) + tail = "" + self._emit_text(this) + return punct, tail + def _really_parse_external_link(self, brackets): """Really parse an external link.""" if brackets: self._parse_bracketed_uri_scheme() + invalid = ("\n", " ", "]") else: self._parse_free_uri_scheme() - if self._read() in self.MARKERS or self._read()[0].isspace(): ## Should actually check for valid chars + invalid = ("\n", " ", "[", "]") + punct = tuple(",;\.:!?)") + if self._read() is self.END or self._read()[0] in invalid: self._fail_route() + tail = "" while True: this, next = self._read(), self._read(1) if this is self.END or this == "\n": if brackets: self._fail_route() - self._head -= 1 - return self._pop(), None + return self._pop(), tail, -1 elif this == next == "{" and self._can_recurse(): + if not brackets and tail: + self._emit_text(tail) + tail = "" self._parse_template_or_argument() + elif this == "[": + if brackets: + self._emit_text("[") + else: + return self._pop(), tail, -1 + elif this == "]": + return self._pop(), tail, 0 if brackets else -1 elif this == "&": + if not brackets and tail: + self._emit_text(tail) + tail = "" self._parse_entity() - elif this == "]": - if not brackets: - self._head -= 1 - return self._pop(), None - elif this == "(" and not brackets and not parentheses: - parentheses = True - self._emit_text(this) - elif " " in this: ## Should be a more general whitespace check + elif " " in this: before, after = this.split(" ", 1) - self._emit_text(before) if brackets: + self._emit_text(before) self._emit(tokens.ExternalLinkSeparator()) self._emit_text(after) self._context ^= contexts.EXT_LINK_URI self._context |= contexts.EXT_LINK_TITLE self._head += 1 - return self._parse(push=False), None - return self._pop(), " " + after + return self._parse(push=False), None, 0 + punct, tail = self._handle_free_link_text(punct, tail, before) + return self._pop(), tail + " " + after, 0 + elif not brackets: + punct, tail = self._handle_free_link_text(punct, tail, this) else: self._emit_text(this) self._head += 1 @@ -424,7 +455,7 @@ class Tokenizer(object): bad_context = self._context & contexts.INVALID_LINK if bad_context or not self._can_recurse(): raise BadRoute() - link, extra = self._really_parse_external_link(brackets) + link, extra, delta = self._really_parse_external_link(brackets) except BadRoute: self._head = reset if not brackets and self._context & contexts.DL_TERM: @@ -438,6 +469,7 @@ class Tokenizer(object): self._emit(tokens.ExternalLinkOpen(brackets=brackets)) self._emit_all(link) self._emit(tokens.ExternalLinkClose()) + self._head += delta if extra: self._emit_text(extra) From 6eb72c56eb3219009f4f06b7a63f0b31d971dd9e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 14:34:45 -0400 Subject: [PATCH 168/189] Some cleanup. --- mwparserfromhell/parser/tokenizer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index e9768fa..2340077 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -342,10 +342,10 @@ class Tokenizer(object): valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" scheme = [] try: - # Ugly, but we have to backtrack through the textbuffer looking for - # our scheme since it was just parsed as text: - for i in range(-1, -len(self._textbuffer) - 1, -1): - for char in reversed(self._textbuffer[i]): + # We have to backtrack through the textbuffer looking for our + # scheme since it was just parsed as text: + for chunk in reversed(self._textbuffer): + for char in reversed(chunk): if char.isspace() or char in self.MARKERS: raise StopIteration() if char not in valid: @@ -369,7 +369,7 @@ class Tokenizer(object): if "(" in this and ")" in punct: punct = punct[:-1] # ')' is not longer valid punctuation if this.endswith(punct): - for i in range(-1, -len(this) - 1, -1): + for i in reversed(range(-len(this), 0)): if i == -len(this) or this[i - 1] not in punct: break stripped = this[:i] @@ -403,7 +403,7 @@ class Tokenizer(object): self._fail_route() return self._pop(), tail, -1 elif this == next == "{" and self._can_recurse(): - if not brackets and tail: + if tail: self._emit_text(tail) tail = "" self._parse_template_or_argument() @@ -415,7 +415,7 @@ class Tokenizer(object): elif this == "]": return self._pop(), tail, 0 if brackets else -1 elif this == "&": - if not brackets and tail: + if tail: self._emit_text(tail) tail = "" self._parse_entity() From 7b84b3f0df31e634bc9390dae2f3539c3dc04d3c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 15:01:59 -0400 Subject: [PATCH 169/189] Refactor out C's is_marker(); hooks for ext links. --- mwparserfromhell/parser/tokenizer.c | 77 ++++++++++++++++++------------------- mwparserfromhell/parser/tokenizer.h | 3 +- 2 files changed, 39 insertions(+), 41 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 46df405..8a2d9f9 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -24,6 +24,20 @@ SOFTWARE. #include "tokenizer.h" /* + Determine whether the given Py_UNICODE is a marker. +*/ +static int is_marker(Py_UNICODE this) +{ + int i; + + for (i = 0; i < NUM_MARKERS; i++) { + if (*MARKERS[i] == this) + return 1; + } + return 0; +} + +/* Given a context, return the heading level encoded within it. */ static int heading_level_from_context(int n) @@ -37,13 +51,13 @@ static int heading_level_from_context(int n) } /* - Call the given function in definitions.py, using 'tag' as a parameter, and - return its output as a bool. + Call the given function in definitions.py, using 'input' as a parameter, + and return its output as a bool. */ -static int call_def_func(const char* funcname, PyObject* tag) +static int call_def_func(const char* funcname, PyObject* input) { PyObject* func = PyObject_GetAttrString(definitions, funcname); - PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL); + PyObject* result = PyObject_CallFunctionObjArgs(func, input, NULL); int ans = (result == Py_True) ? 1 : 0; Py_DECREF(func); @@ -1238,15 +1252,8 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text) { Py_UNICODE next = Tokenizer_READ(self, 1); - int i, is_marker = 0; - for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == text) { - is_marker = 1; - break; - } - } - if (!is_marker || !Tokenizer_CAN_RECURSE(self)) + if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self)) return Tokenizer_emit_char(self, text); else if (text == next && next == *"{") return Tokenizer_parse_template_or_argument(self); @@ -1264,17 +1271,11 @@ static int Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk) { PyObject *trash; - int first_time, i, is_marker = 0, escaped; + int first_time, escaped; if (data->context & TAG_NAME) { first_time = !(data->context & TAG_NOTE_SPACE); - for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == chunk) { - is_marker = 1; - break; - } - } - if (is_marker || (Py_UNICODE_ISSPACE(chunk) && first_time)) { + if (is_marker(chunk) || (Py_UNICODE_ISSPACE(chunk) && first_time)) { // Tags must start with text, not spaces Tokenizer_fail_route(self); return 0; @@ -1623,7 +1624,6 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) Textbuffer* buf; PyObject *name, *tag; Py_UNICODE this; - int is_marker, i; self->head += 2; buf = Textbuffer_new(); @@ -1631,14 +1631,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) return -1; while (1) { this = Tokenizer_READ(self, pos); - is_marker = 0; - for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == this) { - is_marker = 1; - break; - } - } - if (is_marker) { + if (is_marker(this)) { name = Textbuffer_render(buf); if (!name) { Textbuffer_dealloc(buf); @@ -2047,9 +2040,8 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) */ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) { - if (context & LC_FAIL_NEXT) { + if (context & LC_FAIL_NEXT) return -1; - } if (context & LC_WIKILINK) { if (context & LC_WIKILINK_TEXT) return (data == *"[" && Tokenizer_READ(self, 1) == *"[") ? -1 : 0; @@ -2059,6 +2051,8 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) return -1; return 0; } + if (context & LC_EXT_LINK_TITLE) + return (data == *"\n") ? -1 : 0; if (context & LC_TAG_CLOSE) return (data == *"<") ? -1 : 0; if (context & LC_TEMPLATE_NAME) { @@ -2125,7 +2119,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) */ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) { - int this_context, is_marker, i; + int this_context; Py_UNICODE this, next, next_next, last; PyObject* temp; @@ -2145,14 +2139,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) return Tokenizer_fail_route(self); } } - is_marker = 0; - for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == this) { - is_marker = 1; - break; - } - } - if (!is_marker) { + if (!is_marker(this)) { if (Tokenizer_emit_char(self, this)) return NULL; self->head++; @@ -2205,6 +2192,16 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) } else if (this == next && next == *"]" && this_context & LC_WIKILINK) return Tokenizer_handle_wikilink_end(self); + // else if (this == *"[") { + // if (Tokenizer_parse_external_link(self, 1)) + // return NULL; + // } + // else if (this == *":" && !is_marker(last)) { + // if (Tokenizer_parse_external_link(self, 0)) + // return NULL; + // } + // else if (this == *"]" && this_context & LC_EXT_LINK_TITLE) + // return Tokenizer_pop(self); else if (this == *"=" && !(self->global & GL_HEADING)) { if (last == *"\n" || last == *"") { if (Tokenizer_parse_heading(self)) diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 5961dcc..e125068 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -238,12 +238,13 @@ typedef struct { #define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1) -/* Macros for accessing HTML tag definitions: */ +/* Macros for accessing definitions: */ #define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") #define IS_PARSABLE(tag) (call_def_func("is_parsable", tag)) #define IS_SINGLE(tag) (call_def_func("is_single", tag)) #define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag)) +#define IS_SCHEME(scheme) (call_def_func("is_scheme", scheme)) /* Function prototypes: */ From a1948b06aaa05dd6585c4af9c254dfb966165e81 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 20:03:34 -0400 Subject: [PATCH 170/189] Tokenizer_parse_bracketed/free_uri_scheme(), other adjustments --- mwparserfromhell/parser/tokenizer.c | 315 ++++++++++++++++++++++++++++++++++-- mwparserfromhell/parser/tokenizer.h | 12 +- 2 files changed, 304 insertions(+), 23 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 8a2d9f9..80f4c56 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -51,13 +51,13 @@ static int heading_level_from_context(int n) } /* - Call the given function in definitions.py, using 'input' as a parameter, - and return its output as a bool. + Call the given function in definitions.py, using 'in1' and 'in2' as + parameters, and return its output as a bool. */ -static int call_def_func(const char* funcname, PyObject* input) +static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2) { PyObject* func = PyObject_GetAttrString(definitions, funcname); - PyObject* result = PyObject_CallFunctionObjArgs(func, input, NULL); + PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL); int ans = (result == Py_True) ? 1 : 0; Py_DECREF(func); @@ -129,8 +129,7 @@ static int Textbuffer_write(Textbuffer** this, Py_UNICODE code) new->next = self; *this = self = new; } - self->data[self->size] = code; - self->size++; + self->data[self->size++] = code; return 0; } @@ -822,6 +821,288 @@ static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self) } /* + Parse the URI scheme of a bracket-enclosed external link. +*/ +static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) +{ + static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; + Textbuffer* buffer; + PyObject* scheme; + Py_UNICODE this; + int slashes, i; + + if (Tokenizer_push(self, LC_EXT_LINK_URI)) + return -1; + if (Tokenizer_READ(self, 0) == *"/" && Tokenizer_READ(self, 1) == *"/") { + if (Tokenizer_emit_text(self, "//")) + return -1; + self->head += 2; + } + else { + buffer = Textbuffer_new(); + if (!buffer) + return -1; + while ((this = Tokenizer_READ(self, 0)) != *"") { + i = 0; + while (1) { + if (!valid[i]) + goto end_of_loop; + if (this == valid[i]) + break; + i++; + } + Textbuffer_write(&buffer, this); + if (Tokenizer_emit_char(self, this)) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head++; + } + end_of_loop: + if (this != *":") { + Textbuffer_dealloc(buffer); + Tokenizer_fail_route(self); + return 0; + } + if (Tokenizer_emit_char(self, *":")) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head++; + slashes = (Tokenizer_READ(self, 0) == *"/" && + Tokenizer_READ(self, 1) == *"/"); + if (slashes) { + if (Tokenizer_emit_text(self, "//")) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head += 2; + } + scheme = Textbuffer_render(buffer); + Textbuffer_dealloc(buffer); + if (!scheme) + return -1; + if (!IS_SCHEME(scheme, slashes ? Py_True : Py_False)) { + Py_DECREF(scheme); + Tokenizer_fail_route(self); + return 0; + } + Py_DECREF(scheme); + } + return 0; +} + +/* + Parse the URI scheme of a free (no brackets) external link. +*/ +static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) +{ + static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; + Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer; + PyObject *temp, *reversed, *scheme; + Py_UNICODE chunk; + int slashes, i, j; + + if (!scheme_buffer) + return -1; + // We have to backtrack through the textbuffer looking for our scheme since + // it was just parsed as text: + temp_buffer = self->topstack->textbuffer; + while (temp_buffer) { + for (i = temp_buffer->size - 1; i >= 0; i++) { + chunk = temp_buffer->data[i]; + if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) + goto end_of_loop; + j = 0; + while (1) { + if (!valid[j]) { + Textbuffer_dealloc(scheme_buffer); + FAIL_ROUTE(0); + return 0; + } + if (chunk == valid[j]) + break; + j++; + } + Textbuffer_write(&scheme_buffer, chunk); + } + temp_buffer = temp_buffer->next; + } + end_of_loop: + temp = Textbuffer_render(scheme_buffer); + if (!temp) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + reversed = PyDict_GetItemString(PyEval_GetBuiltins(), "reversed"); + scheme = PyObject_CallFunctionObjArgs(reversed, temp, NULL); + Py_DECREF(temp); + if (!scheme) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + slashes = (Tokenizer_READ(self, 0) == *"/" && + Tokenizer_READ(self, 1) == *"/"); + if (!IS_SCHEME(scheme, slashes ? Py_True : Py_False)) { + Py_DECREF(scheme); + Textbuffer_dealloc(scheme_buffer); + FAIL_ROUTE(0); + return 0; + } + Py_DECREF(scheme); + if (Tokenizer_push(self, LC_EXT_LINK_URI)) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + while (temp_buffer) { + for (i = 0; i < temp_buffer->size; i++) { + if (Tokenizer_emit_char(self, temp_buffer->data[i])) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + } + temp_buffer = temp_buffer->next; + } + Textbuffer_dealloc(scheme_buffer); + if (Tokenizer_emit_char(self, *":")) + return -1; + if (slashes) { + if (Tokenizer_emit_text(self, "//")) + return -1; + self->head += 2; + } +} + +/* + Handle text in a free external link, including trailing punctuation. +*/ +static int +Tokenizer_handle_free_link_text(Tokenizer* self, PyObject** punct, + Textbuffer** tail, Py_UNICODE this) +{ + // if "(" in this and ")" in punct: + // punct = punct[:-1] # ')' is not longer valid punctuation + // if this.endswith(punct): + // for i in reversed(range(-len(this), 0)): + // if i == -len(this) or this[i - 1] not in punct: + // break + // stripped = this[:i] + // if stripped and tail: + // self._emit_text(tail) + // tail = "" + // tail += this[i:] + // this = stripped + // elif tail: + // self._emit_text(tail) + // tail = "" + // self._emit_text(this) + // return punct, tail +} + +/* + Really parse an external link. +*/ +static PyObject* +Tokenizer_really_parse_external_link(Tokenizer* self, int brackets) +{ + // if brackets: + // self._parse_bracketed_uri_scheme() + // invalid = ("\n", " ", "]") + // else: + // self._parse_free_uri_scheme() + // invalid = ("\n", " ", "[", "]") + // punct = tuple(",;\.:!?)") + // if self._read() is self.END or self._read()[0] in invalid: + // self._fail_route() + // tail = "" + // while True: + // this, next = self._read(), self._read(1) + // if this is self.END or this == "\n": + // if brackets: + // self._fail_route() + // return self._pop(), tail, -1 + // elif this == next == "{" and self._can_recurse(): + // if tail: + // self._emit_text(tail) + // tail = "" + // self._parse_template_or_argument() + // elif this == "[": + // if brackets: + // self._emit_text("[") + // else: + // return self._pop(), tail, -1 + // elif this == "]": + // return self._pop(), tail, 0 if brackets else -1 + // elif this == "&": + // if tail: + // self._emit_text(tail) + // tail = "" + // self._parse_entity() + // elif " " in this: + // before, after = this.split(" ", 1) + // if brackets: + // self._emit_text(before) + // self._emit(tokens.ExternalLinkSeparator()) + // self._emit_text(after) + // self._context ^= contexts.EXT_LINK_URI + // self._context |= contexts.EXT_LINK_TITLE + // self._head += 1 + // return self._parse(push=False), None, 0 + // punct, tail = self._handle_free_link_text(punct, tail, before) + // return self._pop(), tail + " " + after, 0 + // elif not brackets: + // punct, tail = self._handle_free_link_text(punct, tail, this) + // else: + // self._emit_text(this) + // self._head += 1 +} + +/* + Remove the URI scheme of a new external link from the textbuffer. +*/ +static int +Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* scheme) +{ + // length = len(scheme) + // while length: + // if length < len(self._textbuffer[-1]): + // self._textbuffer[-1] = self._textbuffer[-1][:-length] + // break + // length -= len(self._textbuffer[-1]) + // self._textbuffer.pop() +} + +/* + Parse an external link at the head of the wikicode string. +*/ +static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) +{ + // reset = self._head + // self._head += 1 + // try: + // bad_context = self._context & contexts.INVALID_LINK + // if bad_context or not self._can_recurse(): + // raise BadRoute() + // link, extra, delta = self._really_parse_external_link(brackets) + // except BadRoute: + // self._head = reset + // if not brackets and self._context & contexts.DL_TERM: + // self._handle_dl_term() + // else: + // self._emit_text(self._read()) + // else: + // if not brackets: + // scheme = link[0].text.split(":", 1)[0] + // self._remove_uri_scheme_from_textbuffer(scheme) + // self._emit(tokens.ExternalLinkOpen(brackets=brackets)) + // self._emit_all(link) + // self._emit(tokens.ExternalLinkClose()) + // self._head += delta + // if extra: + // self._emit_text(extra) +} + +/* Parse a section heading at the head of the wikicode string. */ static int Tokenizer_parse_heading(Tokenizer* self) @@ -1978,9 +2259,9 @@ static int Tokenizer_handle_hr(Tokenizer* self) self->head++; } markup = Textbuffer_render(buffer); + Textbuffer_dealloc(buffer); if (!markup) return -1; - Textbuffer_dealloc(buffer); kwargs = PyDict_New(); if (!kwargs) return -1; @@ -2192,16 +2473,16 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) } else if (this == next && next == *"]" && this_context & LC_WIKILINK) return Tokenizer_handle_wikilink_end(self); - // else if (this == *"[") { - // if (Tokenizer_parse_external_link(self, 1)) - // return NULL; - // } - // else if (this == *":" && !is_marker(last)) { - // if (Tokenizer_parse_external_link(self, 0)) - // return NULL; - // } - // else if (this == *"]" && this_context & LC_EXT_LINK_TITLE) - // return Tokenizer_pop(self); + else if (this == *"[") { + if (Tokenizer_parse_external_link(self, 1)) + return NULL; + } + else if (this == *":" && !is_marker(last)) { + if (Tokenizer_parse_external_link(self, 0)) + return NULL; + } + else if (this == *"]" && this_context & LC_EXT_LINK_TITLE) + return Tokenizer_pop(self); else if (this == *"=" && !(self->global & GL_HEADING)) { if (last == *"\n" || last == *"") { if (Tokenizer_parse_heading(self)) diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index e125068..a49e896 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -122,7 +122,7 @@ static PyObject* TagCloseClose; #define LC_WIKILINK_TEXT 0x00000040 #define LC_EXT_LINK 0x00000380 -#define LC_EXT_LINK_URL 0x00000080 +#define LC_EXT_LINK_URI 0x00000080 #define LC_EXT_LINK_TITLE 0x00000100 #define LC_EXT_LINK_BRACKETS 0x00000200 @@ -165,7 +165,7 @@ static PyObject* TagCloseClose; #define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) -#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK_URL) +#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK_URI) /* Tag contexts: */ @@ -241,10 +241,10 @@ typedef struct { /* Macros for accessing definitions: */ #define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") -#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag)) -#define IS_SINGLE(tag) (call_def_func("is_single", tag)) -#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag)) -#define IS_SCHEME(scheme) (call_def_func("is_scheme", scheme)) +#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL)) +#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL)) +#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL)) +#define IS_SCHEME(scheme, slashes) (call_def_func("is_scheme", scheme, slashes)) /* Function prototypes: */ From 6ecf15cad4c2d0f271e0de67d54869cb35f2d3b2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 20:41:45 -0400 Subject: [PATCH 171/189] Tokenizer_parse_external_link() --- mwparserfromhell/parser/tokenizer.c | 93 ++++++++++++++++++++++++------------- mwparserfromhell/parser/tokenizer.h | 1 + 2 files changed, 62 insertions(+), 32 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 80f4c56..1681398 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -358,7 +358,7 @@ static void* Tokenizer_fail_route(Tokenizer* self) } /* - Write a token to the end of the current token stack. + Write a token to the current token stack. */ static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) { @@ -379,7 +379,8 @@ static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) } /* - Write a token to the end of the current token stack. + Write a token to the current token stack, with kwargs. Steals a reference + to kwargs. */ static int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token, PyObject* kwargs, int first) @@ -997,13 +998,15 @@ Tokenizer_handle_free_link_text(Tokenizer* self, PyObject** punct, // tail = "" // self._emit_text(this) // return punct, tail + return 0; } /* Really parse an external link. */ static PyObject* -Tokenizer_really_parse_external_link(Tokenizer* self, int brackets) +Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, + char** extra) { // if brackets: // self._parse_bracketed_uri_scheme() @@ -1020,7 +1023,8 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets) // if this is self.END or this == "\n": // if brackets: // self._fail_route() - // return self._pop(), tail, -1 + // self.head -= 1 + // return self._pop(), tail // elif this == next == "{" and self._can_recurse(): // if tail: // self._emit_text(tail) @@ -1030,9 +1034,12 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets) // if brackets: // self._emit_text("[") // else: - // return self._pop(), tail, -1 + // self._head -= 1 + // return self._pop(), tail // elif this == "]": - // return self._pop(), tail, 0 if brackets else -1 + // if not brackets: + // self._head -= 1 + // return self._pop(), tail // elif this == "&": // if tail: // self._emit_text(tail) @@ -1047,22 +1054,24 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets) // self._context ^= contexts.EXT_LINK_URI // self._context |= contexts.EXT_LINK_TITLE // self._head += 1 - // return self._parse(push=False), None, 0 + // return self._parse(push=False), None // punct, tail = self._handle_free_link_text(punct, tail, before) - // return self._pop(), tail + " " + after, 0 + // return self._pop(), tail + " " + after // elif not brackets: // punct, tail = self._handle_free_link_text(punct, tail, this) // else: // self._emit_text(this) // self._head += 1 + return NULL; } /* Remove the URI scheme of a new external link from the textbuffer. */ static int -Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* scheme) +Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) { + // scheme = link[0].text.split(":", 1)[0] // length = len(scheme) // while length: // if length < len(self._textbuffer[-1]): @@ -1070,6 +1079,7 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* scheme) // break // length -= len(self._textbuffer[-1]) // self._textbuffer.pop() + return 0; } /* @@ -1077,29 +1087,48 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* scheme) */ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) { - // reset = self._head - // self._head += 1 - // try: - // bad_context = self._context & contexts.INVALID_LINK - // if bad_context or not self._can_recurse(): - // raise BadRoute() - // link, extra, delta = self._really_parse_external_link(brackets) - // except BadRoute: - // self._head = reset - // if not brackets and self._context & contexts.DL_TERM: - // self._handle_dl_term() - // else: - // self._emit_text(self._read()) - // else: - // if not brackets: - // scheme = link[0].text.split(":", 1)[0] - // self._remove_uri_scheme_from_textbuffer(scheme) - // self._emit(tokens.ExternalLinkOpen(brackets=brackets)) - // self._emit_all(link) - // self._emit(tokens.ExternalLinkClose()) - // self._head += delta - // if extra: - // self._emit_text(extra) + Py_ssize_t reset = self->head; + PyObject *link, *kwargs; + char* extra; + + self->head++; + if (self->topstack->context & AGG_INVALID_LINK || !(Tokenizer_CAN_RECURSE(self))) { + FAIL_ROUTE(0); + } + else + link = Tokenizer_really_parse_external_link(self, brackets, &extra); + if (BAD_ROUTE) { + self->head = reset; + if (!brackets && self->topstack->context & LC_DLTERM) + return Tokenizer_handle_dl_term(self); + return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)); + } + if (!link) + return -1; + if (!brackets) { + if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) + return -1; + } + kwargs = PyDict_New(); + if (!kwargs) { + Py_DECREF(link); + return -1; + } + PyDict_SetItemString(kwargs, "brackets", brackets ? Py_True : Py_False); + if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) { + Py_DECREF(link); + return -1; + } + if (Tokenizer_emit_all(self, link)) { + Py_DECREF(link); + return -1; + } + Py_DECREF(link); + if (Tokenizer_emit(self, ExternalLinkClose)) + return -1; + if (extra) + return Tokenizer_emit_text(self, extra); + return 0; } /* diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index a49e896..0e41a85 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -258,6 +258,7 @@ static void TagData_dealloc(TagData*); static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); static void Tokenizer_dealloc(Tokenizer*); static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); +static int Tokenizer_handle_dl_term(Tokenizer*); static int Tokenizer_parse_tag(Tokenizer*); static PyObject* Tokenizer_parse(Tokenizer*, int, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); From 7dcfa3fe929c3344994517fb28e3002d844a834d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 22:15:49 -0400 Subject: [PATCH 172/189] Implement Tokenizer_really_parse_external_link(), some other fixes --- mwparserfromhell/definitions.py | 4 +- mwparserfromhell/parser/tokenizer.c | 228 ++++++++++++++++++++++-------------- mwparserfromhell/parser/tokenizer.h | 10 +- 3 files changed, 146 insertions(+), 96 deletions(-) diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index ef8255e..9449bcb 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -81,8 +81,10 @@ def is_single_only(tag): """Return whether or not the given *tag* must exist without a close tag.""" return tag.lower() in SINGLE_ONLY -def is_scheme(scheme, slashes=True): +def is_scheme(scheme, slashes=True, reverse=False): """Return whether *scheme* is valid for external links.""" + if reverse: # Convenience for C + scheme = scheme[::-1] scheme = scheme.lower() if slashes: return scheme in URI_SCHEMES diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 1681398..6f0c1a6 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -51,13 +51,14 @@ static int heading_level_from_context(int n) } /* - Call the given function in definitions.py, using 'in1' and 'in2' as + Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as parameters, and return its output as a bool. */ -static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2) +static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2, + PyObject* in3) { PyObject* func = PyObject_GetAttrString(definitions, funcname); - PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL); + PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL); int ans = (result == Py_True) ? 1 : 0; Py_DECREF(func); @@ -431,6 +432,28 @@ static int Tokenizer_emit_text(Tokenizer* self, const char* text) } /* + Write the contents of another textbuffer to the current textbuffer, + deallocating it in the process. +*/ +static int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer) +{ + Textbuffer *original = buffer; + int i; + + while (buffer) { + for (i = 0; i < buffer->size; i++) { + if (Tokenizer_emit_char(self, buffer->data[i])) { + Textbuffer_dealloc(original); + return -1; + } + } + buffer = buffer->next; + } + Textbuffer_dealloc(original); + return 0; +} + +/* Write a series of tokens to the current stack at once. */ static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist) @@ -883,7 +906,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) Textbuffer_dealloc(buffer); if (!scheme) return -1; - if (!IS_SCHEME(scheme, slashes ? Py_True : Py_False)) { + if (!IS_SCHEME(scheme, slashes, 0)) { Py_DECREF(scheme); Tokenizer_fail_route(self); return 0; @@ -900,7 +923,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) { static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer; - PyObject *temp, *reversed, *scheme; + PyObject *scheme; Py_UNICODE chunk; int slashes, i, j; @@ -930,21 +953,14 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) temp_buffer = temp_buffer->next; } end_of_loop: - temp = Textbuffer_render(scheme_buffer); - if (!temp) { - Textbuffer_dealloc(scheme_buffer); - return -1; - } - reversed = PyDict_GetItemString(PyEval_GetBuiltins(), "reversed"); - scheme = PyObject_CallFunctionObjArgs(reversed, temp, NULL); - Py_DECREF(temp); + scheme = Textbuffer_render(scheme_buffer); if (!scheme) { Textbuffer_dealloc(scheme_buffer); return -1; } slashes = (Tokenizer_READ(self, 0) == *"/" && Tokenizer_READ(self, 1) == *"/"); - if (!IS_SCHEME(scheme, slashes ? Py_True : Py_False)) { + if (!IS_SCHEME(scheme, slashes, 1)) { Py_DECREF(scheme); Textbuffer_dealloc(scheme_buffer); FAIL_ROUTE(0); @@ -955,16 +971,8 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) Textbuffer_dealloc(scheme_buffer); return -1; } - while (temp_buffer) { - for (i = 0; i < temp_buffer->size; i++) { - if (Tokenizer_emit_char(self, temp_buffer->data[i])) { - Textbuffer_dealloc(scheme_buffer); - return -1; - } - } - temp_buffer = temp_buffer->next; - } - Textbuffer_dealloc(scheme_buffer); + if (Tokenizer_emit_textbuffer(self, scheme_buffer)) + return -1; if (Tokenizer_emit_char(self, *":")) return -1; if (slashes) { @@ -972,15 +980,25 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) return -1; self->head += 2; } + return 0; } /* Handle text in a free external link, including trailing punctuation. */ static int -Tokenizer_handle_free_link_text(Tokenizer* self, PyObject** punct, +Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, Textbuffer** tail, Py_UNICODE this) { + #define PUSH_TAIL_BUFFER(tail, error) \ + if ((tail)->size || (tail)->next) { \ + Tokenizer_emit_textbuffer(self, tail); \ + tail = Textbuffer_new(); \ + if (!(tail)) \ + return error; \ + } + + // punct = tuple(",;\.:!?)") // if "(" in this and ")" in punct: // punct = punct[:-1] # ')' is not longer valid punctuation // if this.endswith(punct): @@ -998,7 +1016,7 @@ Tokenizer_handle_free_link_text(Tokenizer* self, PyObject** punct, // tail = "" // self._emit_text(this) // return punct, tail - return 0; + return Tokenizer_emit_char(self, this); } /* @@ -1006,63 +1024,76 @@ Tokenizer_handle_free_link_text(Tokenizer* self, PyObject** punct, */ static PyObject* Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, - char** extra) -{ - // if brackets: - // self._parse_bracketed_uri_scheme() - // invalid = ("\n", " ", "]") - // else: - // self._parse_free_uri_scheme() - // invalid = ("\n", " ", "[", "]") - // punct = tuple(",;\.:!?)") - // if self._read() is self.END or self._read()[0] in invalid: - // self._fail_route() - // tail = "" - // while True: - // this, next = self._read(), self._read(1) - // if this is self.END or this == "\n": - // if brackets: - // self._fail_route() - // self.head -= 1 - // return self._pop(), tail - // elif this == next == "{" and self._can_recurse(): - // if tail: - // self._emit_text(tail) - // tail = "" - // self._parse_template_or_argument() - // elif this == "[": - // if brackets: - // self._emit_text("[") - // else: - // self._head -= 1 - // return self._pop(), tail - // elif this == "]": - // if not brackets: - // self._head -= 1 - // return self._pop(), tail - // elif this == "&": - // if tail: - // self._emit_text(tail) - // tail = "" - // self._parse_entity() - // elif " " in this: - // before, after = this.split(" ", 1) - // if brackets: - // self._emit_text(before) - // self._emit(tokens.ExternalLinkSeparator()) - // self._emit_text(after) - // self._context ^= contexts.EXT_LINK_URI - // self._context |= contexts.EXT_LINK_TITLE - // self._head += 1 - // return self._parse(push=False), None - // punct, tail = self._handle_free_link_text(punct, tail, before) - // return self._pop(), tail + " " + after - // elif not brackets: - // punct, tail = self._handle_free_link_text(punct, tail, this) - // else: - // self._emit_text(this) - // self._head += 1 - return NULL; + Textbuffer** extra) +{ + Py_UNICODE this, next; + int parens = 0; + + if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) : + Tokenizer_parse_free_uri_scheme(self)) + return NULL; + if (BAD_ROUTE) + return NULL; + this = Tokenizer_READ(self, 0); + if (this == *"" || this == *"\n" || this == *" " || this == *"]") + return Tokenizer_fail_route(self); + if (!brackets && this == *"[") + return Tokenizer_fail_route(self); + while (1) { + this = Tokenizer_READ(self, 0); + next = Tokenizer_READ(self, 1); + if (this == *"" || this == *"\n") { + if (brackets) + return Tokenizer_fail_route(self); + self->head--; + return Tokenizer_pop(self); + } + if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_template_or_argument(self)) + return NULL; + } + else if (this == *"[") { + if (!brackets) { + self->head--; + return Tokenizer_pop(self); + } + if (Tokenizer_emit_char(self, *"[")) + return NULL; + } + else if (this == *"]") { + if (!brackets) + self->head--; + return Tokenizer_pop(self); + } + else if (this == *"&") { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_entity(self)) + return NULL; + } + else if (this == *" ") { + if (brackets) { + if (Tokenizer_emit(self, ExternalLinkSeparator)) + return NULL; + self->topstack->context ^= LC_EXT_LINK_URI; + self->topstack->context |= LC_EXT_LINK_TITLE; + self->head++; + return Tokenizer_parse(self, 0, 0); + } + if (Textbuffer_write(extra, *" ")) + return NULL; + return Tokenizer_pop(self); + } + else if (!brackets) { + if (Tokenizer_handle_free_link_text(self, &parens, extra, this)) + return NULL; + } + else { + if (Tokenizer_emit_char(self, this)) + return NULL; + } + self->head++; + } } /* @@ -1089,45 +1120,60 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) { Py_ssize_t reset = self->head; PyObject *link, *kwargs; - char* extra; + Textbuffer *extra; self->head++; - if (self->topstack->context & AGG_INVALID_LINK || !(Tokenizer_CAN_RECURSE(self))) { + #define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK + if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { FAIL_ROUTE(0); } - else + else { + extra = Textbuffer_new(); + if (!extra) + return -1; link = Tokenizer_really_parse_external_link(self, brackets, &extra); + } if (BAD_ROUTE) { self->head = reset; + Textbuffer_dealloc(extra); if (!brackets && self->topstack->context & LC_DLTERM) return Tokenizer_handle_dl_term(self); return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)); } - if (!link) + if (!link) { + Textbuffer_dealloc(extra); return -1; + } if (!brackets) { - if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) + if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) { + Textbuffer_dealloc(extra); return -1; + } } kwargs = PyDict_New(); if (!kwargs) { + Textbuffer_dealloc(extra); Py_DECREF(link); return -1; } PyDict_SetItemString(kwargs, "brackets", brackets ? Py_True : Py_False); if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) { + Textbuffer_dealloc(extra); Py_DECREF(link); return -1; } if (Tokenizer_emit_all(self, link)) { + Textbuffer_dealloc(extra); Py_DECREF(link); return -1; } Py_DECREF(link); - if (Tokenizer_emit(self, ExternalLinkClose)) + if (Tokenizer_emit(self, ExternalLinkClose)) { + Textbuffer_dealloc(extra); return -1; - if (extra) - return Tokenizer_emit_text(self, extra); + } + if (extra->size || extra->next) + return Tokenizer_emit_textbuffer(self, extra); return 0; } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 0e41a85..e437814 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -241,10 +241,11 @@ typedef struct { /* Macros for accessing definitions: */ #define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") -#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL)) -#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL)) -#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL)) -#define IS_SCHEME(scheme, slashes) (call_def_func("is_scheme", scheme, slashes)) +#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL)) +#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL)) +#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL)) +#define IS_SCHEME(scheme, slashes, reverse) \ + (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False)) /* Function prototypes: */ @@ -258,6 +259,7 @@ static void TagData_dealloc(TagData*); static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); static void Tokenizer_dealloc(Tokenizer*); static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); +static int Tokenizer_parse_entity(Tokenizer*); static int Tokenizer_handle_dl_term(Tokenizer*); static int Tokenizer_parse_tag(Tokenizer*); static PyObject* Tokenizer_parse(Tokenizer*, int, int); From c1b502bbe6405f408d4d98bc85154fd338443ce8 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 01:24:35 -0400 Subject: [PATCH 173/189] Finish external links implementation. --- mwparserfromhell/parser/tokenizer.c | 61 +++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 6f0c1a6..6310523 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -80,7 +80,7 @@ static PyObject* strip_tag_name(PyObject* token) Py_DECREF(text); if (!rstripped) return NULL; - lowered = PyObject_CallMethod(rstripped, "rstrip", NULL); + lowered = PyObject_CallMethod(rstripped, "lower", NULL); Py_DECREF(rstripped); return lowered; } @@ -998,24 +998,14 @@ Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, return error; \ } - // punct = tuple(",;\.:!?)") - // if "(" in this and ")" in punct: - // punct = punct[:-1] # ')' is not longer valid punctuation - // if this.endswith(punct): - // for i in reversed(range(-len(this), 0)): - // if i == -len(this) or this[i - 1] not in punct: - // break - // stripped = this[:i] - // if stripped and tail: - // self._emit_text(tail) - // tail = "" - // tail += this[i:] - // this = stripped - // elif tail: - // self._emit_text(tail) - // tail = "" - // self._emit_text(this) - // return punct, tail + if (this == *"(" && !(*parens)) + *parens = 1; + else if (this == *"," || this == *";" || this == *"\\" || this == *"." || + this == *":" || this == *"!" || this == *"?" || + (!(*parens) && this == *")")) + return Textbuffer_write(tail, this); + else + PUSH_TAIL_BUFFER(*tail, -1) return Tokenizer_emit_char(self, this); } @@ -1102,14 +1092,31 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, static int Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) { - // scheme = link[0].text.split(":", 1)[0] - // length = len(scheme) - // while length: - // if length < len(self._textbuffer[-1]): - // self._textbuffer[-1] = self._textbuffer[-1][:-length] - // break - // length -= len(self._textbuffer[-1]) - // self._textbuffer.pop() + PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"), + *split, *scheme; + Py_ssize_t length; + Textbuffer* temp; + + if (!text) + return -1; + split = PyObject_CallMethod(text, "split", "si", ":", 1); + Py_DECREF(text); + if (!split) + return -1; + scheme = PyList_GET_ITEM(split, 0); + length = PyUnicode_GET_SIZE(scheme); + while (length) { + temp = self->topstack->textbuffer; + if (length <= temp->size) { + temp->size -= length; + break; + } + length -= temp->size; + self->topstack->textbuffer = temp->next; + free(temp->data); + free(temp); + } + Py_DECREF(split); return 0; } From b9324eb658eda01a874d18cf193b6647ba3d0e5e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 02:29:00 -0400 Subject: [PATCH 174/189] Fix Python tokenizer to not generate empty Text tokens. --- mwparserfromhell/parser/tokenizer.py | 3 ++- tests/tokenizer/external_links.mwtest | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 2340077..2c8d6d7 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -424,7 +424,8 @@ class Tokenizer(object): if brackets: self._emit_text(before) self._emit(tokens.ExternalLinkSeparator()) - self._emit_text(after) + if after: + self._emit_text(after) self._context ^= contexts.EXT_LINK_URI self._context |= contexts.EXT_LINK_TITLE self._head += 1 diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index b517ada..6666c05 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -15,7 +15,7 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), Exte name: brackets_space label: basic external link in brackets, with a space after input: "[http://example.com/ ]" -output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text=""), ExternalLinkClose()] +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), ExternalLinkClose()] --- From 072b956a073e15ec164edd971e156cd256a37d8a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 02:29:51 -0400 Subject: [PATCH 175/189] Make a couple tests harder. --- tests/tokenizer/external_links.mwtest | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index 6666c05..af7a570 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -91,8 +91,8 @@ output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), Exte name: colons_after label: colons after a free link that are excluded -input: "http://example.com/foo:bar:::baz:::" -output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo:bar:::baz"), ExternalLinkClose(), Text(text=":::")] +input: "http://example.com/foo:bar.:;baz!?," +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo:bar.:;baz"), ExternalLinkClose(), Text(text="!?,")] --- @@ -126,8 +126,8 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com."), Exte name: brackets_colons_after label: colons after a bracket-enclosed link that are included -input: "[http://example.com/foo:bar:::baz::: Example]" -output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo:bar:::baz:::"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] +input: "[http://example.com/foo:bar.:;baz!?, Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo:bar.:;baz!?,"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] --- From 2561cf5b5e94ee7df7878b879bcf2354e074b255 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 02:30:27 -0400 Subject: [PATCH 176/189] Fix all bugs in C implementation of external links. --- mwparserfromhell/parser/tokenizer.c | 56 ++++++++++++++++++++++++------------- mwparserfromhell/parser/tokenizer.h | 3 +- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 6310523..c2ac12f 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -100,7 +100,7 @@ static Textbuffer* Textbuffer_new(void) PyErr_NoMemory(); return NULL; } - buffer->next = NULL; + buffer->prev = buffer->next = NULL; return buffer; } @@ -128,6 +128,7 @@ static int Textbuffer_write(Textbuffer** this, Py_UNICODE code) if (!new) return -1; new->next = self; + self->prev = new; *this = self = new; } self->data[self->size++] = code; @@ -435,19 +436,33 @@ static int Tokenizer_emit_text(Tokenizer* self, const char* text) Write the contents of another textbuffer to the current textbuffer, deallocating it in the process. */ -static int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer) +static int +Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse) { Textbuffer *original = buffer; int i; - while (buffer) { - for (i = 0; i < buffer->size; i++) { - if (Tokenizer_emit_char(self, buffer->data[i])) { - Textbuffer_dealloc(original); - return -1; + if (reverse) { + do { + for (i = buffer->size - 1; i >= 0; i--) { + if (Tokenizer_emit_char(self, buffer->data[i])) { + Textbuffer_dealloc(original); + return -1; + } } - } - buffer = buffer->next; + } while ((buffer = buffer->next)); + } + else { + while (buffer->next) + buffer = buffer->next; + do { + for (i = 0; i < buffer->size; i++) { + if (Tokenizer_emit_char(self, buffer->data[i])) { + Textbuffer_dealloc(original); + return -1; + } + } + } while ((buffer = buffer->prev)); } Textbuffer_dealloc(original); return 0; @@ -933,7 +948,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) // it was just parsed as text: temp_buffer = self->topstack->textbuffer; while (temp_buffer) { - for (i = temp_buffer->size - 1; i >= 0; i++) { + for (i = temp_buffer->size - 1; i >= 0; i--) { chunk = temp_buffer->data[i]; if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) goto end_of_loop; @@ -971,7 +986,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) Textbuffer_dealloc(scheme_buffer); return -1; } - if (Tokenizer_emit_textbuffer(self, scheme_buffer)) + if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1)) return -1; if (Tokenizer_emit_char(self, *":")) return -1; @@ -990,16 +1005,18 @@ static int Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, Textbuffer** tail, Py_UNICODE this) { - #define PUSH_TAIL_BUFFER(tail, error) \ - if ((tail)->size || (tail)->next) { \ - Tokenizer_emit_textbuffer(self, tail); \ - tail = Textbuffer_new(); \ - if (!(tail)) \ - return error; \ + #define PUSH_TAIL_BUFFER(tail, error) \ + if ((tail)->size || (tail)->next) { \ + Tokenizer_emit_textbuffer(self, tail, 0); \ + tail = Textbuffer_new(); \ + if (!(tail)) \ + return error; \ } - if (this == *"(" && !(*parens)) + if (this == *"(" && !(*parens)) { *parens = 1; + PUSH_TAIL_BUFFER(*tail, -1) + } else if (this == *"," || this == *";" || this == *"\\" || this == *"." || this == *":" || this == *"!" || this == *"?" || (!(*parens) && this == *")")) @@ -1141,6 +1158,7 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) link = Tokenizer_really_parse_external_link(self, brackets, &extra); } if (BAD_ROUTE) { + RESET_ROUTE(); self->head = reset; Textbuffer_dealloc(extra); if (!brackets && self->topstack->context & LC_DLTERM) @@ -1180,7 +1198,7 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) return -1; } if (extra->size || extra->next) - return Tokenizer_emit_textbuffer(self, extra); + return Tokenizer_emit_textbuffer(self, extra, 0); return 0; } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index e437814..c23fe4a 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -165,7 +165,7 @@ static PyObject* TagCloseClose; #define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) -#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK_URI) +#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK) /* Tag contexts: */ @@ -184,6 +184,7 @@ static PyObject* TagCloseClose; struct Textbuffer { Py_ssize_t size; Py_UNICODE* data; + struct Textbuffer* prev; struct Textbuffer* next; }; From f1b95758d659c9352db9a7d1c4ca4ad85f82c400 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 03:22:18 -0400 Subject: [PATCH 177/189] Squash a memory leak. --- mwparserfromhell/parser/tokenizer.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index c2ac12f..3dca5c2 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1005,12 +1005,13 @@ static int Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, Textbuffer** tail, Py_UNICODE this) { - #define PUSH_TAIL_BUFFER(tail, error) \ - if ((tail)->size || (tail)->next) { \ - Tokenizer_emit_textbuffer(self, tail, 0); \ - tail = Textbuffer_new(); \ - if (!(tail)) \ - return error; \ + #define PUSH_TAIL_BUFFER(tail, error) \ + if ((tail)->size || (tail)->next) { \ + if (Tokenizer_emit_textbuffer(self, tail, 0)) \ + return error; \ + tail = Textbuffer_new(); \ + if (!(tail)) \ + return error; \ } if (this == *"(" && !(*parens)) { @@ -1172,6 +1173,7 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) if (!brackets) { if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) { Textbuffer_dealloc(extra); + Py_DECREF(link); return -1; } } @@ -1199,6 +1201,7 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) } if (extra->size || extra->next) return Tokenizer_emit_textbuffer(self, extra, 0); + Textbuffer_dealloc(extra); return 0; } From 655cdc0dab2280ad4023ab78c6421448170b188d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 19:36:53 -0400 Subject: [PATCH 178/189] TestBuilder.test_external_link() --- tests/test_builder.py | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/tests/test_builder.py b/tests/test_builder.py index 29ae65a..152ab53 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -23,8 +23,8 @@ from __future__ import unicode_literals import unittest -from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity, - Tag, Template, Text, Wikilink) +from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, + HTMLEntity, Tag, Template, Text, Wikilink) from mwparserfromhell.nodes.extras import Attribute, Parameter from mwparserfromhell.parser import tokens from mwparserfromhell.parser.builder import Builder @@ -150,6 +150,48 @@ class TestBuilder(TreeEqualityTestCase): for test, valid in tests: self.assertWikicodeEqual(valid, self.builder.build(test)) + def test_external_link(self): + """tests for building ExternalLink nodes""" + tests = [ + ([tokens.ExternalLinkOpen(brackets=False), + tokens.Text(text="http://example.com/"), + tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example.com/"), + brackets=False)])), + + ([tokens.ExternalLinkOpen(brackets=True), + tokens.Text(text="http://example.com/"), + tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example.com/"))])), + + ([tokens.ExternalLinkOpen(brackets=True), + tokens.Text(text="http://example.com/"), + tokens.ExternalLinkSeparator(), tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example.com/"), wrap([]))])), + + ([tokens.ExternalLinkOpen(brackets=True), + tokens.Text(text="http://example.com/"), + tokens.ExternalLinkSeparator(), tokens.Text(text="Example"), + tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example.com/"), + wraptext("Example"))])), + + ([tokens.ExternalLinkOpen(brackets=False), + tokens.Text(text="http://example"), tokens.Text(text=".com/foo"), + tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example", ".com/foo"), + brackets=False)])), + + ([tokens.ExternalLinkOpen(brackets=True), + tokens.Text(text="http://example"), tokens.Text(text=".com/foo"), + tokens.ExternalLinkSeparator(), tokens.Text(text="Example"), + tokens.Text(text=" Web Page"), tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example", ".com/foo"), + wraptext("Example", " Web Page"))])), + ] + for test, valid in tests: + self.assertWikicodeEqual(valid, self.builder.build(test)) + def test_html_entity(self): """tests for building HTMLEntity nodes""" tests = [ From d91c65dc1f59347cc727296b3df29d21ade01f9e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 20:05:32 -0400 Subject: [PATCH 179/189] TestExternalLink; some fixes in ExternalLink. --- mwparserfromhell/nodes/external_link.py | 19 ++--- tests/test_external_link.py | 130 ++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 9 deletions(-) create mode 100644 tests/test_external_link.py diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py index a604f9a..2ee37f3 100644 --- a/mwparserfromhell/nodes/external_link.py +++ b/mwparserfromhell/nodes/external_link.py @@ -53,16 +53,20 @@ class ExternalLink(Node): yield self.title, child def __strip__(self, normalize, collapse): - if self.title.strip(): - return self.title.strip_code(normalize, collapse) - return None + if self.brackets: + if self.title: + return self.title.strip_code(normalize, collapse) + return None + return self.url.strip_code(normalize, collapse) def __showtree__(self, write, get, mark): - write("[") + if self.brackets: + write("[") get(self.url) if self.title is not None: get(self.title) - write("]") + if self.brackets: + write("]") @property def url(self): @@ -85,10 +89,7 @@ class ExternalLink(Node): @title.setter def title(self, value): - if value is None: - self._title = None - else: - self._title = parse_anything(value) + self._title = None if value is None else parse_anything(value) @brackets.setter def brackets(self, value): diff --git a/tests/test_external_link.py b/tests/test_external_link.py new file mode 100644 index 0000000..13a82bf --- /dev/null +++ b/tests/test_external_link.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012-2013 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals +import unittest + +from mwparserfromhell.compat import str +from mwparserfromhell.nodes import ExternalLink, Text + +from ._test_tree_equality import TreeEqualityTestCase, getnodes, wrap, wraptext + +class TestExternalLink(TreeEqualityTestCase): + """Test cases for the ExternalLink node.""" + + def test_unicode(self): + """test ExternalLink.__unicode__()""" + node = ExternalLink(wraptext("http://example.com/"), brackets=False) + self.assertEqual("http://example.com/", str(node)) + node2 = ExternalLink(wraptext("http://example.com/")) + self.assertEqual("[http://example.com/]", str(node2)) + node3 = ExternalLink(wraptext("http://example.com/"), wrap([])) + self.assertEqual("[http://example.com/ ]", str(node3)) + node4 = ExternalLink(wraptext("http://example.com/"), + wraptext("Example Web Page")) + self.assertEqual("[http://example.com/ Example Web Page]", str(node4)) + + def test_iternodes(self): + """test ExternalLink.__iternodes__()""" + node1n1 = Text("http://example.com/") + node2n1 = Text("http://example.com/") + node2n2, node2n3 = Text("Example"), Text("Page") + node1 = ExternalLink(wrap([node1n1]), brackets=False) + node2 = ExternalLink(wrap([node2n1]), wrap([node2n2, node2n3])) + gen1 = node1.__iternodes__(getnodes) + gen2 = node2.__iternodes__(getnodes) + self.assertEqual((None, node1), next(gen1)) + self.assertEqual((None, node2), next(gen2)) + self.assertEqual((node1.url, node1n1), next(gen1)) + self.assertEqual((node2.url, node2n1), next(gen2)) + self.assertEqual((node2.title, node2n2), next(gen2)) + self.assertEqual((node2.title, node2n3), next(gen2)) + self.assertRaises(StopIteration, next, gen1) + self.assertRaises(StopIteration, next, gen2) + + def test_strip(self): + """test ExternalLink.__strip__()""" + node1 = ExternalLink(wraptext("http://example.com"), brackets=False) + node2 = ExternalLink(wraptext("http://example.com")) + node3 = ExternalLink(wraptext("http://example.com"), wrap([])) + node4 = ExternalLink(wraptext("http://example.com"), wraptext("Link")) + for a in (True, False): + for b in (True, False): + self.assertEqual("http://example.com", node1.__strip__(a, b)) + self.assertEqual(None, node2.__strip__(a, b)) + self.assertEqual(None, node3.__strip__(a, b)) + self.assertEqual("Link", node4.__strip__(a, b)) + + def test_showtree(self): + """test ExternalLink.__showtree__()""" + output = [] + getter, marker = object(), object() + get = lambda code: output.append((getter, code)) + mark = lambda: output.append(marker) + node1 = ExternalLink(wraptext("http://example.com"), brackets=False) + node2 = ExternalLink(wraptext("http://example.com"), wraptext("Link")) + node1.__showtree__(output.append, get, mark) + node2.__showtree__(output.append, get, mark) + valid = [ + (getter, node1.url), "[", (getter, node2.url), + (getter, node2.title), "]"] + self.assertEqual(valid, output) + + def test_url(self): + """test getter/setter for the url attribute""" + url = wraptext("http://example.com/") + node1 = ExternalLink(url, brackets=False) + node2 = ExternalLink(url, wraptext("Example")) + self.assertIs(url, node1.url) + self.assertIs(url, node2.url) + node1.url = "mailto:héhehé@spam.com" + node2.url = "mailto:héhehé@spam.com" + self.assertWikicodeEqual(wraptext("mailto:héhehé@spam.com"), node1.url) + self.assertWikicodeEqual(wraptext("mailto:héhehé@spam.com"), node2.url) + + def test_title(self): + """test getter/setter for the title attribute""" + title = wraptext("Example!") + node1 = ExternalLink(wraptext("http://example.com/"), brackets=False) + node2 = ExternalLink(wraptext("http://example.com/"), title) + self.assertIs(None, node1.title) + self.assertIs(title, node2.title) + node2.title = None + self.assertIs(None, node2.title) + node2.title = "My Website" + self.assertWikicodeEqual(wraptext("My Website"), node2.title) + + def test_brackets(self): + """test getter/setter for the brackets attribute""" + node1 = ExternalLink(wraptext("http://example.com/"), brackets=False) + node2 = ExternalLink(wraptext("http://example.com/"), wraptext("Link")) + self.assertFalse(node1.brackets) + self.assertTrue(node2.brackets) + node1.brackets = True + node2.brackets = False + self.assertTrue(node1.brackets) + self.assertFalse(node2.brackets) + self.assertEqual("[http://example.com/]", str(node1)) + self.assertEqual("http://example.com/", str(node2)) + +if __name__ == "__main__": + unittest.main(verbosity=2) From 67f1762aa402a7dee1b96f80e8d9d2521fe8b069 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 23:23:22 -0400 Subject: [PATCH 180/189] Doc updates, and allow passing a starting context to tokenize(). --- CHANGELOG | 2 +- docs/api/mwparserfromhell.nodes.rst | 8 ++++++++ docs/api/mwparserfromhell.rst | 6 +++--- docs/changelog.rst | 2 +- mwparserfromhell/__init__.py | 3 ++- mwparserfromhell/nodes/external_link.py | 2 +- mwparserfromhell/parser/__init__.py | 9 ++++----- mwparserfromhell/parser/tokenizer.c | 7 ++++--- mwparserfromhell/parser/tokenizer.py | 5 +++-- mwparserfromhell/utils.py | 16 +++++++++++----- tests/test_parser.py | 6 +++--- 11 files changed, 41 insertions(+), 25 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 84edc60..122247f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -16,7 +16,7 @@ v0.3 (unreleased): - Renamed Template.has_param() to has() for consistency with Template's other methods; has_param() is now an alias. - The C tokenizer extension now works on Python 3 in addition to Python 2.7. -- Various fixes and cleanup. +- Various bugfixes, internal changes, and cleanup. v0.2 (released June 20, 2013): diff --git a/docs/api/mwparserfromhell.nodes.rst b/docs/api/mwparserfromhell.nodes.rst index a093c17..7043070 100644 --- a/docs/api/mwparserfromhell.nodes.rst +++ b/docs/api/mwparserfromhell.nodes.rst @@ -25,6 +25,14 @@ nodes Package :undoc-members: :show-inheritance: +:mod:`external_link` Module +--------------------------- + +.. automodule:: mwparserfromhell.nodes.external_link + :members: + :undoc-members: + :show-inheritance: + :mod:`heading` Module --------------------- diff --git a/docs/api/mwparserfromhell.rst b/docs/api/mwparserfromhell.rst index b682139..0da522e 100644 --- a/docs/api/mwparserfromhell.rst +++ b/docs/api/mwparserfromhell.rst @@ -30,10 +30,10 @@ mwparserfromhell Package :members: :undoc-members: -:mod:`tag_defs` Module ----------------------- +:mod:`definitions` Module +------------------------- -.. automodule:: mwparserfromhell.tag_defs +.. automodule:: mwparserfromhell.definitions :members: :mod:`utils` Module diff --git a/docs/changelog.rst b/docs/changelog.rst index 810f594..f43a3c9 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -26,7 +26,7 @@ Unreleased :py:meth:`~.Template.has` for consistency with :py:class:`~.Template`\ 's other methods; :py:meth:`~.has_param` is now an alias. - The C tokenizer extension now works on Python 3 in addition to Python 2.7. -- Various fixes and cleanup. +- Various bugfixes, internal changes, and cleanup. v0.2 ---- diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 738d4c2..74e1616 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -34,6 +34,7 @@ __license__ = "MIT License" __version__ = "0.3.dev" __email__ = "ben.kurtovic@verizon.net" -from . import compat, nodes, parser, smart_list, string_mixin, utils, wikicode +from . import (compat, definitions, nodes, parser, smart_list, string_mixin, + utils, wikicode) parse = utils.parse_anything diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py index 2ee37f3..bf1c9b1 100644 --- a/mwparserfromhell/nodes/external_link.py +++ b/mwparserfromhell/nodes/external_link.py @@ -70,7 +70,7 @@ class ExternalLink(Node): @property def url(self): - """The url of the link target, as a :py:class:`~.Wikicode` object.""" + """The URL of the link target, as a :py:class:`~.Wikicode` object.""" return self._url @property diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index 1fb95b5..22c3dc2 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -46,16 +46,15 @@ class Parser(object): :py:class:`~.Node`\ s by the :py:class:`~.Builder`. """ - def __init__(self, text): - self.text = text + def __init__(self): if use_c and CTokenizer: self._tokenizer = CTokenizer() else: self._tokenizer = Tokenizer() self._builder = Builder() - def parse(self): - """Return a string as a parsed :py:class:`~.Wikicode` object tree.""" - tokens = self._tokenizer.tokenize(self.text) + def parse(self, text, context=0): + """Parse *text*, returning a :py:class:`~.Wikicode` object tree.""" + tokens = self._tokenizer.tokenize(text, context) code = self._builder.build(tokens) return code diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 3dca5c2..af6bf3b 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2667,8 +2667,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) { PyObject *text, *temp; + int context = 0; - if (PyArg_ParseTuple(args, "U", &text)) { + if (PyArg_ParseTuple(args, "U|i", &text, &context)) { Py_XDECREF(self->text); self->text = PySequence_Fast(text, "expected a sequence"); } @@ -2677,7 +2678,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) Py_ssize_t size; /* Failed to parse a Unicode object; try a string instead. */ PyErr_Clear(); - if (!PyArg_ParseTuple(args, "s#", &encoded, &size)) + if (!PyArg_ParseTuple(args, "s#|i", &encoded, &size, &context)) return NULL; temp = PyUnicode_FromStringAndSize(encoded, size); if (!text) @@ -2689,7 +2690,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) } self->head = self->global = self->depth = self->cycles = 0; self->length = PyList_GET_SIZE(self->text); - return Tokenizer_parse(self, 0, 1); + return Tokenizer_parse(self, context, 1); } static int load_entitydefs(void) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 2c8d6d7..6ab549a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1125,8 +1125,9 @@ class Tokenizer(object): self._emit_text(this) self._head += 1 - def tokenize(self, text): + def tokenize(self, text, context=0): """Build a list of tokens from a string of wikicode and return it.""" split = self.regex.split(text) self._text = [segment for segment in split if segment] - return self._parse() + self._head = self._global = self._depth = self._cycles = 0 + return self._parse(context) diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index 31e5ba0..758e751 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -33,7 +33,7 @@ from .smart_list import SmartList __all__ = ["parse_anything"] -def parse_anything(value): +def parse_anything(value, context=0): """Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. This differs from :py:meth:`.Parser.parse` in that we accept more than just @@ -44,6 +44,12 @@ def parse_anything(value): on-the-fly by various methods of :py:class:`~.Wikicode` and others like :py:class:`~.Template`, such as :py:meth:`wikicode.insert() <.Wikicode.insert>` or setting :py:meth:`template.name <.Template.name>`. + + If given, *context* will be passed as a starting context to the parser. + This is helpful when this function is used inside node attribute setters. + For example, :py:class:`~.ExternalLink`\ 's :py:attr:`~.ExternalLink.url` + setter sets *context* to :py:mod:`contexts.EXT_LINK_URI <.contexts>` to + prevent the URL itself from becoming an :py:class:`~.ExternalLink`. """ from .parser import Parser from .wikicode import Wikicode @@ -53,17 +59,17 @@ def parse_anything(value): elif isinstance(value, Node): return Wikicode(SmartList([value])) elif isinstance(value, str): - return Parser(value).parse() + return Parser().parse(value, context) elif isinstance(value, bytes): - return Parser(value.decode("utf8")).parse() + return Parser().parse(value.decode("utf8"), context) elif isinstance(value, int): - return Parser(str(value)).parse() + return Parser().parse(str(value), context) elif value is None: return Wikicode(SmartList()) try: nodelist = SmartList() for item in value: - nodelist += parse_anything(item).nodes + nodelist += parse_anything(item, context).nodes except TypeError: error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" raise ValueError(error.format(type(value).__name__, value)) diff --git a/tests/test_parser.py b/tests/test_parser.py index ec5f065..8760c0e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -36,9 +36,9 @@ class TestParser(TreeEqualityTestCase): def test_use_c(self): """make sure the correct tokenizer is used""" if parser.use_c: - self.assertTrue(parser.Parser(None)._tokenizer.USES_C) + self.assertTrue(parser.Parser()._tokenizer.USES_C) parser.use_c = False - self.assertFalse(parser.Parser(None)._tokenizer.USES_C) + self.assertFalse(parser.Parser()._tokenizer.USES_C) def test_parsing(self): """integration test for parsing overall""" @@ -59,7 +59,7 @@ class TestParser(TreeEqualityTestCase): ])) ]) ]) - actual = parser.Parser(text).parse() + actual = parser.Parser().parse(text) self.assertWikicodeEqual(expected, actual) if __name__ == "__main__": From d1a7d25220b0acf9bc8a43fb49d8b711431156b7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 23:36:34 -0400 Subject: [PATCH 181/189] Set right context for ExternalLink.url; Wikicode.filter_external_links() --- mwparserfromhell/nodes/external_link.py | 3 ++- mwparserfromhell/wikicode.py | 10 +++++----- tests/test_wikicode.py | 1 + 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py index bf1c9b1..d74f6b3 100644 --- a/mwparserfromhell/nodes/external_link.py +++ b/mwparserfromhell/nodes/external_link.py @@ -85,7 +85,8 @@ class ExternalLink(Node): @url.setter def url(self, value): - self._url = parse_anything(value) + from ..parser import contexts + self._url = parse_anything(value, contexts.EXT_LINK_URI) @title.setter def title(self, value): diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index b5e854d..c3249d9 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -24,8 +24,8 @@ from __future__ import unicode_literals import re from .compat import maxsize, py3k, str -from .nodes import (Argument, Comment, Heading, HTMLEntity, Node, Tag, - Template, Text, Wikilink) +from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, + Node, Tag, Template, Text, Wikilink) from .string_mixin import StringMixIn from .utils import parse_anything @@ -509,6 +509,6 @@ class Wikicode(StringMixIn): return "\n".join(self._get_tree(self, [], marker, 0)) Wikicode._build_filter_methods( - arguments=Argument, comments=Comment, headings=Heading, - html_entities=HTMLEntity, tags=Tag, templates=Template, text=Text, - wikilinks=Wikilink) + arguments=Argument, comments=Comment, external_links=ExternalLink, + headings=Heading, html_entities=HTMLEntity, tags=Tag, templates=Template, + text=Text, wikilinks=Wikilink) diff --git a/tests/test_wikicode.py b/tests/test_wikicode.py index 08cf93c..14d801c 100644 --- a/tests/test_wikicode.py +++ b/tests/test_wikicode.py @@ -276,6 +276,7 @@ class TestWikicode(TreeEqualityTestCase): self.assertEqual(["{{{e}}}"], get_filter("arguments")) self.assertIs(code.get(4), get_filter("arguments")[0]) self.assertEqual([], get_filter("comments")) + self.assertEqual([], get_filter("external_links")) self.assertEqual([], get_filter("headings")) self.assertEqual([], get_filter("html_entities")) self.assertEqual([], get_filter("tags")) From fcdc0abd22259b4aa6213a088989bbd1f9c922bd Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 Aug 2013 00:05:13 -0400 Subject: [PATCH 182/189] Fix autofail contexts. --- mwparserfromhell/parser/contexts.py | 2 +- mwparserfromhell/parser/tokenizer.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 0d25400..33da8f7 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -162,7 +162,7 @@ GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK + HEADING + TAG + STYLE +FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index c23fe4a..da3c57a 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -162,7 +162,7 @@ static PyObject* TagCloseClose; /* Aggregate contexts: */ -#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) #define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK) From 4d04cae7802e7a1775016e8a599d2555fe32b763 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 Aug 2013 00:27:05 -0400 Subject: [PATCH 183/189] Fix a segfault with GCC. --- mwparserfromhell/parser/tokenizer.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index af6bf3b..07d3988 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1143,28 +1143,29 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) */ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) { + #define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK + #define NOT_A_LINK \ + if (!brackets && self->topstack->context & LC_DLTERM) \ + return Tokenizer_handle_dl_term(self); \ + return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)) + Py_ssize_t reset = self->head; PyObject *link, *kwargs; - Textbuffer *extra; + Textbuffer *extra = 0; - self->head++; - #define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { - FAIL_ROUTE(0); - } - else { - extra = Textbuffer_new(); - if (!extra) - return -1; - link = Tokenizer_really_parse_external_link(self, brackets, &extra); + NOT_A_LINK; } + extra = Textbuffer_new(); + if (!extra) + return -1; + self->head++; + link = Tokenizer_really_parse_external_link(self, brackets, &extra); if (BAD_ROUTE) { RESET_ROUTE(); self->head = reset; Textbuffer_dealloc(extra); - if (!brackets && self->topstack->context & LC_DLTERM) - return Tokenizer_handle_dl_term(self); - return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)); + NOT_A_LINK; } if (!link) { Textbuffer_dealloc(extra); From fdb276239392cb7bed5efe349f351ce6e97ab705 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 Aug 2013 01:05:43 -0400 Subject: [PATCH 184/189] Add a test for tag name capitalization. --- tests/tokenizer/tags.mwtest | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index dc02a51..a4ce8d8 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -548,3 +548,10 @@ name: single_only_double label: a tag that can only be single; a tag with backslashes at the beginning and end input: "foo
    bar{{baz}}" output: [Text(text="foo"), TagOpenOpen(invalid=True), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()] + +--- + +name: capitalization +label: caps should be ignored within tag names +input: "{{test}}" +output: [TagOpenOpen(), Text(text="NoWiKi"), TagCloseOpen(padding=""), Text(text="{{test}}"), TagOpenClose(), Text(text="nOwIkI"), TagCloseClose()] From bdfd0632b3541f99f2086a27987cd2707806ab14 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 Aug 2013 01:32:47 -0400 Subject: [PATCH 185/189] A couple very minor nitpicks. --- mwparserfromhell/nodes/tag.py | 2 +- mwparserfromhell/parser/tokenizer.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 80b8a88..06f43d0 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -22,7 +22,7 @@ from __future__ import unicode_literals -from . import Node, Text +from . import Node from .extras import Attribute from ..compat import str from ..definitions import is_visible diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 6ab549a..1061b9f 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -35,6 +35,7 @@ class BadRoute(Exception): """Raised internally when the current tokenization route is invalid.""" def __init__(self, context=0): + super(BadRoute, self).__init__() self.context = context From 6784ff73bf23048a0bdbcbb666cf53b830ae904f Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 Aug 2013 03:06:58 -0400 Subject: [PATCH 186/189] Fix an edge case when we recurse too deeply. --- mwparserfromhell/parser/tokenizer.c | 3 ++- mwparserfromhell/parser/tokenizer.py | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 07d3988..1bc1f14 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2272,7 +2272,8 @@ static PyObject* Tokenizer_parse_style(Tokenizer* self) return NULL; return Tokenizer_pop(self); } - self->topstack->context |= LC_STYLE_PASS_AGAIN; + if (context & LC_STYLE_ITALICS) + self->topstack->context |= LC_STYLE_PASS_AGAIN; } for (i = 0; i < ticks; i++) { if (Tokenizer_emit_char(self, *"'")) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 1061b9f..8fae729 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -823,7 +823,8 @@ class Tokenizer(object): except BadRoute as route: self._head = reset if route.context & contexts.STYLE_PASS_AGAIN: - stack = self._parse(route.context | contexts.STYLE_SECOND_PASS) + new_ctx = contexts.STYLE_ITALICS | contexts.STYLE_SECOND_PASS + stack = self._parse(new_ctx) else: return self._emit_text("''") self._emit_style_tag("i", "''", stack) @@ -912,7 +913,8 @@ class Tokenizer(object): if self._context & contexts.STYLE_SECOND_PASS: self._emit_text("'") return self._pop() - self._context |= contexts.STYLE_PASS_AGAIN + if self._context & contexts.STYLE_ITALICS: + self._context |= contexts.STYLE_PASS_AGAIN self._emit_text("'" * ticks) elif ticks == 2: self._parse_italics() From c204cf489fe947d6fdc9b5094beae3a556ee01ae Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 Aug 2013 16:30:42 -0400 Subject: [PATCH 187/189] Add some tests for tag edge cases. --- tests/tokenizer/tags.mwtest | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index a4ce8d8..a0d7f18 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -355,6 +355,20 @@ output: [Text(text="junk bar")] --- +name: incomplete_unclosed_close +label: incomplete tags: an unclosed close tag +input: "junk " @@ -551,6 +565,13 @@ output: [Text(text="foo"), TagOpenOpen(invalid=True), Text(text="br"), TagCloseS --- +name: single_only_close_attribute +label: a tag that can only be single; presented as a close tag with an attribute +input: "
    " +output: [TagOpenOpen(invalid=True), Text(text="br"), TagAttrStart(pad_first=" ", pad_after_eq="", pad_before_eq=""), Text(text="id"), TagAttrEquals(), TagAttrQuote(), Text(text="break"), TagCloseSelfclose(padding="", implicit=True)] + +--- + name: capitalization label: caps should be ignored within tag names input: "{{test}}" From 77092e066ca6b39512a38b485ed0f047bfc6e32b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 Aug 2013 16:31:22 -0400 Subject: [PATCH 188/189] Fix C tokenizer behavior re: some single_only tag edge cases. --- mwparserfromhell/parser/tokenizer.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 1bc1f14..c9527ab 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2016,7 +2016,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) return -1; while (1) { this = Tokenizer_READ(self, pos); - if (is_marker(this)) { + if (Py_UNICODE_ISSPACE(this) || is_marker(this)) { name = Textbuffer_render(buf); if (!name) { Textbuffer_dealloc(buf); @@ -2031,16 +2031,15 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) pos++; } Textbuffer_dealloc(buf); - if (!BAD_ROUTE) { + if (!BAD_ROUTE) tag = Tokenizer_really_parse_tag(self); - if (!tag) - return -1; - } if (BAD_ROUTE) { RESET_ROUTE(); self->head = reset; return Tokenizer_emit_text(self, " Date: Sat, 24 Aug 2013 19:11:31 -0400 Subject: [PATCH 189/189] release/0.3 --- CHANGELOG | 2 +- docs/changelog.rst | 4 ++-- mwparserfromhell/__init__.py | 2 +- setup.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 122247f..67214fa 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ -v0.3 (unreleased): +v0.3 (released August 24, 2013): - Added complete support for HTML Tags, including forms like foo, , and wiki-markup tags like bold ('''), italics (''), and diff --git a/docs/changelog.rst b/docs/changelog.rst index f43a3c9..b6db9d9 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -4,8 +4,8 @@ Changelog v0.3 ---- -Unreleased -(`changes `__): +`Released August 24, 2013 `_ +(`changes `__): - Added complete support for HTML :py:class:`Tags <.Tag>`, including forms like ``foo``, ````, and wiki-markup tags like bold diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 74e1616..6a45a11 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -31,7 +31,7 @@ from __future__ import unicode_literals __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012, 2013 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.3.dev" +__version__ = "0.3" __email__ = "ben.kurtovic@verizon.net" from . import (compat, definitions, nodes, parser, smart_list, string_mixin, diff --git a/setup.py b/setup.py index 5e6d779..3ef7e0e 100644 --- a/setup.py +++ b/setup.py @@ -47,13 +47,13 @@ setup( keywords = "earwig mwparserfromhell wikipedia wiki mediawiki wikicode template parsing", license = "MIT License", classifiers = [ - "Development Status :: 3 - Alpha", + "Development Status :: 4 - Beta", "Environment :: Console", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.3", "Topic :: Text Processing :: Markup" ], )