Browse Source

Support attributes quoted with '; add required quotes in value setter.

tags/v0.4
Ben Kurtovic 9 years ago
parent
commit
b997e4cd71
15 changed files with 189 additions and 85 deletions
  1. +4
    -0
      CHANGELOG
  2. +5
    -0
      docs/changelog.rst
  3. +39
    -12
      mwparserfromhell/nodes/extras/attribute.py
  4. +10
    -7
      mwparserfromhell/nodes/tag.py
  5. +3
    -3
      mwparserfromhell/parser/builder.py
  6. +15
    -6
      mwparserfromhell/parser/tokenizer.c
  7. +1
    -0
      mwparserfromhell/parser/tokenizer.h
  8. +7
    -5
      mwparserfromhell/parser/tokenizer.py
  9. +1
    -1
      mwparserfromhell/parser/tokens.py
  10. +1
    -1
      tests/_test_tree_equality.py
  11. +34
    -14
      tests/test_attribute.py
  12. +8
    -8
      tests/test_builder.py
  13. +17
    -12
      tests/test_tag.py
  14. +1
    -1
      tests/tokenizer/integration.mwtest
  15. +43
    -15
      tests/tokenizer/tags.mwtest

+ 4
- 0
CHANGELOG View File

@@ -10,6 +10,10 @@ v0.4 (unreleased):
option, RECURSE_OTHERS, which recurses over all children except instances of
'forcetype' (for example, `code.filter_templates(code.RECURSE_OTHERS)`
returns all un-nested templates).
- The parser now understands HTML tag attributes quoted with single quotes.
When setting a tag attribute's value, quotes will be added if necessary. As
part of this, Attribute's 'quoted' attribute has been changed to 'quotes',
and is now either a string or None.
- Calling Template.remove() with a Parameter object that is not part of the
template now raises ValueError instead of doing nothing.
- Parameters with non-integer keys can no longer be created with


+ 5
- 0
docs/changelog.rst View File

@@ -18,6 +18,11 @@ Unreleased
which recurses over all children except instances of *forcetype* (for
example, ``code.filter_templates(code.RECURSE_OTHERS)`` returns all un-nested
templates).
- The parser now understands HTML tag attributes quoted with single quotes.
When setting a tag attribute's value, quotes will be added if necessary. As
part of this, :py:class:`.Attribute`\ 's :py:attr:`~.Attribute.quoted`
attribute has been changed to :py:attr:`~.Attribute.quotes`, and is now
either a string or ``None``.
- Calling :py:meth:`.Template.remove` with a :py:class:`.Parameter` object that
is not part of the template now raises :py:exc:`ValueError` instead of doing
nothing.


+ 39
- 12
mwparserfromhell/nodes/extras/attribute.py View File

@@ -36,12 +36,14 @@ class Attribute(StringMixIn):
whose value is ``"foo"``.
"""

def __init__(self, name, value=None, quoted=True, pad_first=" ",
def __init__(self, name, value=None, quotes='"', pad_first=" ",
pad_before_eq="", pad_after_eq=""):
super(Attribute, self).__init__()
if not quotes and self._value_needs_quotes(value):
raise ValueError("given value {0!r} requires quotes".format(value))
self._name = name
self._value = value
self._quoted = quoted
self._quotes = quotes
self._pad_first = pad_first
self._pad_before_eq = pad_before_eq
self._pad_after_eq = pad_after_eq
@@ -50,11 +52,18 @@ class Attribute(StringMixIn):
result = self.pad_first + str(self.name) + self.pad_before_eq
if self.value is not None:
result += "=" + self.pad_after_eq
if self.quoted:
return result + '"' + str(self.value) + '"'
if self.quotes:
return result + self.quotes + str(self.value) + self.quotes
return result + str(self.value)
return result

@staticmethod
def _value_needs_quotes(val):
"""Return the preferred quotes for the given value, or None."""
if val and any(char.isspace() for char in val):
return ('"' in val and "'" in val) or ("'" if '"' in val else '"')
return None

def _set_padding(self, attr, value):
"""Setter for the value of a padding attribute."""
if not value:
@@ -65,6 +74,14 @@ class Attribute(StringMixIn):
raise ValueError("padding must be entirely whitespace")
setattr(self, attr, value)

@staticmethod
def coerce_quotes(quotes):
"""Coerce a quote type into an acceptable value, or raise an error."""
orig, quotes = quotes, str(quotes) if quotes else None
if quotes not in [None, '"', "'"]:
raise ValueError("{0!r} is not a valid quote type".format(orig))
return quotes

@property
def name(self):
"""The name of the attribute as a :py:class:`~.Wikicode` object."""
@@ -76,9 +93,9 @@ class Attribute(StringMixIn):
return self._value

@property
def quoted(self):
"""Whether the attribute's value is quoted with double quotes."""
return self._quoted
def quotes(self):
"""How to enclose the attribute value. ``"``, ``'``, or ``None``."""
return self._quotes

@property
def pad_first(self):
@@ -101,11 +118,21 @@ class Attribute(StringMixIn):

@value.setter
def value(self, newval):
self._value = None if newval is None else parse_anything(newval)

@quoted.setter
def quoted(self, value):
self._quoted = bool(value)
if newval is None:
self._value = None
else:
code = parse_anything(newval)
quotes = self._value_needs_quotes(code)
if quotes in ['"', "'"] or (quotes is True and not self.quotes):
self._quotes = quotes
self._value = code

@quotes.setter
def quotes(self, value):
value = self.coerce_quotes(value)
if not value and self._value_needs_quotes(self.value):
raise ValueError("attribute value requires quotes")
self._quotes = value

@pad_first.setter
def pad_first(self, value):


+ 10
- 7
mwparserfromhell/nodes/tag.py View File

@@ -236,21 +236,24 @@ class Tag(Node):
return attr
raise ValueError(name)

def add(self, name, value=None, quoted=True, pad_first=" ",
def add(self, name, value=None, quotes='"', pad_first=" ",
pad_before_eq="", pad_after_eq=""):
"""Add an attribute with the given *name* and *value*.

*name* and *value* can be anything parsable by
:py:func:`.utils.parse_anything`; *value* can be omitted if the
attribute is valueless. *quoted* is a bool telling whether to wrap the
*value* in double quotes (this is recommended). *pad_first*,
*pad_before_eq*, and *pad_after_eq* are whitespace used as padding
before the name, before the equal sign (or after the name if no value),
and after the equal sign (ignored if no value), respectively.
attribute is valueless. If *quotes* is not ``None``, it should be a
string (either ``"`` or ``'``) that *value* will be wrapped in (this is
recommended). ``None`` is only legal if *value* contains no spacing.

*pad_first*, *pad_before_eq*, and *pad_after_eq* are whitespace used as
padding before the name, before the equal sign (or after the name if no
value), and after the equal sign (ignored if no value), respectively.
"""
if value is not None:
value = parse_anything(value)
attr = Attribute(parse_anything(name), value, quoted)
quotes = Attribute.coerce_quotes(quotes)
attr = Attribute(parse_anything(name), value, quotes)
attr.pad_first = pad_first
attr.pad_before_eq = pad_before_eq
attr.pad_after_eq = pad_after_eq


+ 3
- 3
mwparserfromhell/parser/builder.py View File

@@ -193,7 +193,7 @@ class Builder(object):

def _handle_attribute(self, start):
"""Handle a case where a tag attribute is at the head of the tokens."""
name, quoted = None, False
name = quotes = None
self._push()
while self._tokens:
token = self._tokens.pop()
@@ -201,7 +201,7 @@ class Builder(object):
name = self._pop()
self._push()
elif isinstance(token, tokens.TagAttrQuote):
quoted = True
quotes = token.char
elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen,
tokens.TagCloseSelfclose)):
self._tokens.append(token)
@@ -209,7 +209,7 @@ class Builder(object):
value = self._pop()
else:
name, value = self._pop(), None
return Attribute(name, value, quoted, start.pad_first,
return Attribute(name, value, quotes, start.pad_first,
start.pad_before_eq, start.pad_after_eq)
else:
self._write(self._handle_token(token))


+ 15
- 6
mwparserfromhell/parser/tokenizer.c View File

@@ -173,7 +173,7 @@ static TagData* TagData_new(void)
ALLOC_BUFFER(self->pad_first)
ALLOC_BUFFER(self->pad_before_eq)
ALLOC_BUFFER(self->pad_after_eq)
self->reset = 0;
self->quoter = self->reset = 0;
return self;
}

@@ -1566,10 +1566,18 @@ static int Tokenizer_parse_comment(Tokenizer* self)
*/
static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
{
PyObject *tokens, *kwargs, *pad_first, *pad_before_eq, *pad_after_eq;
PyObject *tokens, *kwargs, *tmp, *pad_first, *pad_before_eq, *pad_after_eq;

if (data->context & TAG_QUOTED) {
if (Tokenizer_emit_first(self, TagAttrQuote))
kwargs = PyDict_New();
if (!kwargs)
return -1;
tmp = PyUnicode_FromUnicode(&data->quoter, 1);
if (!tmp)
return -1;
PyDict_SetItemString(kwargs, "char", tmp);
Py_DECREF(tmp);
if (Tokenizer_emit_first_kwargs(self, TagAttrQuote, kwargs))
return -1;
tokens = Tokenizer_pop(self);
if (!tokens)
@@ -1721,16 +1729,17 @@ Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk)
Tokenizer_READ_BACKWARDS(self, 2) != '\\');
if (data->context & TAG_NOTE_QUOTE) {
data->context ^= TAG_NOTE_QUOTE;
if (chunk == '"' && !escaped) {
if ((chunk == '"' || chunk == '\'') && !escaped) {
data->context |= TAG_QUOTED;
data->quoter = chunk;
data->reset = self->head;
if (Tokenizer_push(self, self->topstack->context))
return -1;
data->reset = self->head;
return 0;
}
}
else if (data->context & TAG_QUOTED) {
if (chunk == '"' && !escaped) {
if (chunk == data->quoter && !escaped) {
data->context |= TAG_NOTE_SPACE;
return 0;
}


+ 1
- 0
mwparserfromhell/parser/tokenizer.h View File

@@ -206,6 +206,7 @@ typedef struct {
struct Textbuffer* pad_first;
struct Textbuffer* pad_before_eq;
struct Textbuffer* pad_after_eq;
Py_UNICODE quoter;
Py_ssize_t reset;
} TagData;



+ 7
- 5
mwparserfromhell/parser/tokenizer.py View File

@@ -53,6 +53,7 @@ class _TagOpenData(object):
def __init__(self):
self.context = self.CX_NAME
self.padding_buffer = {"first": "", "before_eq": "", "after_eq": ""}
self.quoter = None
self.reset = 0


@@ -66,7 +67,7 @@ class Tokenizer(object):
MAX_DEPTH = 40
MAX_CYCLES = 100000
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
tag_splitter = re.compile(r"([\s\"\\]+)")
tag_splitter = re.compile(r"([\s\"\'\\]+)")

def __init__(self):
self._text = None
@@ -612,7 +613,7 @@ class Tokenizer(object):
def _push_tag_buffer(self, data):
"""Write a pending tag attribute from *data* to the stack."""
if data.context & data.CX_QUOTED:
self._emit_first(tokens.TagAttrQuote())
self._emit_first(tokens.TagAttrQuote(char=data.quoter))
self._emit_all(self._pop())
buf = data.padding_buffer
self._emit_first(tokens.TagAttrStart(pad_first=buf["first"],
@@ -689,13 +690,14 @@ class Tokenizer(object):
escaped = self._read(-1) == "\\" and self._read(-2) != "\\"
if data.context & data.CX_NOTE_QUOTE:
data.context ^= data.CX_NOTE_QUOTE
if chunk == '"' and not escaped:
if chunk in "'\"" and not escaped:
data.context |= data.CX_QUOTED
self._push(self._context)
data.quoter = chunk
data.reset = self._head
self._push(self._context)
continue
elif data.context & data.CX_QUOTED:
if chunk == '"' and not escaped:
if chunk == data.quoter and not escaped:
data.context |= data.CX_NOTE_SPACE
continue
self._handle_tag_text(chunk)


+ 1
- 1
mwparserfromhell/parser/tokens.py View File

@@ -100,7 +100,7 @@ CommentEnd = make("CommentEnd") # -->
TagOpenOpen = make("TagOpenOpen") # <
TagAttrStart = make("TagAttrStart")
TagAttrEquals = make("TagAttrEquals") # =
TagAttrQuote = make("TagAttrQuote") # "
TagAttrQuote = make("TagAttrQuote") # ", '
TagCloseOpen = make("TagCloseOpen") # >
TagCloseSelfclose = make("TagCloseSelfclose") # />
TagOpenClose = make("TagOpenClose") # </


+ 1
- 1
tests/_test_tree_equality.py View File

@@ -98,7 +98,7 @@ class TreeEqualityTestCase(TestCase):
self.assertWikicodeEqual(exp_attr.name, act_attr.name)
if exp_attr.value is not None:
self.assertWikicodeEqual(exp_attr.value, act_attr.value)
self.assertIs(exp_attr.quoted, act_attr.quoted)
self.assertEqual(exp_attr.quotes, act_attr.quotes)
self.assertEqual(exp_attr.pad_first, act_attr.pad_first)
self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq)
self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq)


+ 34
- 14
tests/test_attribute.py View File

@@ -42,12 +42,14 @@ class TestAttribute(TreeEqualityTestCase):
self.assertEqual(" foo", str(node))
node2 = Attribute(wraptext("foo"), wraptext("bar"))
self.assertEqual(' foo="bar"', str(node2))
node3 = Attribute(wraptext("a"), wraptext("b"), True, "", " ", " ")
node3 = Attribute(wraptext("a"), wraptext("b"), '"', "", " ", " ")
self.assertEqual('a = "b"', str(node3))
node3 = Attribute(wraptext("a"), wraptext("b"), False, "", " ", " ")
self.assertEqual("a = b", str(node3))
node4 = Attribute(wraptext("a"), wrap([]), False, " ", "", " ")
self.assertEqual(" a= ", str(node4))
node4 = Attribute(wraptext("a"), wraptext("b"), "'", "", " ", " ")
self.assertEqual("a = 'b'", str(node4))
node5 = Attribute(wraptext("a"), wraptext("b"), None, "", " ", " ")
self.assertEqual("a = b", str(node5))
node6 = Attribute(wraptext("a"), wrap([]), None, " ", "", " ")
self.assertEqual(" a= ", str(node6))

def test_name(self):
"""test getter/setter for the name attribute"""
@@ -66,17 +68,35 @@ class TestAttribute(TreeEqualityTestCase):
self.assertWikicodeEqual(wrap([Template(wraptext("bar"))]), node.value)
node.value = None
self.assertIs(None, node.value)
node2 = Attribute(wraptext("id"), wraptext("foo"), None)
node2.value = "foo bar baz"
self.assertWikicodeEqual(wraptext("foo bar baz"), node2.value)
self.assertEqual('"', node2.quotes)
node2.value = 'foo "bar" baz'
self.assertWikicodeEqual(wraptext('foo "bar" baz'), node2.value)
self.assertEqual("'", node2.quotes)
node2.value = "foo 'bar' baz"
self.assertWikicodeEqual(wraptext("foo 'bar' baz"), node2.value)
self.assertEqual('"', node2.quotes)
node2.value = "fo\"o 'bar' b\"az"
self.assertWikicodeEqual(wraptext("fo\"o 'bar' b\"az"), node2.value)
self.assertEqual('"', node2.quotes)

def test_quoted(self):
"""test getter/setter for the quoted attribute"""
node1 = Attribute(wraptext("id"), wraptext("foo"), False)
def test_quotes(self):
"""test getter/setter for the quotes attribute"""
node1 = Attribute(wraptext("id"), wraptext("foo"), None)
node2 = Attribute(wraptext("id"), wraptext("bar"))
self.assertFalse(node1.quoted)
self.assertTrue(node2.quoted)
node1.quoted = True
node2.quoted = ""
self.assertTrue(node1.quoted)
self.assertFalse(node2.quoted)
node3 = Attribute(wraptext("id"), wraptext("foo bar baz"))
self.assertIs(None, node1.quotes)
self.assertEqual('"', node2.quotes)
node1.quotes = "'"
node2.quotes = None
self.assertEqual("'", node1.quotes)
self.assertIs(None, node2.quotes)
self.assertRaises(ValueError, setattr, node1, "quotes", "foobar")
self.assertRaises(ValueError, setattr, node3, "quotes", None)
self.assertRaises(ValueError, Attribute, wraptext("id"),
wraptext("foo bar baz"), None)

def test_padding(self):
"""test getter/setter for the padding attributes"""


+ 8
- 8
tests/test_builder.py View File

@@ -270,7 +270,7 @@ class TestBuilder(TreeEqualityTestCase):
tokens.TagAttrStart(pad_first=" ", pad_before_eq="",
pad_after_eq=""),
tokens.Text(text="name"), tokens.TagAttrEquals(),
tokens.TagAttrQuote(), tokens.Text(text="abc"),
tokens.TagAttrQuote(char='"'), tokens.Text(text="abc"),
tokens.TagCloseSelfclose(padding=" ")],
wrap([Tag(wraptext("ref"),
attrs=[Attribute(wraptext("name"), wraptext("abc"))],
@@ -298,7 +298,7 @@ class TestBuilder(TreeEqualityTestCase):
wrap([Tag(wraptext("br"), self_closing=True, invalid=True)])),

# <ref name={{abc}} foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}}
# mno = "{{p}} [[q]] {{r}}">[[Source]]</ref>
# mno = '{{p}} [[q]] {{r}}'>[[Source]]</ref>
([tokens.TagOpenOpen(), tokens.Text(text="ref"),
tokens.TagAttrStart(pad_first=" ", pad_before_eq="",
pad_after_eq=""),
@@ -308,7 +308,7 @@ class TestBuilder(TreeEqualityTestCase):
tokens.TagAttrStart(pad_first=" ", pad_before_eq="",
pad_after_eq=""),
tokens.Text(text="foo"), tokens.TagAttrEquals(),
tokens.TagAttrQuote(), tokens.Text(text="bar "),
tokens.TagAttrQuote(char='"'), tokens.Text(text="bar "),
tokens.TemplateOpen(), tokens.Text(text="baz"),
tokens.TemplateClose(),
tokens.TagAttrStart(pad_first=" ", pad_before_eq="",
@@ -326,7 +326,7 @@ class TestBuilder(TreeEqualityTestCase):
tokens.TagAttrStart(pad_first=" \n ", pad_before_eq=" ",
pad_after_eq=" "),
tokens.Text(text="mno"), tokens.TagAttrEquals(),
tokens.TagAttrQuote(), tokens.TemplateOpen(),
tokens.TagAttrQuote(char="'"), tokens.TemplateOpen(),
tokens.Text(text="p"), tokens.TemplateClose(),
tokens.Text(text=" "), tokens.WikilinkOpen(),
tokens.Text(text="q"), tokens.WikilinkClose(),
@@ -338,17 +338,17 @@ class TestBuilder(TreeEqualityTestCase):
tokens.TagCloseClose()],
wrap([Tag(wraptext("ref"), wrap([Wikilink(wraptext("Source"))]), [
Attribute(wraptext("name"),
wrap([Template(wraptext("abc"))]), False),
wrap([Template(wraptext("abc"))]), None),
Attribute(wraptext("foo"), wrap([Text("bar "),
Template(wraptext("baz"))]), pad_first=" "),
Attribute(wraptext("abc"), wrap([Template(wraptext("de")),
Text("f")]), False),
Text("f")]), None),
Attribute(wraptext("ghi"), wrap([Text("j"),
Template(wraptext("k")),
Template(wraptext("l"))]), False),
Template(wraptext("l"))]), None),
Attribute(wraptext("mno"), wrap([Template(wraptext("p")),
Text(" "), Wikilink(wraptext("q")), Text(" "),
Template(wraptext("r"))]), True, " \n ", " ",
Template(wraptext("r"))]), "'", " \n ", " ",
" ")])])),

# "''italic text''"


+ 17
- 12
tests/test_tag.py View File

@@ -34,9 +34,9 @@ from ._test_tree_equality import TreeEqualityTestCase, wrap, wraptext

agen = lambda name, value: Attribute(wraptext(name), wraptext(value))
agennv = lambda name: Attribute(wraptext(name))
agennq = lambda name, value: Attribute(wraptext(name), wraptext(value), False)
agenp = lambda name, v, a, b, c: Attribute(wraptext(name), v, True, a, b, c)
agenpnv = lambda name, a, b, c: Attribute(wraptext(name), None, True, a, b, c)
agennq = lambda name, value: Attribute(wraptext(name), wraptext(value), None)
agenp = lambda name, v, a, b, c: Attribute(wraptext(name), v, '"', a, b, c)
agenpnv = lambda name, a, b, c: Attribute(wraptext(name), None, '"', a, b, c)

class TestTag(TreeEqualityTestCase):
"""Test cases for the Tag node."""
@@ -276,28 +276,33 @@ class TestTag(TreeEqualityTestCase):
"""test Tag.add()"""
node = Tag(wraptext("ref"), wraptext("cite"))
node.add("name", "value")
node.add("name", "value", quoted=False)
node.add("name", "value", quotes=None)
node.add("name", "value", quotes="'")
node.add("name")
node.add(1, False)
node.add("style", "{{foobar}}")
node.add("name", "value", True, "\n", " ", " ")
node.add("name", "value", '"', "\n", " ", " ")
attr1 = ' name="value"'
attr2 = " name=value"
attr3 = " name"
attr4 = ' 1="False"'
attr5 = ' style="{{foobar}}"'
attr6 = '\nname = "value"'
attr3 = " name='value'"
attr4 = " name"
attr5 = ' 1="False"'
attr6 = ' style="{{foobar}}"'
attr7 = '\nname = "value"'
self.assertEqual(attr1, node.attributes[0])
self.assertEqual(attr2, node.attributes[1])
self.assertEqual(attr3, node.attributes[2])
self.assertEqual(attr4, node.attributes[3])
self.assertEqual(attr5, node.attributes[4])
self.assertEqual(attr6, node.attributes[5])
self.assertEqual(attr6, node.get("name"))
self.assertEqual(attr7, node.attributes[6])
self.assertEqual(attr7, node.get("name"))
self.assertWikicodeEqual(wrap([Template(wraptext("foobar"))]),
node.attributes[4].value)
node.attributes[5].value)
self.assertEqual("".join(("<ref", attr1, attr2, attr3, attr4, attr5,
attr6, ">cite</ref>")), node)
attr6, attr7, ">cite</ref>")), node)
self.assertRaises(ValueError, node.add, "name", "foo", quotes="bar")
self.assertRaises(ValueError, node.add, "name", "a bc d", quotes=None)

def test_remove(self):
"""test Tag.remove()"""


+ 1
- 1
tests/tokenizer/integration.mwtest View File

@@ -43,7 +43,7 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t
name: rich_tags
label: a HTML tag with tons of other things in it
input: "{{dubious claim}}<ref name={{abc}} foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}} \n mno = "{{p}} [[q]] {{r}}">[[Source]]</ref>"
output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" \n ", pad_before_eq=" ", pad_after_eq=" "), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" \n ", pad_before_eq=" ", pad_after_eq=" "), Text(text="mno"), TagAttrEquals(), TagAttrQuote(char="\""), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---



+ 43
- 15
tests/tokenizer/tags.mwtest View File

@@ -57,7 +57,14 @@ output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before
name: attribute_quoted
label: a tag with a single quoted attribute
input: "<ref name="foo bar"></ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: attribute_single_quoted
label: a tag with a single singly-quoted attribute
input: "<ref name='foo bar'></ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="'"), Text(text="foo bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

@@ -71,7 +78,7 @@ output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before
name: attribute_quoted_hyphen
label: a tag with a single quoted attribute, containing a hyphen
input: "<ref name="foo-bar"></ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

@@ -92,21 +99,21 @@ output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before
name: attribute_selfclosing_value_quoted
label: a self-closing tag with a single quoted attribute
input: "<ref name="foo"/>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo"), TagCloseSelfclose(padding="")]

---

name: nested_tag
label: a tag nested within the attributes of another
input: "<ref name=<span style="color: red;">foo</span>>citation</ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: nested_tag_quoted
label: a tag nested within the attributes of another, quoted
input: "<ref name="<span style="color: red;">foo</span>">citation</ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

@@ -120,7 +127,7 @@ output: [Text(text="<ref name=</ ><//>>citation</ref>")]
name: nested_troll_tag_quoted
label: a bogus tag that appears to be nested within the attributes of another, quoted
input: "<ref name="</ ><//>">citation</ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="</ ><//>"), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="</ ><//>"), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

@@ -222,6 +229,27 @@ output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_befor

---

name: quotes_in_quotes
label: singly-quoted text inside a doubly-quoted attribute
input: "<span foo="bar 'baz buzz' biz">stuff</span>"
output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="bar 'baz buzz' biz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]

---

name: quotes_in_quotes_2
label: doubly-quoted text inside a singly-quoted attribute
input: "<span foo='bar "baz buzz" biz'>stuff</span>"
output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="'"), Text(text="bar \"baz buzz\" biz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]

---

name: quotes_in_quotes_3
label: doubly-quoted text inside a singly-quoted attribute, with backslashes
input: "<span foo='bar "baz buzz\\" biz'>stuff</span>"
output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char="'"), Text(text="bar \"baz buzz\\\" biz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]

---

name: incomplete_lbracket
label: incomplete tags: just a left bracket
input: "<"
@@ -407,28 +435,28 @@ output: [Text(text="junk <></>")]
name: backslash_premature_before
label: a backslash before a quote before a space
input: "<foo attribute="this is\\" quoted">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\" quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="this is\\\" quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: backslash_premature_after
label: a backslash before a quote after a space
input: "<foo attribute="this is \\"quoted">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is \\\"quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="this is \\\"quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: backslash_premature_middle
label: a backslash before a quote in the middle of a word
input: "<foo attribute="this i\\"s quoted">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this i\\\"s quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="this i\\\"s quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: backslash_adjacent
label: escaped quotes next to unescaped quotes
input: "<foo attribute="\\"this is quoted\\"">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\\"this is quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="\\\"this is quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

@@ -442,21 +470,21 @@ output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before
name: backslash_double
label: two adjacent backslashes, which do *not* affect the quote
input: "<foo attribute="this is\\\\" quoted">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="this is\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: backslash_triple
label: three adjacent backslashes, which do *not* affect the quote
input: "<foo attribute="this is\\\\\\" quoted">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="this is\\\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: backslash_unaffecting
label: backslashes near quotes, but not immediately adjacent, thus having no effect
input: "<foo attribute="\\quote\\d" also="quote\\d\\">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\quote\\d"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="also"), TagAttrEquals(), Text(text="\"quote\\d\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="\\quote\\d"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="also"), TagAttrEquals(), Text(text="\"quote\\d\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

@@ -477,7 +505,7 @@ output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(t
name: unparsable_attributed
label: a tag that should not be put through the normal parser; parsed attributes
input: "{{t1}}<nowiki attr=val attr2="{{val2}}">{{t2}}</nowiki>{{t3}}"
output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr"), TagAttrEquals(), Text(text="val"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr2"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="val2"), TemplateClose(), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]
output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr"), TagAttrEquals(), Text(text="val"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr2"), TagAttrEquals(), TagAttrQuote(char="\""), TemplateOpen(), Text(text="val2"), TemplateClose(), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]

---

@@ -575,7 +603,7 @@ output: [Text(text="foo"), TagOpenOpen(invalid=True), Text(text="br"), TagCloseS
name: single_only_close_attribute
label: a tag that can only be single; presented as a close tag with an attribute
input: "</br id="break">"
output: [TagOpenOpen(invalid=True), Text(text="br"), TagAttrStart(pad_first=" ", pad_after_eq="", pad_before_eq=""), Text(text="id"), TagAttrEquals(), TagAttrQuote(), Text(text="break"), TagCloseSelfclose(padding="", implicit=True)]
output: [TagOpenOpen(invalid=True), Text(text="br"), TagAttrStart(pad_first=" ", pad_after_eq="", pad_before_eq=""), Text(text="id"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="break"), TagCloseSelfclose(padding="", implicit=True)]

---



Loading…
Cancel
Save