Browse Source

Merge branch 'feature/wikimarkup_tags' into develop (closes #9)

tags/v0.3
Ben Kurtovic 11 years ago
parent
commit
5029082b58
13 changed files with 1295 additions and 169 deletions
  1. +19
    -15
      mwparserfromhell/nodes/tag.py
  2. +5
    -6
      mwparserfromhell/parser/builder.py
  3. +23
    -6
      mwparserfromhell/parser/contexts.py
  4. +464
    -60
      mwparserfromhell/parser/tokenizer.c
  5. +53
    -43
      mwparserfromhell/parser/tokenizer.h
  6. +177
    -7
      mwparserfromhell/parser/tokenizer.py
  7. +1
    -5
      mwparserfromhell/parser/tokens.py
  8. +9
    -13
      mwparserfromhell/tag_defs.py
  9. +1
    -1
      tests/_test_tree_equality.py
  10. +14
    -0
      tests/test_builder.py
  11. +10
    -10
      tests/test_tag.py
  12. +3
    -3
      tests/test_tokens.py
  13. +516
    -0
      tests/tokenizer/tags_wikimarkup.mwtest

+ 19
- 15
mwparserfromhell/nodes/tag.py View File

@@ -24,7 +24,7 @@ from __future__ import unicode_literals


from . import Node, Text from . import Node, Text
from ..compat import str from ..compat import str
from ..tag_defs import get_wikicode, is_visible
from ..tag_defs import is_visible
from ..utils import parse_anything from ..utils import parse_anything


__all__ = ["Tag"] __all__ = ["Tag"]
@@ -32,7 +32,7 @@ __all__ = ["Tag"]
class Tag(Node): class Tag(Node):
"""Represents an HTML-style tag in wikicode, like ``<ref>``.""" """Represents an HTML-style tag in wikicode, like ``<ref>``."""


def __init__(self, tag, contents=None, attrs=None, showtag=True,
def __init__(self, tag, contents=None, attrs=None, wiki_markup=None,
self_closing=False, invalid=False, implicit=False, padding="", self_closing=False, invalid=False, implicit=False, padding="",
closing_tag=None): closing_tag=None):
super(Tag, self).__init__() super(Tag, self).__init__()
@@ -42,7 +42,7 @@ class Tag(Node):
else: else:
self._contents = contents self._contents = contents
self._attrs = attrs if attrs else [] self._attrs = attrs if attrs else []
self._showtag = showtag
self._wiki_markup = wiki_markup
self._self_closing = self_closing self._self_closing = self_closing
self._invalid = invalid self._invalid = invalid
self._implicit = implicit self._implicit = implicit
@@ -53,12 +53,11 @@ class Tag(Node):
self._closing_tag = tag self._closing_tag = tag


def __unicode__(self): def __unicode__(self):
if not self.showtag:
open_, close = get_wikicode(self.tag)
if self.wiki_markup:
if self.self_closing: if self.self_closing:
return open_
return self.wiki_markup
else: else:
return open_ + str(self.contents) + close
return self.wiki_markup + str(self.contents) + self.wiki_markup


result = ("</" if self.invalid else "<") + str(self.tag) result = ("</" if self.invalid else "<") + str(self.tag)
if self.attributes: if self.attributes:
@@ -72,7 +71,7 @@ class Tag(Node):


def __iternodes__(self, getter): def __iternodes__(self, getter):
yield None, self yield None, self
if self.showtag:
if not self.wiki_markup:
for child in getter(self.tag): for child in getter(self.tag):
yield self.tag, child yield self.tag, child
for attr in self.attributes: for attr in self.attributes:
@@ -84,7 +83,7 @@ class Tag(Node):
if self.contents: if self.contents:
for child in getter(self.contents): for child in getter(self.contents):
yield self.contents, child yield self.contents, child
if not self.self_closing and self.showtag and self.closing_tag:
if not self.self_closing and not self.wiki_markup and self.closing_tag:
for child in getter(self.closing_tag): for child in getter(self.closing_tag):
yield self.closing_tag, child yield self.closing_tag, child


@@ -131,9 +130,14 @@ class Tag(Node):
return self._attrs return self._attrs


@property @property
def showtag(self):
"""Whether to show the tag itself instead of a wikicode version."""
return self._showtag
def wiki_markup(self):
"""The wikified version of a tag to show instead of HTML.

If set to a value, this will be displayed instead of the brackets.
For example, set to ``''`` to replace ``<i>`` or ``----`` to replace
``<hr>``.
"""
return self._wiki_markup


@property @property
def self_closing(self): def self_closing(self):
@@ -183,9 +187,9 @@ class Tag(Node):
def contents(self, value): def contents(self, value):
self._contents = parse_anything(value) self._contents = parse_anything(value)


@showtag.setter
def showtag(self, value):
self._showtag = bool(value)
@wiki_markup.setter
def wiki_markup(self, value):
self._wiki_markup = str(value) if value else None


@self_closing.setter @self_closing.setter
def self_closing(self, value): def self_closing(self, value):


+ 5
- 6
mwparserfromhell/parser/builder.py View File

@@ -207,15 +207,14 @@ class Builder(object):
"""Handle a case where a tag is at the head of the tokens.""" """Handle a case where a tag is at the head of the tokens."""
close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose)
implicit, attrs, contents, closing_tag = False, [], None, None implicit, attrs, contents, closing_tag = False, [], None, None
showtag = token.get("showtag", True)
invalid = token.get("invalid", False)
wiki_markup, invalid = token.wiki_markup, token.invalid or False
self._push() self._push()
while self._tokens: while self._tokens:
token = self._tokens.pop() token = self._tokens.pop()
if isinstance(token, tokens.TagAttrStart): if isinstance(token, tokens.TagAttrStart):
attrs.append(self._handle_attribute(token)) attrs.append(self._handle_attribute(token))
elif isinstance(token, tokens.TagCloseOpen): elif isinstance(token, tokens.TagCloseOpen):
padding = token.padding
padding = token.padding or ""
tag = self._pop() tag = self._pop()
self._push() self._push()
elif isinstance(token, tokens.TagOpenClose): elif isinstance(token, tokens.TagOpenClose):
@@ -225,12 +224,12 @@ class Builder(object):
if isinstance(token, tokens.TagCloseSelfclose): if isinstance(token, tokens.TagCloseSelfclose):
tag = self._pop() tag = self._pop()
self_closing = True self_closing = True
padding = token.padding
implicit = token.get("implicit", False)
padding = token.padding or ""
implicit = token.implicit or False
else: else:
self_closing = False self_closing = False
closing_tag = self._pop() closing_tag = self._pop()
return Tag(tag, contents, attrs, showtag, self_closing,
return Tag(tag, contents, attrs, wiki_markup, self_closing,
invalid, implicit, padding, closing_tag) invalid, implicit, padding, closing_tag)
else: else:
self._write(self._handle_token(token)) self._write(self._handle_token(token))


+ 23
- 6
mwparserfromhell/parser/contexts.py View File

@@ -69,6 +69,15 @@ Local (stack-specific) contexts:
* :py:const:`TAG_BODY` * :py:const:`TAG_BODY`
* :py:const:`TAG_CLOSE` * :py:const:`TAG_CLOSE`


* :py:const:`STYLE`

* :py:const:`STYLE_ITALICS`
* :py:const:`STYLE_BOLD`
* :py:const:`STYLE_PASS_AGAIN`
* :py:const:`STYLE_SECOND_PASS`

* :py:const:`DL_TERM`

* :py:const:`SAFETY_CHECK` * :py:const:`SAFETY_CHECK`


* :py:const:`HAS_TEXT` * :py:const:`HAS_TEXT`
@@ -115,12 +124,20 @@ TAG_BODY = 1 << 16
TAG_CLOSE = 1 << 17 TAG_CLOSE = 1 << 17
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE


HAS_TEXT = 1 << 18
FAIL_ON_TEXT = 1 << 19
FAIL_NEXT = 1 << 20
FAIL_ON_LBRACE = 1 << 21
FAIL_ON_RBRACE = 1 << 22
FAIL_ON_EQUALS = 1 << 23
STYLE_ITALICS = 1 << 18
STYLE_BOLD = 1 << 19
STYLE_PASS_AGAIN = 1 << 20
STYLE_SECOND_PASS = 1 << 21
STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS

DL_TERM = 1 << 22

HAS_TEXT = 1 << 23
FAIL_ON_TEXT = 1 << 24
FAIL_NEXT = 1 << 25
FAIL_ON_LBRACE = 1 << 26
FAIL_ON_RBRACE = 1 << 27
FAIL_ON_EQUALS = 1 << 28
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS) FAIL_ON_RBRACE + FAIL_ON_EQUALS)




+ 464
- 60
mwparserfromhell/parser/tokenizer.c View File

@@ -29,6 +29,7 @@ SOFTWARE.
static int heading_level_from_context(int n) static int heading_level_from_context(int n)
{ {
int level; int level;

n /= LC_HEADING_LEVEL_1; n /= LC_HEADING_LEVEL_1;
for (level = 1; n > 1; n >>= 1) for (level = 1; n > 1; n >>= 1)
level++; level++;
@@ -72,6 +73,7 @@ static PyObject* strip_tag_name(PyObject* token)
static Textbuffer* Textbuffer_new(void) static Textbuffer* Textbuffer_new(void)
{ {
Textbuffer* buffer = malloc(sizeof(Textbuffer)); Textbuffer* buffer = malloc(sizeof(Textbuffer));

if (!buffer) { if (!buffer) {
PyErr_NoMemory(); PyErr_NoMemory();
return NULL; return NULL;
@@ -90,6 +92,7 @@ static Textbuffer* Textbuffer_new(void)
static void Textbuffer_dealloc(Textbuffer* self) static void Textbuffer_dealloc(Textbuffer* self)
{ {
Textbuffer* next; Textbuffer* next;

while (self) { while (self) {
free(self->data); free(self->data);
next = self->next; next = self->next;
@@ -99,11 +102,12 @@ static void Textbuffer_dealloc(Textbuffer* self)
} }


/* /*
Write text to the given textbuffer.
Write a Unicode codepoint to the given textbuffer.
*/ */
static int Textbuffer_write(Textbuffer** this, Py_UNICODE text)
static int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
{ {
Textbuffer* self = *this; Textbuffer* self = *this;

if (self->size == TEXTBUFFER_BLOCKSIZE) { if (self->size == TEXTBUFFER_BLOCKSIZE) {
Textbuffer* new = Textbuffer_new(); Textbuffer* new = Textbuffer_new();
if (!new) if (!new)
@@ -111,7 +115,7 @@ static int Textbuffer_write(Textbuffer** this, Py_UNICODE text)
new->next = self; new->next = self;
*this = self = new; *this = self = new;
} }
self->data[self->size] = text;
self->data[self->size] = code;
self->size++; self->size++;
return 0; return 0;
} }
@@ -123,6 +127,7 @@ static PyObject* Textbuffer_render(Textbuffer* self)
{ {
PyObject *result = PyUnicode_FromUnicode(self->data, self->size); PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
PyObject *left, *concat; PyObject *left, *concat;

while (self->next) { while (self->next) {
self = self->next; self = self->next;
left = PyUnicode_FromUnicode(self->data, self->size); left = PyUnicode_FromUnicode(self->data, self->size);
@@ -208,6 +213,7 @@ static void Tokenizer_dealloc(Tokenizer* self)
static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
{ {
static char* kwlist[] = {NULL}; static char* kwlist[] = {NULL};

if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist)) if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
return -1; return -1;
self->text = Py_None; self->text = Py_None;
@@ -223,6 +229,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
static int Tokenizer_push(Tokenizer* self, int context) static int Tokenizer_push(Tokenizer* self, int context)
{ {
Stack* top = malloc(sizeof(Stack)); Stack* top = malloc(sizeof(Stack));

if (!top) { if (!top) {
PyErr_NoMemory(); PyErr_NoMemory();
return -1; return -1;
@@ -246,6 +253,7 @@ static int Tokenizer_push_textbuffer(Tokenizer* self)
{ {
PyObject *text, *kwargs, *token; PyObject *text, *kwargs, *token;
Textbuffer* buffer = self->topstack->textbuffer; Textbuffer* buffer = self->topstack->textbuffer;

if (buffer->size == 0 && !buffer->next) if (buffer->size == 0 && !buffer->next)
return 0; return 0;
text = Textbuffer_render(buffer); text = Textbuffer_render(buffer);
@@ -280,6 +288,7 @@ static int Tokenizer_push_textbuffer(Tokenizer* self)
static void Tokenizer_delete_top_of_stack(Tokenizer* self) static void Tokenizer_delete_top_of_stack(Tokenizer* self)
{ {
Stack* top = self->topstack; Stack* top = self->topstack;

Py_DECREF(top->stack); Py_DECREF(top->stack);
Textbuffer_dealloc(top->textbuffer); Textbuffer_dealloc(top->textbuffer);
self->topstack = top->next; self->topstack = top->next;
@@ -293,6 +302,7 @@ static void Tokenizer_delete_top_of_stack(Tokenizer* self)
static PyObject* Tokenizer_pop(Tokenizer* self) static PyObject* Tokenizer_pop(Tokenizer* self)
{ {
PyObject* stack; PyObject* stack;

if (Tokenizer_push_textbuffer(self)) if (Tokenizer_push_textbuffer(self))
return NULL; return NULL;
stack = self->topstack->stack; stack = self->topstack->stack;
@@ -309,6 +319,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
{ {
PyObject* stack; PyObject* stack;
int context; int context;

if (Tokenizer_push_textbuffer(self)) if (Tokenizer_push_textbuffer(self))
return NULL; return NULL;
stack = self->topstack->stack; stack = self->topstack->stack;
@@ -325,9 +336,11 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
*/ */
static void* Tokenizer_fail_route(Tokenizer* self) static void* Tokenizer_fail_route(Tokenizer* self)
{ {
int context = self->topstack->context;
PyObject* stack = Tokenizer_pop(self); PyObject* stack = Tokenizer_pop(self);

Py_XDECREF(stack); Py_XDECREF(stack);
FAIL_ROUTE();
FAIL_ROUTE(context);
return NULL; return NULL;
} }


@@ -356,11 +369,26 @@ static int Tokenizer_emit_first(Tokenizer* self, PyObject* token)
} }


/* /*
Write text to the current textbuffer.
Write a Unicode codepoint to the current textbuffer.
*/ */
static int Tokenizer_emit_text(Tokenizer* self, Py_UNICODE text)
static int Tokenizer_emit_char(Tokenizer* self, Py_UNICODE code)
{ {
return Textbuffer_write(&(self->topstack->textbuffer), text);
return Textbuffer_write(&(self->topstack->textbuffer), code);
}

/*
Write a string of text to the current textbuffer.
*/
static int Tokenizer_emit_text(Tokenizer* self, const char* text)
{
int i = 0;

while (text[i]) {
if (Tokenizer_emit_char(self, text[i]))
return -1;
i++;
}
return 0;
} }


/* /*
@@ -427,15 +455,10 @@ static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text) static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
{ {
PyObject* stack = Tokenizer_pop(self); PyObject* stack = Tokenizer_pop(self);
int i = 0;
while (1) {
if (!text[i])
break;
if (Tokenizer_emit_text(self, (Py_UNICODE) text[i])) {
Py_XDECREF(stack);
return -1;
}
i++;

if (Tokenizer_emit_text(self, text)) {
Py_DECREF(stack);
return -1;
} }
if (stack) { if (stack) {
if (PyList_GET_SIZE(stack) > 0) { if (PyList_GET_SIZE(stack) > 0) {
@@ -456,6 +479,7 @@ static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta) static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
{ {
Py_ssize_t index = self->head + delta; Py_ssize_t index = self->head + delta;

if (index >= self->length) if (index >= self->length)
return EMPTY; return EMPTY;
return PyList_GET_ITEM(self->text, index); return PyList_GET_ITEM(self->text, index);
@@ -467,6 +491,7 @@ static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta) static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
{ {
Py_ssize_t index; Py_ssize_t index;

if (delta > self->head) if (delta > self->head)
return EMPTY; return EMPTY;
index = self->head - delta; index = self->head - delta;
@@ -751,7 +776,6 @@ static int Tokenizer_parse_wikilink(Tokenizer* self)
{ {
Py_ssize_t reset; Py_ssize_t reset;
PyObject *wikilink, *token; PyObject *wikilink, *token;
int i;


self->head += 2; self->head += 2;
reset = self->head - 1; reset = self->head - 1;
@@ -759,10 +783,8 @@ static int Tokenizer_parse_wikilink(Tokenizer* self)
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
for (i = 0; i < 2; i++) {
if (Tokenizer_emit_text(self, *"["))
return -1;
}
if (Tokenizer_emit_text(self, "[["))
return -1;
return 0; return 0;
} }
if (!wikilink) if (!wikilink)
@@ -847,7 +869,7 @@ static int Tokenizer_parse_heading(Tokenizer* self)
RESET_ROUTE(); RESET_ROUTE();
self->head = reset + best - 1; self->head = reset + best - 1;
for (i = 0; i < best; i++) { for (i = 0; i < best; i++) {
if (Tokenizer_emit_text(self, *"="))
if (Tokenizer_emit_char(self, *"="))
return -1; return -1;
} }
self->global ^= GL_HEADING; self->global ^= GL_HEADING;
@@ -885,7 +907,7 @@ static int Tokenizer_parse_heading(Tokenizer* self)
if (heading->level < best) { if (heading->level < best) {
diff = best - heading->level; diff = best - heading->level;
for (i = 0; i < diff; i++) { for (i = 0; i < diff; i++) {
if (Tokenizer_emit_text(self, *"=")) {
if (Tokenizer_emit_char(self, *"=")) {
Py_DECREF(heading->title); Py_DECREF(heading->title);
free(heading); free(heading);
return -1; return -1;
@@ -936,7 +958,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
if (level < best) { if (level < best) {
diff = best - level; diff = best - level;
for (i = 0; i < diff; i++) { for (i = 0; i < diff; i++) {
if (Tokenizer_emit_text(self, *"="))
if (Tokenizer_emit_char(self, *"="))
return NULL; return NULL;
} }
} }
@@ -944,7 +966,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
} }
else { else {
for (i = 0; i < best; i++) { for (i = 0; i < best; i++) {
if (Tokenizer_emit_text(self, *"=")) {
if (Tokenizer_emit_char(self, *"=")) {
Py_DECREF(after->title); Py_DECREF(after->title);
free(after); free(after);
return NULL; return NULL;
@@ -1160,7 +1182,7 @@ static int Tokenizer_parse_entity(Tokenizer* self)
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
if (Tokenizer_emit_text(self, *"&"))
if (Tokenizer_emit_char(self, *"&"))
return -1; return -1;
return 0; return 0;
} }
@@ -1182,24 +1204,14 @@ static int Tokenizer_parse_comment(Tokenizer* self)
{ {
Py_ssize_t reset = self->head + 3; Py_ssize_t reset = self->head + 3;
PyObject *token, *comment; PyObject *token, *comment;
int i;


self->head += 4; self->head += 4;
comment = Tokenizer_parse(self, LC_COMMENT, 1); comment = Tokenizer_parse(self, LC_COMMENT, 1);
if (BAD_ROUTE) { if (BAD_ROUTE) {
const char* text = "<!--";
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
i = 0;
while (1) {
if (!text[i])
return 0;
if (Tokenizer_emit_text(self, (Py_UNICODE) text[i])) {
Py_XDECREF(text);
return -1;
}
i++;
}
if (Tokenizer_emit_text(self, "<!--"))
return -1;
return 0; return 0;
} }
if (!comment) if (!comment)
@@ -1317,7 +1329,7 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
return -1; return -1;
} }
if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) { if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) {
if (Tokenizer_emit_text(self, text))
if (Tokenizer_emit_char(self, text))
return -1; return -1;
} }
else if (data->context & TAG_ATTR_READY) else if (data->context & TAG_ATTR_READY)
@@ -1342,14 +1354,14 @@ static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
} }
} }
if (!is_marker || !Tokenizer_CAN_RECURSE(self)) if (!is_marker || !Tokenizer_CAN_RECURSE(self))
return Tokenizer_emit_text(self, text);
return Tokenizer_emit_char(self, text);
else if (text == next && next == *"{") else if (text == next && next == *"{")
return Tokenizer_parse_template_or_argument(self); return Tokenizer_parse_template_or_argument(self);
else if (text == next && next == *"[") else if (text == next && next == *"[")
return Tokenizer_parse_wikilink(self); return Tokenizer_parse_wikilink(self);
else if (text == *"<") else if (text == *"<")
return Tokenizer_parse_tag(self); return Tokenizer_parse_tag(self);
return Tokenizer_emit_text(self, text);
return Tokenizer_emit_char(self, text);
} }


/* /*
@@ -1574,7 +1586,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
return NULL; return NULL;
return Tokenizer_parse(self, 0, 0); return Tokenizer_parse(self, 0, 0);
} }
if (Tokenizer_emit_text(self, this))
if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
} }
} }
@@ -1776,7 +1788,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
return -1; return -1;
} }
if (!IS_SINGLE_ONLY(name)) if (!IS_SINGLE_ONLY(name))
FAIL_ROUTE();
FAIL_ROUTE(0);
break; break;
} }
Textbuffer_write(&buf, this); Textbuffer_write(&buf, this);
@@ -1790,8 +1802,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
return (Tokenizer_emit_text(self, *"<") ||
Tokenizer_emit_text(self, *"/"));
return Tokenizer_emit_text(self, "</");
} }
// Set invalid=True flag of TagOpenOpen // Set invalid=True flag of TagOpenOpen
if (PyObject_SetAttrString(PyList_GET_ITEM(tag, 0), "invalid", Py_True)) if (PyObject_SetAttrString(PyList_GET_ITEM(tag, 0), "invalid", Py_True))
@@ -1812,7 +1823,7 @@ static int Tokenizer_parse_tag(Tokenizer* self)
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
return Tokenizer_emit_text(self, *"<");
return Tokenizer_emit_char(self, *"<");
} }
if (!tag) { if (!tag) {
return -1; return -1;
@@ -1823,12 +1834,382 @@ static int Tokenizer_parse_tag(Tokenizer* self)
} }


/* /*
Write the body of a tag and the tokens that should surround it.
*/
static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag,
const char* ticks, PyObject* body)
{
PyObject *markup, *kwargs, *token;

markup = PyBytes_FromString(ticks);
if (!markup)
return -1;
kwargs = PyDict_New();
if (!kwargs) {
Py_DECREF(markup);
return -1;
}
PyDict_SetItemString(kwargs, "wiki_markup", markup);
Py_DECREF(markup);
token = PyObject_Call(TagOpenOpen, NOARGS, kwargs);
if (!token) {
Py_DECREF(kwargs);
return -1;
}
Py_DECREF(kwargs);
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_text(self, tag))
return -1;
token = PyObject_CallObject(TagCloseOpen, NULL);
if (!token)
return -1;
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_all(self, body))
return -1;
token = PyObject_CallObject(TagOpenClose, NULL);
if (!token)
return -1;
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_text(self, tag))
return -1;
token = PyObject_CallObject(TagCloseClose, NULL);
if (!token)
return -1;
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
Py_DECREF(body);
return 0;
}

/*
Parse wiki-style italics.
*/
static int Tokenizer_parse_italics(Tokenizer* self)
{
Py_ssize_t reset = self->head;
int context;
PyObject *stack;

stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (BAD_ROUTE_CONTEXT & LC_STYLE_PASS_AGAIN) {
context = LC_STYLE_ITALICS | LC_STYLE_SECOND_PASS;
stack = Tokenizer_parse(self, context, 1);
}
else
return Tokenizer_emit_text(self, "''");
}
if (!stack)
return -1;
return Tokenizer_emit_style_tag(self, "i", "''", stack);
}

/*
Parse wiki-style bold.
*/
static int Tokenizer_parse_bold(Tokenizer* self)
{
Py_ssize_t reset = self->head;
PyObject *stack;

stack = Tokenizer_parse(self, LC_STYLE_BOLD, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (self->topstack->context & LC_STYLE_SECOND_PASS)
return Tokenizer_emit_char(self, *"'") ? -1 : 1;
if (self->topstack->context & LC_STYLE_ITALICS) {
self->topstack->context |= LC_STYLE_PASS_AGAIN;
return Tokenizer_emit_text(self, "'''");
}
if (Tokenizer_emit_char(self, *"'"))
return -1;
return Tokenizer_parse_italics(self);
}
if (!stack)
return -1;
return Tokenizer_emit_style_tag(self, "b", "'''", stack);
}

/*
Parse wiki-style italics and bold together (i.e., five ticks).
*/
static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
{
Py_ssize_t reset = self->head;
PyObject *stack, *stack2;

stack = Tokenizer_parse(self, LC_STYLE_BOLD, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
return Tokenizer_emit_text(self, "'''''");
}
if (!stack)
return -1;
reset = self->head;
stack2 = Tokenizer_parse(self, LC_STYLE_BOLD, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (Tokenizer_emit_text(self, "'''"))
return -1;
return Tokenizer_emit_style_tag(self, "i", "''", stack);
}
if (!stack2)
return -1;
if (Tokenizer_push(self, 0))
return -1;
if (Tokenizer_emit_style_tag(self, "i", "''", stack))
return -1;
if (Tokenizer_emit_all(self, stack2))
return -1;
Py_DECREF(stack2);
stack2 = Tokenizer_pop(self);
if (!stack2)
return -1;
return Tokenizer_emit_style_tag(self, "b", "'''", stack2);
}
if (!stack)
return -1;
reset = self->head;
stack2 = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (Tokenizer_emit_text(self, "''"))
return -1;
return Tokenizer_emit_style_tag(self, "b", "'''", stack);
}
if (!stack2)
return -1;
if (Tokenizer_push(self, 0))
return -1;
if (Tokenizer_emit_style_tag(self, "b", "'''", stack))
return -1;
if (Tokenizer_emit_all(self, stack2))
return -1;
Py_DECREF(stack2);
stack2 = Tokenizer_pop(self);
if (!stack2)
return -1;
return Tokenizer_emit_style_tag(self, "i", "''", stack2);
}

/*
Parse wiki-style formatting (''/''' for italics/bold).
*/
static PyObject* Tokenizer_parse_style(Tokenizer* self)
{
int context = self->topstack->context, ticks = 2, i;

self->head += 2;
while (Tokenizer_READ(self, 0) == *"'") {
self->head++;
ticks++;
}
if (ticks > 5) {
for (i = 0; i < ticks - 5; i++) {
if (Tokenizer_emit_char(self, *"'"))
return NULL;
}
ticks = 5;
}
else if (ticks == 4) {
if (Tokenizer_emit_char(self, *"'"))
return NULL;
ticks = 3;
}
if ((context & LC_STYLE_ITALICS && (ticks == 2 || ticks == 5)) ||
(context & LC_STYLE_BOLD && (ticks == 3 || ticks == 5))) {
if (ticks == 5)
self->head -= context & LC_STYLE_ITALICS ? 3 : 2;
return Tokenizer_pop(self);
}
if (!Tokenizer_CAN_RECURSE(self)) {
if (ticks == 3) {
if (context & LC_STYLE_SECOND_PASS) {
if (Tokenizer_emit_char(self, *"'"))
return NULL;
return Tokenizer_pop(self);
}
self->topstack->context |= LC_STYLE_PASS_AGAIN;
}
for (i = 0; i < ticks; i++) {
if (Tokenizer_emit_char(self, *"'"))
return NULL;
}
}
else if (ticks == 2) {
if (Tokenizer_parse_italics(self))
return NULL;
}
else if (ticks == 3) {
switch (Tokenizer_parse_bold(self)) {
case 1:
return Tokenizer_pop(self);
case -1:
return NULL;
}
}
else {
if (Tokenizer_parse_italics_and_bold(self))
return NULL;
}
self->head--;
return Py_None;
}

/*
Handle a list marker at the head (#, *, ;, :).
*/
static int Tokenizer_handle_list_marker(Tokenizer* self)
{
PyObject *markup = Tokenizer_read(self, 0), *kwargs, *token;
Py_UNICODE code = *PyUnicode_AS_UNICODE(markup);

if (code == *";")
self->topstack->context |= LC_DLTERM;
kwargs = PyDict_New();
if (!kwargs)
return -1;
PyDict_SetItemString(kwargs, "wiki_markup", markup);
token = PyObject_Call(TagOpenOpen, NOARGS, kwargs);
if (!token) {
Py_DECREF(kwargs);
return -1;
}
Py_DECREF(kwargs);
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_text(self, GET_HTML_TAG(code)))
return -1;
token = PyObject_CallObject(TagCloseSelfclose, NULL);
if (!token)
return -1;
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
return 0;
}

/*
Handle a wiki-style list (#, *, ;, :).
*/
static int Tokenizer_handle_list(Tokenizer* self)
{
Py_UNICODE marker = Tokenizer_READ(self, 1);

if (Tokenizer_handle_list_marker(self))
return -1;
while (marker == *"#" || marker == *"*" || marker == *";" ||
marker == *":") {
self->head++;
if (Tokenizer_handle_list_marker(self))
return -1;
marker = Tokenizer_READ(self, 1);
}
return 0;
}

/*
Handle a wiki-style horizontal rule (----) in the string.
*/
static int Tokenizer_handle_hr(Tokenizer* self)
{
PyObject *markup, *kwargs, *token;
Textbuffer *buffer = Textbuffer_new();
int i;

if (!buffer)
return -1;
self->head += 3;
for (i = 0; i < 4; i++) {
if (Textbuffer_write(&buffer, *"-"))
return -1;
}
while (Tokenizer_READ(self, 1) == *"-") {
if (Textbuffer_write(&buffer, *"-"))
return -1;
self->head++;
}
markup = Textbuffer_render(buffer);
if (!markup)
return -1;
Textbuffer_dealloc(buffer);
kwargs = PyDict_New();
if (!kwargs)
return -1;
PyDict_SetItemString(kwargs, "wiki_markup", markup);
Py_DECREF(markup);
token = PyObject_Call(TagOpenOpen, NOARGS, kwargs);
if (!token) {
Py_DECREF(kwargs);
return -1;
}
Py_DECREF(kwargs);
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_text(self, "hr"))
return -1;
token = PyObject_CallObject(TagCloseSelfclose, NULL);
if (!token)
return -1;
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
return 0;
}

/*
Handle the term in a description list ('foo' in ';foo:bar').
*/
static int Tokenizer_handle_dl_term(Tokenizer* self)
{
self->topstack->context ^= LC_DLTERM;
if (Tokenizer_READ(self, 0) == *":")
return Tokenizer_handle_list_marker(self);
return Tokenizer_emit_char(self, *"\n");
}

/*
Handle the end of the stream of wikitext. Handle the end of the stream of wikitext.
*/ */
static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
{ {
static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK |
LC_HEADING | LC_COMMENT | LC_TAG);
LC_HEADING | LC_COMMENT | LC_TAG | LC_STYLE);
static int double_fail = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE); static int double_fail = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
PyObject *token, *text, *trash; PyObject *token, *text, *trash;
int single; int single;
@@ -1943,7 +2324,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static int double_unsafe = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE); static int double_unsafe = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
int this_context, is_marker, i; int this_context, is_marker, i;
Py_UNICODE this, next, next_next, last; Py_UNICODE this, next, next_next, last;
PyObject* trash;
PyObject* temp;


if (push) { if (push) {
if (Tokenizer_push(self, context)) if (Tokenizer_push(self, context))
@@ -1955,8 +2336,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (this_context & unsafe_contexts) { if (this_context & unsafe_contexts) {
if (Tokenizer_verify_safe(self, this_context, this) < 0) { if (Tokenizer_verify_safe(self, this_context, this) < 0) {
if (this_context & double_unsafe) { if (this_context & double_unsafe) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
temp = Tokenizer_pop(self);
Py_XDECREF(temp);
} }
return Tokenizer_fail_route(self); return Tokenizer_fail_route(self);
} }
@@ -1969,7 +2350,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
} }
} }
if (!is_marker) { if (!is_marker) {
if (Tokenizer_emit_text(self, this))
if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
self->head++; self->head++;
continue; continue;
@@ -1977,12 +2358,13 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (this == *"") if (this == *"")
return Tokenizer_handle_end(self, this_context); return Tokenizer_handle_end(self, this_context);
next = Tokenizer_READ(self, 1); next = Tokenizer_READ(self, 1);
last = Tokenizer_READ_BACKWARDS(self, 1);
if (this_context & LC_COMMENT) { if (this_context & LC_COMMENT) {
if (this == next && next == *"-") { if (this == next && next == *"-") {
if (Tokenizer_READ(self, 2) == *">") if (Tokenizer_READ(self, 2) == *">")
return Tokenizer_pop(self); return Tokenizer_pop(self);
} }
if (Tokenizer_emit_text(self, this))
if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
} }
else if (this == next && next == *"{") { else if (this == next && next == *"{") {
@@ -1990,7 +2372,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (Tokenizer_parse_template_or_argument(self)) if (Tokenizer_parse_template_or_argument(self))
return NULL; return NULL;
} }
else if (Tokenizer_emit_text(self, this))
else if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
} }
else if (this == *"|" && this_context & LC_TEMPLATE) { else if (this == *"|" && this_context & LC_TEMPLATE) {
@@ -2011,7 +2393,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (Tokenizer_READ(self, 2) == *"}") { if (Tokenizer_READ(self, 2) == *"}") {
return Tokenizer_handle_argument_end(self); return Tokenizer_handle_argument_end(self);
} }
if (Tokenizer_emit_text(self, this))
if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
} }
else if (this == next && next == *"[") { else if (this == next && next == *"[") {
@@ -2020,7 +2402,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (Tokenizer_parse_wikilink(self)) if (Tokenizer_parse_wikilink(self))
return NULL; return NULL;
} }
else if (Tokenizer_emit_text(self, this))
else if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
} }
else if (this == *"|" && this_context & LC_WIKILINK_TITLE) { else if (this == *"|" && this_context & LC_WIKILINK_TITLE) {
@@ -2030,12 +2412,11 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
else if (this == next && next == *"]" && this_context & LC_WIKILINK) else if (this == next && next == *"]" && this_context & LC_WIKILINK)
return Tokenizer_handle_wikilink_end(self); return Tokenizer_handle_wikilink_end(self);
else if (this == *"=" && !(self->global & GL_HEADING)) { else if (this == *"=" && !(self->global & GL_HEADING)) {
last = Tokenizer_READ_BACKWARDS(self, 1);
if (last == *"\n" || last == *"") { if (last == *"\n" || last == *"") {
if (Tokenizer_parse_heading(self)) if (Tokenizer_parse_heading(self))
return NULL; return NULL;
} }
else if (Tokenizer_emit_text(self, this))
else if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
} }
else if (this == *"=" && this_context & LC_HEADING) else if (this == *"=" && this_context & LC_HEADING)
@@ -2052,7 +2433,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (Tokenizer_parse_comment(self)) if (Tokenizer_parse_comment(self))
return NULL; return NULL;
} }
else if (Tokenizer_emit_text(self, this))
else if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
} }
else if (this == *"<" && next == *"/" && else if (this == *"<" && next == *"/" &&
@@ -2072,12 +2453,35 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (Tokenizer_parse_tag(self)) if (Tokenizer_parse_tag(self))
return NULL; return NULL;
} }
else if (Tokenizer_emit_text(self, this))
else if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
} }
else if (this == *">" && this_context & LC_TAG_CLOSE) else if (this == *">" && this_context & LC_TAG_CLOSE)
return Tokenizer_handle_tag_close_close(self); return Tokenizer_handle_tag_close_close(self);
else if (Tokenizer_emit_text(self, this))
else if (this == next && next == *"'") {
temp = Tokenizer_parse_style(self);
if (temp != Py_None)
return temp;
}
else if (last == *"\n" || last == *"") {
if (this == *"#" || this == *"*" || this == *";" || this == *":") {
if (Tokenizer_handle_list(self))
return NULL;
}
else if (this == *"-" && this == next &&
this == Tokenizer_READ(self, 2) &&
this == Tokenizer_READ(self, 3)) {
if (Tokenizer_handle_hr(self))
return NULL;
}
else if (Tokenizer_emit_char(self, this))
return NULL;
}
else if ((this == *"\n" || this == *":") && this_context & LC_DLTERM) {
if (Tokenizer_handle_dl_term(self))
return NULL;
}
else if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
self->head++; self->head++;
} }


+ 53
- 43
mwparserfromhell/parser/tokenizer.h View File

@@ -41,20 +41,21 @@ SOFTWARE.
#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" #define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"


static const char* MARKERS[] = { static const char* MARKERS[] = {
"{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-",
"\n", ""};
"{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", ":", "/",
"-", "\n", ""};


#define NUM_MARKERS 17
#define NUM_MARKERS 18
#define TEXTBUFFER_BLOCKSIZE 1024 #define TEXTBUFFER_BLOCKSIZE 1024
#define MAX_DEPTH 40 #define MAX_DEPTH 40
#define MAX_CYCLES 100000 #define MAX_CYCLES 100000
#define MAX_BRACES 255 #define MAX_BRACES 255
#define MAX_ENTITY_SIZE 8 #define MAX_ENTITY_SIZE 8


static int route_state = 0;
#define BAD_ROUTE (route_state)
#define FAIL_ROUTE() (route_state = 1)
#define RESET_ROUTE() (route_state = 0)
static int route_state = 0, route_context = 0;
#define BAD_ROUTE route_state
#define BAD_ROUTE_CONTEXT route_context
#define FAIL_ROUTE(context) route_state = 1; route_context = context
#define RESET_ROUTE() route_state = 0


static char** entitydefs; static char** entitydefs;


@@ -102,42 +103,50 @@ static PyObject* TagCloseClose;


/* Local contexts: */ /* Local contexts: */


#define LC_TEMPLATE 0x000007
#define LC_TEMPLATE_NAME 0x000001
#define LC_TEMPLATE_PARAM_KEY 0x000002
#define LC_TEMPLATE_PARAM_VALUE 0x000004

#define LC_ARGUMENT 0x000018
#define LC_ARGUMENT_NAME 0x000008
#define LC_ARGUMENT_DEFAULT 0x000010

#define LC_WIKILINK 0x000060
#define LC_WIKILINK_TITLE 0x000020
#define LC_WIKILINK_TEXT 0x000040

#define LC_HEADING 0x001F80
#define LC_HEADING_LEVEL_1 0x000080
#define LC_HEADING_LEVEL_2 0x000100
#define LC_HEADING_LEVEL_3 0x000200
#define LC_HEADING_LEVEL_4 0x000400
#define LC_HEADING_LEVEL_5 0x000800
#define LC_HEADING_LEVEL_6 0x001000

#define LC_COMMENT 0x002000

#define LC_TAG 0x03C000
#define LC_TAG_OPEN 0x004000
#define LC_TAG_ATTR 0x008000
#define LC_TAG_BODY 0x010000
#define LC_TAG_CLOSE 0x020000

#define LC_SAFETY_CHECK 0xFC0000
#define LC_HAS_TEXT 0x040000
#define LC_FAIL_ON_TEXT 0x080000
#define LC_FAIL_NEXT 0x100000
#define LC_FAIL_ON_LBRACE 0x200000
#define LC_FAIL_ON_RBRACE 0x400000
#define LC_FAIL_ON_EQUALS 0x800000
#define LC_TEMPLATE 0x00000007
#define LC_TEMPLATE_NAME 0x00000001
#define LC_TEMPLATE_PARAM_KEY 0x00000002
#define LC_TEMPLATE_PARAM_VALUE 0x00000004

#define LC_ARGUMENT 0x00000018
#define LC_ARGUMENT_NAME 0x00000008
#define LC_ARGUMENT_DEFAULT 0x00000010

#define LC_WIKILINK 0x00000060
#define LC_WIKILINK_TITLE 0x00000020
#define LC_WIKILINK_TEXT 0x00000040

#define LC_HEADING 0x00001F80
#define LC_HEADING_LEVEL_1 0x00000080
#define LC_HEADING_LEVEL_2 0x00000100
#define LC_HEADING_LEVEL_3 0x00000200
#define LC_HEADING_LEVEL_4 0x00000400
#define LC_HEADING_LEVEL_5 0x00000800
#define LC_HEADING_LEVEL_6 0x00001000

#define LC_COMMENT 0x00002000

#define LC_TAG 0x0003C000
#define LC_TAG_OPEN 0x00004000
#define LC_TAG_ATTR 0x00008000
#define LC_TAG_BODY 0x00010000
#define LC_TAG_CLOSE 0x00020000

#define LC_STYLE 0x003C0000
#define LC_STYLE_ITALICS 0x00040000
#define LC_STYLE_BOLD 0x00080000
#define LC_STYLE_PASS_AGAIN 0x00100000
#define LC_STYLE_SECOND_PASS 0x00200000

#define LC_DLTERM 0x00400000

#define LC_SAFETY_CHECK 0x1F800000
#define LC_HAS_TEXT 0x00800000
#define LC_FAIL_ON_TEXT 0x01000000
#define LC_FAIL_NEXT 0x02000000
#define LC_FAIL_ON_LBRACE 0x04000000
#define LC_FAIL_ON_RBRACE 0x08000000
#define LC_FAIL_ON_EQUALS 0x10000000


/* Global contexts: */ /* Global contexts: */


@@ -211,6 +220,7 @@ typedef struct {


/* Macros for accessing HTML tag definitions: */ /* Macros for accessing HTML tag definitions: */


#define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li")
#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag)) #define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag))
#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag)) #define IS_SINGLE(tag) (call_tag_def_func("is_single", tag))
#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag)) #define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag))


+ 177
- 7
mwparserfromhell/parser/tokenizer.py View File

@@ -26,13 +26,15 @@ import re


from . import contexts, tokens from . import contexts, tokens
from ..compat import htmlentities from ..compat import htmlentities
from ..tag_defs import is_parsable, is_single, is_single_only
from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only


__all__ = ["Tokenizer"] __all__ = ["Tokenizer"]


class BadRoute(Exception): class BadRoute(Exception):
"""Raised internally when the current tokenization route is invalid.""" """Raised internally when the current tokenization route is invalid."""
pass

def __init__(self, context=0):
self.context = context




class _TagOpenData(object): class _TagOpenData(object):
@@ -57,11 +59,11 @@ class Tokenizer(object):
USES_C = False USES_C = False
START = object() START = object()
END = object() END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
"/", "-", "\n", END]
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";",
":", "/", "-", "\n", END]
MAX_DEPTH = 40 MAX_DEPTH = 40
MAX_CYCLES = 100000 MAX_CYCLES = 100000
regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
tag_splitter = re.compile(r"([\s\"\\]+)") tag_splitter = re.compile(r"([\s\"\\]+)")


def __init__(self): def __init__(self):
@@ -132,8 +134,9 @@ class Tokenizer(object):
Discards the current stack/context/textbuffer and raises Discards the current stack/context/textbuffer and raises
:py:exc:`~.BadRoute`. :py:exc:`~.BadRoute`.
""" """
context = self._context
self._pop() self._pop()
raise BadRoute()
raise BadRoute(context)


def _emit(self, token): def _emit(self, token):
"""Write a token to the end of the current token stack.""" """Write a token to the end of the current token stack."""
@@ -629,10 +632,164 @@ class Tokenizer(object):
else: else:
self._emit_all(tag) self._emit_all(tag)


def _emit_style_tag(self, tag, markup, body):
"""Write the body of a tag and the tokens that should surround it."""
self._emit(tokens.TagOpenOpen(wiki_markup=markup))
self._emit_text(tag)
self._emit(tokens.TagCloseOpen())
self._emit_all(body)
self._emit(tokens.TagOpenClose())
self._emit_text(tag)
self._emit(tokens.TagCloseClose())

def _parse_italics(self):
"""Parse wiki-style italics."""
reset = self._head
try:
stack = self._parse(contexts.STYLE_ITALICS)
except BadRoute as route:
self._head = reset
if route.context & contexts.STYLE_PASS_AGAIN:
stack = self._parse(route.context | contexts.STYLE_SECOND_PASS)
else:
return self._emit_text("''")
self._emit_style_tag("i", "''", stack)

def _parse_bold(self):
"""Parse wiki-style bold."""
reset = self._head
try:
stack = self._parse(contexts.STYLE_BOLD)
except BadRoute:
self._head = reset
if self._context & contexts.STYLE_SECOND_PASS:
self._emit_text("'")
return True
elif self._context & contexts.STYLE_ITALICS:
self._context |= contexts.STYLE_PASS_AGAIN
self._emit_text("'''")
else:
self._emit_text("'")
self._parse_italics()
else:
self._emit_style_tag("b", "'''", stack)

def _parse_italics_and_bold(self):
"""Parse wiki-style italics and bold together (i.e., five ticks)."""
reset = self._head
try:
stack = self._parse(contexts.STYLE_BOLD)
except BadRoute:
self._head = reset
try:
stack = self._parse(contexts.STYLE_ITALICS)
except BadRoute:
self._head = reset
self._emit_text("'''''")
else:
reset = self._head
try:
stack2 = self._parse(contexts.STYLE_BOLD)
except BadRoute:
self._head = reset
self._emit_text("'''")
self._emit_style_tag("i", "''", stack)
else:
self._push()
self._emit_style_tag("i", "''", stack)
self._emit_all(stack2)
self._emit_style_tag("b", "'''", self._pop())
else:
reset = self._head
try:
stack2 = self._parse(contexts.STYLE_ITALICS)
except BadRoute:
self._head = reset
self._emit_text("''")
self._emit_style_tag("b", "'''", stack)
else:
self._push()
self._emit_style_tag("b", "'''", stack)
self._emit_all(stack2)
self._emit_style_tag("i", "''", self._pop())

def _parse_style(self):
"""Parse wiki-style formatting (``''``/``'''`` for italics/bold)."""
self._head += 2
ticks = 2
while self._read() == "'":
self._head += 1
ticks += 1
italics = self._context & contexts.STYLE_ITALICS
bold = self._context & contexts.STYLE_BOLD

if ticks > 5:
self._emit_text("'" * (ticks - 5))
ticks = 5
elif ticks == 4:
self._emit_text("'")
ticks = 3

if (italics and ticks in (2, 5)) or (bold and ticks in (3, 5)):
if ticks == 5:
self._head -= 3 if italics else 2
return self._pop()
elif not self._can_recurse():
if ticks == 3:
if self._context & contexts.STYLE_SECOND_PASS:
self._emit_text("'")
return self._pop()
self._context |= contexts.STYLE_PASS_AGAIN
self._emit_text("'" * ticks)
elif ticks == 2:
self._parse_italics()
elif ticks == 3:
if self._parse_bold():
return self._pop()
elif ticks == 5:
self._parse_italics_and_bold()
self._head -= 1

def _handle_list_marker(self):
"""Handle a list marker at the head (``#``, ``*``, ``;``, ``:``)."""
markup = self._read()
if markup == ";":
self._context |= contexts.DL_TERM
self._emit(tokens.TagOpenOpen(wiki_markup=markup))
self._emit_text(get_html_tag(markup))
self._emit(tokens.TagCloseSelfclose())

def _handle_list(self):
"""Handle a wiki-style list (``#``, ``*``, ``;``, ``:``)."""
self._handle_list_marker()
while self._read(1) in ("#", "*", ";", ":"):
self._head += 1
self._handle_list_marker()

def _handle_hr(self):
"""Handle a wiki-style horizontal rule (``----``) in the string."""
length = 4
self._head += 3
while self._read(1) == "-":
length += 1
self._head += 1
self._emit(tokens.TagOpenOpen(wiki_markup="-" * length))
self._emit_text("hr")
self._emit(tokens.TagCloseSelfclose())

def _handle_dl_term(self):
"""Handle the term in a description list (``foo`` in ``;foo:bar``)."""
self._context ^= contexts.DL_TERM
if self._read() == ":":
self._handle_list_marker()
else:
self._emit_text("\n")

def _handle_end(self): def _handle_end(self):
"""Handle the end of the stream of wikitext.""" """Handle the end of the stream of wikitext."""
fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
contexts.HEADING | contexts.COMMENT | contexts.TAG)
contexts.HEADING | contexts.COMMENT | contexts.TAG |
contexts.STYLE)
double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
if self._context & fail: if self._context & fail:
if self._context & contexts.TAG_BODY: if self._context & contexts.TAG_BODY:
@@ -782,6 +939,19 @@ class Tokenizer(object):
self._emit_text("<") self._emit_text("<")
elif this == ">" and self._context & contexts.TAG_CLOSE: elif this == ">" and self._context & contexts.TAG_CLOSE:
return self._handle_tag_close_close() return self._handle_tag_close_close()
elif this == next == "'":
result = self._parse_style()
if result is not None:
return result
elif self._read(-1) in ("\n", self.START):
if this in ("#", "*", ";", ":"):
self._handle_list()
elif this == next == self._read(2) == self._read(3) == "-":
self._handle_hr()
else:
self._emit_text(this)
elif this in ("\n", ":") and self._context & contexts.DL_TERM:
self._handle_dl_term()
else: else:
self._emit_text(this) self._emit_text(this)
self._head += 1 self._head += 1


+ 1
- 5
mwparserfromhell/parser/tokens.py View File

@@ -55,7 +55,7 @@ class Token(object):
return False return False


def __getattr__(self, key): def __getattr__(self, key):
return self._kwargs[key]
return self._kwargs.get(key)


def __setattr__(self, key, value): def __setattr__(self, key, value):
self._kwargs[key] = value self._kwargs[key] = value
@@ -63,10 +63,6 @@ class Token(object):
def __delattr__(self, key): def __delattr__(self, key):
del self._kwargs[key] del self._kwargs[key]


def get(self, key, default=None):
"""Same as :py:meth:`__getattr__`, but has a *default* if missing."""
return self._kwargs.get(key, default)



def make(name): def make(name):
"""Create a new Token class using ``type()`` and add it to ``__all__``.""" """Create a new Token class using ``type()`` and add it to ``__all__``."""


+ 9
- 13
mwparserfromhell/tag_defs.py View File

@@ -24,7 +24,7 @@


from __future__ import unicode_literals from __future__ import unicode_literals


__all__ = ["get_wikicode", "is_parsable", "is_visible", "is_single",
__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single",
"is_single_only"] "is_single_only"]


PARSER_BLACKLIST = [ PARSER_BLACKLIST = [
@@ -44,20 +44,16 @@ INVISIBLE_TAGS = [
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] SINGLE_ONLY = ["br", "hr", "meta", "link", "img"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] SINGLE = SINGLE_ONLY + ["li", "dt", "dd"]


WIKICODE = {
"i": {"open": "''", "close": "''"},
"b": {"open": "'''", "close": "'''"},
"ul": {"open": "*"},
"ol": {"open": "#"},
"dt": {"open": ";"},
"dd": {"open": ":"},
"hr": {"open": "----"},
MARKUP_TO_HTML = {
"#": "li",
"*": "li",
";": "dt",
":": "dd"
} }


def get_wikicode(tag):
"""Return the appropriate wikicode before and after the given *tag*."""
data = WIKICODE[tag.lower()]
return (data.get("open"), data.get("close"))
def get_html_tag(markup):
"""Return the HTML tag associated with the given wiki-markup."""
return MARKUP_TO_HTML[markup]


def is_parsable(tag): def is_parsable(tag):
"""Return if the given *tag*'s contents should be passed to the parser.""" """Return if the given *tag*'s contents should be passed to the parser."""


+ 1
- 1
tests/_test_tree_equality.py View File

@@ -106,7 +106,7 @@ class TreeEqualityTestCase(TestCase):
self.assertEqual(exp_attr.pad_first, act_attr.pad_first) self.assertEqual(exp_attr.pad_first, act_attr.pad_first)
self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq) self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq)
self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq) self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq)
self.assertIs(expected.showtag, actual.showtag)
self.assertIs(expected.wiki_markup, actual.wiki_markup)
self.assertIs(expected.self_closing, actual.self_closing) self.assertIs(expected.self_closing, actual.self_closing)
self.assertIs(expected.invalid, actual.invalid) self.assertIs(expected.invalid, actual.invalid)
self.assertIs(expected.implicit, actual.implicit) self.assertIs(expected.implicit, actual.implicit)


+ 14
- 0
tests/test_builder.py View File

@@ -303,6 +303,20 @@ class TestBuilder(TreeEqualityTestCase):
Text(" "), Wikilink(wraptext("q")), Text(" "), Text(" "), Wikilink(wraptext("q")), Text(" "),
Template(wraptext("r"))]), True, " \n ", " ", Template(wraptext("r"))]), True, " \n ", " ",
" ")])])), " ")])])),

# "''italic text''"
([tokens.TagOpenOpen(wiki_markup="''"), tokens.Text(text="i"),
tokens.TagCloseOpen(), tokens.Text(text="italic text"),
tokens.TagOpenClose(), tokens.Text(text="i"),
tokens.TagCloseClose()],
wrap([Tag(wraptext("i"), wraptext("italic text"),
wiki_markup="''")])),

# * bullet
([tokens.TagOpenOpen(wiki_markup="*"), tokens.Text(text="li"),
tokens.TagCloseSelfclose(), tokens.Text(text=" bullet")],
wrap([Tag(wraptext("li"), wiki_markup="*", self_closing=True),
Text(" bullet")])),
] ]
for test, valid in tests: for test, valid in tests:
self.assertWikicodeEqual(valid, self.builder.build(test)) self.assertWikicodeEqual(valid, self.builder.build(test))


+ 10
- 10
tests/test_tag.py View File

@@ -50,8 +50,8 @@ class TestTag(TreeEqualityTestCase):
implicit=True) implicit=True)
node7 = Tag(wraptext("br"), self_closing=True, invalid=True, node7 = Tag(wraptext("br"), self_closing=True, invalid=True,
padding=" ") padding=" ")
node8 = Tag(wraptext("hr"), showtag=False, self_closing=True)
node9 = Tag(wraptext("i"), wraptext("italics!"), showtag=False)
node8 = Tag(wraptext("hr"), wiki_markup="----", self_closing=True)
node9 = Tag(wraptext("i"), wraptext("italics!"), wiki_markup="''")


self.assertEqual("<ref></ref>", str(node1)) self.assertEqual("<ref></ref>", str(node1))
self.assertEqual('<span style="color: red;">foo</span>', str(node2)) self.assertEqual('<span style="color: red;">foo</span>', str(node2))
@@ -72,7 +72,7 @@ class TestTag(TreeEqualityTestCase):
# <ref>foobar</ref> # <ref>foobar</ref>
node1 = Tag(wrap([node1n1]), wrap([node1n2])) node1 = Tag(wrap([node1n1]), wrap([node1n2]))
# '''bold text''' # '''bold text'''
node2 = Tag(wraptext("i"), wrap([node2n1]), showtag=False)
node2 = Tag(wraptext("b"), wrap([node2n1]), wiki_markup="'''")
# <img id="foo" class="bar" /> # <img id="foo" class="bar" />
node3 = Tag(wrap([node3n1]), node3 = Tag(wrap([node3n1]),
attrs=[Attribute(wrap([node3n2]), wrap([node3n3])), attrs=[Attribute(wrap([node3n2]), wrap([node3n3])),
@@ -158,15 +158,15 @@ class TestTag(TreeEqualityTestCase):
self.assertEqual([], node1.attributes) self.assertEqual([], node1.attributes)
self.assertIs(attrs, node2.attributes) self.assertIs(attrs, node2.attributes)


def test_showtag(self):
"""test getter/setter for the showtag attribute"""
def test_wiki_markup(self):
"""test getter/setter for the wiki_markup attribute"""
node = Tag(wraptext("i"), wraptext("italic text")) node = Tag(wraptext("i"), wraptext("italic text"))
self.assertTrue(node.showtag)
node.showtag = False
self.assertFalse(node.showtag)
self.assertIs(None, node.wiki_markup)
node.wiki_markup = "''"
self.assertEqual("''", node.wiki_markup)
self.assertEqual("''italic text''", node) self.assertEqual("''italic text''", node)
node.showtag = 1
self.assertTrue(node.showtag)
node.wiki_markup = False
self.assertFalse(node.wiki_markup)
self.assertEqual("<i>italic text</i>", node) self.assertEqual("<i>italic text</i>", node)


def test_self_closing(self): def test_self_closing(self):


+ 3
- 3
tests/test_tokens.py View File

@@ -44,8 +44,8 @@ class TestTokens(unittest.TestCase):


self.assertEqual("bar", token2.foo) self.assertEqual("bar", token2.foo)
self.assertEqual(123, token2.baz) self.assertEqual(123, token2.baz)
self.assertRaises(KeyError, lambda: token1.foo)
self.assertRaises(KeyError, lambda: token2.bar)
self.assertFalse(token1.foo)
self.assertFalse(token2.bar)


token1.spam = "eggs" token1.spam = "eggs"
token2.foo = "ham" token2.foo = "ham"
@@ -53,7 +53,7 @@ class TestTokens(unittest.TestCase):


self.assertEqual("eggs", token1.spam) self.assertEqual("eggs", token1.spam)
self.assertEqual("ham", token2.foo) self.assertEqual("ham", token2.foo)
self.assertRaises(KeyError, lambda: token2.baz)
self.assertFalse(token2.baz)
self.assertRaises(KeyError, delattr, token2, "baz") self.assertRaises(KeyError, delattr, token2, "baz")


def test_repr(self): def test_repr(self):


+ 516
- 0
tests/tokenizer/tags_wikimarkup.mwtest View File

@@ -0,0 +1,516 @@
name: basic_italics
label: basic italic text
input: "''text''"
output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: basic_bold
label: basic bold text
input: "'''text'''"
output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="b"), TagCloseClose()]

---

name: basic_ul
label: basic unordered list
input: "*text"
output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="text")]

---

name: basic_ol
label: basic ordered list
input: "#text"
output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="text")]

---

name: basic_dt
label: basic description term
input: ";text"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="text")]

---

name: basic_dd
label: basic description item
input: ":text"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="text")]

---

name: basic_hr
label: basic horizontal rule
input: "----"
output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose()]

---

name: complex_italics
label: italics with a lot in them
input: "''this is a&nbsp;test of [[Italic text|italics]] with {{plenty|of|stuff}}''"
output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of "), WikilinkOpen(), Text(text="Italic text"), WikilinkSeparator(), Text(text="italics"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose(), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: multiline_italics
label: italics spanning mulitple lines
input: "foo\nbar''testing\ntext\nspanning\n\n\n\n\nmultiple\nlines''foo\n\nbar"
output: [Text(text="foo\nbar"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="testing\ntext\nspanning\n\n\n\n\nmultiple\nlines"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="foo\n\nbar")]

---

name: unending_italics
label: italics without an ending tag
input: "''unending formatting!"
output: [Text(text="''unending formatting!")]

---

name: misleading_italics_end
label: italics with something that looks like an end but isn't
input: "''this is 'not' the en'd'<nowiki>''</nowiki>"
output: [Text(text="''this is 'not' the en'd'"), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="''"), TagOpenClose(), Text(text="nowiki"), TagCloseClose()]
]

---

name: italics_start_outside_end_inside
label: italics that start outside a link and end inside it
input: "''foo[[bar|baz'']]spam"
output: [Text(text="''foo"), WikilinkOpen(), Text(text="bar"), WikilinkSeparator(), Text(text="baz''"), WikilinkClose(), Text(text="spam")]

---

name: italics_start_inside_end_outside
label: italics that start inside a link and end outside it
input: "[[foo|''bar]]baz''spam"
output: [Text(text="[[foo|"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar]]baz"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="spam")]

---

name: complex_bold
label: bold with a lot in it
input: "'''this is a&nbsp;test of [[Bold text|bold]] with {{plenty|of|stuff}}'''"
output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of "), WikilinkOpen(), Text(text="Bold text"), WikilinkSeparator(), Text(text="bold"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose(), TagOpenClose(), Text(text="b"), TagCloseClose()]

---

name: multiline_bold
label: bold spanning mulitple lines
input: "foo\nbar'''testing\ntext\nspanning\n\n\n\n\nmultiple\nlines'''foo\n\nbar"
output: [Text(text="foo\nbar"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="testing\ntext\nspanning\n\n\n\n\nmultiple\nlines"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="foo\n\nbar")]

---

name: unending_bold
label: bold without an ending tag
input: "'''unending formatting!"
output: [Text(text="'''unending formatting!")]

---

name: misleading_bold_end
label: bold with something that looks like an end but isn't
input: "'''this is 'not' the en''d'<nowiki>'''</nowiki>"
output: [Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="this is 'not' the en"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="d'"), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="'''"), TagOpenClose(), Text(text="nowiki"), TagCloseClose()]

---

name: bold_start_outside_end_inside
label: bold that start outside a link and end inside it
input: "'''foo[[bar|baz''']]spam"
output: [Text(text="'''foo"), WikilinkOpen(), Text(text="bar"), WikilinkSeparator(), Text(text="baz'''"), WikilinkClose(), Text(text="spam")]

---

name: bold_start_inside_end_outside
label: bold that start inside a link and end outside it
input: "[[foo|'''bar]]baz'''spam"
output: [Text(text="[[foo|"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bar]]baz"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="spam")]

---

name: bold_and_italics
label: bold and italics together
input: "this is '''''bold and italic text'''''!"
output: [Text(text="this is "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold and italic text"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="!")]

---

name: both_then_bold
label: text that starts bold/italic, then is just bold
input: "'''''both''bold'''"
output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose()]

---

name: both_then_italics
label: text that starts bold/italic, then is just italic
input: "'''''both'''italics''"
output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: bold_then_both
label: text that starts just bold, then is bold/italic
input: "'''bold''both'''''"
output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenClose(), Text(text="b"), TagCloseClose()]

---

name: italics_then_both
label: text that starts just italic, then is bold/italic
input: "''italics'''both'''''"
output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: italics_then_bold
label: text that starts italic, then is bold
input: "none''italics'''''bold'''none"
output: [Text(text="none"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="none")]

---

name: bold_then_italics
label: text that starts bold, then is italic
input: "none'''bold'''''italics''none"
output: [Text(text="none"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="none")]

---

name: five_three
label: five ticks to open, three to close (bold)
input: "'''''foobar'''"
output: [Text(text="''"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="foobar"), TagOpenClose(), Text(text="b"), TagCloseClose()]

---

name: five_two
label: five ticks to open, two to close (bold)
input: "'''''foobar''"
output: [Text(text="'''"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="foobar"), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: four
label: four ticks
input: "foo ''''bar'''' baz"
output: [Text(text="foo '"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bar'"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text=" baz")]

---

name: four_two
label: four ticks to open, two to close
input: "foo ''''bar'' baz"
output: [Text(text="foo ''"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text=" baz")]

---

name: two_three
label: two ticks to open, three to close
input: "foo ''bar''' baz"
output: [Text(text="foo "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar'"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text=" baz")]

---

name: two_four
label: two ticks to open, four to close
input: "foo ''bar'''' baz"
output: [Text(text="foo "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar''"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text=" baz")]

---

name: two_three_two
label: two ticks to open, three to close, two afterwards
input: "foo ''bar''' baz''"
output: [Text(text="foo "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar''' baz"), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: two_four_four
label: two ticks to open, four to close, four afterwards
input: "foo ''bar'''' baz''''"
output: [Text(text="foo ''bar'"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text=" baz'"), TagOpenClose(), Text(text="b"), TagCloseClose()]

---

name: seven
label: seven ticks
input: "'''''''seven'''''''"
output: [Text(text="''"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="seven''"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: complex_ul
label: ul with a lot in it
input: "* this is a&nbsp;test of an [[Unordered list|ul]] with {{plenty|of|stuff}}"
output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="Unordered list"), WikilinkSeparator(), Text(text="ul"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()]

---

name: ul_multiline_template
label: ul with a template that spans multiple lines
input: "* this has a template with a {{line|\nbreak}}\nthis is not part of the list"
output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text=" this has a template with a "), TemplateOpen(), Text(text="line"), TemplateParamSeparator(), Text(text="\nbreak"), TemplateClose(), Text(text="\nthis is not part of the list")]

---

name: ul_adjacent
label: multiple adjacent uls
input: "a\n*b\n*c\nd\n*e\nf"
output: [Text(text="a\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf")]

---

name: ul_depths
label: multiple adjacent uls, with differing depths
input: "*a\n**b\n***c\n********d\n**e\nf\n***g"
output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="g")]

---

name: ul_space_before
label: uls with space before them
input: "foo *bar\n *baz\n*buzz"
output: [Text(text="foo *bar\n *baz\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="buzz")]

---

name: ul_interruption
label: high-depth ul with something blocking it
input: "**f*oobar"
output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="f*oobar")]

---

name: complex_ol
label: ol with a lot in it
input: "# this is a&nbsp;test of an [[Ordered list|ol]] with {{plenty|of|stuff}}"
output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="Ordered list"), WikilinkSeparator(), Text(text="ol"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()]

---

name: ol_multiline_template
label: ol with a template that spans moltiple lines
input: "# this has a template with a {{line|\nbreak}}\nthis is not part of the list"
output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text=" this has a template with a "), TemplateOpen(), Text(text="line"), TemplateParamSeparator(), Text(text="\nbreak"), TemplateClose(), Text(text="\nthis is not part of the list")]

---

name: ol_adjacent
label: moltiple adjacent ols
input: "a\n#b\n#c\nd\n#e\nf"
output: [Text(text="a\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf")]

---

name: ol_depths
label: moltiple adjacent ols, with differing depths
input: "#a\n##b\n###c\n########d\n##e\nf\n###g"
output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="g")]

---

name: ol_space_before
label: ols with space before them
input: "foo #bar\n #baz\n#buzz"
output: [Text(text="foo #bar\n #baz\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="buzz")]

---

name: ol_interruption
label: high-depth ol with something blocking it
input: "##f#oobar"
output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="f#oobar")]

---

name: ul_ol_mix
label: a mix of adjacent uls and ols
input: "*a\n*#b\n*##c\n*##*#*#*d\n*#e\nf\n##*g"
output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="g")]

---

name: complex_dt
label: dt with a lot in it
input: "; this is a&nbsp;test of an [[description term|dt]] with {{plenty|of|stuff}}"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="description term"), WikilinkSeparator(), Text(text="dt"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()]

---

name: dt_multiline_template
label: dt with a template that spans mdttiple lines
input: "; this has a template with a {{line|\nbreak}}\nthis is not part of the list"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text=" this has a template with a "), TemplateOpen(), Text(text="line"), TemplateParamSeparator(), Text(text="\nbreak"), TemplateClose(), Text(text="\nthis is not part of the list")]

---

name: dt_adjacent
label: mdttiple adjacent dts
input: "a\n;b\n;c\nd\n;e\nf"
output: [Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="e\nf")]

---

name: dt_depths
label: mdttiple adjacent dts, with differing depths
input: ";a\n;;b\n;;;c\n;;;;;;;;d\n;;e\nf\n;;;g"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="g")]

---

name: dt_space_before
label: dts with space before them
input: "foo ;bar\n ;baz\n;buzz"
output: [Text(text="foo ;bar\n ;baz\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="buzz")]

---

name: dt_interruption
label: high-depth dt with something blocking it
input: ";;f;oobar"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="f;oobar")]

---

name: complex_dd
label: dd with a lot in it
input: ": this is a&nbsp;test of an [[description item|dd]] with {{plenty|of|stuff}}"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="description item"), WikilinkSeparator(), Text(text="dd"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()]

---

name: dd_multiline_template
label: dd with a template that spans mddtiple lines
input: ": this has a template with a {{line|\nbreak}}\nthis is not part of the list"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=" this has a template with a "), TemplateOpen(), Text(text="line"), TemplateParamSeparator(), Text(text="\nbreak"), TemplateClose(), Text(text="\nthis is not part of the list")]

---

name: dd_adjacent
label: mddtiple adjacent dds
input: "a\n:b\n:c\nd\n:e\nf"
output: [Text(text="a\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf")]

---

name: dd_depths
label: mddtiple adjacent dds, with differing depths
input: ":a\n::b\n:::c\n::::::::d\n::e\nf\n:::g"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="g")]

---

name: dd_space_before
label: dds with space before them
input: "foo :bar\n :baz\n:buzz"
output: [Text(text="foo :bar\n :baz\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="buzz")]

---

name: dd_interruption
label: high-depth dd with something blocking it
input: "::f:oobar"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="f:oobar")]

---

name: dt_dd_mix
label: a mix of adjacent dts and dds
input: ";a\n;:b\n;::c\n;::;:;:;d\n;:e\nf\n::;g"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="g")]

---

name: dt_dd_mix2
label: the correct usage of a dt/dd unit, as in a dl
input: ";foo:bar:baz"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")]

---

name: dt_dd_mix3
label: another example of correct (but strange) dt/dd usage
input: ":;;::foo:bar:baz"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")]

---

name: ul_ol_dt_dd_mix
label: an assortment of uls, ols, dds, and dts
input: ";:#*foo\n:#*;foo\n#*;:foo\n*;:#foo"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="foo\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="foo\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="foo")]

---

name: hr_text_before
label: text before an otherwise-valid hr
input: "foo----"
output: [Text(text="foo----")]

---

name: hr_text_after
label: text after a valid hr
input: "----bar"
output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="bar")]

---

name: hr_text_before_after
label: text at both ends of an otherwise-valid hr
input: "foo----bar"
output: [Text(text="foo----bar")]

---

name: hr_newlines
label: newlines surrounding a valid hr
input: "foo\n----\nbar"
output: [Text(text="foo\n"), TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="\nbar")]

---

name: hr_adjacent
label: two adjacent hrs
input: "----\n----"
output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="\n"), TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose()]

---

name: hr_adjacent_space
label: two adjacent hrs, with a space before the second one, making it invalid
input: "----\n ----"
output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="\n ----")]

---

name: hr_short
label: an invalid three-hyphen-long hr
input: "---"
output: [Text(text="---")]

---

name: hr_long
label: a very long, valid hr
input: "------------------------------------------"
output: [TagOpenOpen(wiki_markup="------------------------------------------"), Text(text="hr"), TagCloseSelfclose()]

---

name: hr_interruption_short
label: a hr that is interrupted, making it invalid
input: "---x-"
output: [Text(text="---x-")]

---

name: hr_interruption_long
label: a hr that is interrupted, but the first part remains valid because it is long enough
input: "----x--"
output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="x--")]

Loading…
Cancel
Save