Browse Source

Merge branch 'feature/wikimarkup_tags' into develop (closes #9)

tags/v0.3
Ben Kurtovic 10 years ago
parent
commit
5029082b58
13 changed files with 1295 additions and 169 deletions
  1. +19
    -15
      mwparserfromhell/nodes/tag.py
  2. +5
    -6
      mwparserfromhell/parser/builder.py
  3. +23
    -6
      mwparserfromhell/parser/contexts.py
  4. +464
    -60
      mwparserfromhell/parser/tokenizer.c
  5. +53
    -43
      mwparserfromhell/parser/tokenizer.h
  6. +177
    -7
      mwparserfromhell/parser/tokenizer.py
  7. +1
    -5
      mwparserfromhell/parser/tokens.py
  8. +9
    -13
      mwparserfromhell/tag_defs.py
  9. +1
    -1
      tests/_test_tree_equality.py
  10. +14
    -0
      tests/test_builder.py
  11. +10
    -10
      tests/test_tag.py
  12. +3
    -3
      tests/test_tokens.py
  13. +516
    -0
      tests/tokenizer/tags_wikimarkup.mwtest

+ 19
- 15
mwparserfromhell/nodes/tag.py View File

@@ -24,7 +24,7 @@ from __future__ import unicode_literals

from . import Node, Text
from ..compat import str
from ..tag_defs import get_wikicode, is_visible
from ..tag_defs import is_visible
from ..utils import parse_anything

__all__ = ["Tag"]
@@ -32,7 +32,7 @@ __all__ = ["Tag"]
class Tag(Node):
"""Represents an HTML-style tag in wikicode, like ``<ref>``."""

def __init__(self, tag, contents=None, attrs=None, showtag=True,
def __init__(self, tag, contents=None, attrs=None, wiki_markup=None,
self_closing=False, invalid=False, implicit=False, padding="",
closing_tag=None):
super(Tag, self).__init__()
@@ -42,7 +42,7 @@ class Tag(Node):
else:
self._contents = contents
self._attrs = attrs if attrs else []
self._showtag = showtag
self._wiki_markup = wiki_markup
self._self_closing = self_closing
self._invalid = invalid
self._implicit = implicit
@@ -53,12 +53,11 @@ class Tag(Node):
self._closing_tag = tag

def __unicode__(self):
if not self.showtag:
open_, close = get_wikicode(self.tag)
if self.wiki_markup:
if self.self_closing:
return open_
return self.wiki_markup
else:
return open_ + str(self.contents) + close
return self.wiki_markup + str(self.contents) + self.wiki_markup

result = ("</" if self.invalid else "<") + str(self.tag)
if self.attributes:
@@ -72,7 +71,7 @@ class Tag(Node):

def __iternodes__(self, getter):
yield None, self
if self.showtag:
if not self.wiki_markup:
for child in getter(self.tag):
yield self.tag, child
for attr in self.attributes:
@@ -84,7 +83,7 @@ class Tag(Node):
if self.contents:
for child in getter(self.contents):
yield self.contents, child
if not self.self_closing and self.showtag and self.closing_tag:
if not self.self_closing and not self.wiki_markup and self.closing_tag:
for child in getter(self.closing_tag):
yield self.closing_tag, child

@@ -131,9 +130,14 @@ class Tag(Node):
return self._attrs

@property
def showtag(self):
"""Whether to show the tag itself instead of a wikicode version."""
return self._showtag
def wiki_markup(self):
"""The wikified version of a tag to show instead of HTML.

If set to a value, this will be displayed instead of the brackets.
For example, set to ``''`` to replace ``<i>`` or ``----`` to replace
``<hr>``.
"""
return self._wiki_markup

@property
def self_closing(self):
@@ -183,9 +187,9 @@ class Tag(Node):
def contents(self, value):
self._contents = parse_anything(value)

@showtag.setter
def showtag(self, value):
self._showtag = bool(value)
@wiki_markup.setter
def wiki_markup(self, value):
self._wiki_markup = str(value) if value else None

@self_closing.setter
def self_closing(self, value):


+ 5
- 6
mwparserfromhell/parser/builder.py View File

@@ -207,15 +207,14 @@ class Builder(object):
"""Handle a case where a tag is at the head of the tokens."""
close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose)
implicit, attrs, contents, closing_tag = False, [], None, None
showtag = token.get("showtag", True)
invalid = token.get("invalid", False)
wiki_markup, invalid = token.wiki_markup, token.invalid or False
self._push()
while self._tokens:
token = self._tokens.pop()
if isinstance(token, tokens.TagAttrStart):
attrs.append(self._handle_attribute(token))
elif isinstance(token, tokens.TagCloseOpen):
padding = token.padding
padding = token.padding or ""
tag = self._pop()
self._push()
elif isinstance(token, tokens.TagOpenClose):
@@ -225,12 +224,12 @@ class Builder(object):
if isinstance(token, tokens.TagCloseSelfclose):
tag = self._pop()
self_closing = True
padding = token.padding
implicit = token.get("implicit", False)
padding = token.padding or ""
implicit = token.implicit or False
else:
self_closing = False
closing_tag = self._pop()
return Tag(tag, contents, attrs, showtag, self_closing,
return Tag(tag, contents, attrs, wiki_markup, self_closing,
invalid, implicit, padding, closing_tag)
else:
self._write(self._handle_token(token))


+ 23
- 6
mwparserfromhell/parser/contexts.py View File

@@ -69,6 +69,15 @@ Local (stack-specific) contexts:
* :py:const:`TAG_BODY`
* :py:const:`TAG_CLOSE`

* :py:const:`STYLE`

* :py:const:`STYLE_ITALICS`
* :py:const:`STYLE_BOLD`
* :py:const:`STYLE_PASS_AGAIN`
* :py:const:`STYLE_SECOND_PASS`

* :py:const:`DL_TERM`

* :py:const:`SAFETY_CHECK`

* :py:const:`HAS_TEXT`
@@ -115,12 +124,20 @@ TAG_BODY = 1 << 16
TAG_CLOSE = 1 << 17
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE

HAS_TEXT = 1 << 18
FAIL_ON_TEXT = 1 << 19
FAIL_NEXT = 1 << 20
FAIL_ON_LBRACE = 1 << 21
FAIL_ON_RBRACE = 1 << 22
FAIL_ON_EQUALS = 1 << 23
STYLE_ITALICS = 1 << 18
STYLE_BOLD = 1 << 19
STYLE_PASS_AGAIN = 1 << 20
STYLE_SECOND_PASS = 1 << 21
STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS

DL_TERM = 1 << 22

HAS_TEXT = 1 << 23
FAIL_ON_TEXT = 1 << 24
FAIL_NEXT = 1 << 25
FAIL_ON_LBRACE = 1 << 26
FAIL_ON_RBRACE = 1 << 27
FAIL_ON_EQUALS = 1 << 28
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS)



+ 464
- 60
mwparserfromhell/parser/tokenizer.c View File

@@ -29,6 +29,7 @@ SOFTWARE.
static int heading_level_from_context(int n)
{
int level;

n /= LC_HEADING_LEVEL_1;
for (level = 1; n > 1; n >>= 1)
level++;
@@ -72,6 +73,7 @@ static PyObject* strip_tag_name(PyObject* token)
static Textbuffer* Textbuffer_new(void)
{
Textbuffer* buffer = malloc(sizeof(Textbuffer));

if (!buffer) {
PyErr_NoMemory();
return NULL;
@@ -90,6 +92,7 @@ static Textbuffer* Textbuffer_new(void)
static void Textbuffer_dealloc(Textbuffer* self)
{
Textbuffer* next;

while (self) {
free(self->data);
next = self->next;
@@ -99,11 +102,12 @@ static void Textbuffer_dealloc(Textbuffer* self)
}

/*
Write text to the given textbuffer.
Write a Unicode codepoint to the given textbuffer.
*/
static int Textbuffer_write(Textbuffer** this, Py_UNICODE text)
static int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
{
Textbuffer* self = *this;

if (self->size == TEXTBUFFER_BLOCKSIZE) {
Textbuffer* new = Textbuffer_new();
if (!new)
@@ -111,7 +115,7 @@ static int Textbuffer_write(Textbuffer** this, Py_UNICODE text)
new->next = self;
*this = self = new;
}
self->data[self->size] = text;
self->data[self->size] = code;
self->size++;
return 0;
}
@@ -123,6 +127,7 @@ static PyObject* Textbuffer_render(Textbuffer* self)
{
PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
PyObject *left, *concat;

while (self->next) {
self = self->next;
left = PyUnicode_FromUnicode(self->data, self->size);
@@ -208,6 +213,7 @@ static void Tokenizer_dealloc(Tokenizer* self)
static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
{
static char* kwlist[] = {NULL};

if (!PyArg_ParseTupleAndKeywords(args, kwds, "", kwlist))
return -1;
self->text = Py_None;
@@ -223,6 +229,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
static int Tokenizer_push(Tokenizer* self, int context)
{
Stack* top = malloc(sizeof(Stack));

if (!top) {
PyErr_NoMemory();
return -1;
@@ -246,6 +253,7 @@ static int Tokenizer_push_textbuffer(Tokenizer* self)
{
PyObject *text, *kwargs, *token;
Textbuffer* buffer = self->topstack->textbuffer;

if (buffer->size == 0 && !buffer->next)
return 0;
text = Textbuffer_render(buffer);
@@ -280,6 +288,7 @@ static int Tokenizer_push_textbuffer(Tokenizer* self)
static void Tokenizer_delete_top_of_stack(Tokenizer* self)
{
Stack* top = self->topstack;

Py_DECREF(top->stack);
Textbuffer_dealloc(top->textbuffer);
self->topstack = top->next;
@@ -293,6 +302,7 @@ static void Tokenizer_delete_top_of_stack(Tokenizer* self)
static PyObject* Tokenizer_pop(Tokenizer* self)
{
PyObject* stack;

if (Tokenizer_push_textbuffer(self))
return NULL;
stack = self->topstack->stack;
@@ -309,6 +319,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
{
PyObject* stack;
int context;

if (Tokenizer_push_textbuffer(self))
return NULL;
stack = self->topstack->stack;
@@ -325,9 +336,11 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
*/
static void* Tokenizer_fail_route(Tokenizer* self)
{
int context = self->topstack->context;
PyObject* stack = Tokenizer_pop(self);

Py_XDECREF(stack);
FAIL_ROUTE();
FAIL_ROUTE(context);
return NULL;
}

@@ -356,11 +369,26 @@ static int Tokenizer_emit_first(Tokenizer* self, PyObject* token)
}

/*
Write text to the current textbuffer.
Write a Unicode codepoint to the current textbuffer.
*/
static int Tokenizer_emit_text(Tokenizer* self, Py_UNICODE text)
static int Tokenizer_emit_char(Tokenizer* self, Py_UNICODE code)
{
return Textbuffer_write(&(self->topstack->textbuffer), text);
return Textbuffer_write(&(self->topstack->textbuffer), code);
}

/*
Write a string of text to the current textbuffer.
*/
static int Tokenizer_emit_text(Tokenizer* self, const char* text)
{
int i = 0;

while (text[i]) {
if (Tokenizer_emit_char(self, text[i]))
return -1;
i++;
}
return 0;
}

/*
@@ -427,15 +455,10 @@ static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
{
PyObject* stack = Tokenizer_pop(self);
int i = 0;
while (1) {
if (!text[i])
break;
if (Tokenizer_emit_text(self, (Py_UNICODE) text[i])) {
Py_XDECREF(stack);
return -1;
}
i++;

if (Tokenizer_emit_text(self, text)) {
Py_DECREF(stack);
return -1;
}
if (stack) {
if (PyList_GET_SIZE(stack) > 0) {
@@ -456,6 +479,7 @@ static int Tokenizer_emit_text_then_stack(Tokenizer* self, const char* text)
static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
{
Py_ssize_t index = self->head + delta;

if (index >= self->length)
return EMPTY;
return PyList_GET_ITEM(self->text, index);
@@ -467,6 +491,7 @@ static PyObject* Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
{
Py_ssize_t index;

if (delta > self->head)
return EMPTY;
index = self->head - delta;
@@ -751,7 +776,6 @@ static int Tokenizer_parse_wikilink(Tokenizer* self)
{
Py_ssize_t reset;
PyObject *wikilink, *token;
int i;

self->head += 2;
reset = self->head - 1;
@@ -759,10 +783,8 @@ static int Tokenizer_parse_wikilink(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
for (i = 0; i < 2; i++) {
if (Tokenizer_emit_text(self, *"["))
return -1;
}
if (Tokenizer_emit_text(self, "[["))
return -1;
return 0;
}
if (!wikilink)
@@ -847,7 +869,7 @@ static int Tokenizer_parse_heading(Tokenizer* self)
RESET_ROUTE();
self->head = reset + best - 1;
for (i = 0; i < best; i++) {
if (Tokenizer_emit_text(self, *"="))
if (Tokenizer_emit_char(self, *"="))
return -1;
}
self->global ^= GL_HEADING;
@@ -885,7 +907,7 @@ static int Tokenizer_parse_heading(Tokenizer* self)
if (heading->level < best) {
diff = best - heading->level;
for (i = 0; i < diff; i++) {
if (Tokenizer_emit_text(self, *"=")) {
if (Tokenizer_emit_char(self, *"=")) {
Py_DECREF(heading->title);
free(heading);
return -1;
@@ -936,7 +958,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
if (level < best) {
diff = best - level;
for (i = 0; i < diff; i++) {
if (Tokenizer_emit_text(self, *"="))
if (Tokenizer_emit_char(self, *"="))
return NULL;
}
}
@@ -944,7 +966,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self)
}
else {
for (i = 0; i < best; i++) {
if (Tokenizer_emit_text(self, *"=")) {
if (Tokenizer_emit_char(self, *"=")) {
Py_DECREF(after->title);
free(after);
return NULL;
@@ -1160,7 +1182,7 @@ static int Tokenizer_parse_entity(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (Tokenizer_emit_text(self, *"&"))
if (Tokenizer_emit_char(self, *"&"))
return -1;
return 0;
}
@@ -1182,24 +1204,14 @@ static int Tokenizer_parse_comment(Tokenizer* self)
{
Py_ssize_t reset = self->head + 3;
PyObject *token, *comment;
int i;

self->head += 4;
comment = Tokenizer_parse(self, LC_COMMENT, 1);
if (BAD_ROUTE) {
const char* text = "<!--";
RESET_ROUTE();
self->head = reset;
i = 0;
while (1) {
if (!text[i])
return 0;
if (Tokenizer_emit_text(self, (Py_UNICODE) text[i])) {
Py_XDECREF(text);
return -1;
}
i++;
}
if (Tokenizer_emit_text(self, "<!--"))
return -1;
return 0;
}
if (!comment)
@@ -1317,7 +1329,7 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
return -1;
}
if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) {
if (Tokenizer_emit_text(self, text))
if (Tokenizer_emit_char(self, text))
return -1;
}
else if (data->context & TAG_ATTR_READY)
@@ -1342,14 +1354,14 @@ static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
}
}
if (!is_marker || !Tokenizer_CAN_RECURSE(self))
return Tokenizer_emit_text(self, text);
return Tokenizer_emit_char(self, text);
else if (text == next && next == *"{")
return Tokenizer_parse_template_or_argument(self);
else if (text == next && next == *"[")
return Tokenizer_parse_wikilink(self);
else if (text == *"<")
return Tokenizer_parse_tag(self);
return Tokenizer_emit_text(self, text);
return Tokenizer_emit_char(self, text);
}

/*
@@ -1574,7 +1586,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
return NULL;
return Tokenizer_parse(self, 0, 0);
}
if (Tokenizer_emit_text(self, this))
if (Tokenizer_emit_char(self, this))
return NULL;
}
}
@@ -1776,7 +1788,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
return -1;
}
if (!IS_SINGLE_ONLY(name))
FAIL_ROUTE();
FAIL_ROUTE(0);
break;
}
Textbuffer_write(&buf, this);
@@ -1790,8 +1802,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
return (Tokenizer_emit_text(self, *"<") ||
Tokenizer_emit_text(self, *"/"));
return Tokenizer_emit_text(self, "</");
}
// Set invalid=True flag of TagOpenOpen
if (PyObject_SetAttrString(PyList_GET_ITEM(tag, 0), "invalid", Py_True))
@@ -1812,7 +1823,7 @@ static int Tokenizer_parse_tag(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
return Tokenizer_emit_text(self, *"<");
return Tokenizer_emit_char(self, *"<");
}
if (!tag) {
return -1;
@@ -1823,12 +1834,382 @@ static int Tokenizer_parse_tag(Tokenizer* self)
}

/*
Write the body of a tag and the tokens that should surround it.
*/
static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag,
const char* ticks, PyObject* body)
{
PyObject *markup, *kwargs, *token;

markup = PyBytes_FromString(ticks);
if (!markup)
return -1;
kwargs = PyDict_New();
if (!kwargs) {
Py_DECREF(markup);
return -1;
}
PyDict_SetItemString(kwargs, "wiki_markup", markup);
Py_DECREF(markup);
token = PyObject_Call(TagOpenOpen, NOARGS, kwargs);
if (!token) {
Py_DECREF(kwargs);
return -1;
}
Py_DECREF(kwargs);
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_text(self, tag))
return -1;
token = PyObject_CallObject(TagCloseOpen, NULL);
if (!token)
return -1;
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_all(self, body))
return -1;
token = PyObject_CallObject(TagOpenClose, NULL);
if (!token)
return -1;
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_text(self, tag))
return -1;
token = PyObject_CallObject(TagCloseClose, NULL);
if (!token)
return -1;
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
Py_DECREF(body);
return 0;
}

/*
Parse wiki-style italics.
*/
static int Tokenizer_parse_italics(Tokenizer* self)
{
Py_ssize_t reset = self->head;
int context;
PyObject *stack;

stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (BAD_ROUTE_CONTEXT & LC_STYLE_PASS_AGAIN) {
context = LC_STYLE_ITALICS | LC_STYLE_SECOND_PASS;
stack = Tokenizer_parse(self, context, 1);
}
else
return Tokenizer_emit_text(self, "''");
}
if (!stack)
return -1;
return Tokenizer_emit_style_tag(self, "i", "''", stack);
}

/*
Parse wiki-style bold.
*/
static int Tokenizer_parse_bold(Tokenizer* self)
{
Py_ssize_t reset = self->head;
PyObject *stack;

stack = Tokenizer_parse(self, LC_STYLE_BOLD, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (self->topstack->context & LC_STYLE_SECOND_PASS)
return Tokenizer_emit_char(self, *"'") ? -1 : 1;
if (self->topstack->context & LC_STYLE_ITALICS) {
self->topstack->context |= LC_STYLE_PASS_AGAIN;
return Tokenizer_emit_text(self, "'''");
}
if (Tokenizer_emit_char(self, *"'"))
return -1;
return Tokenizer_parse_italics(self);
}
if (!stack)
return -1;
return Tokenizer_emit_style_tag(self, "b", "'''", stack);
}

/*
Parse wiki-style italics and bold together (i.e., five ticks).
*/
static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
{
Py_ssize_t reset = self->head;
PyObject *stack, *stack2;

stack = Tokenizer_parse(self, LC_STYLE_BOLD, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
return Tokenizer_emit_text(self, "'''''");
}
if (!stack)
return -1;
reset = self->head;
stack2 = Tokenizer_parse(self, LC_STYLE_BOLD, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (Tokenizer_emit_text(self, "'''"))
return -1;
return Tokenizer_emit_style_tag(self, "i", "''", stack);
}
if (!stack2)
return -1;
if (Tokenizer_push(self, 0))
return -1;
if (Tokenizer_emit_style_tag(self, "i", "''", stack))
return -1;
if (Tokenizer_emit_all(self, stack2))
return -1;
Py_DECREF(stack2);
stack2 = Tokenizer_pop(self);
if (!stack2)
return -1;
return Tokenizer_emit_style_tag(self, "b", "'''", stack2);
}
if (!stack)
return -1;
reset = self->head;
stack2 = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (Tokenizer_emit_text(self, "''"))
return -1;
return Tokenizer_emit_style_tag(self, "b", "'''", stack);
}
if (!stack2)
return -1;
if (Tokenizer_push(self, 0))
return -1;
if (Tokenizer_emit_style_tag(self, "b", "'''", stack))
return -1;
if (Tokenizer_emit_all(self, stack2))
return -1;
Py_DECREF(stack2);
stack2 = Tokenizer_pop(self);
if (!stack2)
return -1;
return Tokenizer_emit_style_tag(self, "i", "''", stack2);
}

/*
Parse wiki-style formatting (''/''' for italics/bold).
*/
static PyObject* Tokenizer_parse_style(Tokenizer* self)
{
int context = self->topstack->context, ticks = 2, i;

self->head += 2;
while (Tokenizer_READ(self, 0) == *"'") {
self->head++;
ticks++;
}
if (ticks > 5) {
for (i = 0; i < ticks - 5; i++) {
if (Tokenizer_emit_char(self, *"'"))
return NULL;
}
ticks = 5;
}
else if (ticks == 4) {
if (Tokenizer_emit_char(self, *"'"))
return NULL;
ticks = 3;
}
if ((context & LC_STYLE_ITALICS && (ticks == 2 || ticks == 5)) ||
(context & LC_STYLE_BOLD && (ticks == 3 || ticks == 5))) {
if (ticks == 5)
self->head -= context & LC_STYLE_ITALICS ? 3 : 2;
return Tokenizer_pop(self);
}
if (!Tokenizer_CAN_RECURSE(self)) {
if (ticks == 3) {
if (context & LC_STYLE_SECOND_PASS) {
if (Tokenizer_emit_char(self, *"'"))
return NULL;
return Tokenizer_pop(self);
}
self->topstack->context |= LC_STYLE_PASS_AGAIN;
}
for (i = 0; i < ticks; i++) {
if (Tokenizer_emit_char(self, *"'"))
return NULL;
}
}
else if (ticks == 2) {
if (Tokenizer_parse_italics(self))
return NULL;
}
else if (ticks == 3) {
switch (Tokenizer_parse_bold(self)) {
case 1:
return Tokenizer_pop(self);
case -1:
return NULL;
}
}
else {
if (Tokenizer_parse_italics_and_bold(self))
return NULL;
}
self->head--;
return Py_None;
}

/*
Handle a list marker at the head (#, *, ;, :).
*/
static int Tokenizer_handle_list_marker(Tokenizer* self)
{
PyObject *markup = Tokenizer_read(self, 0), *kwargs, *token;
Py_UNICODE code = *PyUnicode_AS_UNICODE(markup);

if (code == *";")
self->topstack->context |= LC_DLTERM;
kwargs = PyDict_New();
if (!kwargs)
return -1;
PyDict_SetItemString(kwargs, "wiki_markup", markup);
token = PyObject_Call(TagOpenOpen, NOARGS, kwargs);
if (!token) {
Py_DECREF(kwargs);
return -1;
}
Py_DECREF(kwargs);
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_text(self, GET_HTML_TAG(code)))
return -1;
token = PyObject_CallObject(TagCloseSelfclose, NULL);
if (!token)
return -1;
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
return 0;
}

/*
Handle a wiki-style list (#, *, ;, :).
*/
static int Tokenizer_handle_list(Tokenizer* self)
{
Py_UNICODE marker = Tokenizer_READ(self, 1);

if (Tokenizer_handle_list_marker(self))
return -1;
while (marker == *"#" || marker == *"*" || marker == *";" ||
marker == *":") {
self->head++;
if (Tokenizer_handle_list_marker(self))
return -1;
marker = Tokenizer_READ(self, 1);
}
return 0;
}

/*
Handle a wiki-style horizontal rule (----) in the string.
*/
static int Tokenizer_handle_hr(Tokenizer* self)
{
PyObject *markup, *kwargs, *token;
Textbuffer *buffer = Textbuffer_new();
int i;

if (!buffer)
return -1;
self->head += 3;
for (i = 0; i < 4; i++) {
if (Textbuffer_write(&buffer, *"-"))
return -1;
}
while (Tokenizer_READ(self, 1) == *"-") {
if (Textbuffer_write(&buffer, *"-"))
return -1;
self->head++;
}
markup = Textbuffer_render(buffer);
if (!markup)
return -1;
Textbuffer_dealloc(buffer);
kwargs = PyDict_New();
if (!kwargs)
return -1;
PyDict_SetItemString(kwargs, "wiki_markup", markup);
Py_DECREF(markup);
token = PyObject_Call(TagOpenOpen, NOARGS, kwargs);
if (!token) {
Py_DECREF(kwargs);
return -1;
}
Py_DECREF(kwargs);
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
if (Tokenizer_emit_text(self, "hr"))
return -1;
token = PyObject_CallObject(TagCloseSelfclose, NULL);
if (!token)
return -1;
if (Tokenizer_emit(self, token)) {
Py_DECREF(token);
return -1;
}
Py_DECREF(token);
return 0;
}

/*
Handle the term in a description list ('foo' in ';foo:bar').
*/
static int Tokenizer_handle_dl_term(Tokenizer* self)
{
self->topstack->context ^= LC_DLTERM;
if (Tokenizer_READ(self, 0) == *":")
return Tokenizer_handle_list_marker(self);
return Tokenizer_emit_char(self, *"\n");
}

/*
Handle the end of the stream of wikitext.
*/
static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
{
static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK |
LC_HEADING | LC_COMMENT | LC_TAG);
LC_HEADING | LC_COMMENT | LC_TAG | LC_STYLE);
static int double_fail = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
PyObject *token, *text, *trash;
int single;
@@ -1943,7 +2324,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static int double_unsafe = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
int this_context, is_marker, i;
Py_UNICODE this, next, next_next, last;
PyObject* trash;
PyObject* temp;

if (push) {
if (Tokenizer_push(self, context))
@@ -1955,8 +2336,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (this_context & unsafe_contexts) {
if (Tokenizer_verify_safe(self, this_context, this) < 0) {
if (this_context & double_unsafe) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
temp = Tokenizer_pop(self);
Py_XDECREF(temp);
}
return Tokenizer_fail_route(self);
}
@@ -1969,7 +2350,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
}
}
if (!is_marker) {
if (Tokenizer_emit_text(self, this))
if (Tokenizer_emit_char(self, this))
return NULL;
self->head++;
continue;
@@ -1977,12 +2358,13 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (this == *"")
return Tokenizer_handle_end(self, this_context);
next = Tokenizer_READ(self, 1);
last = Tokenizer_READ_BACKWARDS(self, 1);
if (this_context & LC_COMMENT) {
if (this == next && next == *"-") {
if (Tokenizer_READ(self, 2) == *">")
return Tokenizer_pop(self);
}
if (Tokenizer_emit_text(self, this))
if (Tokenizer_emit_char(self, this))
return NULL;
}
else if (this == next && next == *"{") {
@@ -1990,7 +2372,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (Tokenizer_parse_template_or_argument(self))
return NULL;
}
else if (Tokenizer_emit_text(self, this))
else if (Tokenizer_emit_char(self, this))
return NULL;
}
else if (this == *"|" && this_context & LC_TEMPLATE) {
@@ -2011,7 +2393,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (Tokenizer_READ(self, 2) == *"}") {
return Tokenizer_handle_argument_end(self);
}
if (Tokenizer_emit_text(self, this))
if (Tokenizer_emit_char(self, this))
return NULL;
}
else if (this == next && next == *"[") {
@@ -2020,7 +2402,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (Tokenizer_parse_wikilink(self))
return NULL;
}
else if (Tokenizer_emit_text(self, this))
else if (Tokenizer_emit_char(self, this))
return NULL;
}
else if (this == *"|" && this_context & LC_WIKILINK_TITLE) {
@@ -2030,12 +2412,11 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
else if (this == next && next == *"]" && this_context & LC_WIKILINK)
return Tokenizer_handle_wikilink_end(self);
else if (this == *"=" && !(self->global & GL_HEADING)) {
last = Tokenizer_READ_BACKWARDS(self, 1);
if (last == *"\n" || last == *"") {
if (Tokenizer_parse_heading(self))
return NULL;
}
else if (Tokenizer_emit_text(self, this))
else if (Tokenizer_emit_char(self, this))
return NULL;
}
else if (this == *"=" && this_context & LC_HEADING)
@@ -2052,7 +2433,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (Tokenizer_parse_comment(self))
return NULL;
}
else if (Tokenizer_emit_text(self, this))
else if (Tokenizer_emit_char(self, this))
return NULL;
}
else if (this == *"<" && next == *"/" &&
@@ -2072,12 +2453,35 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (Tokenizer_parse_tag(self))
return NULL;
}
else if (Tokenizer_emit_text(self, this))
else if (Tokenizer_emit_char(self, this))
return NULL;
}
else if (this == *">" && this_context & LC_TAG_CLOSE)
return Tokenizer_handle_tag_close_close(self);
else if (Tokenizer_emit_text(self, this))
else if (this == next && next == *"'") {
temp = Tokenizer_parse_style(self);
if (temp != Py_None)
return temp;
}
else if (last == *"\n" || last == *"") {
if (this == *"#" || this == *"*" || this == *";" || this == *":") {
if (Tokenizer_handle_list(self))
return NULL;
}
else if (this == *"-" && this == next &&
this == Tokenizer_READ(self, 2) &&
this == Tokenizer_READ(self, 3)) {
if (Tokenizer_handle_hr(self))
return NULL;
}
else if (Tokenizer_emit_char(self, this))
return NULL;
}
else if ((this == *"\n" || this == *":") && this_context & LC_DLTERM) {
if (Tokenizer_handle_dl_term(self))
return NULL;
}
else if (Tokenizer_emit_char(self, this))
return NULL;
self->head++;
}


+ 53
- 43
mwparserfromhell/parser/tokenizer.h View File

@@ -41,20 +41,21 @@ SOFTWARE.
#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

static const char* MARKERS[] = {
"{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-",
"\n", ""};
"{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", ":", "/",
"-", "\n", ""};

#define NUM_MARKERS 17
#define NUM_MARKERS 18
#define TEXTBUFFER_BLOCKSIZE 1024
#define MAX_DEPTH 40
#define MAX_CYCLES 100000
#define MAX_BRACES 255
#define MAX_ENTITY_SIZE 8

static int route_state = 0;
#define BAD_ROUTE (route_state)
#define FAIL_ROUTE() (route_state = 1)
#define RESET_ROUTE() (route_state = 0)
static int route_state = 0, route_context = 0;
#define BAD_ROUTE route_state
#define BAD_ROUTE_CONTEXT route_context
#define FAIL_ROUTE(context) route_state = 1; route_context = context
#define RESET_ROUTE() route_state = 0

static char** entitydefs;

@@ -102,42 +103,50 @@ static PyObject* TagCloseClose;

/* Local contexts: */

#define LC_TEMPLATE 0x000007
#define LC_TEMPLATE_NAME 0x000001
#define LC_TEMPLATE_PARAM_KEY 0x000002
#define LC_TEMPLATE_PARAM_VALUE 0x000004

#define LC_ARGUMENT 0x000018
#define LC_ARGUMENT_NAME 0x000008
#define LC_ARGUMENT_DEFAULT 0x000010

#define LC_WIKILINK 0x000060
#define LC_WIKILINK_TITLE 0x000020
#define LC_WIKILINK_TEXT 0x000040

#define LC_HEADING 0x001F80
#define LC_HEADING_LEVEL_1 0x000080
#define LC_HEADING_LEVEL_2 0x000100
#define LC_HEADING_LEVEL_3 0x000200
#define LC_HEADING_LEVEL_4 0x000400
#define LC_HEADING_LEVEL_5 0x000800
#define LC_HEADING_LEVEL_6 0x001000

#define LC_COMMENT 0x002000

#define LC_TAG 0x03C000
#define LC_TAG_OPEN 0x004000
#define LC_TAG_ATTR 0x008000
#define LC_TAG_BODY 0x010000
#define LC_TAG_CLOSE 0x020000

#define LC_SAFETY_CHECK 0xFC0000
#define LC_HAS_TEXT 0x040000
#define LC_FAIL_ON_TEXT 0x080000
#define LC_FAIL_NEXT 0x100000
#define LC_FAIL_ON_LBRACE 0x200000
#define LC_FAIL_ON_RBRACE 0x400000
#define LC_FAIL_ON_EQUALS 0x800000
#define LC_TEMPLATE 0x00000007
#define LC_TEMPLATE_NAME 0x00000001
#define LC_TEMPLATE_PARAM_KEY 0x00000002
#define LC_TEMPLATE_PARAM_VALUE 0x00000004

#define LC_ARGUMENT 0x00000018
#define LC_ARGUMENT_NAME 0x00000008
#define LC_ARGUMENT_DEFAULT 0x00000010

#define LC_WIKILINK 0x00000060
#define LC_WIKILINK_TITLE 0x00000020
#define LC_WIKILINK_TEXT 0x00000040

#define LC_HEADING 0x00001F80
#define LC_HEADING_LEVEL_1 0x00000080
#define LC_HEADING_LEVEL_2 0x00000100
#define LC_HEADING_LEVEL_3 0x00000200
#define LC_HEADING_LEVEL_4 0x00000400
#define LC_HEADING_LEVEL_5 0x00000800
#define LC_HEADING_LEVEL_6 0x00001000

#define LC_COMMENT 0x00002000

#define LC_TAG 0x0003C000
#define LC_TAG_OPEN 0x00004000
#define LC_TAG_ATTR 0x00008000
#define LC_TAG_BODY 0x00010000
#define LC_TAG_CLOSE 0x00020000

#define LC_STYLE 0x003C0000
#define LC_STYLE_ITALICS 0x00040000
#define LC_STYLE_BOLD 0x00080000
#define LC_STYLE_PASS_AGAIN 0x00100000
#define LC_STYLE_SECOND_PASS 0x00200000

#define LC_DLTERM 0x00400000

#define LC_SAFETY_CHECK 0x1F800000
#define LC_HAS_TEXT 0x00800000
#define LC_FAIL_ON_TEXT 0x01000000
#define LC_FAIL_NEXT 0x02000000
#define LC_FAIL_ON_LBRACE 0x04000000
#define LC_FAIL_ON_RBRACE 0x08000000
#define LC_FAIL_ON_EQUALS 0x10000000

/* Global contexts: */

@@ -211,6 +220,7 @@ typedef struct {

/* Macros for accessing HTML tag definitions: */

#define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li")
#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag))
#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag))
#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag))


+ 177
- 7
mwparserfromhell/parser/tokenizer.py View File

@@ -26,13 +26,15 @@ import re

from . import contexts, tokens
from ..compat import htmlentities
from ..tag_defs import is_parsable, is_single, is_single_only
from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only

__all__ = ["Tokenizer"]

class BadRoute(Exception):
"""Raised internally when the current tokenization route is invalid."""
pass

def __init__(self, context=0):
self.context = context


class _TagOpenData(object):
@@ -57,11 +59,11 @@ class Tokenizer(object):
USES_C = False
START = object()
END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
"/", "-", "\n", END]
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";",
":", "/", "-", "\n", END]
MAX_DEPTH = 40
MAX_CYCLES = 100000
regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
tag_splitter = re.compile(r"([\s\"\\]+)")

def __init__(self):
@@ -132,8 +134,9 @@ class Tokenizer(object):
Discards the current stack/context/textbuffer and raises
:py:exc:`~.BadRoute`.
"""
context = self._context
self._pop()
raise BadRoute()
raise BadRoute(context)

def _emit(self, token):
"""Write a token to the end of the current token stack."""
@@ -629,10 +632,164 @@ class Tokenizer(object):
else:
self._emit_all(tag)

def _emit_style_tag(self, tag, markup, body):
"""Write the body of a tag and the tokens that should surround it."""
self._emit(tokens.TagOpenOpen(wiki_markup=markup))
self._emit_text(tag)
self._emit(tokens.TagCloseOpen())
self._emit_all(body)
self._emit(tokens.TagOpenClose())
self._emit_text(tag)
self._emit(tokens.TagCloseClose())

def _parse_italics(self):
"""Parse wiki-style italics."""
reset = self._head
try:
stack = self._parse(contexts.STYLE_ITALICS)
except BadRoute as route:
self._head = reset
if route.context & contexts.STYLE_PASS_AGAIN:
stack = self._parse(route.context | contexts.STYLE_SECOND_PASS)
else:
return self._emit_text("''")
self._emit_style_tag("i", "''", stack)

def _parse_bold(self):
"""Parse wiki-style bold."""
reset = self._head
try:
stack = self._parse(contexts.STYLE_BOLD)
except BadRoute:
self._head = reset
if self._context & contexts.STYLE_SECOND_PASS:
self._emit_text("'")
return True
elif self._context & contexts.STYLE_ITALICS:
self._context |= contexts.STYLE_PASS_AGAIN
self._emit_text("'''")
else:
self._emit_text("'")
self._parse_italics()
else:
self._emit_style_tag("b", "'''", stack)

def _parse_italics_and_bold(self):
"""Parse wiki-style italics and bold together (i.e., five ticks)."""
reset = self._head
try:
stack = self._parse(contexts.STYLE_BOLD)
except BadRoute:
self._head = reset
try:
stack = self._parse(contexts.STYLE_ITALICS)
except BadRoute:
self._head = reset
self._emit_text("'''''")
else:
reset = self._head
try:
stack2 = self._parse(contexts.STYLE_BOLD)
except BadRoute:
self._head = reset
self._emit_text("'''")
self._emit_style_tag("i", "''", stack)
else:
self._push()
self._emit_style_tag("i", "''", stack)
self._emit_all(stack2)
self._emit_style_tag("b", "'''", self._pop())
else:
reset = self._head
try:
stack2 = self._parse(contexts.STYLE_ITALICS)
except BadRoute:
self._head = reset
self._emit_text("''")
self._emit_style_tag("b", "'''", stack)
else:
self._push()
self._emit_style_tag("b", "'''", stack)
self._emit_all(stack2)
self._emit_style_tag("i", "''", self._pop())

def _parse_style(self):
"""Parse wiki-style formatting (``''``/``'''`` for italics/bold)."""
self._head += 2
ticks = 2
while self._read() == "'":
self._head += 1
ticks += 1
italics = self._context & contexts.STYLE_ITALICS
bold = self._context & contexts.STYLE_BOLD

if ticks > 5:
self._emit_text("'" * (ticks - 5))
ticks = 5
elif ticks == 4:
self._emit_text("'")
ticks = 3

if (italics and ticks in (2, 5)) or (bold and ticks in (3, 5)):
if ticks == 5:
self._head -= 3 if italics else 2
return self._pop()
elif not self._can_recurse():
if ticks == 3:
if self._context & contexts.STYLE_SECOND_PASS:
self._emit_text("'")
return self._pop()
self._context |= contexts.STYLE_PASS_AGAIN
self._emit_text("'" * ticks)
elif ticks == 2:
self._parse_italics()
elif ticks == 3:
if self._parse_bold():
return self._pop()
elif ticks == 5:
self._parse_italics_and_bold()
self._head -= 1

def _handle_list_marker(self):
"""Handle a list marker at the head (``#``, ``*``, ``;``, ``:``)."""
markup = self._read()
if markup == ";":
self._context |= contexts.DL_TERM
self._emit(tokens.TagOpenOpen(wiki_markup=markup))
self._emit_text(get_html_tag(markup))
self._emit(tokens.TagCloseSelfclose())

def _handle_list(self):
"""Handle a wiki-style list (``#``, ``*``, ``;``, ``:``)."""
self._handle_list_marker()
while self._read(1) in ("#", "*", ";", ":"):
self._head += 1
self._handle_list_marker()

def _handle_hr(self):
"""Handle a wiki-style horizontal rule (``----``) in the string."""
length = 4
self._head += 3
while self._read(1) == "-":
length += 1
self._head += 1
self._emit(tokens.TagOpenOpen(wiki_markup="-" * length))
self._emit_text("hr")
self._emit(tokens.TagCloseSelfclose())

def _handle_dl_term(self):
"""Handle the term in a description list (``foo`` in ``;foo:bar``)."""
self._context ^= contexts.DL_TERM
if self._read() == ":":
self._handle_list_marker()
else:
self._emit_text("\n")

def _handle_end(self):
"""Handle the end of the stream of wikitext."""
fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
contexts.HEADING | contexts.COMMENT | contexts.TAG)
contexts.HEADING | contexts.COMMENT | contexts.TAG |
contexts.STYLE)
double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
if self._context & fail:
if self._context & contexts.TAG_BODY:
@@ -782,6 +939,19 @@ class Tokenizer(object):
self._emit_text("<")
elif this == ">" and self._context & contexts.TAG_CLOSE:
return self._handle_tag_close_close()
elif this == next == "'":
result = self._parse_style()
if result is not None:
return result
elif self._read(-1) in ("\n", self.START):
if this in ("#", "*", ";", ":"):
self._handle_list()
elif this == next == self._read(2) == self._read(3) == "-":
self._handle_hr()
else:
self._emit_text(this)
elif this in ("\n", ":") and self._context & contexts.DL_TERM:
self._handle_dl_term()
else:
self._emit_text(this)
self._head += 1


+ 1
- 5
mwparserfromhell/parser/tokens.py View File

@@ -55,7 +55,7 @@ class Token(object):
return False

def __getattr__(self, key):
return self._kwargs[key]
return self._kwargs.get(key)

def __setattr__(self, key, value):
self._kwargs[key] = value
@@ -63,10 +63,6 @@ class Token(object):
def __delattr__(self, key):
del self._kwargs[key]

def get(self, key, default=None):
"""Same as :py:meth:`__getattr__`, but has a *default* if missing."""
return self._kwargs.get(key, default)


def make(name):
"""Create a new Token class using ``type()`` and add it to ``__all__``."""


+ 9
- 13
mwparserfromhell/tag_defs.py View File

@@ -24,7 +24,7 @@

from __future__ import unicode_literals

__all__ = ["get_wikicode", "is_parsable", "is_visible", "is_single",
__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single",
"is_single_only"]

PARSER_BLACKLIST = [
@@ -44,20 +44,16 @@ INVISIBLE_TAGS = [
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"]

WIKICODE = {
"i": {"open": "''", "close": "''"},
"b": {"open": "'''", "close": "'''"},
"ul": {"open": "*"},
"ol": {"open": "#"},
"dt": {"open": ";"},
"dd": {"open": ":"},
"hr": {"open": "----"},
MARKUP_TO_HTML = {
"#": "li",
"*": "li",
";": "dt",
":": "dd"
}

def get_wikicode(tag):
"""Return the appropriate wikicode before and after the given *tag*."""
data = WIKICODE[tag.lower()]
return (data.get("open"), data.get("close"))
def get_html_tag(markup):
"""Return the HTML tag associated with the given wiki-markup."""
return MARKUP_TO_HTML[markup]

def is_parsable(tag):
"""Return if the given *tag*'s contents should be passed to the parser."""


+ 1
- 1
tests/_test_tree_equality.py View File

@@ -106,7 +106,7 @@ class TreeEqualityTestCase(TestCase):
self.assertEqual(exp_attr.pad_first, act_attr.pad_first)
self.assertEqual(exp_attr.pad_before_eq, act_attr.pad_before_eq)
self.assertEqual(exp_attr.pad_after_eq, act_attr.pad_after_eq)
self.assertIs(expected.showtag, actual.showtag)
self.assertIs(expected.wiki_markup, actual.wiki_markup)
self.assertIs(expected.self_closing, actual.self_closing)
self.assertIs(expected.invalid, actual.invalid)
self.assertIs(expected.implicit, actual.implicit)


+ 14
- 0
tests/test_builder.py View File

@@ -303,6 +303,20 @@ class TestBuilder(TreeEqualityTestCase):
Text(" "), Wikilink(wraptext("q")), Text(" "),
Template(wraptext("r"))]), True, " \n ", " ",
" ")])])),

# "''italic text''"
([tokens.TagOpenOpen(wiki_markup="''"), tokens.Text(text="i"),
tokens.TagCloseOpen(), tokens.Text(text="italic text"),
tokens.TagOpenClose(), tokens.Text(text="i"),
tokens.TagCloseClose()],
wrap([Tag(wraptext("i"), wraptext("italic text"),
wiki_markup="''")])),

# * bullet
([tokens.TagOpenOpen(wiki_markup="*"), tokens.Text(text="li"),
tokens.TagCloseSelfclose(), tokens.Text(text=" bullet")],
wrap([Tag(wraptext("li"), wiki_markup="*", self_closing=True),
Text(" bullet")])),
]
for test, valid in tests:
self.assertWikicodeEqual(valid, self.builder.build(test))


+ 10
- 10
tests/test_tag.py View File

@@ -50,8 +50,8 @@ class TestTag(TreeEqualityTestCase):
implicit=True)
node7 = Tag(wraptext("br"), self_closing=True, invalid=True,
padding=" ")
node8 = Tag(wraptext("hr"), showtag=False, self_closing=True)
node9 = Tag(wraptext("i"), wraptext("italics!"), showtag=False)
node8 = Tag(wraptext("hr"), wiki_markup="----", self_closing=True)
node9 = Tag(wraptext("i"), wraptext("italics!"), wiki_markup="''")

self.assertEqual("<ref></ref>", str(node1))
self.assertEqual('<span style="color: red;">foo</span>', str(node2))
@@ -72,7 +72,7 @@ class TestTag(TreeEqualityTestCase):
# <ref>foobar</ref>
node1 = Tag(wrap([node1n1]), wrap([node1n2]))
# '''bold text'''
node2 = Tag(wraptext("i"), wrap([node2n1]), showtag=False)
node2 = Tag(wraptext("b"), wrap([node2n1]), wiki_markup="'''")
# <img id="foo" class="bar" />
node3 = Tag(wrap([node3n1]),
attrs=[Attribute(wrap([node3n2]), wrap([node3n3])),
@@ -158,15 +158,15 @@ class TestTag(TreeEqualityTestCase):
self.assertEqual([], node1.attributes)
self.assertIs(attrs, node2.attributes)

def test_showtag(self):
"""test getter/setter for the showtag attribute"""
def test_wiki_markup(self):
"""test getter/setter for the wiki_markup attribute"""
node = Tag(wraptext("i"), wraptext("italic text"))
self.assertTrue(node.showtag)
node.showtag = False
self.assertFalse(node.showtag)
self.assertIs(None, node.wiki_markup)
node.wiki_markup = "''"
self.assertEqual("''", node.wiki_markup)
self.assertEqual("''italic text''", node)
node.showtag = 1
self.assertTrue(node.showtag)
node.wiki_markup = False
self.assertFalse(node.wiki_markup)
self.assertEqual("<i>italic text</i>", node)

def test_self_closing(self):


+ 3
- 3
tests/test_tokens.py View File

@@ -44,8 +44,8 @@ class TestTokens(unittest.TestCase):

self.assertEqual("bar", token2.foo)
self.assertEqual(123, token2.baz)
self.assertRaises(KeyError, lambda: token1.foo)
self.assertRaises(KeyError, lambda: token2.bar)
self.assertFalse(token1.foo)
self.assertFalse(token2.bar)

token1.spam = "eggs"
token2.foo = "ham"
@@ -53,7 +53,7 @@ class TestTokens(unittest.TestCase):

self.assertEqual("eggs", token1.spam)
self.assertEqual("ham", token2.foo)
self.assertRaises(KeyError, lambda: token2.baz)
self.assertFalse(token2.baz)
self.assertRaises(KeyError, delattr, token2, "baz")

def test_repr(self):


+ 516
- 0
tests/tokenizer/tags_wikimarkup.mwtest View File

@@ -0,0 +1,516 @@
name: basic_italics
label: basic italic text
input: "''text''"
output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: basic_bold
label: basic bold text
input: "'''text'''"
output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="text"), TagOpenClose(), Text(text="b"), TagCloseClose()]

---

name: basic_ul
label: basic unordered list
input: "*text"
output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="text")]

---

name: basic_ol
label: basic ordered list
input: "#text"
output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="text")]

---

name: basic_dt
label: basic description term
input: ";text"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="text")]

---

name: basic_dd
label: basic description item
input: ":text"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="text")]

---

name: basic_hr
label: basic horizontal rule
input: "----"
output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose()]

---

name: complex_italics
label: italics with a lot in them
input: "''this is a&nbsp;test of [[Italic text|italics]] with {{plenty|of|stuff}}''"
output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of "), WikilinkOpen(), Text(text="Italic text"), WikilinkSeparator(), Text(text="italics"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose(), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: multiline_italics
label: italics spanning mulitple lines
input: "foo\nbar''testing\ntext\nspanning\n\n\n\n\nmultiple\nlines''foo\n\nbar"
output: [Text(text="foo\nbar"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="testing\ntext\nspanning\n\n\n\n\nmultiple\nlines"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="foo\n\nbar")]

---

name: unending_italics
label: italics without an ending tag
input: "''unending formatting!"
output: [Text(text="''unending formatting!")]

---

name: misleading_italics_end
label: italics with something that looks like an end but isn't
input: "''this is 'not' the en'd'<nowiki>''</nowiki>"
output: [Text(text="''this is 'not' the en'd'"), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="''"), TagOpenClose(), Text(text="nowiki"), TagCloseClose()]
]

---

name: italics_start_outside_end_inside
label: italics that start outside a link and end inside it
input: "''foo[[bar|baz'']]spam"
output: [Text(text="''foo"), WikilinkOpen(), Text(text="bar"), WikilinkSeparator(), Text(text="baz''"), WikilinkClose(), Text(text="spam")]

---

name: italics_start_inside_end_outside
label: italics that start inside a link and end outside it
input: "[[foo|''bar]]baz''spam"
output: [Text(text="[[foo|"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar]]baz"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="spam")]

---

name: complex_bold
label: bold with a lot in it
input: "'''this is a&nbsp;test of [[Bold text|bold]] with {{plenty|of|stuff}}'''"
output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of "), WikilinkOpen(), Text(text="Bold text"), WikilinkSeparator(), Text(text="bold"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose(), TagOpenClose(), Text(text="b"), TagCloseClose()]

---

name: multiline_bold
label: bold spanning mulitple lines
input: "foo\nbar'''testing\ntext\nspanning\n\n\n\n\nmultiple\nlines'''foo\n\nbar"
output: [Text(text="foo\nbar"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="testing\ntext\nspanning\n\n\n\n\nmultiple\nlines"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="foo\n\nbar")]

---

name: unending_bold
label: bold without an ending tag
input: "'''unending formatting!"
output: [Text(text="'''unending formatting!")]

---

name: misleading_bold_end
label: bold with something that looks like an end but isn't
input: "'''this is 'not' the en''d'<nowiki>'''</nowiki>"
output: [Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="this is 'not' the en"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="d'"), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="'''"), TagOpenClose(), Text(text="nowiki"), TagCloseClose()]

---

name: bold_start_outside_end_inside
label: bold that start outside a link and end inside it
input: "'''foo[[bar|baz''']]spam"
output: [Text(text="'''foo"), WikilinkOpen(), Text(text="bar"), WikilinkSeparator(), Text(text="baz'''"), WikilinkClose(), Text(text="spam")]

---

name: bold_start_inside_end_outside
label: bold that start inside a link and end outside it
input: "[[foo|'''bar]]baz'''spam"
output: [Text(text="[[foo|"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bar]]baz"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="spam")]

---

name: bold_and_italics
label: bold and italics together
input: "this is '''''bold and italic text'''''!"
output: [Text(text="this is "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold and italic text"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="!")]

---

name: both_then_bold
label: text that starts bold/italic, then is just bold
input: "'''''both''bold'''"
output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose()]

---

name: both_then_italics
label: text that starts bold/italic, then is just italic
input: "'''''both'''italics''"
output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: bold_then_both
label: text that starts just bold, then is bold/italic
input: "'''bold''both'''''"
output: [TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenClose(), Text(text="b"), TagCloseClose()]

---

name: italics_then_both
label: text that starts just italic, then is bold/italic
input: "''italics'''both'''''"
output: [TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="both"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: italics_then_bold
label: text that starts italic, then is bold
input: "none''italics'''''bold'''none"
output: [Text(text="none"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="none")]

---

name: bold_then_italics
label: text that starts bold, then is italic
input: "none'''bold'''''italics''none"
output: [Text(text="none"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bold"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="italics"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text="none")]

---

name: five_three
label: five ticks to open, three to close (bold)
input: "'''''foobar'''"
output: [Text(text="''"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="foobar"), TagOpenClose(), Text(text="b"), TagCloseClose()]

---

name: five_two
label: five ticks to open, two to close (bold)
input: "'''''foobar''"
output: [Text(text="'''"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="foobar"), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: four
label: four ticks
input: "foo ''''bar'''' baz"
output: [Text(text="foo '"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="bar'"), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text=" baz")]

---

name: four_two
label: four ticks to open, two to close
input: "foo ''''bar'' baz"
output: [Text(text="foo ''"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text=" baz")]

---

name: two_three
label: two ticks to open, three to close
input: "foo ''bar''' baz"
output: [Text(text="foo "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar'"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text=" baz")]

---

name: two_four
label: two ticks to open, four to close
input: "foo ''bar'''' baz"
output: [Text(text="foo "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar''"), TagOpenClose(), Text(text="i"), TagCloseClose(), Text(text=" baz")]

---

name: two_three_two
label: two ticks to open, three to close, two afterwards
input: "foo ''bar''' baz''"
output: [Text(text="foo "), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar''' baz"), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: two_four_four
label: two ticks to open, four to close, four afterwards
input: "foo ''bar'''' baz''''"
output: [Text(text="foo ''bar'"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text=" baz'"), TagOpenClose(), Text(text="b"), TagCloseClose()]

---

name: seven
label: seven ticks
input: "'''''''seven'''''''"
output: [Text(text="''"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="seven''"), TagOpenClose(), Text(text="b"), TagCloseClose(), TagOpenClose(), Text(text="i"), TagCloseClose()]

---

name: complex_ul
label: ul with a lot in it
input: "* this is a&nbsp;test of an [[Unordered list|ul]] with {{plenty|of|stuff}}"
output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="Unordered list"), WikilinkSeparator(), Text(text="ul"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()]

---

name: ul_multiline_template
label: ul with a template that spans multiple lines
input: "* this has a template with a {{line|\nbreak}}\nthis is not part of the list"
output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text=" this has a template with a "), TemplateOpen(), Text(text="line"), TemplateParamSeparator(), Text(text="\nbreak"), TemplateClose(), Text(text="\nthis is not part of the list")]

---

name: ul_adjacent
label: multiple adjacent uls
input: "a\n*b\n*c\nd\n*e\nf"
output: [Text(text="a\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf")]

---

name: ul_depths
label: multiple adjacent uls, with differing depths
input: "*a\n**b\n***c\n********d\n**e\nf\n***g"
output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="g")]

---

name: ul_space_before
label: uls with space before them
input: "foo *bar\n *baz\n*buzz"
output: [Text(text="foo *bar\n *baz\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="buzz")]

---

name: ul_interruption
label: high-depth ul with something blocking it
input: "**f*oobar"
output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="f*oobar")]

---

name: complex_ol
label: ol with a lot in it
input: "# this is a&nbsp;test of an [[Ordered list|ol]] with {{plenty|of|stuff}}"
output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="Ordered list"), WikilinkSeparator(), Text(text="ol"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()]

---

name: ol_multiline_template
label: ol with a template that spans moltiple lines
input: "# this has a template with a {{line|\nbreak}}\nthis is not part of the list"
output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text=" this has a template with a "), TemplateOpen(), Text(text="line"), TemplateParamSeparator(), Text(text="\nbreak"), TemplateClose(), Text(text="\nthis is not part of the list")]

---

name: ol_adjacent
label: moltiple adjacent ols
input: "a\n#b\n#c\nd\n#e\nf"
output: [Text(text="a\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf")]

---

name: ol_depths
label: moltiple adjacent ols, with differing depths
input: "#a\n##b\n###c\n########d\n##e\nf\n###g"
output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="g")]

---

name: ol_space_before
label: ols with space before them
input: "foo #bar\n #baz\n#buzz"
output: [Text(text="foo #bar\n #baz\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="buzz")]

---

name: ol_interruption
label: high-depth ol with something blocking it
input: "##f#oobar"
output: [TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="f#oobar")]

---

name: ul_ol_mix
label: a mix of adjacent uls and ols
input: "*a\n*#b\n*##c\n*##*#*#*d\n*#e\nf\n##*g"
output: [TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="g")]

---

name: complex_dt
label: dt with a lot in it
input: "; this is a&nbsp;test of an [[description term|dt]] with {{plenty|of|stuff}}"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="description term"), WikilinkSeparator(), Text(text="dt"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()]

---

name: dt_multiline_template
label: dt with a template that spans mdttiple lines
input: "; this has a template with a {{line|\nbreak}}\nthis is not part of the list"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text=" this has a template with a "), TemplateOpen(), Text(text="line"), TemplateParamSeparator(), Text(text="\nbreak"), TemplateClose(), Text(text="\nthis is not part of the list")]

---

name: dt_adjacent
label: mdttiple adjacent dts
input: "a\n;b\n;c\nd\n;e\nf"
output: [Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="e\nf")]

---

name: dt_depths
label: mdttiple adjacent dts, with differing depths
input: ";a\n;;b\n;;;c\n;;;;;;;;d\n;;e\nf\n;;;g"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="g")]

---

name: dt_space_before
label: dts with space before them
input: "foo ;bar\n ;baz\n;buzz"
output: [Text(text="foo ;bar\n ;baz\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="buzz")]

---

name: dt_interruption
label: high-depth dt with something blocking it
input: ";;f;oobar"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="f;oobar")]

---

name: complex_dd
label: dd with a lot in it
input: ": this is a&nbsp;test of an [[description item|dd]] with {{plenty|of|stuff}}"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="description item"), WikilinkSeparator(), Text(text="dd"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()]

---

name: dd_multiline_template
label: dd with a template that spans mddtiple lines
input: ": this has a template with a {{line|\nbreak}}\nthis is not part of the list"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=" this has a template with a "), TemplateOpen(), Text(text="line"), TemplateParamSeparator(), Text(text="\nbreak"), TemplateClose(), Text(text="\nthis is not part of the list")]

---

name: dd_adjacent
label: mddtiple adjacent dds
input: "a\n:b\n:c\nd\n:e\nf"
output: [Text(text="a\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf")]

---

name: dd_depths
label: mddtiple adjacent dds, with differing depths
input: ":a\n::b\n:::c\n::::::::d\n::e\nf\n:::g"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="g")]

---

name: dd_space_before
label: dds with space before them
input: "foo :bar\n :baz\n:buzz"
output: [Text(text="foo :bar\n :baz\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="buzz")]

---

name: dd_interruption
label: high-depth dd with something blocking it
input: "::f:oobar"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="f:oobar")]

---

name: dt_dd_mix
label: a mix of adjacent dts and dds
input: ";a\n;:b\n;::c\n;::;:;:;d\n;:e\nf\n::;g"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="d\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="g")]

---

name: dt_dd_mix2
label: the correct usage of a dt/dd unit, as in a dl
input: ";foo:bar:baz"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")]

---

name: dt_dd_mix3
label: another example of correct (but strange) dt/dd usage
input: ":;;::foo:bar:baz"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")]

---

name: ul_ol_dt_dd_mix
label: an assortment of uls, ols, dds, and dts
input: ";:#*foo\n:#*;foo\n#*;:foo\n*;:#foo"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), Text(text="foo\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo\n"), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="foo\n"), TagOpenOpen(wiki_markup="*"), Text(text="li"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup="#"), Text(text="li"), TagCloseSelfclose(), Text(text="foo")]

---

name: hr_text_before
label: text before an otherwise-valid hr
input: "foo----"
output: [Text(text="foo----")]

---

name: hr_text_after
label: text after a valid hr
input: "----bar"
output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="bar")]

---

name: hr_text_before_after
label: text at both ends of an otherwise-valid hr
input: "foo----bar"
output: [Text(text="foo----bar")]

---

name: hr_newlines
label: newlines surrounding a valid hr
input: "foo\n----\nbar"
output: [Text(text="foo\n"), TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="\nbar")]

---

name: hr_adjacent
label: two adjacent hrs
input: "----\n----"
output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="\n"), TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose()]

---

name: hr_adjacent_space
label: two adjacent hrs, with a space before the second one, making it invalid
input: "----\n ----"
output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="\n ----")]

---

name: hr_short
label: an invalid three-hyphen-long hr
input: "---"
output: [Text(text="---")]

---

name: hr_long
label: a very long, valid hr
input: "------------------------------------------"
output: [TagOpenOpen(wiki_markup="------------------------------------------"), Text(text="hr"), TagCloseSelfclose()]

---

name: hr_interruption_short
label: a hr that is interrupted, making it invalid
input: "---x-"
output: [Text(text="---x-")]

---

name: hr_interruption_long
label: a hr that is interrupted, but the first part remains valid because it is long enough
input: "----x--"
output: [TagOpenOpen(wiki_markup="----"), Text(text="hr"), TagCloseSelfclose(), Text(text="x--")]

Loading…
Cancel
Save