Browse Source

Merge branch 'feature/tables' into develop (closes #81)

tags/v0.4
Ben Kurtovic 10 years ago
parent
commit
45ed8c445c
14 changed files with 1357 additions and 108 deletions
  1. +1
    -0
      CHANGELOG
  2. +1
    -0
      docs/changelog.rst
  3. +1
    -1
      mwparserfromhell/definitions.py
  4. +54
    -7
      mwparserfromhell/nodes/tag.py
  5. +6
    -1
      mwparserfromhell/parser/builder.py
  6. +22
    -2
      mwparserfromhell/parser/contexts.py
  7. +491
    -33
      mwparserfromhell/parser/tokenizer.c
  8. +65
    -54
      mwparserfromhell/parser/tokenizer.h
  9. +218
    -7
      mwparserfromhell/parser/tokenizer.py
  10. +8
    -3
      tests/_test_tokenizer.py
  11. +41
    -0
      tests/test_roundtripping.py
  12. +32
    -0
      tests/test_tag.py
  13. +410
    -0
      tests/tokenizer/tables.mwtest
  14. +7
    -0
      tests/tokenizer/tags_wikimarkup.mwtest

+ 1
- 0
CHANGELOG View File

@@ -2,6 +2,7 @@ v0.4 (unreleased):


- The parser is now distributed with Windows binaries, fixing an issue that - The parser is now distributed with Windows binaries, fixing an issue that
prevented Windows users from using the C tokenizer. prevented Windows users from using the C tokenizer.
- Added support for parsing wikicode tables (patches by David Winegar).
- Added a script to test for memory leaks in scripts/memtest.py. - Added a script to test for memory leaks in scripts/memtest.py.
- Added a script to do releases in scripts/release.sh. - Added a script to do releases in scripts/release.sh.
- skip_style_tags can now be passed to mwparserfromhell.parse() (previously, - skip_style_tags can now be passed to mwparserfromhell.parse() (previously,


+ 1
- 0
docs/changelog.rst View File

@@ -9,6 +9,7 @@ Unreleased


- The parser is now distributed with Windows binaries, fixing an issue that - The parser is now distributed with Windows binaries, fixing an issue that
prevented Windows users from using the C tokenizer. prevented Windows users from using the C tokenizer.
- Added support for parsing wikicode tables (patches by David Winegar).
- Added a script to test for memory leaks in :file:`scripts/memtest.py`. - Added a script to test for memory leaks in :file:`scripts/memtest.py`.
- Added a script to do releases in :file:`scripts/release.sh`. - Added a script to do releases in :file:`scripts/release.sh`.
- *skip_style_tags* can now be passed to :func:`mwparserfromhell.parse() - *skip_style_tags* can now be passed to :func:`mwparserfromhell.parse()


+ 1
- 1
mwparserfromhell/definitions.py View File

@@ -52,7 +52,7 @@ INVISIBLE_TAGS = [


# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 # [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] SINGLE_ONLY = ["br", "hr", "meta", "link", "img"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"]


MARKUP_TO_HTML = { MARKUP_TO_HTML = {
"#": "li", "#": "li",


+ 54
- 7
mwparserfromhell/nodes/tag.py View File

@@ -35,7 +35,8 @@ class Tag(Node):


def __init__(self, tag, contents=None, attrs=None, wiki_markup=None, def __init__(self, tag, contents=None, attrs=None, wiki_markup=None,
self_closing=False, invalid=False, implicit=False, padding="", self_closing=False, invalid=False, implicit=False, padding="",
closing_tag=None):
closing_tag=None, wiki_style_separator=None,
closing_wiki_markup=None):
super(Tag, self).__init__() super(Tag, self).__init__()
self._tag = tag self._tag = tag
if contents is None and not self_closing: if contents is None and not self_closing:
@@ -52,13 +53,28 @@ class Tag(Node):
self._closing_tag = closing_tag self._closing_tag = closing_tag
else: else:
self._closing_tag = tag self._closing_tag = tag
self._wiki_style_separator = wiki_style_separator
if closing_wiki_markup is not None:
self._closing_wiki_markup = closing_wiki_markup
elif wiki_markup and not self_closing:
self._closing_wiki_markup = wiki_markup
else:
self._closing_wiki_markup = None


def __unicode__(self): def __unicode__(self):
if self.wiki_markup: if self.wiki_markup:
if self.attributes:
attrs = "".join([str(attr) for attr in self.attributes])
else:
attrs = ""
padding = self.padding or ""
separator = self.wiki_style_separator or ""
close = self.closing_wiki_markup or ""
if self.self_closing: if self.self_closing:
return self.wiki_markup
return self.wiki_markup + attrs + padding + separator
else: else:
return self.wiki_markup + str(self.contents) + self.wiki_markup
return self.wiki_markup + attrs + padding + separator + \
str(self.contents) + close


result = ("</" if self.invalid else "<") + str(self.tag) result = ("</" if self.invalid else "<") + str(self.tag)
if self.attributes: if self.attributes:
@@ -73,10 +89,10 @@ class Tag(Node):
def __children__(self): def __children__(self):
if not self.wiki_markup: if not self.wiki_markup:
yield self.tag yield self.tag
for attr in self.attributes:
yield attr.name
if attr.value is not None:
yield attr.value
for attr in self.attributes:
yield attr.name
if attr.value is not None:
yield attr.value
if self.contents: if self.contents:
yield self.contents yield self.contents
if not self.self_closing and not self.wiki_markup and self.closing_tag: if not self.self_closing and not self.wiki_markup and self.closing_tag:
@@ -174,6 +190,27 @@ class Tag(Node):
""" """
return self._closing_tag return self._closing_tag


@property
def wiki_style_separator(self):
"""The separator between the padding and content in a wiki markup tag.

Essentially the wiki equivalent of the TagCloseOpen.
"""
return self._wiki_style_separator

@property
def closing_wiki_markup(self):
"""The wikified version of the closing tag to show instead of HTML.

If set to a value, this will be displayed instead of the close tag
brackets. If tag is :attr:`self_closing` is ``True`` then this is not
displayed. If :attr:`wiki_markup` is set and this has not been set, this
is set to the value of :attr:`wiki_markup`. If this has been set and
:attr:`wiki_markup` is set to a ``False`` value, this is set to
``None``.
"""
return self._closing_wiki_markup

@tag.setter @tag.setter
def tag(self, value): def tag(self, value):
self._tag = self._closing_tag = parse_anything(value) self._tag = self._closing_tag = parse_anything(value)
@@ -185,6 +222,8 @@ class Tag(Node):
@wiki_markup.setter @wiki_markup.setter
def wiki_markup(self, value): def wiki_markup(self, value):
self._wiki_markup = str(value) if value else None self._wiki_markup = str(value) if value else None
if not value or not self.closing_wiki_markup:
self._closing_wiki_markup = self._wiki_markup


@self_closing.setter @self_closing.setter
def self_closing(self, value): def self_closing(self, value):
@@ -212,6 +251,14 @@ class Tag(Node):
def closing_tag(self, value): def closing_tag(self, value):
self._closing_tag = parse_anything(value) self._closing_tag = parse_anything(value)


@wiki_style_separator.setter
def wiki_style_separator(self, value):
self._wiki_style_separator = str(value) if value else None

@closing_wiki_markup.setter
def closing_wiki_markup(self, value):
self._closing_wiki_markup = str(value) if value else None

def has(self, name): def has(self, name):
"""Return whether any attribute in the tag has the given *name*. """Return whether any attribute in the tag has the given *name*.




+ 6
- 1
mwparserfromhell/parser/builder.py View File

@@ -249,20 +249,24 @@ class Builder(object):
close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose)
implicit, attrs, contents, closing_tag = False, [], None, None implicit, attrs, contents, closing_tag = False, [], None, None
wiki_markup, invalid = token.wiki_markup, token.invalid or False wiki_markup, invalid = token.wiki_markup, token.invalid or False
wiki_style_separator, closing_wiki_markup = None, wiki_markup
self._push() self._push()
while self._tokens: while self._tokens:
token = self._tokens.pop() token = self._tokens.pop()
if isinstance(token, tokens.TagAttrStart): if isinstance(token, tokens.TagAttrStart):
attrs.append(self._handle_attribute(token)) attrs.append(self._handle_attribute(token))
elif isinstance(token, tokens.TagCloseOpen): elif isinstance(token, tokens.TagCloseOpen):
wiki_style_separator = token.wiki_markup
padding = token.padding or "" padding = token.padding or ""
tag = self._pop() tag = self._pop()
self._push() self._push()
elif isinstance(token, tokens.TagOpenClose): elif isinstance(token, tokens.TagOpenClose):
closing_wiki_markup = token.wiki_markup
contents = self._pop() contents = self._pop()
self._push() self._push()
elif isinstance(token, close_tokens): elif isinstance(token, close_tokens):
if isinstance(token, tokens.TagCloseSelfclose): if isinstance(token, tokens.TagCloseSelfclose):
closing_wiki_markup = token.wiki_markup
tag = self._pop() tag = self._pop()
self_closing = True self_closing = True
padding = token.padding or "" padding = token.padding or ""
@@ -271,7 +275,8 @@ class Builder(object):
self_closing = False self_closing = False
closing_tag = self._pop() closing_tag = self._pop()
return Tag(tag, contents, attrs, wiki_markup, self_closing, return Tag(tag, contents, attrs, wiki_markup, self_closing,
invalid, implicit, padding, closing_tag)
invalid, implicit, padding, closing_tag,
wiki_style_separator, closing_wiki_markup)
else: else:
self._write(self._handle_token(token)) self._write(self._handle_token(token))
raise ParserError("_handle_tag() missed a close token") raise ParserError("_handle_tag() missed a close token")


+ 22
- 2
mwparserfromhell/parser/contexts.py View File

@@ -90,6 +90,15 @@ Local (stack-specific) contexts:
* :const:`FAIL_ON_RBRACE` * :const:`FAIL_ON_RBRACE`
* :const:`FAIL_ON_EQUALS` * :const:`FAIL_ON_EQUALS`


* :const:`TABLE`

* :const:`TABLE_OPEN`
* :const:`TABLE_CELL_OPEN`
* :const:`TABLE_CELL_STYLE`
* :const:`TABLE_TD_LINE`
* :const:`TABLE_TH_LINE`
* :const:`TABLE_CELL_LINE_CONTEXTS`

Global contexts: Global contexts:


* :const:`GL_HEADING` * :const:`GL_HEADING`
@@ -155,15 +164,26 @@ FAIL_ON_EQUALS = 1 << 29
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS) FAIL_ON_RBRACE + FAIL_ON_EQUALS)


TABLE_OPEN = 1 << 30
TABLE_CELL_OPEN = 1 << 31
TABLE_CELL_STYLE = 1 << 32
TABLE_ROW_OPEN = 1 << 33
TABLE_TD_LINE = 1 << 34
TABLE_TH_LINE = 1 << 35
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN +
TABLE_TD_LINE + TABLE_TH_LINE)

# Global contexts: # Global contexts:


GL_HEADING = 1 << 0 GL_HEADING = 1 << 0


# Aggregate contexts: # Aggregate contexts:


FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE
FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG +
STYLE + TABLE)
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE +
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE)
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK

+ 491
- 33
mwparserfromhell/parser/tokenizer.c View File

@@ -241,7 +241,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
/* /*
Add a new token stack, context, and textbuffer to the list. Add a new token stack, context, and textbuffer to the list.
*/ */
static int Tokenizer_push(Tokenizer* self, int context)
static int Tokenizer_push(Tokenizer* self, uint64_t context)
{ {
Stack* top = malloc(sizeof(Stack)); Stack* top = malloc(sizeof(Stack));


@@ -333,7 +333,7 @@ static PyObject* Tokenizer_pop(Tokenizer* self)
static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
{ {
PyObject* stack; PyObject* stack;
int context;
uint64_t context;


if (Tokenizer_push_textbuffer(self)) if (Tokenizer_push_textbuffer(self))
return NULL; return NULL;
@@ -351,7 +351,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
*/ */
static void* Tokenizer_fail_route(Tokenizer* self) static void* Tokenizer_fail_route(Tokenizer* self)
{ {
int context = self->topstack->context;
uint64_t context = self->topstack->context;
PyObject* stack = Tokenizer_pop(self); PyObject* stack = Tokenizer_pop(self);


Py_XDECREF(stack); Py_XDECREF(stack);
@@ -676,11 +676,8 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
RESET_ROUTE(); RESET_ROUTE();
for (i = 0; i < braces; i++) text[i] = '{'; for (i = 0; i < braces; i++) text[i] = '{';
text[braces] = '\0'; text[braces] = '\0';
if (Tokenizer_emit_text_then_stack(self, text)) {
Py_XDECREF(text);
if (Tokenizer_emit_text_then_stack(self, text))
return -1; return -1;
}
Py_XDECREF(text);
return 0; return 0;
} }
else else
@@ -1034,7 +1031,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
{ {
// Built from Tokenizer_parse()'s end sentinels: // Built from Tokenizer_parse()'s end sentinels:
Py_UNICODE after = Tokenizer_READ(self, 2); Py_UNICODE after = Tokenizer_READ(self, 2);
int ctx = self->topstack->context;
uint64_t ctx = self->topstack->context;


return (!this || this == '\n' || this == '[' || this == ']' || return (!this || this == '\n' || this == '[' || this == ']' ||
this == '<' || this == '>' || (this == '\'' && next == '\'') || this == '<' || this == '>' || (this == '\'' && next == '\'') ||
@@ -1629,9 +1626,9 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
static int static int
Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
{ {
int ctx = data->context;
int end_of_value = (ctx & TAG_ATTR_VALUE &&
!(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));
uint64_t ctx = data->context;
uint64_t end_of_value = (ctx & TAG_ATTR_VALUE &&
!(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));


if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) { if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) {
if (Tokenizer_push_tag_buffer(self, data)) if (Tokenizer_push_tag_buffer(self, data))
@@ -2153,7 +2150,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag,
static int Tokenizer_parse_italics(Tokenizer* self) static int Tokenizer_parse_italics(Tokenizer* self)
{ {
Py_ssize_t reset = self->head; Py_ssize_t reset = self->head;
int context;
uint64_t context;
PyObject *stack; PyObject *stack;


stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1); stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
@@ -2273,7 +2270,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
*/ */
static PyObject* Tokenizer_parse_style(Tokenizer* self) static PyObject* Tokenizer_parse_style(Tokenizer* self)
{ {
int context = self->topstack->context, ticks = 2, i;
uint64_t context = self->topstack->context, ticks = 2, i;


self->head += 2; self->head += 2;
while (Tokenizer_READ(self, 0) == '\'') { while (Tokenizer_READ(self, 0) == '\'') {
@@ -2426,9 +2423,363 @@ static int Tokenizer_handle_dl_term(Tokenizer* self)
} }


/* /*
Emit a table tag.
*/
static int
Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup,
const char* tag, PyObject* style, PyObject* padding,
const char* close_open_markup, PyObject* contents,
const char* open_close_markup)
{
PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs,
*close_open_markup_unicode, *open_close_kwargs,
*open_close_markup_unicode;

open_open_kwargs = PyDict_New();
if (!open_open_kwargs)
goto fail_decref_all;
open_open_markup_unicode = PyUnicode_FromString(open_open_markup);
if (!open_open_markup_unicode) {
Py_DECREF(open_open_kwargs);
goto fail_decref_all;
}
PyDict_SetItemString(open_open_kwargs, "wiki_markup",
open_open_markup_unicode);
Py_DECREF(open_open_markup_unicode);
if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs))
goto fail_decref_all;
if (Tokenizer_emit_text(self, tag))
goto fail_decref_all;

if (style) {
if (Tokenizer_emit_all(self, style))
goto fail_decref_all;
Py_DECREF(style);
}

close_open_kwargs = PyDict_New();
if (!close_open_kwargs)
goto fail_decref_padding_contents;
if (close_open_markup && strlen(close_open_markup) != 0) {
close_open_markup_unicode = PyUnicode_FromString(close_open_markup);
if (!close_open_markup_unicode) {
Py_DECREF(close_open_kwargs);
goto fail_decref_padding_contents;
}
PyDict_SetItemString(close_open_kwargs, "wiki_markup",
close_open_markup_unicode);
Py_DECREF(close_open_markup_unicode);
}
PyDict_SetItemString(close_open_kwargs, "padding", padding);
Py_DECREF(padding);
if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs))
goto fail_decref_contents;

if (contents) {
if (Tokenizer_emit_all(self, contents))
goto fail_decref_contents;
Py_DECREF(contents);
}

open_close_kwargs = PyDict_New();
if (!open_close_kwargs)
return -1;
open_close_markup_unicode = PyUnicode_FromString(open_close_markup);
if (!open_close_markup_unicode) {
Py_DECREF(open_close_kwargs);
return -1;
}
PyDict_SetItemString(open_close_kwargs, "wiki_markup",
open_close_markup_unicode);
Py_DECREF(open_close_markup_unicode);
if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs))
return -1;
if (Tokenizer_emit_text(self, tag))
return -1;
if (Tokenizer_emit(self, TagCloseClose))
return -1;
return 0;

fail_decref_all:
Py_XDECREF(style);
fail_decref_padding_contents:
Py_DECREF(padding);
fail_decref_contents:
Py_DECREF(contents);
return -1;
}

/*
Handle style attributes for a table until an ending token.
*/
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
{
TagData *data = TagData_new();
PyObject *padding, *trash;
Py_UNICODE this;
int can_exit;

if (!data)
return NULL;
data->context = TAG_ATTR_READY;

while (1) {
this = Tokenizer_READ(self, 0);
can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE);
if (this == end_token && can_exit) {
if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) {
if (Tokenizer_push_tag_buffer(self, data)) {
TagData_dealloc(data);
return NULL;
}
}
if (Py_UNICODE_ISSPACE(this))
Textbuffer_write(&(data->pad_first), this);
padding = Textbuffer_render(data->pad_first);
TagData_dealloc(data);
if (!padding)
return NULL;
return padding;
}
else if (!this || this == end_token) {
if (self->topstack->context & LC_TAG_ATTR) {
if (data->context & TAG_QUOTED) {
// Unclosed attribute quote: reset, don't die
data->context = TAG_ATTR_VALUE;
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
self->head = data->reset;
continue;
}
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
}
TagData_dealloc(data);
return Tokenizer_fail_route(self);
}
else {
if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
TagData_dealloc(data);
return NULL;
}
}
self->head++;
}
}

/*
Parse a wikicode table by starting with the first line.
*/
static int Tokenizer_parse_table(Tokenizer* self)
{
Py_ssize_t reset = self->head + 1;
PyObject *style, *padding;
PyObject *table = NULL;
self->head += 2;

if(Tokenizer_push(self, LC_TABLE_OPEN))
return -1;
padding = Tokenizer_handle_table_style(self, '\n');
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (Tokenizer_emit_text(self, "{|"))
return -1;
return 0;
}
if (!padding)
return -1;
style = Tokenizer_pop(self);
if (!style) {
Py_DECREF(padding);
return -1;
}

self->head++;
table = Tokenizer_parse(self, LC_TABLE_OPEN, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
Py_DECREF(padding);
Py_DECREF(style);
self->head = reset;
if (Tokenizer_emit_text(self, "{|"))
return -1;
return 0;
}
if (!table) {
Py_DECREF(padding);
Py_DECREF(style);
return -1;
}

if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL,
table, "|}"))
return -1;
// Offset displacement done by _parse()
self->head--;
return 0;
}

/*
Parse as style until end of the line, then continue.
*/
static int Tokenizer_handle_table_row(Tokenizer* self)
{
PyObject *padding, *style, *row, *trash;
self->head += 2;

if (!Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_emit_text(self, "|-"))
return -1;
self->head -= 1;
return 0;
}

if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN))
return -1;
padding = Tokenizer_handle_table_style(self, '\n');
if (BAD_ROUTE) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
return 0;
}
if (!padding)
return -1;
style = Tokenizer_pop(self);
if (!style) {
Py_DECREF(padding);
return -1;
}

// Don't parse the style separator
self->head++;
row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1);
if (!row) {
Py_DECREF(padding);
Py_DECREF(style);
return -1;
}

if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, ""))
return -1;
// Offset displacement done by _parse()
self->head--;
return 0;
}

/*
Parse as normal syntax unless we hit a style marker, then parse style
as HTML attributes and the remainder as normal syntax.
*/
static int
Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
const char *tag, uint64_t line_context)
{
uint64_t old_context = self->topstack->context;
uint64_t cell_context;
Py_ssize_t reset;
PyObject *padding, *cell, *style = NULL;
const char *close_open_markup = NULL;

self->head += strlen(markup);
reset = self->head;

if (!Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_emit_text(self, markup))
return -1;
self->head--;
return 0;
}

cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
LC_TABLE_CELL_STYLE | line_context, 1);
if (!cell)
return -1;
cell_context = self->topstack->context;
self->topstack->context = old_context;

if (cell_context & LC_TABLE_CELL_STYLE) {
Py_DECREF(cell);
self->head = reset;
if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
line_context))
return -1;
padding = Tokenizer_handle_table_style(self, '|');
if (!padding)
return -1;
style = Tokenizer_pop(self);
if (!style) {
Py_DECREF(padding);
return -1;
}
// Don't parse the style separator
self->head++;
cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
line_context, 1);
if (!cell) {
Py_DECREF(padding);
Py_DECREF(style);
return -1;
}
cell_context = self->topstack->context;
self->topstack->context = old_context;
}
else {
padding = PyUnicode_FromString("");
if (!padding) {
Py_DECREF(cell);
return -1;
}
}

if (style) {
close_open_markup = "|";
}
if (Tokenizer_emit_table_tag(self, markup, tag, style, padding,
close_open_markup, cell, ""))
return -1;
// Keep header/cell line contexts
self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE);
// Offset displacement done by parse()
self->head--;
return 0;
}

/*
Returns the context, stack, and whether to reset the cell for style
in a tuple.
*/
static PyObject*
Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style)
{
if (reset_for_style)
self->topstack->context |= LC_TABLE_CELL_STYLE;
else
self->topstack->context &= ~LC_TABLE_CELL_STYLE;
return Tokenizer_pop_keeping_context(self);
}

/*
Return the stack in order to handle the table row end.
*/
static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self)
{
return Tokenizer_pop(self);
}

/*
Return the stack in order to handle the table end.
*/
static PyObject* Tokenizer_handle_table_end(Tokenizer* self)
{
self->head += 2;
return Tokenizer_pop(self);
}

/*
Handle the end of the stream of wikitext. Handle the end of the stream of wikitext.
*/ */
static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
{ {
PyObject *token, *text, *trash; PyObject *token, *text, *trash;
int single; int single;
@@ -2444,9 +2795,16 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
if (single) if (single)
return Tokenizer_handle_single_tag_end(self); return Tokenizer_handle_single_tag_end(self);
} }
else if (context & AGG_DOUBLE) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
else {
if (context & LC_TABLE_CELL_OPEN) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
context = self->topstack->context;
}
if (context & AGG_DOUBLE) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
}
} }
return Tokenizer_fail_route(self); return Tokenizer_fail_route(self);
} }
@@ -2457,7 +2815,8 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
Make sure we are not trying to write an invalid character. Return 0 if Make sure we are not trying to write an invalid character. Return 0 if
everything is safe, or -1 if the route must be failed. everything is safe, or -1 if the route must be failed.
*/ */
static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
static int
Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
{ {
if (context & LC_FAIL_NEXT) if (context & LC_FAIL_NEXT)
return -1; return -1;
@@ -2508,7 +2867,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
} }
else if (context & LC_FAIL_ON_LBRACE) { else if (context & LC_FAIL_ON_LBRACE) {
if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' && if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' &&
Tokenizer_READ_BACKWARDS(self, 2) == '{')) {
Tokenizer_READ_BACKWARDS(self, 2) == '{')) {
if (context & LC_TEMPLATE) if (context & LC_TEMPLATE)
self->topstack->context |= LC_FAIL_ON_EQUALS; self->topstack->context |= LC_FAIL_ON_EQUALS;
else else
@@ -2533,12 +2892,30 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
} }


/* /*
Returns whether the current head has leading whitespace.
TODO: treat comments and templates as whitespace, allow fail on non-newline spaces.
*/
static int Tokenizer_has_leading_whitespace(Tokenizer* self)
{
int offset = 1;
Py_UNICODE current_character;
while (1) {
current_character = Tokenizer_READ_BACKWARDS(self, offset);
if (!current_character || current_character == '\n')
return 1;
else if (!Py_UNICODE_ISSPACE(current_character))
return 0;
offset++;
}
}

/*
Parse the wikicode string, using context for when to stop. If push is true, Parse the wikicode string, using context for when to stop. If push is true,
we will push a new context, otherwise we won't and context will be ignored. we will push a new context, otherwise we won't and context will be ignored.
*/ */
static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
{ {
int this_context;
uint64_t this_context;
Py_UNICODE this, next, next_next, last; Py_UNICODE this, next, next_next, last;
PyObject* temp; PyObject* temp;


@@ -2667,22 +3044,99 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (temp != Py_None) if (temp != Py_None)
return temp; return temp;
} }
else if (!last || last == '\n') {
if (this == '#' || this == '*' || this == ';' || this == ':') {
if (Tokenizer_handle_list(self))
else if ((!last || last == '\n') && (this == '#' || this == '*' || this == ';' || this == ':')) {
if (Tokenizer_handle_list(self))
return NULL;
}
else if ((!last || last == '\n') && (this == '-' && this == next &&
this == Tokenizer_READ(self, 2) &&
this == Tokenizer_READ(self, 3))) {
if (Tokenizer_handle_hr(self))
return NULL;
}
else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) {
if (Tokenizer_handle_dl_term(self))
return NULL;
// Kill potential table contexts
if (this == '\n')
self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS;
}

// Start of table parsing
else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) {
if (Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_parse_table(self))
return NULL;
}
else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next))
return NULL;
else
self->head++;
}
else if (this_context & LC_TABLE_OPEN) {
if (this == '|' && next == '|' && this_context & LC_TABLE_TD_LINE) {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "||", "td", LC_TABLE_TD_LINE))
return NULL; return NULL;
} }
else if (this == '-' && this == next &&
this == Tokenizer_READ(self, 2) &&
this == Tokenizer_READ(self, 3)) {
if (Tokenizer_handle_hr(self))
else if (this == '|' && next == '|' && this_context & LC_TABLE_TH_LINE) {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "||", "th", LC_TABLE_TH_LINE))
return NULL;
}
else if (this == '!' && next == '!' && this_context & LC_TABLE_TH_LINE) {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "!!", "th", LC_TABLE_TH_LINE))
return NULL;
}
else if (this == '|' && this_context & LC_TABLE_CELL_STYLE) {
return Tokenizer_handle_table_cell_end(self, 1);
}
// On newline, clear out cell line contexts
else if (this == '\n' && this_context & LC_TABLE_CELL_LINE_CONTEXTS) {
self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS;
if (Tokenizer_emit_char(self, this))
return NULL;
}
else if (Tokenizer_has_leading_whitespace(self)) {
if (this == '|' && next == '}') {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
if (this_context & LC_TABLE_ROW_OPEN)
return Tokenizer_handle_table_row_end(self);
else
return Tokenizer_handle_table_end(self);
}
else if (this == '|' && next == '-') {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
if (this_context & LC_TABLE_ROW_OPEN)
return Tokenizer_handle_table_row_end(self);
else if (Tokenizer_handle_table_row(self))
return NULL;
}
else if (this == '|') {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "|", "td", LC_TABLE_TD_LINE))
return NULL;
}
else if (this == '!') {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "!", "th", LC_TABLE_TH_LINE))
return NULL;
}
else if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
} }
else if (Tokenizer_emit_char(self, this)) else if (Tokenizer_emit_char(self, this))
return NULL; return NULL;
}
else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) {
if (Tokenizer_handle_dl_term(self))
// Raise BadRoute to table start
if (BAD_ROUTE)
return NULL; return NULL;
} }
else if (Tokenizer_emit_char(self, this)) else if (Tokenizer_emit_char(self, this))
@@ -2697,7 +3151,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
{ {
PyObject *text, *temp, *tokens; PyObject *text, *temp, *tokens;
int context = 0, skip_style_tags = 0;
uint64_t context = 0;
int skip_style_tags = 0;


if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) {
Py_XDECREF(self->text); Py_XDECREF(self->text);
@@ -2725,7 +3180,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
self->skip_style_tags = skip_style_tags; self->skip_style_tags = skip_style_tags;
tokens = Tokenizer_parse(self, context, 1); tokens = Tokenizer_parse(self, context, 1);


if (!tokens && !PyErr_Occurred()) {
if ((!tokens && !PyErr_Occurred()) || self->topstack) {
if (!ParserError) { if (!ParserError) {
if (load_exceptions()) if (load_exceptions())
return NULL; return NULL;
@@ -2734,6 +3189,9 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
RESET_ROUTE(); RESET_ROUTE();
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");
} }
else if (self->topstack)
PyErr_SetString(ParserError,
"C tokenizer exited with non-empty token stack");
else else
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); PyErr_SetString(ParserError, "C tokenizer exited unexpectedly");
return NULL; return NULL;


+ 65
- 54
mwparserfromhell/parser/tokenizer.h View File

@@ -29,6 +29,7 @@ SOFTWARE.
#include <math.h> #include <math.h>
#include <structmember.h> #include <structmember.h>
#include <bytesobject.h> #include <bytesobject.h>
#include <stdint.h>


#if PY_MAJOR_VERSION >= 3 #if PY_MAJOR_VERSION >= 3
#define IS_PY3K #define IS_PY3K
@@ -43,16 +44,17 @@ SOFTWARE.


static const char MARKERS[] = { static const char MARKERS[] = {
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
'-', '\n', '\0'};
'-', '!', '\n', '\0'};


#define NUM_MARKERS 18
#define NUM_MARKERS 19
#define TEXTBUFFER_BLOCKSIZE 1024 #define TEXTBUFFER_BLOCKSIZE 1024
#define MAX_DEPTH 40 #define MAX_DEPTH 40
#define MAX_CYCLES 100000 #define MAX_CYCLES 100000
#define MAX_BRACES 255 #define MAX_BRACES 255
#define MAX_ENTITY_SIZE 8 #define MAX_ENTITY_SIZE 8


static int route_state = 0, route_context = 0;
static int route_state = 0;
static uint64_t route_context = 0;
#define BAD_ROUTE route_state #define BAD_ROUTE route_state
#define BAD_ROUTE_CONTEXT route_context #define BAD_ROUTE_CONTEXT route_context
#define FAIL_ROUTE(context) route_state = 1; route_context = context #define FAIL_ROUTE(context) route_state = 1; route_context = context
@@ -109,52 +111,61 @@ static PyObject* TagCloseClose;


/* Local contexts: */ /* Local contexts: */


#define LC_TEMPLATE 0x00000007
#define LC_TEMPLATE_NAME 0x00000001
#define LC_TEMPLATE_PARAM_KEY 0x00000002
#define LC_TEMPLATE_PARAM_VALUE 0x00000004

#define LC_ARGUMENT 0x00000018
#define LC_ARGUMENT_NAME 0x00000008
#define LC_ARGUMENT_DEFAULT 0x00000010

#define LC_WIKILINK 0x00000060
#define LC_WIKILINK_TITLE 0x00000020
#define LC_WIKILINK_TEXT 0x00000040

#define LC_EXT_LINK 0x00000180
#define LC_EXT_LINK_URI 0x00000080
#define LC_EXT_LINK_TITLE 0x00000100

#define LC_HEADING 0x00007E00
#define LC_HEADING_LEVEL_1 0x00000200
#define LC_HEADING_LEVEL_2 0x00000400
#define LC_HEADING_LEVEL_3 0x00000800
#define LC_HEADING_LEVEL_4 0x00001000
#define LC_HEADING_LEVEL_5 0x00002000
#define LC_HEADING_LEVEL_6 0x00004000

#define LC_TAG 0x00078000
#define LC_TAG_OPEN 0x00008000
#define LC_TAG_ATTR 0x00010000
#define LC_TAG_BODY 0x00020000
#define LC_TAG_CLOSE 0x00040000

#define LC_STYLE 0x00780000
#define LC_STYLE_ITALICS 0x00080000
#define LC_STYLE_BOLD 0x00100000
#define LC_STYLE_PASS_AGAIN 0x00200000
#define LC_STYLE_SECOND_PASS 0x00400000

#define LC_DLTERM 0x00800000

#define LC_SAFETY_CHECK 0x3F000000
#define LC_HAS_TEXT 0x01000000
#define LC_FAIL_ON_TEXT 0x02000000
#define LC_FAIL_NEXT 0x04000000
#define LC_FAIL_ON_LBRACE 0x08000000
#define LC_FAIL_ON_RBRACE 0x10000000
#define LC_FAIL_ON_EQUALS 0x20000000
#define LC_TEMPLATE 0x0000000000000007
#define LC_TEMPLATE_NAME 0x0000000000000001
#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002
#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004

#define LC_ARGUMENT 0x0000000000000018
#define LC_ARGUMENT_NAME 0x0000000000000008
#define LC_ARGUMENT_DEFAULT 0x0000000000000010

#define LC_WIKILINK 0x0000000000000060
#define LC_WIKILINK_TITLE 0x0000000000000020
#define LC_WIKILINK_TEXT 0x0000000000000040

#define LC_EXT_LINK 0x0000000000000180
#define LC_EXT_LINK_URI 0x0000000000000080
#define LC_EXT_LINK_TITLE 0x0000000000000100

#define LC_HEADING 0x0000000000007E00
#define LC_HEADING_LEVEL_1 0x0000000000000200
#define LC_HEADING_LEVEL_2 0x0000000000000400
#define LC_HEADING_LEVEL_3 0x0000000000000800
#define LC_HEADING_LEVEL_4 0x0000000000001000
#define LC_HEADING_LEVEL_5 0x0000000000002000
#define LC_HEADING_LEVEL_6 0x0000000000004000

#define LC_TAG 0x0000000000078000
#define LC_TAG_OPEN 0x0000000000008000
#define LC_TAG_ATTR 0x0000000000010000
#define LC_TAG_BODY 0x0000000000020000
#define LC_TAG_CLOSE 0x0000000000040000

#define LC_STYLE 0x0000000000780000
#define LC_STYLE_ITALICS 0x0000000000080000
#define LC_STYLE_BOLD 0x0000000000100000
#define LC_STYLE_PASS_AGAIN 0x0000000000200000
#define LC_STYLE_SECOND_PASS 0x0000000000400000

#define LC_DLTERM 0x0000000000800000

#define LC_SAFETY_CHECK 0x000000003F000000
#define LC_HAS_TEXT 0x0000000001000000
#define LC_FAIL_ON_TEXT 0x0000000002000000
#define LC_FAIL_NEXT 0x0000000004000000
#define LC_FAIL_ON_LBRACE 0x0000000008000000
#define LC_FAIL_ON_RBRACE 0x0000000010000000
#define LC_FAIL_ON_EQUALS 0x0000000020000000

#define LC_TABLE 0x0000000FC0000000
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000
#define LC_TABLE_OPEN 0x0000000040000000
#define LC_TABLE_CELL_OPEN 0x0000000080000000
#define LC_TABLE_CELL_STYLE 0x0000000100000000
#define LC_TABLE_ROW_OPEN 0x0000000200000000
#define LC_TABLE_TD_LINE 0x0000000400000000
#define LC_TABLE_TH_LINE 0x0000000800000000


/* Global contexts: */ /* Global contexts: */


@@ -162,9 +173,9 @@ static PyObject* TagCloseClose;


/* Aggregate contexts: */ /* Aggregate contexts: */


#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE)
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) #define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) #define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)


@@ -191,7 +202,7 @@ struct Textbuffer {


struct Stack { struct Stack {
PyObject* stack; PyObject* stack;
int context;
uint64_t context;
struct Textbuffer* textbuffer; struct Textbuffer* textbuffer;
struct Stack* next; struct Stack* next;
}; };
@@ -202,7 +213,7 @@ typedef struct {
} HeadingData; } HeadingData;


typedef struct { typedef struct {
int context;
uint64_t context;
struct Textbuffer* pad_first; struct Textbuffer* pad_first;
struct Textbuffer* pad_before_eq; struct Textbuffer* pad_before_eq;
struct Textbuffer* pad_after_eq; struct Textbuffer* pad_after_eq;
@@ -267,7 +278,7 @@ static int Tokenizer_parse_entity(Tokenizer*);
static int Tokenizer_parse_comment(Tokenizer*); static int Tokenizer_parse_comment(Tokenizer*);
static int Tokenizer_handle_dl_term(Tokenizer*); static int Tokenizer_handle_dl_term(Tokenizer*);
static int Tokenizer_parse_tag(Tokenizer*); static int Tokenizer_parse_tag(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, int, int);
static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);


static int load_exceptions(void); static int load_exceptions(void);


+ 218
- 7
mwparserfromhell/parser/tokenizer.py View File

@@ -63,7 +63,7 @@ class Tokenizer(object):
START = object() START = object()
END = object() END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";",
":", "/", "-", "\n", START, END]
":", "/", "-", "!", "\n", START, END]
MAX_DEPTH = 40 MAX_DEPTH = 40
MAX_CYCLES = 100000 MAX_CYCLES = 100000
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
@@ -991,12 +991,166 @@ class Tokenizer(object):
else: else:
self._emit_text("\n") self._emit_text("\n")


def _emit_table_tag(self, open_open_markup, tag, style, padding,
close_open_markup, contents, open_close_markup):
"""Emit a table tag."""
self._emit(tokens.TagOpenOpen(wiki_markup=open_open_markup))
self._emit_text(tag)
if style:
self._emit_all(style)
if close_open_markup:
self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup,
padding=padding))
else:
self._emit(tokens.TagCloseOpen(padding=padding))
if contents:
self._emit_all(contents)
self._emit(tokens.TagOpenClose(wiki_markup=open_close_markup))
self._emit_text(tag)
self._emit(tokens.TagCloseClose())

def _handle_table_style(self, end_token):
"""Handle style attributes for a table until ``end_token``."""
data = _TagOpenData()
data.context = _TagOpenData.CX_ATTR_READY
while True:
this = self._read()
can_exit = (not data.context & data.CX_QUOTED or
data.context & data.CX_NOTE_SPACE)
if this == end_token and can_exit:
if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
self._push_tag_buffer(data)
if this.isspace():
data.padding_buffer["first"] += this
return data.padding_buffer["first"]
elif this is self.END or this == end_token:
if self._context & contexts.TAG_ATTR:
if data.context & data.CX_QUOTED:
# Unclosed attribute quote: reset, don't die
data.context = data.CX_ATTR_VALUE
self._pop()
self._head = data.reset
continue
self._pop()
self._fail_route()
else:
self._handle_tag_data(data, this)
self._head += 1

def _parse_table(self):
"""Parse a wikicode table by starting with the first line."""
reset = self._head + 1
self._head += 2
self._push(contexts.TABLE_OPEN)
try:
padding = self._handle_table_style("\n")
except BadRoute:
self._head = reset
self._emit_text("{|")
return
style = self._pop()

self._head += 1
try:
table = self._parse(contexts.TABLE_OPEN)
except BadRoute:
self._head = reset
self._emit_text("{|")
return

self._emit_table_tag("{|", "table", style, padding, None, table, "|}")
# Offset displacement done by _parse():
self._head -= 1

def _handle_table_row(self):
"""Parse as style until end of the line, then continue."""
self._head += 2
if not self._can_recurse():
self._emit_text("|-")
self._head -= 1
return

self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
try:
padding = self._handle_table_style("\n")
except BadRoute:
self._pop()
raise
style = self._pop()

# Don't parse the style separator:
self._head += 1
row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)

self._emit_table_tag("|-", "tr", style, padding, None, row, "")
# Offset displacement done by parse():
self._head -= 1

def _handle_table_cell(self, markup, tag, line_context):
"""Parse as normal syntax unless we hit a style marker, then parse
style as HTML attributes and the remainder as normal syntax."""
old_context = self._context
padding, style = "", None
self._head += len(markup)
reset = self._head
if not self._can_recurse():
self._emit_text(markup)
self._head -= 1
return

cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context | contexts.TABLE_CELL_STYLE)
cell_context = self._context
self._context = old_context
reset_for_style = cell_context & contexts.TABLE_CELL_STYLE
if reset_for_style:
self._head = reset
self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context)
padding = self._handle_table_style("|")
style = self._pop()
# Don't parse the style separator:
self._head += 1
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context)
cell_context = self._context
self._context = old_context

close_open_markup = "|" if reset_for_style else None
self._emit_table_tag(markup, tag, style, padding, close_open_markup,
cell, "")
# Keep header/cell line contexts:
self._context |= cell_context & (contexts.TABLE_TH_LINE |
contexts.TABLE_TD_LINE)
# Offset displacement done by parse():
self._head -= 1

def _handle_table_cell_end(self, reset_for_style=False):
"""Returns the current context, with the TABLE_CELL_STYLE flag set if
it is necessary to reset and parse style attributes."""
if reset_for_style:
self._context |= contexts.TABLE_CELL_STYLE
else:
self._context &= ~contexts.TABLE_CELL_STYLE
return self._pop(keep_context=True)

def _handle_table_row_end(self):
"""Return the stack in order to handle the table row end."""
return self._pop()

def _handle_table_end(self):
"""Return the stack in order to handle the table end."""
self._head += 2
return self._pop()

def _handle_end(self): def _handle_end(self):
"""Handle the end of the stream of wikitext.""" """Handle the end of the stream of wikitext."""
if self._context & contexts.FAIL: if self._context & contexts.FAIL:
if self._context & contexts.TAG_BODY: if self._context & contexts.TAG_BODY:
if is_single(self._stack[1].text): if is_single(self._stack[1].text):
return self._handle_single_tag_end() return self._handle_single_tag_end()
if self._context & contexts.TABLE_CELL_OPEN:
self._pop()
if self._context & contexts.DOUBLE: if self._context & contexts.DOUBLE:
self._pop() self._pop()
self._fail_route() self._fail_route()
@@ -1144,15 +1298,68 @@ class Tokenizer(object):
result = self._parse_style() result = self._parse_style()
if result is not None: if result is not None:
return result return result
elif self._read(-1) in ("\n", self.START):
if this in ("#", "*", ";", ":"):
elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"):
self._handle_list() self._handle_list()
elif this == next == self._read(2) == self._read(3) == "-":
elif self._read(-1) in ("\n", self.START) and this == next == self._read(2) == self._read(3) == "-":
self._handle_hr() self._handle_hr()
else:
self._emit_text(this)
elif this in ("\n", ":") and self._context & contexts.DL_TERM: elif this in ("\n", ":") and self._context & contexts.DL_TERM:
self._handle_dl_term() self._handle_dl_term()
if this == "\n":
# Kill potential table contexts
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
# Start of table parsing
elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
if self._can_recurse():
self._parse_table()
else:
self._emit_text("{|")
elif self._context & contexts.TABLE_OPEN:
if this == next == "|" and self._context & contexts.TABLE_TD_LINE:
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE)
elif this == next == "|" and self._context & contexts.TABLE_TH_LINE:
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE)
elif this == next == "!" and self._context & contexts.TABLE_TH_LINE:
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE)
elif this == "|" and self._context & contexts.TABLE_CELL_STYLE:
return self._handle_table_cell_end(reset_for_style=True)
# on newline, clear out cell line contexts
elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS:
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
self._emit_text(this)
elif (self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
if this == "|" and next == "}":
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
if self._context & contexts.TABLE_ROW_OPEN:
return self._handle_table_row_end()
return self._handle_table_end()
elif this == "|" and next == "-":
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
if self._context & contexts.TABLE_ROW_OPEN:
return self._handle_table_row_end()
self._handle_table_row()
elif this == "|":
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("|", "td", contexts.TABLE_TD_LINE)
elif this == "!":
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("!", "th", contexts.TABLE_TH_LINE)
else:
self._emit_text(this)
else:
self._emit_text(this)

else: else:
self._emit_text(this) self._emit_text(this)
self._head += 1 self._head += 1
@@ -1164,6 +1371,10 @@ class Tokenizer(object):
self._text = [segment for segment in split if segment] self._text = [segment for segment in split if segment]
self._head = self._global = self._depth = self._cycles = 0 self._head = self._global = self._depth = self._cycles = 0
try: try:
return self._parse(context)
tokens = self._parse(context)
except BadRoute: # pragma: no cover (untestable/exceptional case) except BadRoute: # pragma: no cover (untestable/exceptional case)
raise ParserError("Python tokenizer exited with BadRoute") raise ParserError("Python tokenizer exited with BadRoute")
if self._stacks: # pragma: no cover (untestable/exceptional case)
err = "Python tokenizer exited with non-empty token stack"
raise ParserError(err)
return tokens

+ 8
- 3
tests/_test_tokenizer.py View File

@@ -25,8 +25,9 @@ import codecs
from os import listdir, path from os import listdir, path
import sys import sys


from mwparserfromhell.compat import py3k
from mwparserfromhell.compat import py3k, str
from mwparserfromhell.parser import tokens from mwparserfromhell.parser import tokens
from mwparserfromhell.parser.builder import Builder


class _TestParseError(Exception): class _TestParseError(Exception):
"""Raised internally when a test could not be parsed.""" """Raised internally when a test could not be parsed."""
@@ -50,8 +51,12 @@ class TokenizerTestCase(object):
*label* for the method's docstring. *label* for the method's docstring.
""" """
def inner(self): def inner(self):
expected = data["output"]
actual = self.tokenizer().tokenize(data["input"])
if hasattr(self, "roundtrip"):
expected = data["input"]
actual = str(Builder().build(data["output"][:]))
else:
expected = data["output"]
actual = self.tokenizer().tokenize(data["input"])
self.assertEqual(expected, actual) self.assertEqual(expected, actual)
if not py3k: if not py3k:
inner.__name__ = funcname.encode("utf8") inner.__name__ = funcname.encode("utf8")


+ 41
- 0
tests/test_roundtripping.py View File

@@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2014 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

try:
import unittest2 as unittest
except ImportError:
import unittest

from ._test_tokenizer import TokenizerTestCase

class TestRoundtripping(TokenizerTestCase, unittest.TestCase):
"""Test cases for roundtripping tokens back to wikitext."""

@classmethod
def setUpClass(cls):
cls.roundtrip = True


if __name__ == "__main__":
unittest.main(verbosity=2)

+ 32
- 0
tests/test_tag.py View File

@@ -226,6 +226,38 @@ class TestTag(TreeEqualityTestCase):
self.assertWikicodeEqual(parsed, node.closing_tag) self.assertWikicodeEqual(parsed, node.closing_tag)
self.assertEqual("<ref>foobar</ref {{ignore me}}>", node) self.assertEqual("<ref>foobar</ref {{ignore me}}>", node)


def test_wiki_style_separator(self):
"""test getter/setter for wiki_style_separator attribute"""
node = Tag(wraptext("table"), wraptext("\n"))
self.assertIs(None, node.wiki_style_separator)
node.wiki_style_separator = "|"
self.assertEqual("|", node.wiki_style_separator)
node.wiki_markup = "{"
self.assertEqual("{|\n{", node)
node2 = Tag(wraptext("table"), wraptext("\n"), wiki_style_separator="|")
self.assertEqual("|", node.wiki_style_separator)

def test_closing_wiki_markup(self):
"""test getter/setter for closing_wiki_markup attribute"""
node = Tag(wraptext("table"), wraptext("\n"))
self.assertIs(None, node.closing_wiki_markup)
node.wiki_markup = "{|"
self.assertEqual("{|", node.closing_wiki_markup)
node.closing_wiki_markup = "|}"
self.assertEqual("|}", node.closing_wiki_markup)
self.assertEqual("{|\n|}", node)
node.wiki_markup = "!!"
self.assertEqual("|}", node.closing_wiki_markup)
self.assertEqual("!!\n|}", node)
node.wiki_markup = False
self.assertFalse(node.closing_wiki_markup)
self.assertEqual("<table>\n</table>", node)
node2 = Tag(wraptext("table"), wraptext("\n"),
attrs=[agen("id", "foo")], wiki_markup="{|",
closing_wiki_markup="|}")
self.assertEqual("|}", node2.closing_wiki_markup)
self.assertEqual('{| id="foo"\n|}', node2)

def test_has(self): def test_has(self):
"""test Tag.has()""" """test Tag.has()"""
node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")]) node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")])


+ 410
- 0
tests/tokenizer/tables.mwtest View File

@@ -0,0 +1,410 @@
name: empty_table
label: parsing an empty table
input: "{|\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: inline_table
label: tables with a close on the same line are not valid
input: "{||}"
output: [Text(text="{||}")]

---

name: no_table_close_simple
label: no table close on inline table
input: "{| "
output: [Text(text="{| ")]

---

name: no_table_close_newline
label: no table close with a newline
input: "{| \n "
output: [Text(text="{| \n ")]

---

name: no_table_close_inside_cell
label: no table close while inside of a cell
input: "{| \n| "
output: [Text(text="{| \n| ")]

---

name: no_table_close_inside_cell_after_newline
label: no table close while inside of a cell after a newline
input: "{| \n| \n "
output: [Text(text="{| \n| \n ")]

---

name: no_table_close_inside_cell_with_attributes
label: no table close while inside of a cell with attributes
input: "{| \n| red | test"
output: [Text(text="{| \n| red | test")]

---

name: no_table_close_inside_row
label: no table close while inside of a row
input: "{| \n|- "
output: [Text(text="{| \n|- ")]

---

name: no_table_close_inside_row_after_newline
label: no table close while inside of a row after a newline
input: "{| \n|- \n "
output: [Text(text="{| \n|- \n ")]

---

name: no_table_close_row_and_cell
label: no table close while inside a cell inside a row
input: "{| \n|- \n|"
output: [Text(text="{| \n|- \n|")]

---

name: no_table_close_attributes
label: don't parse attributes as attributes if the table doesn't exist
input: "{| border="1""
output: [Text(text="{| border=\"1\"")]

---

name: no_table_close_unclosed_attributes
label: don't parse unclosed attributes if the table doesn't exist
input: "{| border="
output: [Text(text="{| border=")]

---

name: no_table_close_row_attributes
label: don't parse row attributes as attributes if the table doesn't exist
input: "{| |- border="1""
output: [Text(text="{| |- border=\"1\"")]

---

name: no_table_close_cell
label: don't parse cells if the table doesn't close
input: "{| | border="1"| test || red | foo"
output: [Text(text="{| | border=\"1\"| test || red | foo")]

---

name: crazy_no_table_close
label: lots of opened wiki syntax without closes
input: "{{{ {{ {| <ref"
output: [Text(text="{{{ {{ {| <ref")]

---

name: leading_whitespace_table
label: handle leading whitespace for a table
input: "foo \n \t {|\n|}"
output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: whitespace_after_table
label: handle whitespace after a table close
input: "{|\n|}\n \t "
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text="\n \t ")]

---

name: different_whitespace_after_table
label: handle spaces after a table close
input: "{|\n|} \n "
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" \n ")]

---

name: characters_after_table
label: handle characters after a table close
input: "{|\n|} tsta"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" tsta")]

---

name: characters_after_inline_table
label: handle characters after an inline table close
input: "{| |} tsta"
output: [Text(text="{| |} tsta")]

---

name: leading_characters_table
label: don't parse as a table when leading characters are not newline or whitespace
input: "foo \n foo \t {|\n|}"
output: [Text(text="foo \n foo \t {|\n|}")]

---

name: table_row_simple
label: simple table row
input: "{|\n |- \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_row_multiple
label: simple table row
input: "{|\n |- \n|- \n |-\n |}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_simple
label: simple table cell
input: "{|\n | foo \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_inline
label: multiple inline table cells
input: "{|\n | foo || bar || test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_multiple
label: multiple table cells (non-inline)
input: "{|\n| foo \n| bar \n| test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_header_simple
label: simple header cell
input: "{|\n ! foo \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_header_inline
label: multiple inline header cells
input: "{|\n ! foo || bar !! test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_header_multiple
label: multiple table header cells (non-inline)
input: "{|\n! foo \n! bar \n! test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: nested_cells_and_rows
label: combination of cells and rows in a table
input: "{|\n|- \n| foo \n|- \n| bar\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_fake_close
label: looks like a table close but is not
input: "{|\n | |} \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text="} \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_more_fake_close
label: looks like a table close but is not
input: "{|\n || |} \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" |} \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_extra_close
label: process second close as text
input: "{| \n |} \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" \n|}")]

---

name: nowiki_inside_table
label: nowiki handles pipe characters in tables
input: "{|\n | foo <nowiki>| |- {| |} || ! !!</nowiki> bar \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_text_outside_cell
label: parse text inside table but outside of a cell
input: "{|\n bar \n | foo \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: no_table_cell_with_leading_characters
label: fail to create a table cell when there are leading non-whitespace characters
input: "{|\n bar | foo \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: no_table_row_with_leading_characters
label: fail to create a table row when there are leading non-whitespace characters
input: "{|\n bar |- foo \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: template_inside_table_cell
label: template within table cell
input: "{|\n |{{foo\n|bar=baz}} \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_attributes
label: parse table cell style attributes
input: "{| \n | name="foo bar"| test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_empty_attributes
label: parse table cell with style markers but no attributes
input: "{| \n | | test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_with_dash
label: parse a situation in which a cell line looks like a row line
input: "{|\n ||- \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_attributes_quote_with_pipe
label: pipe inside an attribute quote should still be used as a style separator
input: "{| \n | name="foo|bar"| test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_attributes_name_with_pipe
label: pipe inside an attribute name should still be used as a style separator
input: "{| \n | name|="foo bar" | test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_attributes_pipe_after_equals
label: pipe inside an attribute should still be used as a style separator after an equals
input: "{| \n | name=|"foo|bar"| test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseOpen(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_attributes_templates
label: pipe inside attributes shouldn't be style separator
input: "{| \n | {{comment|template=baz}} | test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: header_cell_attributes
label: parse header cell style attributes
input: "{| \n ! name="foo bar"| test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: inline_cell_attributes
label: parse cell style attributes of inline cells
input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_row_attributes
label: parse table row style attributes
input: "{| \n |- name="foo bar"\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_row_attributes_crazy_whitespace
label: parse table row style attributes with different whitespace
input: "{| \t \n |- \t name="foo bar" \t \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding=" \t \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_attributes
label: parse table style attributes
input: "{| name="foo bar"\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: inline_table_attributes
label: handle attributes in inline tables
input: "{| foo="tee bar" |}"
output: [Text(text='{| foo="tee bar" |}')]

---

name: table_incorrect_attributes
label: parse incorrect table style attributes
input: "{| name="foo\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: templates_in_table_attribute
label: templates in the attributes of a table, after the start
input: "{| {{class}}="{{wikitable}}"\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="class"), TemplateClose(), TagAttrEquals(), TagAttrQuote(char="\""), TemplateOpen(), Text(text="wikitable"), TemplateClose(), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: templates_in_table_attribute_2
label: templates in the attributes of a table, after the start
input: "{|{{foo}} \n | name="foo bar" | test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: inappropriate_marker_at_line_start
label: an inappropriate marker (a right bracket) at the start of a line in the table
input: "{|\n}"
output: [Text(text="{|\n}")]

---

name: fake_close_near_start
label: a fake closing token at the end of the first line in the table
input: "{| class="wikitable" style="text-align: center; width=100%;|}\n|\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"text-align:"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="center;"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="width"), TagAttrEquals(), Text(text="100%;|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: fake_close_near_start_2
label: a fake closing token at the end of the first line in the table
input: "{| class="wikitable|}"\n|\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: junk_after_table_start
label: ignore more junk on the first line of the table
input: "{| class="wikitable" | foobar\n|\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="foobar"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: junk_after_table_row
label: ignore junk on the first line of a table row
input: "{|\n|- foo="bar" | baz\n|blerp\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="bar"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="baz"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="blerp\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

+ 7
- 0
tests/tokenizer/tags_wikimarkup.mwtest View File

@@ -447,6 +447,13 @@ output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Tag


--- ---


name: dt_dd_mix4
label: another example of correct dt/dd usage, with a trigger for a specific parse route
input: ";foo]:bar"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo]"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")]

---

name: ul_ol_dt_dd_mix name: ul_ol_dt_dd_mix
label: an assortment of uls, ols, dds, and dts label: an assortment of uls, ols, dds, and dts
input: ";:#*foo\n:#*;foo\n#*;:foo\n*;:#foo" input: ";:#*foo\n:#*;foo\n#*;:foo\n*;:#foo"


Loading…
Cancel
Save