Browse Source

Merge branch 'feature/tables' into develop (closes #81)

tags/v0.4
Ben Kurtovic 9 years ago
parent
commit
45ed8c445c
14 changed files with 1357 additions and 108 deletions
  1. +1
    -0
      CHANGELOG
  2. +1
    -0
      docs/changelog.rst
  3. +1
    -1
      mwparserfromhell/definitions.py
  4. +54
    -7
      mwparserfromhell/nodes/tag.py
  5. +6
    -1
      mwparserfromhell/parser/builder.py
  6. +22
    -2
      mwparserfromhell/parser/contexts.py
  7. +491
    -33
      mwparserfromhell/parser/tokenizer.c
  8. +65
    -54
      mwparserfromhell/parser/tokenizer.h
  9. +218
    -7
      mwparserfromhell/parser/tokenizer.py
  10. +8
    -3
      tests/_test_tokenizer.py
  11. +41
    -0
      tests/test_roundtripping.py
  12. +32
    -0
      tests/test_tag.py
  13. +410
    -0
      tests/tokenizer/tables.mwtest
  14. +7
    -0
      tests/tokenizer/tags_wikimarkup.mwtest

+ 1
- 0
CHANGELOG View File

@@ -2,6 +2,7 @@ v0.4 (unreleased):

- The parser is now distributed with Windows binaries, fixing an issue that
prevented Windows users from using the C tokenizer.
- Added support for parsing wikicode tables (patches by David Winegar).
- Added a script to test for memory leaks in scripts/memtest.py.
- Added a script to do releases in scripts/release.sh.
- skip_style_tags can now be passed to mwparserfromhell.parse() (previously,


+ 1
- 0
docs/changelog.rst View File

@@ -9,6 +9,7 @@ Unreleased

- The parser is now distributed with Windows binaries, fixing an issue that
prevented Windows users from using the C tokenizer.
- Added support for parsing wikicode tables (patches by David Winegar).
- Added a script to test for memory leaks in :file:`scripts/memtest.py`.
- Added a script to do releases in :file:`scripts/release.sh`.
- *skip_style_tags* can now be passed to :func:`mwparserfromhell.parse()


+ 1
- 1
mwparserfromhell/definitions.py View File

@@ -52,7 +52,7 @@ INVISIBLE_TAGS = [

# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"]

MARKUP_TO_HTML = {
"#": "li",


+ 54
- 7
mwparserfromhell/nodes/tag.py View File

@@ -35,7 +35,8 @@ class Tag(Node):

def __init__(self, tag, contents=None, attrs=None, wiki_markup=None,
self_closing=False, invalid=False, implicit=False, padding="",
closing_tag=None):
closing_tag=None, wiki_style_separator=None,
closing_wiki_markup=None):
super(Tag, self).__init__()
self._tag = tag
if contents is None and not self_closing:
@@ -52,13 +53,28 @@ class Tag(Node):
self._closing_tag = closing_tag
else:
self._closing_tag = tag
self._wiki_style_separator = wiki_style_separator
if closing_wiki_markup is not None:
self._closing_wiki_markup = closing_wiki_markup
elif wiki_markup and not self_closing:
self._closing_wiki_markup = wiki_markup
else:
self._closing_wiki_markup = None

def __unicode__(self):
if self.wiki_markup:
if self.attributes:
attrs = "".join([str(attr) for attr in self.attributes])
else:
attrs = ""
padding = self.padding or ""
separator = self.wiki_style_separator or ""
close = self.closing_wiki_markup or ""
if self.self_closing:
return self.wiki_markup
return self.wiki_markup + attrs + padding + separator
else:
return self.wiki_markup + str(self.contents) + self.wiki_markup
return self.wiki_markup + attrs + padding + separator + \
str(self.contents) + close

result = ("</" if self.invalid else "<") + str(self.tag)
if self.attributes:
@@ -73,10 +89,10 @@ class Tag(Node):
def __children__(self):
if not self.wiki_markup:
yield self.tag
for attr in self.attributes:
yield attr.name
if attr.value is not None:
yield attr.value
for attr in self.attributes:
yield attr.name
if attr.value is not None:
yield attr.value
if self.contents:
yield self.contents
if not self.self_closing and not self.wiki_markup and self.closing_tag:
@@ -174,6 +190,27 @@ class Tag(Node):
"""
return self._closing_tag

@property
def wiki_style_separator(self):
"""The separator between the padding and content in a wiki markup tag.

Essentially the wiki equivalent of the TagCloseOpen.
"""
return self._wiki_style_separator

@property
def closing_wiki_markup(self):
"""The wikified version of the closing tag to show instead of HTML.

If set to a value, this will be displayed instead of the close tag
brackets. If tag is :attr:`self_closing` is ``True`` then this is not
displayed. If :attr:`wiki_markup` is set and this has not been set, this
is set to the value of :attr:`wiki_markup`. If this has been set and
:attr:`wiki_markup` is set to a ``False`` value, this is set to
``None``.
"""
return self._closing_wiki_markup

@tag.setter
def tag(self, value):
self._tag = self._closing_tag = parse_anything(value)
@@ -185,6 +222,8 @@ class Tag(Node):
@wiki_markup.setter
def wiki_markup(self, value):
self._wiki_markup = str(value) if value else None
if not value or not self.closing_wiki_markup:
self._closing_wiki_markup = self._wiki_markup

@self_closing.setter
def self_closing(self, value):
@@ -212,6 +251,14 @@ class Tag(Node):
def closing_tag(self, value):
self._closing_tag = parse_anything(value)

@wiki_style_separator.setter
def wiki_style_separator(self, value):
self._wiki_style_separator = str(value) if value else None

@closing_wiki_markup.setter
def closing_wiki_markup(self, value):
self._closing_wiki_markup = str(value) if value else None

def has(self, name):
"""Return whether any attribute in the tag has the given *name*.



+ 6
- 1
mwparserfromhell/parser/builder.py View File

@@ -249,20 +249,24 @@ class Builder(object):
close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose)
implicit, attrs, contents, closing_tag = False, [], None, None
wiki_markup, invalid = token.wiki_markup, token.invalid or False
wiki_style_separator, closing_wiki_markup = None, wiki_markup
self._push()
while self._tokens:
token = self._tokens.pop()
if isinstance(token, tokens.TagAttrStart):
attrs.append(self._handle_attribute(token))
elif isinstance(token, tokens.TagCloseOpen):
wiki_style_separator = token.wiki_markup
padding = token.padding or ""
tag = self._pop()
self._push()
elif isinstance(token, tokens.TagOpenClose):
closing_wiki_markup = token.wiki_markup
contents = self._pop()
self._push()
elif isinstance(token, close_tokens):
if isinstance(token, tokens.TagCloseSelfclose):
closing_wiki_markup = token.wiki_markup
tag = self._pop()
self_closing = True
padding = token.padding or ""
@@ -271,7 +275,8 @@ class Builder(object):
self_closing = False
closing_tag = self._pop()
return Tag(tag, contents, attrs, wiki_markup, self_closing,
invalid, implicit, padding, closing_tag)
invalid, implicit, padding, closing_tag,
wiki_style_separator, closing_wiki_markup)
else:
self._write(self._handle_token(token))
raise ParserError("_handle_tag() missed a close token")


+ 22
- 2
mwparserfromhell/parser/contexts.py View File

@@ -90,6 +90,15 @@ Local (stack-specific) contexts:
* :const:`FAIL_ON_RBRACE`
* :const:`FAIL_ON_EQUALS`

* :const:`TABLE`

* :const:`TABLE_OPEN`
* :const:`TABLE_CELL_OPEN`
* :const:`TABLE_CELL_STYLE`
* :const:`TABLE_TD_LINE`
* :const:`TABLE_TH_LINE`
* :const:`TABLE_CELL_LINE_CONTEXTS`

Global contexts:

* :const:`GL_HEADING`
@@ -155,15 +164,26 @@ FAIL_ON_EQUALS = 1 << 29
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS)

TABLE_OPEN = 1 << 30
TABLE_CELL_OPEN = 1 << 31
TABLE_CELL_STYLE = 1 << 32
TABLE_ROW_OPEN = 1 << 33
TABLE_TD_LINE = 1 << 34
TABLE_TH_LINE = 1 << 35
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN +
TABLE_TD_LINE + TABLE_TH_LINE)

# Global contexts:

GL_HEADING = 1 << 0

# Aggregate contexts:

FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE
FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG +
STYLE + TABLE)
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE +
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE)
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK

+ 491
- 33
mwparserfromhell/parser/tokenizer.c View File

@@ -241,7 +241,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
/*
Add a new token stack, context, and textbuffer to the list.
*/
static int Tokenizer_push(Tokenizer* self, int context)
static int Tokenizer_push(Tokenizer* self, uint64_t context)
{
Stack* top = malloc(sizeof(Stack));

@@ -333,7 +333,7 @@ static PyObject* Tokenizer_pop(Tokenizer* self)
static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
{
PyObject* stack;
int context;
uint64_t context;

if (Tokenizer_push_textbuffer(self))
return NULL;
@@ -351,7 +351,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
*/
static void* Tokenizer_fail_route(Tokenizer* self)
{
int context = self->topstack->context;
uint64_t context = self->topstack->context;
PyObject* stack = Tokenizer_pop(self);

Py_XDECREF(stack);
@@ -676,11 +676,8 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
RESET_ROUTE();
for (i = 0; i < braces; i++) text[i] = '{';
text[braces] = '\0';
if (Tokenizer_emit_text_then_stack(self, text)) {
Py_XDECREF(text);
if (Tokenizer_emit_text_then_stack(self, text))
return -1;
}
Py_XDECREF(text);
return 0;
}
else
@@ -1034,7 +1031,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
{
// Built from Tokenizer_parse()'s end sentinels:
Py_UNICODE after = Tokenizer_READ(self, 2);
int ctx = self->topstack->context;
uint64_t ctx = self->topstack->context;

return (!this || this == '\n' || this == '[' || this == ']' ||
this == '<' || this == '>' || (this == '\'' && next == '\'') ||
@@ -1629,9 +1626,9 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
static int
Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
{
int ctx = data->context;
int end_of_value = (ctx & TAG_ATTR_VALUE &&
!(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));
uint64_t ctx = data->context;
uint64_t end_of_value = (ctx & TAG_ATTR_VALUE &&
!(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));

if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) {
if (Tokenizer_push_tag_buffer(self, data))
@@ -2153,7 +2150,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag,
static int Tokenizer_parse_italics(Tokenizer* self)
{
Py_ssize_t reset = self->head;
int context;
uint64_t context;
PyObject *stack;

stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
@@ -2273,7 +2270,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
*/
static PyObject* Tokenizer_parse_style(Tokenizer* self)
{
int context = self->topstack->context, ticks = 2, i;
uint64_t context = self->topstack->context, ticks = 2, i;

self->head += 2;
while (Tokenizer_READ(self, 0) == '\'') {
@@ -2426,9 +2423,363 @@ static int Tokenizer_handle_dl_term(Tokenizer* self)
}

/*
Emit a table tag.
*/
static int
Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup,
const char* tag, PyObject* style, PyObject* padding,
const char* close_open_markup, PyObject* contents,
const char* open_close_markup)
{
PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs,
*close_open_markup_unicode, *open_close_kwargs,
*open_close_markup_unicode;

open_open_kwargs = PyDict_New();
if (!open_open_kwargs)
goto fail_decref_all;
open_open_markup_unicode = PyUnicode_FromString(open_open_markup);
if (!open_open_markup_unicode) {
Py_DECREF(open_open_kwargs);
goto fail_decref_all;
}
PyDict_SetItemString(open_open_kwargs, "wiki_markup",
open_open_markup_unicode);
Py_DECREF(open_open_markup_unicode);
if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs))
goto fail_decref_all;
if (Tokenizer_emit_text(self, tag))
goto fail_decref_all;

if (style) {
if (Tokenizer_emit_all(self, style))
goto fail_decref_all;
Py_DECREF(style);
}

close_open_kwargs = PyDict_New();
if (!close_open_kwargs)
goto fail_decref_padding_contents;
if (close_open_markup && strlen(close_open_markup) != 0) {
close_open_markup_unicode = PyUnicode_FromString(close_open_markup);
if (!close_open_markup_unicode) {
Py_DECREF(close_open_kwargs);
goto fail_decref_padding_contents;
}
PyDict_SetItemString(close_open_kwargs, "wiki_markup",
close_open_markup_unicode);
Py_DECREF(close_open_markup_unicode);
}
PyDict_SetItemString(close_open_kwargs, "padding", padding);
Py_DECREF(padding);
if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs))
goto fail_decref_contents;

if (contents) {
if (Tokenizer_emit_all(self, contents))
goto fail_decref_contents;
Py_DECREF(contents);
}

open_close_kwargs = PyDict_New();
if (!open_close_kwargs)
return -1;
open_close_markup_unicode = PyUnicode_FromString(open_close_markup);
if (!open_close_markup_unicode) {
Py_DECREF(open_close_kwargs);
return -1;
}
PyDict_SetItemString(open_close_kwargs, "wiki_markup",
open_close_markup_unicode);
Py_DECREF(open_close_markup_unicode);
if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs))
return -1;
if (Tokenizer_emit_text(self, tag))
return -1;
if (Tokenizer_emit(self, TagCloseClose))
return -1;
return 0;

fail_decref_all:
Py_XDECREF(style);
fail_decref_padding_contents:
Py_DECREF(padding);
fail_decref_contents:
Py_DECREF(contents);
return -1;
}

/*
Handle style attributes for a table until an ending token.
*/
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
{
TagData *data = TagData_new();
PyObject *padding, *trash;
Py_UNICODE this;
int can_exit;

if (!data)
return NULL;
data->context = TAG_ATTR_READY;

while (1) {
this = Tokenizer_READ(self, 0);
can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE);
if (this == end_token && can_exit) {
if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) {
if (Tokenizer_push_tag_buffer(self, data)) {
TagData_dealloc(data);
return NULL;
}
}
if (Py_UNICODE_ISSPACE(this))
Textbuffer_write(&(data->pad_first), this);
padding = Textbuffer_render(data->pad_first);
TagData_dealloc(data);
if (!padding)
return NULL;
return padding;
}
else if (!this || this == end_token) {
if (self->topstack->context & LC_TAG_ATTR) {
if (data->context & TAG_QUOTED) {
// Unclosed attribute quote: reset, don't die
data->context = TAG_ATTR_VALUE;
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
self->head = data->reset;
continue;
}
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
}
TagData_dealloc(data);
return Tokenizer_fail_route(self);
}
else {
if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
TagData_dealloc(data);
return NULL;
}
}
self->head++;
}
}

/*
Parse a wikicode table by starting with the first line.
*/
static int Tokenizer_parse_table(Tokenizer* self)
{
Py_ssize_t reset = self->head + 1;
PyObject *style, *padding;
PyObject *table = NULL;
self->head += 2;

if(Tokenizer_push(self, LC_TABLE_OPEN))
return -1;
padding = Tokenizer_handle_table_style(self, '\n');
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (Tokenizer_emit_text(self, "{|"))
return -1;
return 0;
}
if (!padding)
return -1;
style = Tokenizer_pop(self);
if (!style) {
Py_DECREF(padding);
return -1;
}

self->head++;
table = Tokenizer_parse(self, LC_TABLE_OPEN, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
Py_DECREF(padding);
Py_DECREF(style);
self->head = reset;
if (Tokenizer_emit_text(self, "{|"))
return -1;
return 0;
}
if (!table) {
Py_DECREF(padding);
Py_DECREF(style);
return -1;
}

if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL,
table, "|}"))
return -1;
// Offset displacement done by _parse()
self->head--;
return 0;
}

/*
Parse as style until end of the line, then continue.
*/
static int Tokenizer_handle_table_row(Tokenizer* self)
{
PyObject *padding, *style, *row, *trash;
self->head += 2;

if (!Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_emit_text(self, "|-"))
return -1;
self->head -= 1;
return 0;
}

if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN))
return -1;
padding = Tokenizer_handle_table_style(self, '\n');
if (BAD_ROUTE) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
return 0;
}
if (!padding)
return -1;
style = Tokenizer_pop(self);
if (!style) {
Py_DECREF(padding);
return -1;
}

// Don't parse the style separator
self->head++;
row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1);
if (!row) {
Py_DECREF(padding);
Py_DECREF(style);
return -1;
}

if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, ""))
return -1;
// Offset displacement done by _parse()
self->head--;
return 0;
}

/*
Parse as normal syntax unless we hit a style marker, then parse style
as HTML attributes and the remainder as normal syntax.
*/
static int
Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
const char *tag, uint64_t line_context)
{
uint64_t old_context = self->topstack->context;
uint64_t cell_context;
Py_ssize_t reset;
PyObject *padding, *cell, *style = NULL;
const char *close_open_markup = NULL;

self->head += strlen(markup);
reset = self->head;

if (!Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_emit_text(self, markup))
return -1;
self->head--;
return 0;
}

cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
LC_TABLE_CELL_STYLE | line_context, 1);
if (!cell)
return -1;
cell_context = self->topstack->context;
self->topstack->context = old_context;

if (cell_context & LC_TABLE_CELL_STYLE) {
Py_DECREF(cell);
self->head = reset;
if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
line_context))
return -1;
padding = Tokenizer_handle_table_style(self, '|');
if (!padding)
return -1;
style = Tokenizer_pop(self);
if (!style) {
Py_DECREF(padding);
return -1;
}
// Don't parse the style separator
self->head++;
cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
line_context, 1);
if (!cell) {
Py_DECREF(padding);
Py_DECREF(style);
return -1;
}
cell_context = self->topstack->context;
self->topstack->context = old_context;
}
else {
padding = PyUnicode_FromString("");
if (!padding) {
Py_DECREF(cell);
return -1;
}
}

if (style) {
close_open_markup = "|";
}
if (Tokenizer_emit_table_tag(self, markup, tag, style, padding,
close_open_markup, cell, ""))
return -1;
// Keep header/cell line contexts
self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE);
// Offset displacement done by parse()
self->head--;
return 0;
}

/*
Returns the context, stack, and whether to reset the cell for style
in a tuple.
*/
static PyObject*
Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style)
{
if (reset_for_style)
self->topstack->context |= LC_TABLE_CELL_STYLE;
else
self->topstack->context &= ~LC_TABLE_CELL_STYLE;
return Tokenizer_pop_keeping_context(self);
}

/*
Return the stack in order to handle the table row end.
*/
static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self)
{
return Tokenizer_pop(self);
}

/*
Return the stack in order to handle the table end.
*/
static PyObject* Tokenizer_handle_table_end(Tokenizer* self)
{
self->head += 2;
return Tokenizer_pop(self);
}

/*
Handle the end of the stream of wikitext.
*/
static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
{
PyObject *token, *text, *trash;
int single;
@@ -2444,9 +2795,16 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
if (single)
return Tokenizer_handle_single_tag_end(self);
}
else if (context & AGG_DOUBLE) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
else {
if (context & LC_TABLE_CELL_OPEN) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
context = self->topstack->context;
}
if (context & AGG_DOUBLE) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
}
}
return Tokenizer_fail_route(self);
}
@@ -2457,7 +2815,8 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
Make sure we are not trying to write an invalid character. Return 0 if
everything is safe, or -1 if the route must be failed.
*/
static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
static int
Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
{
if (context & LC_FAIL_NEXT)
return -1;
@@ -2508,7 +2867,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
}
else if (context & LC_FAIL_ON_LBRACE) {
if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' &&
Tokenizer_READ_BACKWARDS(self, 2) == '{')) {
Tokenizer_READ_BACKWARDS(self, 2) == '{')) {
if (context & LC_TEMPLATE)
self->topstack->context |= LC_FAIL_ON_EQUALS;
else
@@ -2533,12 +2892,30 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
}

/*
Returns whether the current head has leading whitespace.
TODO: treat comments and templates as whitespace, allow fail on non-newline spaces.
*/
static int Tokenizer_has_leading_whitespace(Tokenizer* self)
{
int offset = 1;
Py_UNICODE current_character;
while (1) {
current_character = Tokenizer_READ_BACKWARDS(self, offset);
if (!current_character || current_character == '\n')
return 1;
else if (!Py_UNICODE_ISSPACE(current_character))
return 0;
offset++;
}
}

/*
Parse the wikicode string, using context for when to stop. If push is true,
we will push a new context, otherwise we won't and context will be ignored.
*/
static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
{
int this_context;
uint64_t this_context;
Py_UNICODE this, next, next_next, last;
PyObject* temp;

@@ -2667,22 +3044,99 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (temp != Py_None)
return temp;
}
else if (!last || last == '\n') {
if (this == '#' || this == '*' || this == ';' || this == ':') {
if (Tokenizer_handle_list(self))
else if ((!last || last == '\n') && (this == '#' || this == '*' || this == ';' || this == ':')) {
if (Tokenizer_handle_list(self))
return NULL;
}
else if ((!last || last == '\n') && (this == '-' && this == next &&
this == Tokenizer_READ(self, 2) &&
this == Tokenizer_READ(self, 3))) {
if (Tokenizer_handle_hr(self))
return NULL;
}
else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) {
if (Tokenizer_handle_dl_term(self))
return NULL;
// Kill potential table contexts
if (this == '\n')
self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS;
}

// Start of table parsing
else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) {
if (Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_parse_table(self))
return NULL;
}
else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next))
return NULL;
else
self->head++;
}
else if (this_context & LC_TABLE_OPEN) {
if (this == '|' && next == '|' && this_context & LC_TABLE_TD_LINE) {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "||", "td", LC_TABLE_TD_LINE))
return NULL;
}
else if (this == '-' && this == next &&
this == Tokenizer_READ(self, 2) &&
this == Tokenizer_READ(self, 3)) {
if (Tokenizer_handle_hr(self))
else if (this == '|' && next == '|' && this_context & LC_TABLE_TH_LINE) {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "||", "th", LC_TABLE_TH_LINE))
return NULL;
}
else if (this == '!' && next == '!' && this_context & LC_TABLE_TH_LINE) {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "!!", "th", LC_TABLE_TH_LINE))
return NULL;
}
else if (this == '|' && this_context & LC_TABLE_CELL_STYLE) {
return Tokenizer_handle_table_cell_end(self, 1);
}
// On newline, clear out cell line contexts
else if (this == '\n' && this_context & LC_TABLE_CELL_LINE_CONTEXTS) {
self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS;
if (Tokenizer_emit_char(self, this))
return NULL;
}
else if (Tokenizer_has_leading_whitespace(self)) {
if (this == '|' && next == '}') {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
if (this_context & LC_TABLE_ROW_OPEN)
return Tokenizer_handle_table_row_end(self);
else
return Tokenizer_handle_table_end(self);
}
else if (this == '|' && next == '-') {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
if (this_context & LC_TABLE_ROW_OPEN)
return Tokenizer_handle_table_row_end(self);
else if (Tokenizer_handle_table_row(self))
return NULL;
}
else if (this == '|') {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "|", "td", LC_TABLE_TD_LINE))
return NULL;
}
else if (this == '!') {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "!", "th", LC_TABLE_TH_LINE))
return NULL;
}
else if (Tokenizer_emit_char(self, this))
return NULL;
}
else if (Tokenizer_emit_char(self, this))
return NULL;
}
else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) {
if (Tokenizer_handle_dl_term(self))
// Raise BadRoute to table start
if (BAD_ROUTE)
return NULL;
}
else if (Tokenizer_emit_char(self, this))
@@ -2697,7 +3151,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
{
PyObject *text, *temp, *tokens;
int context = 0, skip_style_tags = 0;
uint64_t context = 0;
int skip_style_tags = 0;

if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) {
Py_XDECREF(self->text);
@@ -2725,7 +3180,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
self->skip_style_tags = skip_style_tags;
tokens = Tokenizer_parse(self, context, 1);

if (!tokens && !PyErr_Occurred()) {
if ((!tokens && !PyErr_Occurred()) || self->topstack) {
if (!ParserError) {
if (load_exceptions())
return NULL;
@@ -2734,6 +3189,9 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
RESET_ROUTE();
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");
}
else if (self->topstack)
PyErr_SetString(ParserError,
"C tokenizer exited with non-empty token stack");
else
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly");
return NULL;


+ 65
- 54
mwparserfromhell/parser/tokenizer.h View File

@@ -29,6 +29,7 @@ SOFTWARE.
#include <math.h>
#include <structmember.h>
#include <bytesobject.h>
#include <stdint.h>

#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
@@ -43,16 +44,17 @@ SOFTWARE.

static const char MARKERS[] = {
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
'-', '\n', '\0'};
'-', '!', '\n', '\0'};

#define NUM_MARKERS 18
#define NUM_MARKERS 19
#define TEXTBUFFER_BLOCKSIZE 1024
#define MAX_DEPTH 40
#define MAX_CYCLES 100000
#define MAX_BRACES 255
#define MAX_ENTITY_SIZE 8

static int route_state = 0, route_context = 0;
static int route_state = 0;
static uint64_t route_context = 0;
#define BAD_ROUTE route_state
#define BAD_ROUTE_CONTEXT route_context
#define FAIL_ROUTE(context) route_state = 1; route_context = context
@@ -109,52 +111,61 @@ static PyObject* TagCloseClose;

/* Local contexts: */

#define LC_TEMPLATE 0x00000007
#define LC_TEMPLATE_NAME 0x00000001
#define LC_TEMPLATE_PARAM_KEY 0x00000002
#define LC_TEMPLATE_PARAM_VALUE 0x00000004

#define LC_ARGUMENT 0x00000018
#define LC_ARGUMENT_NAME 0x00000008
#define LC_ARGUMENT_DEFAULT 0x00000010

#define LC_WIKILINK 0x00000060
#define LC_WIKILINK_TITLE 0x00000020
#define LC_WIKILINK_TEXT 0x00000040

#define LC_EXT_LINK 0x00000180
#define LC_EXT_LINK_URI 0x00000080
#define LC_EXT_LINK_TITLE 0x00000100

#define LC_HEADING 0x00007E00
#define LC_HEADING_LEVEL_1 0x00000200
#define LC_HEADING_LEVEL_2 0x00000400
#define LC_HEADING_LEVEL_3 0x00000800
#define LC_HEADING_LEVEL_4 0x00001000
#define LC_HEADING_LEVEL_5 0x00002000
#define LC_HEADING_LEVEL_6 0x00004000

#define LC_TAG 0x00078000
#define LC_TAG_OPEN 0x00008000
#define LC_TAG_ATTR 0x00010000
#define LC_TAG_BODY 0x00020000
#define LC_TAG_CLOSE 0x00040000

#define LC_STYLE 0x00780000
#define LC_STYLE_ITALICS 0x00080000
#define LC_STYLE_BOLD 0x00100000
#define LC_STYLE_PASS_AGAIN 0x00200000
#define LC_STYLE_SECOND_PASS 0x00400000

#define LC_DLTERM 0x00800000

#define LC_SAFETY_CHECK 0x3F000000
#define LC_HAS_TEXT 0x01000000
#define LC_FAIL_ON_TEXT 0x02000000
#define LC_FAIL_NEXT 0x04000000
#define LC_FAIL_ON_LBRACE 0x08000000
#define LC_FAIL_ON_RBRACE 0x10000000
#define LC_FAIL_ON_EQUALS 0x20000000
#define LC_TEMPLATE 0x0000000000000007
#define LC_TEMPLATE_NAME 0x0000000000000001
#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002
#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004

#define LC_ARGUMENT 0x0000000000000018
#define LC_ARGUMENT_NAME 0x0000000000000008
#define LC_ARGUMENT_DEFAULT 0x0000000000000010

#define LC_WIKILINK 0x0000000000000060
#define LC_WIKILINK_TITLE 0x0000000000000020
#define LC_WIKILINK_TEXT 0x0000000000000040

#define LC_EXT_LINK 0x0000000000000180
#define LC_EXT_LINK_URI 0x0000000000000080
#define LC_EXT_LINK_TITLE 0x0000000000000100

#define LC_HEADING 0x0000000000007E00
#define LC_HEADING_LEVEL_1 0x0000000000000200
#define LC_HEADING_LEVEL_2 0x0000000000000400
#define LC_HEADING_LEVEL_3 0x0000000000000800
#define LC_HEADING_LEVEL_4 0x0000000000001000
#define LC_HEADING_LEVEL_5 0x0000000000002000
#define LC_HEADING_LEVEL_6 0x0000000000004000

#define LC_TAG 0x0000000000078000
#define LC_TAG_OPEN 0x0000000000008000
#define LC_TAG_ATTR 0x0000000000010000
#define LC_TAG_BODY 0x0000000000020000
#define LC_TAG_CLOSE 0x0000000000040000

#define LC_STYLE 0x0000000000780000
#define LC_STYLE_ITALICS 0x0000000000080000
#define LC_STYLE_BOLD 0x0000000000100000
#define LC_STYLE_PASS_AGAIN 0x0000000000200000
#define LC_STYLE_SECOND_PASS 0x0000000000400000

#define LC_DLTERM 0x0000000000800000

#define LC_SAFETY_CHECK 0x000000003F000000
#define LC_HAS_TEXT 0x0000000001000000
#define LC_FAIL_ON_TEXT 0x0000000002000000
#define LC_FAIL_NEXT 0x0000000004000000
#define LC_FAIL_ON_LBRACE 0x0000000008000000
#define LC_FAIL_ON_RBRACE 0x0000000010000000
#define LC_FAIL_ON_EQUALS 0x0000000020000000

#define LC_TABLE 0x0000000FC0000000
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000
#define LC_TABLE_OPEN 0x0000000040000000
#define LC_TABLE_CELL_OPEN 0x0000000080000000
#define LC_TABLE_CELL_STYLE 0x0000000100000000
#define LC_TABLE_ROW_OPEN 0x0000000200000000
#define LC_TABLE_TD_LINE 0x0000000400000000
#define LC_TABLE_TH_LINE 0x0000000800000000

/* Global contexts: */

@@ -162,9 +173,9 @@ static PyObject* TagCloseClose;

/* Aggregate contexts: */

#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE)
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)

@@ -191,7 +202,7 @@ struct Textbuffer {

struct Stack {
PyObject* stack;
int context;
uint64_t context;
struct Textbuffer* textbuffer;
struct Stack* next;
};
@@ -202,7 +213,7 @@ typedef struct {
} HeadingData;

typedef struct {
int context;
uint64_t context;
struct Textbuffer* pad_first;
struct Textbuffer* pad_before_eq;
struct Textbuffer* pad_after_eq;
@@ -267,7 +278,7 @@ static int Tokenizer_parse_entity(Tokenizer*);
static int Tokenizer_parse_comment(Tokenizer*);
static int Tokenizer_handle_dl_term(Tokenizer*);
static int Tokenizer_parse_tag(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, int, int);
static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);

static int load_exceptions(void);


+ 218
- 7
mwparserfromhell/parser/tokenizer.py View File

@@ -63,7 +63,7 @@ class Tokenizer(object):
START = object()
END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";",
":", "/", "-", "\n", START, END]
":", "/", "-", "!", "\n", START, END]
MAX_DEPTH = 40
MAX_CYCLES = 100000
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
@@ -991,12 +991,166 @@ class Tokenizer(object):
else:
self._emit_text("\n")

def _emit_table_tag(self, open_open_markup, tag, style, padding,
close_open_markup, contents, open_close_markup):
"""Emit a table tag."""
self._emit(tokens.TagOpenOpen(wiki_markup=open_open_markup))
self._emit_text(tag)
if style:
self._emit_all(style)
if close_open_markup:
self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup,
padding=padding))
else:
self._emit(tokens.TagCloseOpen(padding=padding))
if contents:
self._emit_all(contents)
self._emit(tokens.TagOpenClose(wiki_markup=open_close_markup))
self._emit_text(tag)
self._emit(tokens.TagCloseClose())

def _handle_table_style(self, end_token):
"""Handle style attributes for a table until ``end_token``."""
data = _TagOpenData()
data.context = _TagOpenData.CX_ATTR_READY
while True:
this = self._read()
can_exit = (not data.context & data.CX_QUOTED or
data.context & data.CX_NOTE_SPACE)
if this == end_token and can_exit:
if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
self._push_tag_buffer(data)
if this.isspace():
data.padding_buffer["first"] += this
return data.padding_buffer["first"]
elif this is self.END or this == end_token:
if self._context & contexts.TAG_ATTR:
if data.context & data.CX_QUOTED:
# Unclosed attribute quote: reset, don't die
data.context = data.CX_ATTR_VALUE
self._pop()
self._head = data.reset
continue
self._pop()
self._fail_route()
else:
self._handle_tag_data(data, this)
self._head += 1

def _parse_table(self):
"""Parse a wikicode table by starting with the first line."""
reset = self._head + 1
self._head += 2
self._push(contexts.TABLE_OPEN)
try:
padding = self._handle_table_style("\n")
except BadRoute:
self._head = reset
self._emit_text("{|")
return
style = self._pop()

self._head += 1
try:
table = self._parse(contexts.TABLE_OPEN)
except BadRoute:
self._head = reset
self._emit_text("{|")
return

self._emit_table_tag("{|", "table", style, padding, None, table, "|}")
# Offset displacement done by _parse():
self._head -= 1

def _handle_table_row(self):
"""Parse as style until end of the line, then continue."""
self._head += 2
if not self._can_recurse():
self._emit_text("|-")
self._head -= 1
return

self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
try:
padding = self._handle_table_style("\n")
except BadRoute:
self._pop()
raise
style = self._pop()

# Don't parse the style separator:
self._head += 1
row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)

self._emit_table_tag("|-", "tr", style, padding, None, row, "")
# Offset displacement done by parse():
self._head -= 1

def _handle_table_cell(self, markup, tag, line_context):
"""Parse as normal syntax unless we hit a style marker, then parse
style as HTML attributes and the remainder as normal syntax."""
old_context = self._context
padding, style = "", None
self._head += len(markup)
reset = self._head
if not self._can_recurse():
self._emit_text(markup)
self._head -= 1
return

cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context | contexts.TABLE_CELL_STYLE)
cell_context = self._context
self._context = old_context
reset_for_style = cell_context & contexts.TABLE_CELL_STYLE
if reset_for_style:
self._head = reset
self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context)
padding = self._handle_table_style("|")
style = self._pop()
# Don't parse the style separator:
self._head += 1
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context)
cell_context = self._context
self._context = old_context

close_open_markup = "|" if reset_for_style else None
self._emit_table_tag(markup, tag, style, padding, close_open_markup,
cell, "")
# Keep header/cell line contexts:
self._context |= cell_context & (contexts.TABLE_TH_LINE |
contexts.TABLE_TD_LINE)
# Offset displacement done by parse():
self._head -= 1

def _handle_table_cell_end(self, reset_for_style=False):
"""Returns the current context, with the TABLE_CELL_STYLE flag set if
it is necessary to reset and parse style attributes."""
if reset_for_style:
self._context |= contexts.TABLE_CELL_STYLE
else:
self._context &= ~contexts.TABLE_CELL_STYLE
return self._pop(keep_context=True)

def _handle_table_row_end(self):
"""Return the stack in order to handle the table row end."""
return self._pop()

def _handle_table_end(self):
"""Return the stack in order to handle the table end."""
self._head += 2
return self._pop()

def _handle_end(self):
"""Handle the end of the stream of wikitext."""
if self._context & contexts.FAIL:
if self._context & contexts.TAG_BODY:
if is_single(self._stack[1].text):
return self._handle_single_tag_end()
if self._context & contexts.TABLE_CELL_OPEN:
self._pop()
if self._context & contexts.DOUBLE:
self._pop()
self._fail_route()
@@ -1144,15 +1298,68 @@ class Tokenizer(object):
result = self._parse_style()
if result is not None:
return result
elif self._read(-1) in ("\n", self.START):
if this in ("#", "*", ";", ":"):
elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"):
self._handle_list()
elif this == next == self._read(2) == self._read(3) == "-":
elif self._read(-1) in ("\n", self.START) and this == next == self._read(2) == self._read(3) == "-":
self._handle_hr()
else:
self._emit_text(this)
elif this in ("\n", ":") and self._context & contexts.DL_TERM:
self._handle_dl_term()
if this == "\n":
# Kill potential table contexts
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
# Start of table parsing
elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
if self._can_recurse():
self._parse_table()
else:
self._emit_text("{|")
elif self._context & contexts.TABLE_OPEN:
if this == next == "|" and self._context & contexts.TABLE_TD_LINE:
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE)
elif this == next == "|" and self._context & contexts.TABLE_TH_LINE:
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE)
elif this == next == "!" and self._context & contexts.TABLE_TH_LINE:
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE)
elif this == "|" and self._context & contexts.TABLE_CELL_STYLE:
return self._handle_table_cell_end(reset_for_style=True)
# on newline, clear out cell line contexts
elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS:
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS
self._emit_text(this)
elif (self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
if this == "|" and next == "}":
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
if self._context & contexts.TABLE_ROW_OPEN:
return self._handle_table_row_end()
return self._handle_table_end()
elif this == "|" and next == "-":
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
if self._context & contexts.TABLE_ROW_OPEN:
return self._handle_table_row_end()
self._handle_table_row()
elif this == "|":
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("|", "td", contexts.TABLE_TD_LINE)
elif this == "!":
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("!", "th", contexts.TABLE_TH_LINE)
else:
self._emit_text(this)
else:
self._emit_text(this)

else:
self._emit_text(this)
self._head += 1
@@ -1164,6 +1371,10 @@ class Tokenizer(object):
self._text = [segment for segment in split if segment]
self._head = self._global = self._depth = self._cycles = 0
try:
return self._parse(context)
tokens = self._parse(context)
except BadRoute: # pragma: no cover (untestable/exceptional case)
raise ParserError("Python tokenizer exited with BadRoute")
if self._stacks: # pragma: no cover (untestable/exceptional case)
err = "Python tokenizer exited with non-empty token stack"
raise ParserError(err)
return tokens

+ 8
- 3
tests/_test_tokenizer.py View File

@@ -25,8 +25,9 @@ import codecs
from os import listdir, path
import sys

from mwparserfromhell.compat import py3k
from mwparserfromhell.compat import py3k, str
from mwparserfromhell.parser import tokens
from mwparserfromhell.parser.builder import Builder

class _TestParseError(Exception):
"""Raised internally when a test could not be parsed."""
@@ -50,8 +51,12 @@ class TokenizerTestCase(object):
*label* for the method's docstring.
"""
def inner(self):
expected = data["output"]
actual = self.tokenizer().tokenize(data["input"])
if hasattr(self, "roundtrip"):
expected = data["input"]
actual = str(Builder().build(data["output"][:]))
else:
expected = data["output"]
actual = self.tokenizer().tokenize(data["input"])
self.assertEqual(expected, actual)
if not py3k:
inner.__name__ = funcname.encode("utf8")


+ 41
- 0
tests/test_roundtripping.py View File

@@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2014 Ben Kurtovic <ben.kurtovic@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals

try:
import unittest2 as unittest
except ImportError:
import unittest

from ._test_tokenizer import TokenizerTestCase

class TestRoundtripping(TokenizerTestCase, unittest.TestCase):
"""Test cases for roundtripping tokens back to wikitext."""

@classmethod
def setUpClass(cls):
cls.roundtrip = True


if __name__ == "__main__":
unittest.main(verbosity=2)

+ 32
- 0
tests/test_tag.py View File

@@ -226,6 +226,38 @@ class TestTag(TreeEqualityTestCase):
self.assertWikicodeEqual(parsed, node.closing_tag)
self.assertEqual("<ref>foobar</ref {{ignore me}}>", node)

def test_wiki_style_separator(self):
"""test getter/setter for wiki_style_separator attribute"""
node = Tag(wraptext("table"), wraptext("\n"))
self.assertIs(None, node.wiki_style_separator)
node.wiki_style_separator = "|"
self.assertEqual("|", node.wiki_style_separator)
node.wiki_markup = "{"
self.assertEqual("{|\n{", node)
node2 = Tag(wraptext("table"), wraptext("\n"), wiki_style_separator="|")
self.assertEqual("|", node.wiki_style_separator)

def test_closing_wiki_markup(self):
"""test getter/setter for closing_wiki_markup attribute"""
node = Tag(wraptext("table"), wraptext("\n"))
self.assertIs(None, node.closing_wiki_markup)
node.wiki_markup = "{|"
self.assertEqual("{|", node.closing_wiki_markup)
node.closing_wiki_markup = "|}"
self.assertEqual("|}", node.closing_wiki_markup)
self.assertEqual("{|\n|}", node)
node.wiki_markup = "!!"
self.assertEqual("|}", node.closing_wiki_markup)
self.assertEqual("!!\n|}", node)
node.wiki_markup = False
self.assertFalse(node.closing_wiki_markup)
self.assertEqual("<table>\n</table>", node)
node2 = Tag(wraptext("table"), wraptext("\n"),
attrs=[agen("id", "foo")], wiki_markup="{|",
closing_wiki_markup="|}")
self.assertEqual("|}", node2.closing_wiki_markup)
self.assertEqual('{| id="foo"\n|}', node2)

def test_has(self):
"""test Tag.has()"""
node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")])


+ 410
- 0
tests/tokenizer/tables.mwtest View File

@@ -0,0 +1,410 @@
name: empty_table
label: parsing an empty table
input: "{|\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: inline_table
label: tables with a close on the same line are not valid
input: "{||}"
output: [Text(text="{||}")]

---

name: no_table_close_simple
label: no table close on inline table
input: "{| "
output: [Text(text="{| ")]

---

name: no_table_close_newline
label: no table close with a newline
input: "{| \n "
output: [Text(text="{| \n ")]

---

name: no_table_close_inside_cell
label: no table close while inside of a cell
input: "{| \n| "
output: [Text(text="{| \n| ")]

---

name: no_table_close_inside_cell_after_newline
label: no table close while inside of a cell after a newline
input: "{| \n| \n "
output: [Text(text="{| \n| \n ")]

---

name: no_table_close_inside_cell_with_attributes
label: no table close while inside of a cell with attributes
input: "{| \n| red | test"
output: [Text(text="{| \n| red | test")]

---

name: no_table_close_inside_row
label: no table close while inside of a row
input: "{| \n|- "
output: [Text(text="{| \n|- ")]

---

name: no_table_close_inside_row_after_newline
label: no table close while inside of a row after a newline
input: "{| \n|- \n "
output: [Text(text="{| \n|- \n ")]

---

name: no_table_close_row_and_cell
label: no table close while inside a cell inside a row
input: "{| \n|- \n|"
output: [Text(text="{| \n|- \n|")]

---

name: no_table_close_attributes
label: don't parse attributes as attributes if the table doesn't exist
input: "{| border="1""
output: [Text(text="{| border=\"1\"")]

---

name: no_table_close_unclosed_attributes
label: don't parse unclosed attributes if the table doesn't exist
input: "{| border="
output: [Text(text="{| border=")]

---

name: no_table_close_row_attributes
label: don't parse row attributes as attributes if the table doesn't exist
input: "{| |- border="1""
output: [Text(text="{| |- border=\"1\"")]

---

name: no_table_close_cell
label: don't parse cells if the table doesn't close
input: "{| | border="1"| test || red | foo"
output: [Text(text="{| | border=\"1\"| test || red | foo")]

---

name: crazy_no_table_close
label: lots of opened wiki syntax without closes
input: "{{{ {{ {| <ref"
output: [Text(text="{{{ {{ {| <ref")]

---

name: leading_whitespace_table
label: handle leading whitespace for a table
input: "foo \n \t {|\n|}"
output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: whitespace_after_table
label: handle whitespace after a table close
input: "{|\n|}\n \t "
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text="\n \t ")]

---

name: different_whitespace_after_table
label: handle spaces after a table close
input: "{|\n|} \n "
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" \n ")]

---

name: characters_after_table
label: handle characters after a table close
input: "{|\n|} tsta"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" tsta")]

---

name: characters_after_inline_table
label: handle characters after an inline table close
input: "{| |} tsta"
output: [Text(text="{| |} tsta")]

---

name: leading_characters_table
label: don't parse as a table when leading characters are not newline or whitespace
input: "foo \n foo \t {|\n|}"
output: [Text(text="foo \n foo \t {|\n|}")]

---

name: table_row_simple
label: simple table row
input: "{|\n |- \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_row_multiple
label: simple table row
input: "{|\n |- \n|- \n |-\n |}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_simple
label: simple table cell
input: "{|\n | foo \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_inline
label: multiple inline table cells
input: "{|\n | foo || bar || test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_multiple
label: multiple table cells (non-inline)
input: "{|\n| foo \n| bar \n| test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_header_simple
label: simple header cell
input: "{|\n ! foo \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_header_inline
label: multiple inline header cells
input: "{|\n ! foo || bar !! test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_header_multiple
label: multiple table header cells (non-inline)
input: "{|\n! foo \n! bar \n! test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: nested_cells_and_rows
label: combination of cells and rows in a table
input: "{|\n|- \n| foo \n|- \n| bar\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_fake_close
label: looks like a table close but is not
input: "{|\n | |} \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text="} \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_more_fake_close
label: looks like a table close but is not
input: "{|\n || |} \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" |} \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_extra_close
label: process second close as text
input: "{| \n |} \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" \n|}")]

---

name: nowiki_inside_table
label: nowiki handles pipe characters in tables
input: "{|\n | foo <nowiki>| |- {| |} || ! !!</nowiki> bar \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_text_outside_cell
label: parse text inside table but outside of a cell
input: "{|\n bar \n | foo \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: no_table_cell_with_leading_characters
label: fail to create a table cell when there are leading non-whitespace characters
input: "{|\n bar | foo \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: no_table_row_with_leading_characters
label: fail to create a table row when there are leading non-whitespace characters
input: "{|\n bar |- foo \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: template_inside_table_cell
label: template within table cell
input: "{|\n |{{foo\n|bar=baz}} \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_attributes
label: parse table cell style attributes
input: "{| \n | name="foo bar"| test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_empty_attributes
label: parse table cell with style markers but no attributes
input: "{| \n | | test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_with_dash
label: parse a situation in which a cell line looks like a row line
input: "{|\n ||- \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_attributes_quote_with_pipe
label: pipe inside an attribute quote should still be used as a style separator
input: "{| \n | name="foo|bar"| test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_attributes_name_with_pipe
label: pipe inside an attribute name should still be used as a style separator
input: "{| \n | name|="foo bar" | test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_attributes_pipe_after_equals
label: pipe inside an attribute should still be used as a style separator after an equals
input: "{| \n | name=|"foo|bar"| test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseOpen(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_cell_attributes_templates
label: pipe inside attributes shouldn't be style separator
input: "{| \n | {{comment|template=baz}} | test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: header_cell_attributes
label: parse header cell style attributes
input: "{| \n ! name="foo bar"| test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: inline_cell_attributes
label: parse cell style attributes of inline cells
input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_row_attributes
label: parse table row style attributes
input: "{| \n |- name="foo bar"\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_row_attributes_crazy_whitespace
label: parse table row style attributes with different whitespace
input: "{| \t \n |- \t name="foo bar" \t \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding=" \t \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: table_attributes
label: parse table style attributes
input: "{| name="foo bar"\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: inline_table_attributes
label: handle attributes in inline tables
input: "{| foo="tee bar" |}"
output: [Text(text='{| foo="tee bar" |}')]

---

name: table_incorrect_attributes
label: parse incorrect table style attributes
input: "{| name="foo\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: templates_in_table_attribute
label: templates in the attributes of a table, after the start
input: "{| {{class}}="{{wikitable}}"\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="class"), TemplateClose(), TagAttrEquals(), TagAttrQuote(char="\""), TemplateOpen(), Text(text="wikitable"), TemplateClose(), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: templates_in_table_attribute_2
label: templates in the attributes of a table, after the start
input: "{|{{foo}} \n | name="foo bar" | test \n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: inappropriate_marker_at_line_start
label: an inappropriate marker (a right bracket) at the start of a line in the table
input: "{|\n}"
output: [Text(text="{|\n}")]

---

name: fake_close_near_start
label: a fake closing token at the end of the first line in the table
input: "{| class="wikitable" style="text-align: center; width=100%;|}\n|\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"text-align:"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="center;"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="width"), TagAttrEquals(), Text(text="100%;|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: fake_close_near_start_2
label: a fake closing token at the end of the first line in the table
input: "{| class="wikitable|}"\n|\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: junk_after_table_start
label: ignore more junk on the first line of the table
input: "{| class="wikitable" | foobar\n|\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="foobar"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

---

name: junk_after_table_row
label: ignore junk on the first line of a table row
input: "{|\n|- foo="bar" | baz\n|blerp\n|}"
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="bar"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="baz"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="blerp\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()]

+ 7
- 0
tests/tokenizer/tags_wikimarkup.mwtest View File

@@ -447,6 +447,13 @@ output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Tag

---

name: dt_dd_mix4
label: another example of correct dt/dd usage, with a trigger for a specific parse route
input: ";foo]:bar"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo]"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")]

---

name: ul_ol_dt_dd_mix
label: an assortment of uls, ols, dds, and dts
input: ";:#*foo\n:#*;foo\n#*;:foo\n*;:#foo"


Loading…
Cancel
Save