@@ -2,6 +2,7 @@ v0.4 (unreleased): | |||
- The parser is now distributed with Windows binaries, fixing an issue that | |||
prevented Windows users from using the C tokenizer. | |||
- Added support for parsing wikicode tables (patches by David Winegar). | |||
- Added a script to test for memory leaks in scripts/memtest.py. | |||
- Added a script to do releases in scripts/release.sh. | |||
- skip_style_tags can now be passed to mwparserfromhell.parse() (previously, | |||
@@ -9,6 +9,7 @@ Unreleased | |||
- The parser is now distributed with Windows binaries, fixing an issue that | |||
prevented Windows users from using the C tokenizer. | |||
- Added support for parsing wikicode tables (patches by David Winegar). | |||
- Added a script to test for memory leaks in :file:`scripts/memtest.py`. | |||
- Added a script to do releases in :file:`scripts/release.sh`. | |||
- *skip_style_tags* can now be passed to :func:`mwparserfromhell.parse() | |||
@@ -52,7 +52,7 @@ INVISIBLE_TAGS = [ | |||
# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 | |||
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] | |||
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] | |||
SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"] | |||
MARKUP_TO_HTML = { | |||
"#": "li", | |||
@@ -35,7 +35,8 @@ class Tag(Node): | |||
def __init__(self, tag, contents=None, attrs=None, wiki_markup=None, | |||
self_closing=False, invalid=False, implicit=False, padding="", | |||
closing_tag=None): | |||
closing_tag=None, wiki_style_separator=None, | |||
closing_wiki_markup=None): | |||
super(Tag, self).__init__() | |||
self._tag = tag | |||
if contents is None and not self_closing: | |||
@@ -52,13 +53,28 @@ class Tag(Node): | |||
self._closing_tag = closing_tag | |||
else: | |||
self._closing_tag = tag | |||
self._wiki_style_separator = wiki_style_separator | |||
if closing_wiki_markup is not None: | |||
self._closing_wiki_markup = closing_wiki_markup | |||
elif wiki_markup and not self_closing: | |||
self._closing_wiki_markup = wiki_markup | |||
else: | |||
self._closing_wiki_markup = None | |||
def __unicode__(self): | |||
if self.wiki_markup: | |||
if self.attributes: | |||
attrs = "".join([str(attr) for attr in self.attributes]) | |||
else: | |||
attrs = "" | |||
padding = self.padding or "" | |||
separator = self.wiki_style_separator or "" | |||
close = self.closing_wiki_markup or "" | |||
if self.self_closing: | |||
return self.wiki_markup | |||
return self.wiki_markup + attrs + padding + separator | |||
else: | |||
return self.wiki_markup + str(self.contents) + self.wiki_markup | |||
return self.wiki_markup + attrs + padding + separator + \ | |||
str(self.contents) + close | |||
result = ("</" if self.invalid else "<") + str(self.tag) | |||
if self.attributes: | |||
@@ -73,10 +89,10 @@ class Tag(Node): | |||
def __children__(self): | |||
if not self.wiki_markup: | |||
yield self.tag | |||
for attr in self.attributes: | |||
yield attr.name | |||
if attr.value is not None: | |||
yield attr.value | |||
for attr in self.attributes: | |||
yield attr.name | |||
if attr.value is not None: | |||
yield attr.value | |||
if self.contents: | |||
yield self.contents | |||
if not self.self_closing and not self.wiki_markup and self.closing_tag: | |||
@@ -174,6 +190,27 @@ class Tag(Node): | |||
""" | |||
return self._closing_tag | |||
@property | |||
def wiki_style_separator(self): | |||
"""The separator between the padding and content in a wiki markup tag. | |||
Essentially the wiki equivalent of the TagCloseOpen. | |||
""" | |||
return self._wiki_style_separator | |||
@property | |||
def closing_wiki_markup(self): | |||
"""The wikified version of the closing tag to show instead of HTML. | |||
If set to a value, this will be displayed instead of the close tag | |||
brackets. If tag is :attr:`self_closing` is ``True`` then this is not | |||
displayed. If :attr:`wiki_markup` is set and this has not been set, this | |||
is set to the value of :attr:`wiki_markup`. If this has been set and | |||
:attr:`wiki_markup` is set to a ``False`` value, this is set to | |||
``None``. | |||
""" | |||
return self._closing_wiki_markup | |||
@tag.setter | |||
def tag(self, value): | |||
self._tag = self._closing_tag = parse_anything(value) | |||
@@ -185,6 +222,8 @@ class Tag(Node): | |||
@wiki_markup.setter | |||
def wiki_markup(self, value): | |||
self._wiki_markup = str(value) if value else None | |||
if not value or not self.closing_wiki_markup: | |||
self._closing_wiki_markup = self._wiki_markup | |||
@self_closing.setter | |||
def self_closing(self, value): | |||
@@ -212,6 +251,14 @@ class Tag(Node): | |||
def closing_tag(self, value): | |||
self._closing_tag = parse_anything(value) | |||
@wiki_style_separator.setter | |||
def wiki_style_separator(self, value): | |||
self._wiki_style_separator = str(value) if value else None | |||
@closing_wiki_markup.setter | |||
def closing_wiki_markup(self, value): | |||
self._closing_wiki_markup = str(value) if value else None | |||
def has(self, name): | |||
"""Return whether any attribute in the tag has the given *name*. | |||
@@ -249,20 +249,24 @@ class Builder(object): | |||
close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) | |||
implicit, attrs, contents, closing_tag = False, [], None, None | |||
wiki_markup, invalid = token.wiki_markup, token.invalid or False | |||
wiki_style_separator, closing_wiki_markup = None, wiki_markup | |||
self._push() | |||
while self._tokens: | |||
token = self._tokens.pop() | |||
if isinstance(token, tokens.TagAttrStart): | |||
attrs.append(self._handle_attribute(token)) | |||
elif isinstance(token, tokens.TagCloseOpen): | |||
wiki_style_separator = token.wiki_markup | |||
padding = token.padding or "" | |||
tag = self._pop() | |||
self._push() | |||
elif isinstance(token, tokens.TagOpenClose): | |||
closing_wiki_markup = token.wiki_markup | |||
contents = self._pop() | |||
self._push() | |||
elif isinstance(token, close_tokens): | |||
if isinstance(token, tokens.TagCloseSelfclose): | |||
closing_wiki_markup = token.wiki_markup | |||
tag = self._pop() | |||
self_closing = True | |||
padding = token.padding or "" | |||
@@ -271,7 +275,8 @@ class Builder(object): | |||
self_closing = False | |||
closing_tag = self._pop() | |||
return Tag(tag, contents, attrs, wiki_markup, self_closing, | |||
invalid, implicit, padding, closing_tag) | |||
invalid, implicit, padding, closing_tag, | |||
wiki_style_separator, closing_wiki_markup) | |||
else: | |||
self._write(self._handle_token(token)) | |||
raise ParserError("_handle_tag() missed a close token") | |||
@@ -90,6 +90,15 @@ Local (stack-specific) contexts: | |||
* :const:`FAIL_ON_RBRACE` | |||
* :const:`FAIL_ON_EQUALS` | |||
* :const:`TABLE` | |||
* :const:`TABLE_OPEN` | |||
* :const:`TABLE_CELL_OPEN` | |||
* :const:`TABLE_CELL_STYLE` | |||
* :const:`TABLE_TD_LINE` | |||
* :const:`TABLE_TH_LINE` | |||
* :const:`TABLE_CELL_LINE_CONTEXTS` | |||
Global contexts: | |||
* :const:`GL_HEADING` | |||
@@ -155,15 +164,26 @@ FAIL_ON_EQUALS = 1 << 29 | |||
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | |||
FAIL_ON_RBRACE + FAIL_ON_EQUALS) | |||
TABLE_OPEN = 1 << 30 | |||
TABLE_CELL_OPEN = 1 << 31 | |||
TABLE_CELL_STYLE = 1 << 32 | |||
TABLE_ROW_OPEN = 1 << 33 | |||
TABLE_TD_LINE = 1 << 34 | |||
TABLE_TH_LINE = 1 << 35 | |||
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE | |||
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + | |||
TABLE_TD_LINE + TABLE_TH_LINE) | |||
# Global contexts: | |||
GL_HEADING = 1 << 0 | |||
# Aggregate contexts: | |||
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE | |||
FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + | |||
STYLE + TABLE) | |||
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + | |||
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) | |||
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE | |||
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN | |||
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI | |||
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK |
@@ -241,7 +241,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) | |||
/* | |||
Add a new token stack, context, and textbuffer to the list. | |||
*/ | |||
static int Tokenizer_push(Tokenizer* self, int context) | |||
static int Tokenizer_push(Tokenizer* self, uint64_t context) | |||
{ | |||
Stack* top = malloc(sizeof(Stack)); | |||
@@ -333,7 +333,7 @@ static PyObject* Tokenizer_pop(Tokenizer* self) | |||
static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) | |||
{ | |||
PyObject* stack; | |||
int context; | |||
uint64_t context; | |||
if (Tokenizer_push_textbuffer(self)) | |||
return NULL; | |||
@@ -351,7 +351,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) | |||
*/ | |||
static void* Tokenizer_fail_route(Tokenizer* self) | |||
{ | |||
int context = self->topstack->context; | |||
uint64_t context = self->topstack->context; | |||
PyObject* stack = Tokenizer_pop(self); | |||
Py_XDECREF(stack); | |||
@@ -676,11 +676,8 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) | |||
RESET_ROUTE(); | |||
for (i = 0; i < braces; i++) text[i] = '{'; | |||
text[braces] = '\0'; | |||
if (Tokenizer_emit_text_then_stack(self, text)) { | |||
Py_XDECREF(text); | |||
if (Tokenizer_emit_text_then_stack(self, text)) | |||
return -1; | |||
} | |||
Py_XDECREF(text); | |||
return 0; | |||
} | |||
else | |||
@@ -1034,7 +1031,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) | |||
{ | |||
// Built from Tokenizer_parse()'s end sentinels: | |||
Py_UNICODE after = Tokenizer_READ(self, 2); | |||
int ctx = self->topstack->context; | |||
uint64_t ctx = self->topstack->context; | |||
return (!this || this == '\n' || this == '[' || this == ']' || | |||
this == '<' || this == '>' || (this == '\'' && next == '\'') || | |||
@@ -1629,9 +1626,9 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data) | |||
static int | |||
Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) | |||
{ | |||
int ctx = data->context; | |||
int end_of_value = (ctx & TAG_ATTR_VALUE && | |||
!(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE))); | |||
uint64_t ctx = data->context; | |||
uint64_t end_of_value = (ctx & TAG_ATTR_VALUE && | |||
!(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE))); | |||
if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) { | |||
if (Tokenizer_push_tag_buffer(self, data)) | |||
@@ -2153,7 +2150,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag, | |||
static int Tokenizer_parse_italics(Tokenizer* self) | |||
{ | |||
Py_ssize_t reset = self->head; | |||
int context; | |||
uint64_t context; | |||
PyObject *stack; | |||
stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1); | |||
@@ -2273,7 +2270,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self) | |||
*/ | |||
static PyObject* Tokenizer_parse_style(Tokenizer* self) | |||
{ | |||
int context = self->topstack->context, ticks = 2, i; | |||
uint64_t context = self->topstack->context, ticks = 2, i; | |||
self->head += 2; | |||
while (Tokenizer_READ(self, 0) == '\'') { | |||
@@ -2426,9 +2423,363 @@ static int Tokenizer_handle_dl_term(Tokenizer* self) | |||
} | |||
/* | |||
Emit a table tag. | |||
*/ | |||
static int | |||
Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, | |||
const char* tag, PyObject* style, PyObject* padding, | |||
const char* close_open_markup, PyObject* contents, | |||
const char* open_close_markup) | |||
{ | |||
PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs, | |||
*close_open_markup_unicode, *open_close_kwargs, | |||
*open_close_markup_unicode; | |||
open_open_kwargs = PyDict_New(); | |||
if (!open_open_kwargs) | |||
goto fail_decref_all; | |||
open_open_markup_unicode = PyUnicode_FromString(open_open_markup); | |||
if (!open_open_markup_unicode) { | |||
Py_DECREF(open_open_kwargs); | |||
goto fail_decref_all; | |||
} | |||
PyDict_SetItemString(open_open_kwargs, "wiki_markup", | |||
open_open_markup_unicode); | |||
Py_DECREF(open_open_markup_unicode); | |||
if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs)) | |||
goto fail_decref_all; | |||
if (Tokenizer_emit_text(self, tag)) | |||
goto fail_decref_all; | |||
if (style) { | |||
if (Tokenizer_emit_all(self, style)) | |||
goto fail_decref_all; | |||
Py_DECREF(style); | |||
} | |||
close_open_kwargs = PyDict_New(); | |||
if (!close_open_kwargs) | |||
goto fail_decref_padding_contents; | |||
if (close_open_markup && strlen(close_open_markup) != 0) { | |||
close_open_markup_unicode = PyUnicode_FromString(close_open_markup); | |||
if (!close_open_markup_unicode) { | |||
Py_DECREF(close_open_kwargs); | |||
goto fail_decref_padding_contents; | |||
} | |||
PyDict_SetItemString(close_open_kwargs, "wiki_markup", | |||
close_open_markup_unicode); | |||
Py_DECREF(close_open_markup_unicode); | |||
} | |||
PyDict_SetItemString(close_open_kwargs, "padding", padding); | |||
Py_DECREF(padding); | |||
if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs)) | |||
goto fail_decref_contents; | |||
if (contents) { | |||
if (Tokenizer_emit_all(self, contents)) | |||
goto fail_decref_contents; | |||
Py_DECREF(contents); | |||
} | |||
open_close_kwargs = PyDict_New(); | |||
if (!open_close_kwargs) | |||
return -1; | |||
open_close_markup_unicode = PyUnicode_FromString(open_close_markup); | |||
if (!open_close_markup_unicode) { | |||
Py_DECREF(open_close_kwargs); | |||
return -1; | |||
} | |||
PyDict_SetItemString(open_close_kwargs, "wiki_markup", | |||
open_close_markup_unicode); | |||
Py_DECREF(open_close_markup_unicode); | |||
if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs)) | |||
return -1; | |||
if (Tokenizer_emit_text(self, tag)) | |||
return -1; | |||
if (Tokenizer_emit(self, TagCloseClose)) | |||
return -1; | |||
return 0; | |||
fail_decref_all: | |||
Py_XDECREF(style); | |||
fail_decref_padding_contents: | |||
Py_DECREF(padding); | |||
fail_decref_contents: | |||
Py_DECREF(contents); | |||
return -1; | |||
} | |||
/* | |||
Handle style attributes for a table until an ending token. | |||
*/ | |||
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token) | |||
{ | |||
TagData *data = TagData_new(); | |||
PyObject *padding, *trash; | |||
Py_UNICODE this; | |||
int can_exit; | |||
if (!data) | |||
return NULL; | |||
data->context = TAG_ATTR_READY; | |||
while (1) { | |||
this = Tokenizer_READ(self, 0); | |||
can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE); | |||
if (this == end_token && can_exit) { | |||
if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) { | |||
if (Tokenizer_push_tag_buffer(self, data)) { | |||
TagData_dealloc(data); | |||
return NULL; | |||
} | |||
} | |||
if (Py_UNICODE_ISSPACE(this)) | |||
Textbuffer_write(&(data->pad_first), this); | |||
padding = Textbuffer_render(data->pad_first); | |||
TagData_dealloc(data); | |||
if (!padding) | |||
return NULL; | |||
return padding; | |||
} | |||
else if (!this || this == end_token) { | |||
if (self->topstack->context & LC_TAG_ATTR) { | |||
if (data->context & TAG_QUOTED) { | |||
// Unclosed attribute quote: reset, don't die | |||
data->context = TAG_ATTR_VALUE; | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
self->head = data->reset; | |||
continue; | |||
} | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
} | |||
TagData_dealloc(data); | |||
return Tokenizer_fail_route(self); | |||
} | |||
else { | |||
if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) { | |||
TagData_dealloc(data); | |||
return NULL; | |||
} | |||
} | |||
self->head++; | |||
} | |||
} | |||
/* | |||
Parse a wikicode table by starting with the first line. | |||
*/ | |||
static int Tokenizer_parse_table(Tokenizer* self) | |||
{ | |||
Py_ssize_t reset = self->head + 1; | |||
PyObject *style, *padding; | |||
PyObject *table = NULL; | |||
self->head += 2; | |||
if(Tokenizer_push(self, LC_TABLE_OPEN)) | |||
return -1; | |||
padding = Tokenizer_handle_table_style(self, '\n'); | |||
if (BAD_ROUTE) { | |||
RESET_ROUTE(); | |||
self->head = reset; | |||
if (Tokenizer_emit_text(self, "{|")) | |||
return -1; | |||
return 0; | |||
} | |||
if (!padding) | |||
return -1; | |||
style = Tokenizer_pop(self); | |||
if (!style) { | |||
Py_DECREF(padding); | |||
return -1; | |||
} | |||
self->head++; | |||
table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); | |||
if (BAD_ROUTE) { | |||
RESET_ROUTE(); | |||
Py_DECREF(padding); | |||
Py_DECREF(style); | |||
self->head = reset; | |||
if (Tokenizer_emit_text(self, "{|")) | |||
return -1; | |||
return 0; | |||
} | |||
if (!table) { | |||
Py_DECREF(padding); | |||
Py_DECREF(style); | |||
return -1; | |||
} | |||
if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL, | |||
table, "|}")) | |||
return -1; | |||
// Offset displacement done by _parse() | |||
self->head--; | |||
return 0; | |||
} | |||
/* | |||
Parse as style until end of the line, then continue. | |||
*/ | |||
static int Tokenizer_handle_table_row(Tokenizer* self) | |||
{ | |||
PyObject *padding, *style, *row, *trash; | |||
self->head += 2; | |||
if (!Tokenizer_CAN_RECURSE(self)) { | |||
if (Tokenizer_emit_text(self, "|-")) | |||
return -1; | |||
self->head -= 1; | |||
return 0; | |||
} | |||
if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) | |||
return -1; | |||
padding = Tokenizer_handle_table_style(self, '\n'); | |||
if (BAD_ROUTE) { | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
return 0; | |||
} | |||
if (!padding) | |||
return -1; | |||
style = Tokenizer_pop(self); | |||
if (!style) { | |||
Py_DECREF(padding); | |||
return -1; | |||
} | |||
// Don't parse the style separator | |||
self->head++; | |||
row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); | |||
if (!row) { | |||
Py_DECREF(padding); | |||
Py_DECREF(style); | |||
return -1; | |||
} | |||
if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, "")) | |||
return -1; | |||
// Offset displacement done by _parse() | |||
self->head--; | |||
return 0; | |||
} | |||
/* | |||
Parse as normal syntax unless we hit a style marker, then parse style | |||
as HTML attributes and the remainder as normal syntax. | |||
*/ | |||
static int | |||
Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, | |||
const char *tag, uint64_t line_context) | |||
{ | |||
uint64_t old_context = self->topstack->context; | |||
uint64_t cell_context; | |||
Py_ssize_t reset; | |||
PyObject *padding, *cell, *style = NULL; | |||
const char *close_open_markup = NULL; | |||
self->head += strlen(markup); | |||
reset = self->head; | |||
if (!Tokenizer_CAN_RECURSE(self)) { | |||
if (Tokenizer_emit_text(self, markup)) | |||
return -1; | |||
self->head--; | |||
return 0; | |||
} | |||
cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | | |||
LC_TABLE_CELL_STYLE | line_context, 1); | |||
if (!cell) | |||
return -1; | |||
cell_context = self->topstack->context; | |||
self->topstack->context = old_context; | |||
if (cell_context & LC_TABLE_CELL_STYLE) { | |||
Py_DECREF(cell); | |||
self->head = reset; | |||
if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | | |||
line_context)) | |||
return -1; | |||
padding = Tokenizer_handle_table_style(self, '|'); | |||
if (!padding) | |||
return -1; | |||
style = Tokenizer_pop(self); | |||
if (!style) { | |||
Py_DECREF(padding); | |||
return -1; | |||
} | |||
// Don't parse the style separator | |||
self->head++; | |||
cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | | |||
line_context, 1); | |||
if (!cell) { | |||
Py_DECREF(padding); | |||
Py_DECREF(style); | |||
return -1; | |||
} | |||
cell_context = self->topstack->context; | |||
self->topstack->context = old_context; | |||
} | |||
else { | |||
padding = PyUnicode_FromString(""); | |||
if (!padding) { | |||
Py_DECREF(cell); | |||
return -1; | |||
} | |||
} | |||
if (style) { | |||
close_open_markup = "|"; | |||
} | |||
if (Tokenizer_emit_table_tag(self, markup, tag, style, padding, | |||
close_open_markup, cell, "")) | |||
return -1; | |||
// Keep header/cell line contexts | |||
self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE); | |||
// Offset displacement done by parse() | |||
self->head--; | |||
return 0; | |||
} | |||
/* | |||
Returns the context, stack, and whether to reset the cell for style | |||
in a tuple. | |||
*/ | |||
static PyObject* | |||
Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) | |||
{ | |||
if (reset_for_style) | |||
self->topstack->context |= LC_TABLE_CELL_STYLE; | |||
else | |||
self->topstack->context &= ~LC_TABLE_CELL_STYLE; | |||
return Tokenizer_pop_keeping_context(self); | |||
} | |||
/* | |||
Return the stack in order to handle the table row end. | |||
*/ | |||
static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) | |||
{ | |||
return Tokenizer_pop(self); | |||
} | |||
/* | |||
Return the stack in order to handle the table end. | |||
*/ | |||
static PyObject* Tokenizer_handle_table_end(Tokenizer* self) | |||
{ | |||
self->head += 2; | |||
return Tokenizer_pop(self); | |||
} | |||
/* | |||
Handle the end of the stream of wikitext. | |||
*/ | |||
static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) | |||
static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) | |||
{ | |||
PyObject *token, *text, *trash; | |||
int single; | |||
@@ -2444,9 +2795,16 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) | |||
if (single) | |||
return Tokenizer_handle_single_tag_end(self); | |||
} | |||
else if (context & AGG_DOUBLE) { | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
else { | |||
if (context & LC_TABLE_CELL_OPEN) { | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
context = self->topstack->context; | |||
} | |||
if (context & AGG_DOUBLE) { | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
} | |||
} | |||
return Tokenizer_fail_route(self); | |||
} | |||
@@ -2457,7 +2815,8 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) | |||
Make sure we are not trying to write an invalid character. Return 0 if | |||
everything is safe, or -1 if the route must be failed. | |||
*/ | |||
static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||
static int | |||
Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) | |||
{ | |||
if (context & LC_FAIL_NEXT) | |||
return -1; | |||
@@ -2508,7 +2867,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||
} | |||
else if (context & LC_FAIL_ON_LBRACE) { | |||
if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' && | |||
Tokenizer_READ_BACKWARDS(self, 2) == '{')) { | |||
Tokenizer_READ_BACKWARDS(self, 2) == '{')) { | |||
if (context & LC_TEMPLATE) | |||
self->topstack->context |= LC_FAIL_ON_EQUALS; | |||
else | |||
@@ -2533,12 +2892,30 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||
} | |||
/* | |||
Returns whether the current head has leading whitespace. | |||
TODO: treat comments and templates as whitespace, allow fail on non-newline spaces. | |||
*/ | |||
static int Tokenizer_has_leading_whitespace(Tokenizer* self) | |||
{ | |||
int offset = 1; | |||
Py_UNICODE current_character; | |||
while (1) { | |||
current_character = Tokenizer_READ_BACKWARDS(self, offset); | |||
if (!current_character || current_character == '\n') | |||
return 1; | |||
else if (!Py_UNICODE_ISSPACE(current_character)) | |||
return 0; | |||
offset++; | |||
} | |||
} | |||
/* | |||
Parse the wikicode string, using context for when to stop. If push is true, | |||
we will push a new context, otherwise we won't and context will be ignored. | |||
*/ | |||
static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||
static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) | |||
{ | |||
int this_context; | |||
uint64_t this_context; | |||
Py_UNICODE this, next, next_next, last; | |||
PyObject* temp; | |||
@@ -2667,22 +3044,99 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||
if (temp != Py_None) | |||
return temp; | |||
} | |||
else if (!last || last == '\n') { | |||
if (this == '#' || this == '*' || this == ';' || this == ':') { | |||
if (Tokenizer_handle_list(self)) | |||
else if ((!last || last == '\n') && (this == '#' || this == '*' || this == ';' || this == ':')) { | |||
if (Tokenizer_handle_list(self)) | |||
return NULL; | |||
} | |||
else if ((!last || last == '\n') && (this == '-' && this == next && | |||
this == Tokenizer_READ(self, 2) && | |||
this == Tokenizer_READ(self, 3))) { | |||
if (Tokenizer_handle_hr(self)) | |||
return NULL; | |||
} | |||
else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) { | |||
if (Tokenizer_handle_dl_term(self)) | |||
return NULL; | |||
// Kill potential table contexts | |||
if (this == '\n') | |||
self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; | |||
} | |||
// Start of table parsing | |||
else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) { | |||
if (Tokenizer_CAN_RECURSE(self)) { | |||
if (Tokenizer_parse_table(self)) | |||
return NULL; | |||
} | |||
else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next)) | |||
return NULL; | |||
else | |||
self->head++; | |||
} | |||
else if (this_context & LC_TABLE_OPEN) { | |||
if (this == '|' && next == '|' && this_context & LC_TABLE_TD_LINE) { | |||
if (this_context & LC_TABLE_CELL_OPEN) | |||
return Tokenizer_handle_table_cell_end(self, 0); | |||
else if (Tokenizer_handle_table_cell(self, "||", "td", LC_TABLE_TD_LINE)) | |||
return NULL; | |||
} | |||
else if (this == '-' && this == next && | |||
this == Tokenizer_READ(self, 2) && | |||
this == Tokenizer_READ(self, 3)) { | |||
if (Tokenizer_handle_hr(self)) | |||
else if (this == '|' && next == '|' && this_context & LC_TABLE_TH_LINE) { | |||
if (this_context & LC_TABLE_CELL_OPEN) | |||
return Tokenizer_handle_table_cell_end(self, 0); | |||
else if (Tokenizer_handle_table_cell(self, "||", "th", LC_TABLE_TH_LINE)) | |||
return NULL; | |||
} | |||
else if (this == '!' && next == '!' && this_context & LC_TABLE_TH_LINE) { | |||
if (this_context & LC_TABLE_CELL_OPEN) | |||
return Tokenizer_handle_table_cell_end(self, 0); | |||
else if (Tokenizer_handle_table_cell(self, "!!", "th", LC_TABLE_TH_LINE)) | |||
return NULL; | |||
} | |||
else if (this == '|' && this_context & LC_TABLE_CELL_STYLE) { | |||
return Tokenizer_handle_table_cell_end(self, 1); | |||
} | |||
// On newline, clear out cell line contexts | |||
else if (this == '\n' && this_context & LC_TABLE_CELL_LINE_CONTEXTS) { | |||
self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; | |||
if (Tokenizer_emit_char(self, this)) | |||
return NULL; | |||
} | |||
else if (Tokenizer_has_leading_whitespace(self)) { | |||
if (this == '|' && next == '}') { | |||
if (this_context & LC_TABLE_CELL_OPEN) | |||
return Tokenizer_handle_table_cell_end(self, 0); | |||
if (this_context & LC_TABLE_ROW_OPEN) | |||
return Tokenizer_handle_table_row_end(self); | |||
else | |||
return Tokenizer_handle_table_end(self); | |||
} | |||
else if (this == '|' && next == '-') { | |||
if (this_context & LC_TABLE_CELL_OPEN) | |||
return Tokenizer_handle_table_cell_end(self, 0); | |||
if (this_context & LC_TABLE_ROW_OPEN) | |||
return Tokenizer_handle_table_row_end(self); | |||
else if (Tokenizer_handle_table_row(self)) | |||
return NULL; | |||
} | |||
else if (this == '|') { | |||
if (this_context & LC_TABLE_CELL_OPEN) | |||
return Tokenizer_handle_table_cell_end(self, 0); | |||
else if (Tokenizer_handle_table_cell(self, "|", "td", LC_TABLE_TD_LINE)) | |||
return NULL; | |||
} | |||
else if (this == '!') { | |||
if (this_context & LC_TABLE_CELL_OPEN) | |||
return Tokenizer_handle_table_cell_end(self, 0); | |||
else if (Tokenizer_handle_table_cell(self, "!", "th", LC_TABLE_TH_LINE)) | |||
return NULL; | |||
} | |||
else if (Tokenizer_emit_char(self, this)) | |||
return NULL; | |||
} | |||
else if (Tokenizer_emit_char(self, this)) | |||
return NULL; | |||
} | |||
else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) { | |||
if (Tokenizer_handle_dl_term(self)) | |||
// Raise BadRoute to table start | |||
if (BAD_ROUTE) | |||
return NULL; | |||
} | |||
else if (Tokenizer_emit_char(self, this)) | |||
@@ -2697,7 +3151,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
{ | |||
PyObject *text, *temp, *tokens; | |||
int context = 0, skip_style_tags = 0; | |||
uint64_t context = 0; | |||
int skip_style_tags = 0; | |||
if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { | |||
Py_XDECREF(self->text); | |||
@@ -2725,7 +3180,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
self->skip_style_tags = skip_style_tags; | |||
tokens = Tokenizer_parse(self, context, 1); | |||
if (!tokens && !PyErr_Occurred()) { | |||
if ((!tokens && !PyErr_Occurred()) || self->topstack) { | |||
if (!ParserError) { | |||
if (load_exceptions()) | |||
return NULL; | |||
@@ -2734,6 +3189,9 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
RESET_ROUTE(); | |||
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); | |||
} | |||
else if (self->topstack) | |||
PyErr_SetString(ParserError, | |||
"C tokenizer exited with non-empty token stack"); | |||
else | |||
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); | |||
return NULL; | |||
@@ -29,6 +29,7 @@ SOFTWARE. | |||
#include <math.h> | |||
#include <structmember.h> | |||
#include <bytesobject.h> | |||
#include <stdint.h> | |||
#if PY_MAJOR_VERSION >= 3 | |||
#define IS_PY3K | |||
@@ -43,16 +44,17 @@ SOFTWARE. | |||
static const char MARKERS[] = { | |||
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', | |||
'-', '\n', '\0'}; | |||
'-', '!', '\n', '\0'}; | |||
#define NUM_MARKERS 18 | |||
#define NUM_MARKERS 19 | |||
#define TEXTBUFFER_BLOCKSIZE 1024 | |||
#define MAX_DEPTH 40 | |||
#define MAX_CYCLES 100000 | |||
#define MAX_BRACES 255 | |||
#define MAX_ENTITY_SIZE 8 | |||
static int route_state = 0, route_context = 0; | |||
static int route_state = 0; | |||
static uint64_t route_context = 0; | |||
#define BAD_ROUTE route_state | |||
#define BAD_ROUTE_CONTEXT route_context | |||
#define FAIL_ROUTE(context) route_state = 1; route_context = context | |||
@@ -109,52 +111,61 @@ static PyObject* TagCloseClose; | |||
/* Local contexts: */ | |||
#define LC_TEMPLATE 0x00000007 | |||
#define LC_TEMPLATE_NAME 0x00000001 | |||
#define LC_TEMPLATE_PARAM_KEY 0x00000002 | |||
#define LC_TEMPLATE_PARAM_VALUE 0x00000004 | |||
#define LC_ARGUMENT 0x00000018 | |||
#define LC_ARGUMENT_NAME 0x00000008 | |||
#define LC_ARGUMENT_DEFAULT 0x00000010 | |||
#define LC_WIKILINK 0x00000060 | |||
#define LC_WIKILINK_TITLE 0x00000020 | |||
#define LC_WIKILINK_TEXT 0x00000040 | |||
#define LC_EXT_LINK 0x00000180 | |||
#define LC_EXT_LINK_URI 0x00000080 | |||
#define LC_EXT_LINK_TITLE 0x00000100 | |||
#define LC_HEADING 0x00007E00 | |||
#define LC_HEADING_LEVEL_1 0x00000200 | |||
#define LC_HEADING_LEVEL_2 0x00000400 | |||
#define LC_HEADING_LEVEL_3 0x00000800 | |||
#define LC_HEADING_LEVEL_4 0x00001000 | |||
#define LC_HEADING_LEVEL_5 0x00002000 | |||
#define LC_HEADING_LEVEL_6 0x00004000 | |||
#define LC_TAG 0x00078000 | |||
#define LC_TAG_OPEN 0x00008000 | |||
#define LC_TAG_ATTR 0x00010000 | |||
#define LC_TAG_BODY 0x00020000 | |||
#define LC_TAG_CLOSE 0x00040000 | |||
#define LC_STYLE 0x00780000 | |||
#define LC_STYLE_ITALICS 0x00080000 | |||
#define LC_STYLE_BOLD 0x00100000 | |||
#define LC_STYLE_PASS_AGAIN 0x00200000 | |||
#define LC_STYLE_SECOND_PASS 0x00400000 | |||
#define LC_DLTERM 0x00800000 | |||
#define LC_SAFETY_CHECK 0x3F000000 | |||
#define LC_HAS_TEXT 0x01000000 | |||
#define LC_FAIL_ON_TEXT 0x02000000 | |||
#define LC_FAIL_NEXT 0x04000000 | |||
#define LC_FAIL_ON_LBRACE 0x08000000 | |||
#define LC_FAIL_ON_RBRACE 0x10000000 | |||
#define LC_FAIL_ON_EQUALS 0x20000000 | |||
#define LC_TEMPLATE 0x0000000000000007 | |||
#define LC_TEMPLATE_NAME 0x0000000000000001 | |||
#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002 | |||
#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004 | |||
#define LC_ARGUMENT 0x0000000000000018 | |||
#define LC_ARGUMENT_NAME 0x0000000000000008 | |||
#define LC_ARGUMENT_DEFAULT 0x0000000000000010 | |||
#define LC_WIKILINK 0x0000000000000060 | |||
#define LC_WIKILINK_TITLE 0x0000000000000020 | |||
#define LC_WIKILINK_TEXT 0x0000000000000040 | |||
#define LC_EXT_LINK 0x0000000000000180 | |||
#define LC_EXT_LINK_URI 0x0000000000000080 | |||
#define LC_EXT_LINK_TITLE 0x0000000000000100 | |||
#define LC_HEADING 0x0000000000007E00 | |||
#define LC_HEADING_LEVEL_1 0x0000000000000200 | |||
#define LC_HEADING_LEVEL_2 0x0000000000000400 | |||
#define LC_HEADING_LEVEL_3 0x0000000000000800 | |||
#define LC_HEADING_LEVEL_4 0x0000000000001000 | |||
#define LC_HEADING_LEVEL_5 0x0000000000002000 | |||
#define LC_HEADING_LEVEL_6 0x0000000000004000 | |||
#define LC_TAG 0x0000000000078000 | |||
#define LC_TAG_OPEN 0x0000000000008000 | |||
#define LC_TAG_ATTR 0x0000000000010000 | |||
#define LC_TAG_BODY 0x0000000000020000 | |||
#define LC_TAG_CLOSE 0x0000000000040000 | |||
#define LC_STYLE 0x0000000000780000 | |||
#define LC_STYLE_ITALICS 0x0000000000080000 | |||
#define LC_STYLE_BOLD 0x0000000000100000 | |||
#define LC_STYLE_PASS_AGAIN 0x0000000000200000 | |||
#define LC_STYLE_SECOND_PASS 0x0000000000400000 | |||
#define LC_DLTERM 0x0000000000800000 | |||
#define LC_SAFETY_CHECK 0x000000003F000000 | |||
#define LC_HAS_TEXT 0x0000000001000000 | |||
#define LC_FAIL_ON_TEXT 0x0000000002000000 | |||
#define LC_FAIL_NEXT 0x0000000004000000 | |||
#define LC_FAIL_ON_LBRACE 0x0000000008000000 | |||
#define LC_FAIL_ON_RBRACE 0x0000000010000000 | |||
#define LC_FAIL_ON_EQUALS 0x0000000020000000 | |||
#define LC_TABLE 0x0000000FC0000000 | |||
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000 | |||
#define LC_TABLE_OPEN 0x0000000040000000 | |||
#define LC_TABLE_CELL_OPEN 0x0000000080000000 | |||
#define LC_TABLE_CELL_STYLE 0x0000000100000000 | |||
#define LC_TABLE_ROW_OPEN 0x0000000200000000 | |||
#define LC_TABLE_TD_LINE 0x0000000400000000 | |||
#define LC_TABLE_TH_LINE 0x0000000800000000 | |||
/* Global contexts: */ | |||
@@ -162,9 +173,9 @@ static PyObject* TagCloseClose; | |||
/* Aggregate contexts: */ | |||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) | |||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) | |||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) | |||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) | |||
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) | |||
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) | |||
@@ -191,7 +202,7 @@ struct Textbuffer { | |||
struct Stack { | |||
PyObject* stack; | |||
int context; | |||
uint64_t context; | |||
struct Textbuffer* textbuffer; | |||
struct Stack* next; | |||
}; | |||
@@ -202,7 +213,7 @@ typedef struct { | |||
} HeadingData; | |||
typedef struct { | |||
int context; | |||
uint64_t context; | |||
struct Textbuffer* pad_first; | |||
struct Textbuffer* pad_before_eq; | |||
struct Textbuffer* pad_after_eq; | |||
@@ -267,7 +278,7 @@ static int Tokenizer_parse_entity(Tokenizer*); | |||
static int Tokenizer_parse_comment(Tokenizer*); | |||
static int Tokenizer_handle_dl_term(Tokenizer*); | |||
static int Tokenizer_parse_tag(Tokenizer*); | |||
static PyObject* Tokenizer_parse(Tokenizer*, int, int); | |||
static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int); | |||
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | |||
static int load_exceptions(void); | |||
@@ -63,7 +63,7 @@ class Tokenizer(object): | |||
START = object() | |||
END = object() | |||
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", | |||
":", "/", "-", "\n", START, END] | |||
":", "/", "-", "!", "\n", START, END] | |||
MAX_DEPTH = 40 | |||
MAX_CYCLES = 100000 | |||
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) | |||
@@ -991,12 +991,166 @@ class Tokenizer(object): | |||
else: | |||
self._emit_text("\n") | |||
def _emit_table_tag(self, open_open_markup, tag, style, padding, | |||
close_open_markup, contents, open_close_markup): | |||
"""Emit a table tag.""" | |||
self._emit(tokens.TagOpenOpen(wiki_markup=open_open_markup)) | |||
self._emit_text(tag) | |||
if style: | |||
self._emit_all(style) | |||
if close_open_markup: | |||
self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup, | |||
padding=padding)) | |||
else: | |||
self._emit(tokens.TagCloseOpen(padding=padding)) | |||
if contents: | |||
self._emit_all(contents) | |||
self._emit(tokens.TagOpenClose(wiki_markup=open_close_markup)) | |||
self._emit_text(tag) | |||
self._emit(tokens.TagCloseClose()) | |||
def _handle_table_style(self, end_token): | |||
"""Handle style attributes for a table until ``end_token``.""" | |||
data = _TagOpenData() | |||
data.context = _TagOpenData.CX_ATTR_READY | |||
while True: | |||
this = self._read() | |||
can_exit = (not data.context & data.CX_QUOTED or | |||
data.context & data.CX_NOTE_SPACE) | |||
if this == end_token and can_exit: | |||
if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): | |||
self._push_tag_buffer(data) | |||
if this.isspace(): | |||
data.padding_buffer["first"] += this | |||
return data.padding_buffer["first"] | |||
elif this is self.END or this == end_token: | |||
if self._context & contexts.TAG_ATTR: | |||
if data.context & data.CX_QUOTED: | |||
# Unclosed attribute quote: reset, don't die | |||
data.context = data.CX_ATTR_VALUE | |||
self._pop() | |||
self._head = data.reset | |||
continue | |||
self._pop() | |||
self._fail_route() | |||
else: | |||
self._handle_tag_data(data, this) | |||
self._head += 1 | |||
def _parse_table(self): | |||
"""Parse a wikicode table by starting with the first line.""" | |||
reset = self._head + 1 | |||
self._head += 2 | |||
self._push(contexts.TABLE_OPEN) | |||
try: | |||
padding = self._handle_table_style("\n") | |||
except BadRoute: | |||
self._head = reset | |||
self._emit_text("{|") | |||
return | |||
style = self._pop() | |||
self._head += 1 | |||
try: | |||
table = self._parse(contexts.TABLE_OPEN) | |||
except BadRoute: | |||
self._head = reset | |||
self._emit_text("{|") | |||
return | |||
self._emit_table_tag("{|", "table", style, padding, None, table, "|}") | |||
# Offset displacement done by _parse(): | |||
self._head -= 1 | |||
def _handle_table_row(self): | |||
"""Parse as style until end of the line, then continue.""" | |||
self._head += 2 | |||
if not self._can_recurse(): | |||
self._emit_text("|-") | |||
self._head -= 1 | |||
return | |||
self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) | |||
try: | |||
padding = self._handle_table_style("\n") | |||
except BadRoute: | |||
self._pop() | |||
raise | |||
style = self._pop() | |||
# Don't parse the style separator: | |||
self._head += 1 | |||
row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) | |||
self._emit_table_tag("|-", "tr", style, padding, None, row, "") | |||
# Offset displacement done by parse(): | |||
self._head -= 1 | |||
def _handle_table_cell(self, markup, tag, line_context): | |||
"""Parse as normal syntax unless we hit a style marker, then parse | |||
style as HTML attributes and the remainder as normal syntax.""" | |||
old_context = self._context | |||
padding, style = "", None | |||
self._head += len(markup) | |||
reset = self._head | |||
if not self._can_recurse(): | |||
self._emit_text(markup) | |||
self._head -= 1 | |||
return | |||
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | | |||
line_context | contexts.TABLE_CELL_STYLE) | |||
cell_context = self._context | |||
self._context = old_context | |||
reset_for_style = cell_context & contexts.TABLE_CELL_STYLE | |||
if reset_for_style: | |||
self._head = reset | |||
self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | | |||
line_context) | |||
padding = self._handle_table_style("|") | |||
style = self._pop() | |||
# Don't parse the style separator: | |||
self._head += 1 | |||
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | | |||
line_context) | |||
cell_context = self._context | |||
self._context = old_context | |||
close_open_markup = "|" if reset_for_style else None | |||
self._emit_table_tag(markup, tag, style, padding, close_open_markup, | |||
cell, "") | |||
# Keep header/cell line contexts: | |||
self._context |= cell_context & (contexts.TABLE_TH_LINE | | |||
contexts.TABLE_TD_LINE) | |||
# Offset displacement done by parse(): | |||
self._head -= 1 | |||
def _handle_table_cell_end(self, reset_for_style=False): | |||
"""Returns the current context, with the TABLE_CELL_STYLE flag set if | |||
it is necessary to reset and parse style attributes.""" | |||
if reset_for_style: | |||
self._context |= contexts.TABLE_CELL_STYLE | |||
else: | |||
self._context &= ~contexts.TABLE_CELL_STYLE | |||
return self._pop(keep_context=True) | |||
def _handle_table_row_end(self): | |||
"""Return the stack in order to handle the table row end.""" | |||
return self._pop() | |||
def _handle_table_end(self): | |||
"""Return the stack in order to handle the table end.""" | |||
self._head += 2 | |||
return self._pop() | |||
def _handle_end(self): | |||
"""Handle the end of the stream of wikitext.""" | |||
if self._context & contexts.FAIL: | |||
if self._context & contexts.TAG_BODY: | |||
if is_single(self._stack[1].text): | |||
return self._handle_single_tag_end() | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
self._pop() | |||
if self._context & contexts.DOUBLE: | |||
self._pop() | |||
self._fail_route() | |||
@@ -1144,15 +1298,68 @@ class Tokenizer(object): | |||
result = self._parse_style() | |||
if result is not None: | |||
return result | |||
elif self._read(-1) in ("\n", self.START): | |||
if this in ("#", "*", ";", ":"): | |||
elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"): | |||
self._handle_list() | |||
elif this == next == self._read(2) == self._read(3) == "-": | |||
elif self._read(-1) in ("\n", self.START) and this == next == self._read(2) == self._read(3) == "-": | |||
self._handle_hr() | |||
else: | |||
self._emit_text(this) | |||
elif this in ("\n", ":") and self._context & contexts.DL_TERM: | |||
self._handle_dl_term() | |||
if this == "\n": | |||
# Kill potential table contexts | |||
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS | |||
# Start of table parsing | |||
elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or | |||
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): | |||
if self._can_recurse(): | |||
self._parse_table() | |||
else: | |||
self._emit_text("{|") | |||
elif self._context & contexts.TABLE_OPEN: | |||
if this == next == "|" and self._context & contexts.TABLE_TD_LINE: | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
return self._handle_table_cell_end() | |||
self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE) | |||
elif this == next == "|" and self._context & contexts.TABLE_TH_LINE: | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
return self._handle_table_cell_end() | |||
self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE) | |||
elif this == next == "!" and self._context & contexts.TABLE_TH_LINE: | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
return self._handle_table_cell_end() | |||
self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) | |||
elif this == "|" and self._context & contexts.TABLE_CELL_STYLE: | |||
return self._handle_table_cell_end(reset_for_style=True) | |||
# on newline, clear out cell line contexts | |||
elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS: | |||
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS | |||
self._emit_text(this) | |||
elif (self._read(-1) in ("\n", self.START) or | |||
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): | |||
if this == "|" and next == "}": | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
return self._handle_table_cell_end() | |||
if self._context & contexts.TABLE_ROW_OPEN: | |||
return self._handle_table_row_end() | |||
return self._handle_table_end() | |||
elif this == "|" and next == "-": | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
return self._handle_table_cell_end() | |||
if self._context & contexts.TABLE_ROW_OPEN: | |||
return self._handle_table_row_end() | |||
self._handle_table_row() | |||
elif this == "|": | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
return self._handle_table_cell_end() | |||
self._handle_table_cell("|", "td", contexts.TABLE_TD_LINE) | |||
elif this == "!": | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
return self._handle_table_cell_end() | |||
self._handle_table_cell("!", "th", contexts.TABLE_TH_LINE) | |||
else: | |||
self._emit_text(this) | |||
else: | |||
self._emit_text(this) | |||
else: | |||
self._emit_text(this) | |||
self._head += 1 | |||
@@ -1164,6 +1371,10 @@ class Tokenizer(object): | |||
self._text = [segment for segment in split if segment] | |||
self._head = self._global = self._depth = self._cycles = 0 | |||
try: | |||
return self._parse(context) | |||
tokens = self._parse(context) | |||
except BadRoute: # pragma: no cover (untestable/exceptional case) | |||
raise ParserError("Python tokenizer exited with BadRoute") | |||
if self._stacks: # pragma: no cover (untestable/exceptional case) | |||
err = "Python tokenizer exited with non-empty token stack" | |||
raise ParserError(err) | |||
return tokens |
@@ -25,8 +25,9 @@ import codecs | |||
from os import listdir, path | |||
import sys | |||
from mwparserfromhell.compat import py3k | |||
from mwparserfromhell.compat import py3k, str | |||
from mwparserfromhell.parser import tokens | |||
from mwparserfromhell.parser.builder import Builder | |||
class _TestParseError(Exception): | |||
"""Raised internally when a test could not be parsed.""" | |||
@@ -50,8 +51,12 @@ class TokenizerTestCase(object): | |||
*label* for the method's docstring. | |||
""" | |||
def inner(self): | |||
expected = data["output"] | |||
actual = self.tokenizer().tokenize(data["input"]) | |||
if hasattr(self, "roundtrip"): | |||
expected = data["input"] | |||
actual = str(Builder().build(data["output"][:])) | |||
else: | |||
expected = data["output"] | |||
actual = self.tokenizer().tokenize(data["input"]) | |||
self.assertEqual(expected, actual) | |||
if not py3k: | |||
inner.__name__ = funcname.encode("utf8") | |||
@@ -0,0 +1,41 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2014 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
try: | |||
import unittest2 as unittest | |||
except ImportError: | |||
import unittest | |||
from ._test_tokenizer import TokenizerTestCase | |||
class TestRoundtripping(TokenizerTestCase, unittest.TestCase): | |||
"""Test cases for roundtripping tokens back to wikitext.""" | |||
@classmethod | |||
def setUpClass(cls): | |||
cls.roundtrip = True | |||
if __name__ == "__main__": | |||
unittest.main(verbosity=2) |
@@ -226,6 +226,38 @@ class TestTag(TreeEqualityTestCase): | |||
self.assertWikicodeEqual(parsed, node.closing_tag) | |||
self.assertEqual("<ref>foobar</ref {{ignore me}}>", node) | |||
def test_wiki_style_separator(self): | |||
"""test getter/setter for wiki_style_separator attribute""" | |||
node = Tag(wraptext("table"), wraptext("\n")) | |||
self.assertIs(None, node.wiki_style_separator) | |||
node.wiki_style_separator = "|" | |||
self.assertEqual("|", node.wiki_style_separator) | |||
node.wiki_markup = "{" | |||
self.assertEqual("{|\n{", node) | |||
node2 = Tag(wraptext("table"), wraptext("\n"), wiki_style_separator="|") | |||
self.assertEqual("|", node.wiki_style_separator) | |||
def test_closing_wiki_markup(self): | |||
"""test getter/setter for closing_wiki_markup attribute""" | |||
node = Tag(wraptext("table"), wraptext("\n")) | |||
self.assertIs(None, node.closing_wiki_markup) | |||
node.wiki_markup = "{|" | |||
self.assertEqual("{|", node.closing_wiki_markup) | |||
node.closing_wiki_markup = "|}" | |||
self.assertEqual("|}", node.closing_wiki_markup) | |||
self.assertEqual("{|\n|}", node) | |||
node.wiki_markup = "!!" | |||
self.assertEqual("|}", node.closing_wiki_markup) | |||
self.assertEqual("!!\n|}", node) | |||
node.wiki_markup = False | |||
self.assertFalse(node.closing_wiki_markup) | |||
self.assertEqual("<table>\n</table>", node) | |||
node2 = Tag(wraptext("table"), wraptext("\n"), | |||
attrs=[agen("id", "foo")], wiki_markup="{|", | |||
closing_wiki_markup="|}") | |||
self.assertEqual("|}", node2.closing_wiki_markup) | |||
self.assertEqual('{| id="foo"\n|}', node2) | |||
def test_has(self): | |||
"""test Tag.has()""" | |||
node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")]) | |||
@@ -0,0 +1,410 @@ | |||
name: empty_table | |||
label: parsing an empty table | |||
input: "{|\n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: inline_table | |||
label: tables with a close on the same line are not valid | |||
input: "{||}" | |||
output: [Text(text="{||}")] | |||
--- | |||
name: no_table_close_simple | |||
label: no table close on inline table | |||
input: "{| " | |||
output: [Text(text="{| ")] | |||
--- | |||
name: no_table_close_newline | |||
label: no table close with a newline | |||
input: "{| \n " | |||
output: [Text(text="{| \n ")] | |||
--- | |||
name: no_table_close_inside_cell | |||
label: no table close while inside of a cell | |||
input: "{| \n| " | |||
output: [Text(text="{| \n| ")] | |||
--- | |||
name: no_table_close_inside_cell_after_newline | |||
label: no table close while inside of a cell after a newline | |||
input: "{| \n| \n " | |||
output: [Text(text="{| \n| \n ")] | |||
--- | |||
name: no_table_close_inside_cell_with_attributes | |||
label: no table close while inside of a cell with attributes | |||
input: "{| \n| red | test" | |||
output: [Text(text="{| \n| red | test")] | |||
--- | |||
name: no_table_close_inside_row | |||
label: no table close while inside of a row | |||
input: "{| \n|- " | |||
output: [Text(text="{| \n|- ")] | |||
--- | |||
name: no_table_close_inside_row_after_newline | |||
label: no table close while inside of a row after a newline | |||
input: "{| \n|- \n " | |||
output: [Text(text="{| \n|- \n ")] | |||
--- | |||
name: no_table_close_row_and_cell | |||
label: no table close while inside a cell inside a row | |||
input: "{| \n|- \n|" | |||
output: [Text(text="{| \n|- \n|")] | |||
--- | |||
name: no_table_close_attributes | |||
label: don't parse attributes as attributes if the table doesn't exist | |||
input: "{| border="1"" | |||
output: [Text(text="{| border=\"1\"")] | |||
--- | |||
name: no_table_close_unclosed_attributes | |||
label: don't parse unclosed attributes if the table doesn't exist | |||
input: "{| border=" | |||
output: [Text(text="{| border=")] | |||
--- | |||
name: no_table_close_row_attributes | |||
label: don't parse row attributes as attributes if the table doesn't exist | |||
input: "{| |- border="1"" | |||
output: [Text(text="{| |- border=\"1\"")] | |||
--- | |||
name: no_table_close_cell | |||
label: don't parse cells if the table doesn't close | |||
input: "{| | border="1"| test || red | foo" | |||
output: [Text(text="{| | border=\"1\"| test || red | foo")] | |||
--- | |||
name: crazy_no_table_close | |||
label: lots of opened wiki syntax without closes | |||
input: "{{{ {{ {| <ref" | |||
output: [Text(text="{{{ {{ {| <ref")] | |||
--- | |||
name: leading_whitespace_table | |||
label: handle leading whitespace for a table | |||
input: "foo \n \t {|\n|}" | |||
output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: whitespace_after_table | |||
label: handle whitespace after a table close | |||
input: "{|\n|}\n \t " | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text="\n \t ")] | |||
--- | |||
name: different_whitespace_after_table | |||
label: handle spaces after a table close | |||
input: "{|\n|} \n " | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" \n ")] | |||
--- | |||
name: characters_after_table | |||
label: handle characters after a table close | |||
input: "{|\n|} tsta" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" tsta")] | |||
--- | |||
name: characters_after_inline_table | |||
label: handle characters after an inline table close | |||
input: "{| |} tsta" | |||
output: [Text(text="{| |} tsta")] | |||
--- | |||
name: leading_characters_table | |||
label: don't parse as a table when leading characters are not newline or whitespace | |||
input: "foo \n foo \t {|\n|}" | |||
output: [Text(text="foo \n foo \t {|\n|}")] | |||
--- | |||
name: table_row_simple | |||
label: simple table row | |||
input: "{|\n |- \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_row_multiple | |||
label: simple table row | |||
input: "{|\n |- \n|- \n |-\n |}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_simple | |||
label: simple table cell | |||
input: "{|\n | foo \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_inline | |||
label: multiple inline table cells | |||
input: "{|\n | foo || bar || test \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_multiple | |||
label: multiple table cells (non-inline) | |||
input: "{|\n| foo \n| bar \n| test \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_header_simple | |||
label: simple header cell | |||
input: "{|\n ! foo \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_header_inline | |||
label: multiple inline header cells | |||
input: "{|\n ! foo || bar !! test \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_header_multiple | |||
label: multiple table header cells (non-inline) | |||
input: "{|\n! foo \n! bar \n! test \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: nested_cells_and_rows | |||
label: combination of cells and rows in a table | |||
input: "{|\n|- \n| foo \n|- \n| bar\n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_fake_close | |||
label: looks like a table close but is not | |||
input: "{|\n | |} \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text="} \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_more_fake_close | |||
label: looks like a table close but is not | |||
input: "{|\n || |} \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" |} \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_extra_close | |||
label: process second close as text | |||
input: "{| \n |} \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" \n|}")] | |||
--- | |||
name: nowiki_inside_table | |||
label: nowiki handles pipe characters in tables | |||
input: "{|\n | foo <nowiki>| |- {| |} || ! !!</nowiki> bar \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_text_outside_cell | |||
label: parse text inside table but outside of a cell | |||
input: "{|\n bar \n | foo \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: no_table_cell_with_leading_characters | |||
label: fail to create a table cell when there are leading non-whitespace characters | |||
input: "{|\n bar | foo \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: no_table_row_with_leading_characters | |||
label: fail to create a table row when there are leading non-whitespace characters | |||
input: "{|\n bar |- foo \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: template_inside_table_cell | |||
label: template within table cell | |||
input: "{|\n |{{foo\n|bar=baz}} \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_attributes | |||
label: parse table cell style attributes | |||
input: "{| \n | name="foo bar"| test \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_empty_attributes | |||
label: parse table cell with style markers but no attributes | |||
input: "{| \n | | test \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_with_dash | |||
label: parse a situation in which a cell line looks like a row line | |||
input: "{|\n ||- \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_attributes_quote_with_pipe | |||
label: pipe inside an attribute quote should still be used as a style separator | |||
input: "{| \n | name="foo|bar"| test \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_attributes_name_with_pipe | |||
label: pipe inside an attribute name should still be used as a style separator | |||
input: "{| \n | name|="foo bar" | test \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_attributes_pipe_after_equals | |||
label: pipe inside an attribute should still be used as a style separator after an equals | |||
input: "{| \n | name=|"foo|bar"| test \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseOpen(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_cell_attributes_templates | |||
label: pipe inside attributes shouldn't be style separator | |||
input: "{| \n | {{comment|template=baz}} | test \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: header_cell_attributes | |||
label: parse header cell style attributes | |||
input: "{| \n ! name="foo bar"| test \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: inline_cell_attributes | |||
label: parse cell style attributes of inline cells | |||
input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_row_attributes | |||
label: parse table row style attributes | |||
input: "{| \n |- name="foo bar"\n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_row_attributes_crazy_whitespace | |||
label: parse table row style attributes with different whitespace | |||
input: "{| \t \n |- \t name="foo bar" \t \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding=" \t \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: table_attributes | |||
label: parse table style attributes | |||
input: "{| name="foo bar"\n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: inline_table_attributes | |||
label: handle attributes in inline tables | |||
input: "{| foo="tee bar" |}" | |||
output: [Text(text='{| foo="tee bar" |}')] | |||
--- | |||
name: table_incorrect_attributes | |||
label: parse incorrect table style attributes | |||
input: "{| name="foo\n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: templates_in_table_attribute | |||
label: templates in the attributes of a table, after the start | |||
input: "{| {{class}}="{{wikitable}}"\n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="class"), TemplateClose(), TagAttrEquals(), TagAttrQuote(char="\""), TemplateOpen(), Text(text="wikitable"), TemplateClose(), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: templates_in_table_attribute_2 | |||
label: templates in the attributes of a table, after the start | |||
input: "{|{{foo}} \n | name="foo bar" | test \n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: inappropriate_marker_at_line_start | |||
label: an inappropriate marker (a right bracket) at the start of a line in the table | |||
input: "{|\n}" | |||
output: [Text(text="{|\n}")] | |||
--- | |||
name: fake_close_near_start | |||
label: a fake closing token at the end of the first line in the table | |||
input: "{| class="wikitable" style="text-align: center; width=100%;|}\n|\n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"text-align:"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="center;"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="width"), TagAttrEquals(), Text(text="100%;|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: fake_close_near_start_2 | |||
label: a fake closing token at the end of the first line in the table | |||
input: "{| class="wikitable|}"\n|\n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: junk_after_table_start | |||
label: ignore more junk on the first line of the table | |||
input: "{| class="wikitable" | foobar\n|\n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="foobar"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||
--- | |||
name: junk_after_table_row | |||
label: ignore junk on the first line of a table row | |||
input: "{|\n|- foo="bar" | baz\n|blerp\n|}" | |||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="bar"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="baz"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="blerp\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] |
@@ -447,6 +447,13 @@ output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Tag | |||
--- | |||
name: dt_dd_mix4 | |||
label: another example of correct dt/dd usage, with a trigger for a specific parse route | |||
input: ";foo]:bar" | |||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo]"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")] | |||
--- | |||
name: ul_ol_dt_dd_mix | |||
label: an assortment of uls, ols, dds, and dts | |||
input: ";:#*foo\n:#*;foo\n#*;:foo\n*;:#foo" | |||