@@ -2,6 +2,7 @@ v0.4 (unreleased): | |||||
- The parser is now distributed with Windows binaries, fixing an issue that | - The parser is now distributed with Windows binaries, fixing an issue that | ||||
prevented Windows users from using the C tokenizer. | prevented Windows users from using the C tokenizer. | ||||
- Added support for parsing wikicode tables (patches by David Winegar). | |||||
- Added a script to test for memory leaks in scripts/memtest.py. | - Added a script to test for memory leaks in scripts/memtest.py. | ||||
- Added a script to do releases in scripts/release.sh. | - Added a script to do releases in scripts/release.sh. | ||||
- skip_style_tags can now be passed to mwparserfromhell.parse() (previously, | - skip_style_tags can now be passed to mwparserfromhell.parse() (previously, | ||||
@@ -9,6 +9,7 @@ Unreleased | |||||
- The parser is now distributed with Windows binaries, fixing an issue that | - The parser is now distributed with Windows binaries, fixing an issue that | ||||
prevented Windows users from using the C tokenizer. | prevented Windows users from using the C tokenizer. | ||||
- Added support for parsing wikicode tables (patches by David Winegar). | |||||
- Added a script to test for memory leaks in :file:`scripts/memtest.py`. | - Added a script to test for memory leaks in :file:`scripts/memtest.py`. | ||||
- Added a script to do releases in :file:`scripts/release.sh`. | - Added a script to do releases in :file:`scripts/release.sh`. | ||||
- *skip_style_tags* can now be passed to :func:`mwparserfromhell.parse() | - *skip_style_tags* can now be passed to :func:`mwparserfromhell.parse() | ||||
@@ -52,7 +52,7 @@ INVISIBLE_TAGS = [ | |||||
# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 | # [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 | ||||
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] | SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] | ||||
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] | |||||
SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"] | |||||
MARKUP_TO_HTML = { | MARKUP_TO_HTML = { | ||||
"#": "li", | "#": "li", | ||||
@@ -35,7 +35,8 @@ class Tag(Node): | |||||
def __init__(self, tag, contents=None, attrs=None, wiki_markup=None, | def __init__(self, tag, contents=None, attrs=None, wiki_markup=None, | ||||
self_closing=False, invalid=False, implicit=False, padding="", | self_closing=False, invalid=False, implicit=False, padding="", | ||||
closing_tag=None): | |||||
closing_tag=None, wiki_style_separator=None, | |||||
closing_wiki_markup=None): | |||||
super(Tag, self).__init__() | super(Tag, self).__init__() | ||||
self._tag = tag | self._tag = tag | ||||
if contents is None and not self_closing: | if contents is None and not self_closing: | ||||
@@ -52,13 +53,28 @@ class Tag(Node): | |||||
self._closing_tag = closing_tag | self._closing_tag = closing_tag | ||||
else: | else: | ||||
self._closing_tag = tag | self._closing_tag = tag | ||||
self._wiki_style_separator = wiki_style_separator | |||||
if closing_wiki_markup is not None: | |||||
self._closing_wiki_markup = closing_wiki_markup | |||||
elif wiki_markup and not self_closing: | |||||
self._closing_wiki_markup = wiki_markup | |||||
else: | |||||
self._closing_wiki_markup = None | |||||
def __unicode__(self): | def __unicode__(self): | ||||
if self.wiki_markup: | if self.wiki_markup: | ||||
if self.attributes: | |||||
attrs = "".join([str(attr) for attr in self.attributes]) | |||||
else: | |||||
attrs = "" | |||||
padding = self.padding or "" | |||||
separator = self.wiki_style_separator or "" | |||||
close = self.closing_wiki_markup or "" | |||||
if self.self_closing: | if self.self_closing: | ||||
return self.wiki_markup | |||||
return self.wiki_markup + attrs + padding + separator | |||||
else: | else: | ||||
return self.wiki_markup + str(self.contents) + self.wiki_markup | |||||
return self.wiki_markup + attrs + padding + separator + \ | |||||
str(self.contents) + close | |||||
result = ("</" if self.invalid else "<") + str(self.tag) | result = ("</" if self.invalid else "<") + str(self.tag) | ||||
if self.attributes: | if self.attributes: | ||||
@@ -73,10 +89,10 @@ class Tag(Node): | |||||
def __children__(self): | def __children__(self): | ||||
if not self.wiki_markup: | if not self.wiki_markup: | ||||
yield self.tag | yield self.tag | ||||
for attr in self.attributes: | |||||
yield attr.name | |||||
if attr.value is not None: | |||||
yield attr.value | |||||
for attr in self.attributes: | |||||
yield attr.name | |||||
if attr.value is not None: | |||||
yield attr.value | |||||
if self.contents: | if self.contents: | ||||
yield self.contents | yield self.contents | ||||
if not self.self_closing and not self.wiki_markup and self.closing_tag: | if not self.self_closing and not self.wiki_markup and self.closing_tag: | ||||
@@ -174,6 +190,27 @@ class Tag(Node): | |||||
""" | """ | ||||
return self._closing_tag | return self._closing_tag | ||||
@property | |||||
def wiki_style_separator(self): | |||||
"""The separator between the padding and content in a wiki markup tag. | |||||
Essentially the wiki equivalent of the TagCloseOpen. | |||||
""" | |||||
return self._wiki_style_separator | |||||
@property | |||||
def closing_wiki_markup(self): | |||||
"""The wikified version of the closing tag to show instead of HTML. | |||||
If set to a value, this will be displayed instead of the close tag | |||||
brackets. If tag is :attr:`self_closing` is ``True`` then this is not | |||||
displayed. If :attr:`wiki_markup` is set and this has not been set, this | |||||
is set to the value of :attr:`wiki_markup`. If this has been set and | |||||
:attr:`wiki_markup` is set to a ``False`` value, this is set to | |||||
``None``. | |||||
""" | |||||
return self._closing_wiki_markup | |||||
@tag.setter | @tag.setter | ||||
def tag(self, value): | def tag(self, value): | ||||
self._tag = self._closing_tag = parse_anything(value) | self._tag = self._closing_tag = parse_anything(value) | ||||
@@ -185,6 +222,8 @@ class Tag(Node): | |||||
@wiki_markup.setter | @wiki_markup.setter | ||||
def wiki_markup(self, value): | def wiki_markup(self, value): | ||||
self._wiki_markup = str(value) if value else None | self._wiki_markup = str(value) if value else None | ||||
if not value or not self.closing_wiki_markup: | |||||
self._closing_wiki_markup = self._wiki_markup | |||||
@self_closing.setter | @self_closing.setter | ||||
def self_closing(self, value): | def self_closing(self, value): | ||||
@@ -212,6 +251,14 @@ class Tag(Node): | |||||
def closing_tag(self, value): | def closing_tag(self, value): | ||||
self._closing_tag = parse_anything(value) | self._closing_tag = parse_anything(value) | ||||
@wiki_style_separator.setter | |||||
def wiki_style_separator(self, value): | |||||
self._wiki_style_separator = str(value) if value else None | |||||
@closing_wiki_markup.setter | |||||
def closing_wiki_markup(self, value): | |||||
self._closing_wiki_markup = str(value) if value else None | |||||
def has(self, name): | def has(self, name): | ||||
"""Return whether any attribute in the tag has the given *name*. | """Return whether any attribute in the tag has the given *name*. | ||||
@@ -249,20 +249,24 @@ class Builder(object): | |||||
close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) | close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) | ||||
implicit, attrs, contents, closing_tag = False, [], None, None | implicit, attrs, contents, closing_tag = False, [], None, None | ||||
wiki_markup, invalid = token.wiki_markup, token.invalid or False | wiki_markup, invalid = token.wiki_markup, token.invalid or False | ||||
wiki_style_separator, closing_wiki_markup = None, wiki_markup | |||||
self._push() | self._push() | ||||
while self._tokens: | while self._tokens: | ||||
token = self._tokens.pop() | token = self._tokens.pop() | ||||
if isinstance(token, tokens.TagAttrStart): | if isinstance(token, tokens.TagAttrStart): | ||||
attrs.append(self._handle_attribute(token)) | attrs.append(self._handle_attribute(token)) | ||||
elif isinstance(token, tokens.TagCloseOpen): | elif isinstance(token, tokens.TagCloseOpen): | ||||
wiki_style_separator = token.wiki_markup | |||||
padding = token.padding or "" | padding = token.padding or "" | ||||
tag = self._pop() | tag = self._pop() | ||||
self._push() | self._push() | ||||
elif isinstance(token, tokens.TagOpenClose): | elif isinstance(token, tokens.TagOpenClose): | ||||
closing_wiki_markup = token.wiki_markup | |||||
contents = self._pop() | contents = self._pop() | ||||
self._push() | self._push() | ||||
elif isinstance(token, close_tokens): | elif isinstance(token, close_tokens): | ||||
if isinstance(token, tokens.TagCloseSelfclose): | if isinstance(token, tokens.TagCloseSelfclose): | ||||
closing_wiki_markup = token.wiki_markup | |||||
tag = self._pop() | tag = self._pop() | ||||
self_closing = True | self_closing = True | ||||
padding = token.padding or "" | padding = token.padding or "" | ||||
@@ -271,7 +275,8 @@ class Builder(object): | |||||
self_closing = False | self_closing = False | ||||
closing_tag = self._pop() | closing_tag = self._pop() | ||||
return Tag(tag, contents, attrs, wiki_markup, self_closing, | return Tag(tag, contents, attrs, wiki_markup, self_closing, | ||||
invalid, implicit, padding, closing_tag) | |||||
invalid, implicit, padding, closing_tag, | |||||
wiki_style_separator, closing_wiki_markup) | |||||
else: | else: | ||||
self._write(self._handle_token(token)) | self._write(self._handle_token(token)) | ||||
raise ParserError("_handle_tag() missed a close token") | raise ParserError("_handle_tag() missed a close token") | ||||
@@ -90,6 +90,15 @@ Local (stack-specific) contexts: | |||||
* :const:`FAIL_ON_RBRACE` | * :const:`FAIL_ON_RBRACE` | ||||
* :const:`FAIL_ON_EQUALS` | * :const:`FAIL_ON_EQUALS` | ||||
* :const:`TABLE` | |||||
* :const:`TABLE_OPEN` | |||||
* :const:`TABLE_CELL_OPEN` | |||||
* :const:`TABLE_CELL_STYLE` | |||||
* :const:`TABLE_TD_LINE` | |||||
* :const:`TABLE_TH_LINE` | |||||
* :const:`TABLE_CELL_LINE_CONTEXTS` | |||||
Global contexts: | Global contexts: | ||||
* :const:`GL_HEADING` | * :const:`GL_HEADING` | ||||
@@ -155,15 +164,26 @@ FAIL_ON_EQUALS = 1 << 29 | |||||
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | ||||
FAIL_ON_RBRACE + FAIL_ON_EQUALS) | FAIL_ON_RBRACE + FAIL_ON_EQUALS) | ||||
TABLE_OPEN = 1 << 30 | |||||
TABLE_CELL_OPEN = 1 << 31 | |||||
TABLE_CELL_STYLE = 1 << 32 | |||||
TABLE_ROW_OPEN = 1 << 33 | |||||
TABLE_TD_LINE = 1 << 34 | |||||
TABLE_TH_LINE = 1 << 35 | |||||
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE | |||||
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + | |||||
TABLE_TD_LINE + TABLE_TH_LINE) | |||||
# Global contexts: | # Global contexts: | ||||
GL_HEADING = 1 << 0 | GL_HEADING = 1 << 0 | ||||
# Aggregate contexts: | # Aggregate contexts: | ||||
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE | |||||
FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + | |||||
STYLE + TABLE) | |||||
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + | UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + | ||||
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) | TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) | ||||
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE | |||||
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN | |||||
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI | NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI | ||||
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK | NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK |
@@ -241,7 +241,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) | |||||
/* | /* | ||||
Add a new token stack, context, and textbuffer to the list. | Add a new token stack, context, and textbuffer to the list. | ||||
*/ | */ | ||||
static int Tokenizer_push(Tokenizer* self, int context) | |||||
static int Tokenizer_push(Tokenizer* self, uint64_t context) | |||||
{ | { | ||||
Stack* top = malloc(sizeof(Stack)); | Stack* top = malloc(sizeof(Stack)); | ||||
@@ -333,7 +333,7 @@ static PyObject* Tokenizer_pop(Tokenizer* self) | |||||
static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) | static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) | ||||
{ | { | ||||
PyObject* stack; | PyObject* stack; | ||||
int context; | |||||
uint64_t context; | |||||
if (Tokenizer_push_textbuffer(self)) | if (Tokenizer_push_textbuffer(self)) | ||||
return NULL; | return NULL; | ||||
@@ -351,7 +351,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) | |||||
*/ | */ | ||||
static void* Tokenizer_fail_route(Tokenizer* self) | static void* Tokenizer_fail_route(Tokenizer* self) | ||||
{ | { | ||||
int context = self->topstack->context; | |||||
uint64_t context = self->topstack->context; | |||||
PyObject* stack = Tokenizer_pop(self); | PyObject* stack = Tokenizer_pop(self); | ||||
Py_XDECREF(stack); | Py_XDECREF(stack); | ||||
@@ -676,11 +676,8 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) | |||||
RESET_ROUTE(); | RESET_ROUTE(); | ||||
for (i = 0; i < braces; i++) text[i] = '{'; | for (i = 0; i < braces; i++) text[i] = '{'; | ||||
text[braces] = '\0'; | text[braces] = '\0'; | ||||
if (Tokenizer_emit_text_then_stack(self, text)) { | |||||
Py_XDECREF(text); | |||||
if (Tokenizer_emit_text_then_stack(self, text)) | |||||
return -1; | return -1; | ||||
} | |||||
Py_XDECREF(text); | |||||
return 0; | return 0; | ||||
} | } | ||||
else | else | ||||
@@ -1034,7 +1031,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) | |||||
{ | { | ||||
// Built from Tokenizer_parse()'s end sentinels: | // Built from Tokenizer_parse()'s end sentinels: | ||||
Py_UNICODE after = Tokenizer_READ(self, 2); | Py_UNICODE after = Tokenizer_READ(self, 2); | ||||
int ctx = self->topstack->context; | |||||
uint64_t ctx = self->topstack->context; | |||||
return (!this || this == '\n' || this == '[' || this == ']' || | return (!this || this == '\n' || this == '[' || this == ']' || | ||||
this == '<' || this == '>' || (this == '\'' && next == '\'') || | this == '<' || this == '>' || (this == '\'' && next == '\'') || | ||||
@@ -1629,9 +1626,9 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data) | |||||
static int | static int | ||||
Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) | Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) | ||||
{ | { | ||||
int ctx = data->context; | |||||
int end_of_value = (ctx & TAG_ATTR_VALUE && | |||||
!(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE))); | |||||
uint64_t ctx = data->context; | |||||
uint64_t end_of_value = (ctx & TAG_ATTR_VALUE && | |||||
!(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE))); | |||||
if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) { | if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) { | ||||
if (Tokenizer_push_tag_buffer(self, data)) | if (Tokenizer_push_tag_buffer(self, data)) | ||||
@@ -2153,7 +2150,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag, | |||||
static int Tokenizer_parse_italics(Tokenizer* self) | static int Tokenizer_parse_italics(Tokenizer* self) | ||||
{ | { | ||||
Py_ssize_t reset = self->head; | Py_ssize_t reset = self->head; | ||||
int context; | |||||
uint64_t context; | |||||
PyObject *stack; | PyObject *stack; | ||||
stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1); | stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1); | ||||
@@ -2273,7 +2270,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self) | |||||
*/ | */ | ||||
static PyObject* Tokenizer_parse_style(Tokenizer* self) | static PyObject* Tokenizer_parse_style(Tokenizer* self) | ||||
{ | { | ||||
int context = self->topstack->context, ticks = 2, i; | |||||
uint64_t context = self->topstack->context, ticks = 2, i; | |||||
self->head += 2; | self->head += 2; | ||||
while (Tokenizer_READ(self, 0) == '\'') { | while (Tokenizer_READ(self, 0) == '\'') { | ||||
@@ -2426,9 +2423,363 @@ static int Tokenizer_handle_dl_term(Tokenizer* self) | |||||
} | } | ||||
/* | /* | ||||
Emit a table tag. | |||||
*/ | |||||
static int | |||||
Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, | |||||
const char* tag, PyObject* style, PyObject* padding, | |||||
const char* close_open_markup, PyObject* contents, | |||||
const char* open_close_markup) | |||||
{ | |||||
PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs, | |||||
*close_open_markup_unicode, *open_close_kwargs, | |||||
*open_close_markup_unicode; | |||||
open_open_kwargs = PyDict_New(); | |||||
if (!open_open_kwargs) | |||||
goto fail_decref_all; | |||||
open_open_markup_unicode = PyUnicode_FromString(open_open_markup); | |||||
if (!open_open_markup_unicode) { | |||||
Py_DECREF(open_open_kwargs); | |||||
goto fail_decref_all; | |||||
} | |||||
PyDict_SetItemString(open_open_kwargs, "wiki_markup", | |||||
open_open_markup_unicode); | |||||
Py_DECREF(open_open_markup_unicode); | |||||
if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs)) | |||||
goto fail_decref_all; | |||||
if (Tokenizer_emit_text(self, tag)) | |||||
goto fail_decref_all; | |||||
if (style) { | |||||
if (Tokenizer_emit_all(self, style)) | |||||
goto fail_decref_all; | |||||
Py_DECREF(style); | |||||
} | |||||
close_open_kwargs = PyDict_New(); | |||||
if (!close_open_kwargs) | |||||
goto fail_decref_padding_contents; | |||||
if (close_open_markup && strlen(close_open_markup) != 0) { | |||||
close_open_markup_unicode = PyUnicode_FromString(close_open_markup); | |||||
if (!close_open_markup_unicode) { | |||||
Py_DECREF(close_open_kwargs); | |||||
goto fail_decref_padding_contents; | |||||
} | |||||
PyDict_SetItemString(close_open_kwargs, "wiki_markup", | |||||
close_open_markup_unicode); | |||||
Py_DECREF(close_open_markup_unicode); | |||||
} | |||||
PyDict_SetItemString(close_open_kwargs, "padding", padding); | |||||
Py_DECREF(padding); | |||||
if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs)) | |||||
goto fail_decref_contents; | |||||
if (contents) { | |||||
if (Tokenizer_emit_all(self, contents)) | |||||
goto fail_decref_contents; | |||||
Py_DECREF(contents); | |||||
} | |||||
open_close_kwargs = PyDict_New(); | |||||
if (!open_close_kwargs) | |||||
return -1; | |||||
open_close_markup_unicode = PyUnicode_FromString(open_close_markup); | |||||
if (!open_close_markup_unicode) { | |||||
Py_DECREF(open_close_kwargs); | |||||
return -1; | |||||
} | |||||
PyDict_SetItemString(open_close_kwargs, "wiki_markup", | |||||
open_close_markup_unicode); | |||||
Py_DECREF(open_close_markup_unicode); | |||||
if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs)) | |||||
return -1; | |||||
if (Tokenizer_emit_text(self, tag)) | |||||
return -1; | |||||
if (Tokenizer_emit(self, TagCloseClose)) | |||||
return -1; | |||||
return 0; | |||||
fail_decref_all: | |||||
Py_XDECREF(style); | |||||
fail_decref_padding_contents: | |||||
Py_DECREF(padding); | |||||
fail_decref_contents: | |||||
Py_DECREF(contents); | |||||
return -1; | |||||
} | |||||
/* | |||||
Handle style attributes for a table until an ending token. | |||||
*/ | |||||
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token) | |||||
{ | |||||
TagData *data = TagData_new(); | |||||
PyObject *padding, *trash; | |||||
Py_UNICODE this; | |||||
int can_exit; | |||||
if (!data) | |||||
return NULL; | |||||
data->context = TAG_ATTR_READY; | |||||
while (1) { | |||||
this = Tokenizer_READ(self, 0); | |||||
can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE); | |||||
if (this == end_token && can_exit) { | |||||
if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) { | |||||
if (Tokenizer_push_tag_buffer(self, data)) { | |||||
TagData_dealloc(data); | |||||
return NULL; | |||||
} | |||||
} | |||||
if (Py_UNICODE_ISSPACE(this)) | |||||
Textbuffer_write(&(data->pad_first), this); | |||||
padding = Textbuffer_render(data->pad_first); | |||||
TagData_dealloc(data); | |||||
if (!padding) | |||||
return NULL; | |||||
return padding; | |||||
} | |||||
else if (!this || this == end_token) { | |||||
if (self->topstack->context & LC_TAG_ATTR) { | |||||
if (data->context & TAG_QUOTED) { | |||||
// Unclosed attribute quote: reset, don't die | |||||
data->context = TAG_ATTR_VALUE; | |||||
trash = Tokenizer_pop(self); | |||||
Py_XDECREF(trash); | |||||
self->head = data->reset; | |||||
continue; | |||||
} | |||||
trash = Tokenizer_pop(self); | |||||
Py_XDECREF(trash); | |||||
} | |||||
TagData_dealloc(data); | |||||
return Tokenizer_fail_route(self); | |||||
} | |||||
else { | |||||
if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) { | |||||
TagData_dealloc(data); | |||||
return NULL; | |||||
} | |||||
} | |||||
self->head++; | |||||
} | |||||
} | |||||
/* | |||||
Parse a wikicode table by starting with the first line. | |||||
*/ | |||||
static int Tokenizer_parse_table(Tokenizer* self) | |||||
{ | |||||
Py_ssize_t reset = self->head + 1; | |||||
PyObject *style, *padding; | |||||
PyObject *table = NULL; | |||||
self->head += 2; | |||||
if(Tokenizer_push(self, LC_TABLE_OPEN)) | |||||
return -1; | |||||
padding = Tokenizer_handle_table_style(self, '\n'); | |||||
if (BAD_ROUTE) { | |||||
RESET_ROUTE(); | |||||
self->head = reset; | |||||
if (Tokenizer_emit_text(self, "{|")) | |||||
return -1; | |||||
return 0; | |||||
} | |||||
if (!padding) | |||||
return -1; | |||||
style = Tokenizer_pop(self); | |||||
if (!style) { | |||||
Py_DECREF(padding); | |||||
return -1; | |||||
} | |||||
self->head++; | |||||
table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); | |||||
if (BAD_ROUTE) { | |||||
RESET_ROUTE(); | |||||
Py_DECREF(padding); | |||||
Py_DECREF(style); | |||||
self->head = reset; | |||||
if (Tokenizer_emit_text(self, "{|")) | |||||
return -1; | |||||
return 0; | |||||
} | |||||
if (!table) { | |||||
Py_DECREF(padding); | |||||
Py_DECREF(style); | |||||
return -1; | |||||
} | |||||
if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL, | |||||
table, "|}")) | |||||
return -1; | |||||
// Offset displacement done by _parse() | |||||
self->head--; | |||||
return 0; | |||||
} | |||||
/* | |||||
Parse as style until end of the line, then continue. | |||||
*/ | |||||
static int Tokenizer_handle_table_row(Tokenizer* self) | |||||
{ | |||||
PyObject *padding, *style, *row, *trash; | |||||
self->head += 2; | |||||
if (!Tokenizer_CAN_RECURSE(self)) { | |||||
if (Tokenizer_emit_text(self, "|-")) | |||||
return -1; | |||||
self->head -= 1; | |||||
return 0; | |||||
} | |||||
if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) | |||||
return -1; | |||||
padding = Tokenizer_handle_table_style(self, '\n'); | |||||
if (BAD_ROUTE) { | |||||
trash = Tokenizer_pop(self); | |||||
Py_XDECREF(trash); | |||||
return 0; | |||||
} | |||||
if (!padding) | |||||
return -1; | |||||
style = Tokenizer_pop(self); | |||||
if (!style) { | |||||
Py_DECREF(padding); | |||||
return -1; | |||||
} | |||||
// Don't parse the style separator | |||||
self->head++; | |||||
row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); | |||||
if (!row) { | |||||
Py_DECREF(padding); | |||||
Py_DECREF(style); | |||||
return -1; | |||||
} | |||||
if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, "")) | |||||
return -1; | |||||
// Offset displacement done by _parse() | |||||
self->head--; | |||||
return 0; | |||||
} | |||||
/* | |||||
Parse as normal syntax unless we hit a style marker, then parse style | |||||
as HTML attributes and the remainder as normal syntax. | |||||
*/ | |||||
static int | |||||
Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, | |||||
const char *tag, uint64_t line_context) | |||||
{ | |||||
uint64_t old_context = self->topstack->context; | |||||
uint64_t cell_context; | |||||
Py_ssize_t reset; | |||||
PyObject *padding, *cell, *style = NULL; | |||||
const char *close_open_markup = NULL; | |||||
self->head += strlen(markup); | |||||
reset = self->head; | |||||
if (!Tokenizer_CAN_RECURSE(self)) { | |||||
if (Tokenizer_emit_text(self, markup)) | |||||
return -1; | |||||
self->head--; | |||||
return 0; | |||||
} | |||||
cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | | |||||
LC_TABLE_CELL_STYLE | line_context, 1); | |||||
if (!cell) | |||||
return -1; | |||||
cell_context = self->topstack->context; | |||||
self->topstack->context = old_context; | |||||
if (cell_context & LC_TABLE_CELL_STYLE) { | |||||
Py_DECREF(cell); | |||||
self->head = reset; | |||||
if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | | |||||
line_context)) | |||||
return -1; | |||||
padding = Tokenizer_handle_table_style(self, '|'); | |||||
if (!padding) | |||||
return -1; | |||||
style = Tokenizer_pop(self); | |||||
if (!style) { | |||||
Py_DECREF(padding); | |||||
return -1; | |||||
} | |||||
// Don't parse the style separator | |||||
self->head++; | |||||
cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | | |||||
line_context, 1); | |||||
if (!cell) { | |||||
Py_DECREF(padding); | |||||
Py_DECREF(style); | |||||
return -1; | |||||
} | |||||
cell_context = self->topstack->context; | |||||
self->topstack->context = old_context; | |||||
} | |||||
else { | |||||
padding = PyUnicode_FromString(""); | |||||
if (!padding) { | |||||
Py_DECREF(cell); | |||||
return -1; | |||||
} | |||||
} | |||||
if (style) { | |||||
close_open_markup = "|"; | |||||
} | |||||
if (Tokenizer_emit_table_tag(self, markup, tag, style, padding, | |||||
close_open_markup, cell, "")) | |||||
return -1; | |||||
// Keep header/cell line contexts | |||||
self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE); | |||||
// Offset displacement done by parse() | |||||
self->head--; | |||||
return 0; | |||||
} | |||||
/* | |||||
Returns the context, stack, and whether to reset the cell for style | |||||
in a tuple. | |||||
*/ | |||||
static PyObject* | |||||
Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) | |||||
{ | |||||
if (reset_for_style) | |||||
self->topstack->context |= LC_TABLE_CELL_STYLE; | |||||
else | |||||
self->topstack->context &= ~LC_TABLE_CELL_STYLE; | |||||
return Tokenizer_pop_keeping_context(self); | |||||
} | |||||
/* | |||||
Return the stack in order to handle the table row end. | |||||
*/ | |||||
static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) | |||||
{ | |||||
return Tokenizer_pop(self); | |||||
} | |||||
/* | |||||
Return the stack in order to handle the table end. | |||||
*/ | |||||
static PyObject* Tokenizer_handle_table_end(Tokenizer* self) | |||||
{ | |||||
self->head += 2; | |||||
return Tokenizer_pop(self); | |||||
} | |||||
/* | |||||
Handle the end of the stream of wikitext. | Handle the end of the stream of wikitext. | ||||
*/ | */ | ||||
static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) | |||||
static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) | |||||
{ | { | ||||
PyObject *token, *text, *trash; | PyObject *token, *text, *trash; | ||||
int single; | int single; | ||||
@@ -2444,9 +2795,16 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) | |||||
if (single) | if (single) | ||||
return Tokenizer_handle_single_tag_end(self); | return Tokenizer_handle_single_tag_end(self); | ||||
} | } | ||||
else if (context & AGG_DOUBLE) { | |||||
trash = Tokenizer_pop(self); | |||||
Py_XDECREF(trash); | |||||
else { | |||||
if (context & LC_TABLE_CELL_OPEN) { | |||||
trash = Tokenizer_pop(self); | |||||
Py_XDECREF(trash); | |||||
context = self->topstack->context; | |||||
} | |||||
if (context & AGG_DOUBLE) { | |||||
trash = Tokenizer_pop(self); | |||||
Py_XDECREF(trash); | |||||
} | |||||
} | } | ||||
return Tokenizer_fail_route(self); | return Tokenizer_fail_route(self); | ||||
} | } | ||||
@@ -2457,7 +2815,8 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) | |||||
Make sure we are not trying to write an invalid character. Return 0 if | Make sure we are not trying to write an invalid character. Return 0 if | ||||
everything is safe, or -1 if the route must be failed. | everything is safe, or -1 if the route must be failed. | ||||
*/ | */ | ||||
static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||||
static int | |||||
Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) | |||||
{ | { | ||||
if (context & LC_FAIL_NEXT) | if (context & LC_FAIL_NEXT) | ||||
return -1; | return -1; | ||||
@@ -2508,7 +2867,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||||
} | } | ||||
else if (context & LC_FAIL_ON_LBRACE) { | else if (context & LC_FAIL_ON_LBRACE) { | ||||
if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' && | if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' && | ||||
Tokenizer_READ_BACKWARDS(self, 2) == '{')) { | |||||
Tokenizer_READ_BACKWARDS(self, 2) == '{')) { | |||||
if (context & LC_TEMPLATE) | if (context & LC_TEMPLATE) | ||||
self->topstack->context |= LC_FAIL_ON_EQUALS; | self->topstack->context |= LC_FAIL_ON_EQUALS; | ||||
else | else | ||||
@@ -2533,12 +2892,30 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||||
} | } | ||||
/* | /* | ||||
Returns whether the current head has leading whitespace. | |||||
TODO: treat comments and templates as whitespace, allow fail on non-newline spaces. | |||||
*/ | |||||
static int Tokenizer_has_leading_whitespace(Tokenizer* self) | |||||
{ | |||||
int offset = 1; | |||||
Py_UNICODE current_character; | |||||
while (1) { | |||||
current_character = Tokenizer_READ_BACKWARDS(self, offset); | |||||
if (!current_character || current_character == '\n') | |||||
return 1; | |||||
else if (!Py_UNICODE_ISSPACE(current_character)) | |||||
return 0; | |||||
offset++; | |||||
} | |||||
} | |||||
/* | |||||
Parse the wikicode string, using context for when to stop. If push is true, | Parse the wikicode string, using context for when to stop. If push is true, | ||||
we will push a new context, otherwise we won't and context will be ignored. | we will push a new context, otherwise we won't and context will be ignored. | ||||
*/ | */ | ||||
static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||||
static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) | |||||
{ | { | ||||
int this_context; | |||||
uint64_t this_context; | |||||
Py_UNICODE this, next, next_next, last; | Py_UNICODE this, next, next_next, last; | ||||
PyObject* temp; | PyObject* temp; | ||||
@@ -2667,22 +3044,99 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||||
if (temp != Py_None) | if (temp != Py_None) | ||||
return temp; | return temp; | ||||
} | } | ||||
else if (!last || last == '\n') { | |||||
if (this == '#' || this == '*' || this == ';' || this == ':') { | |||||
if (Tokenizer_handle_list(self)) | |||||
else if ((!last || last == '\n') && (this == '#' || this == '*' || this == ';' || this == ':')) { | |||||
if (Tokenizer_handle_list(self)) | |||||
return NULL; | |||||
} | |||||
else if ((!last || last == '\n') && (this == '-' && this == next && | |||||
this == Tokenizer_READ(self, 2) && | |||||
this == Tokenizer_READ(self, 3))) { | |||||
if (Tokenizer_handle_hr(self)) | |||||
return NULL; | |||||
} | |||||
else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) { | |||||
if (Tokenizer_handle_dl_term(self)) | |||||
return NULL; | |||||
// Kill potential table contexts | |||||
if (this == '\n') | |||||
self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; | |||||
} | |||||
// Start of table parsing | |||||
else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) { | |||||
if (Tokenizer_CAN_RECURSE(self)) { | |||||
if (Tokenizer_parse_table(self)) | |||||
return NULL; | |||||
} | |||||
else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next)) | |||||
return NULL; | |||||
else | |||||
self->head++; | |||||
} | |||||
else if (this_context & LC_TABLE_OPEN) { | |||||
if (this == '|' && next == '|' && this_context & LC_TABLE_TD_LINE) { | |||||
if (this_context & LC_TABLE_CELL_OPEN) | |||||
return Tokenizer_handle_table_cell_end(self, 0); | |||||
else if (Tokenizer_handle_table_cell(self, "||", "td", LC_TABLE_TD_LINE)) | |||||
return NULL; | return NULL; | ||||
} | } | ||||
else if (this == '-' && this == next && | |||||
this == Tokenizer_READ(self, 2) && | |||||
this == Tokenizer_READ(self, 3)) { | |||||
if (Tokenizer_handle_hr(self)) | |||||
else if (this == '|' && next == '|' && this_context & LC_TABLE_TH_LINE) { | |||||
if (this_context & LC_TABLE_CELL_OPEN) | |||||
return Tokenizer_handle_table_cell_end(self, 0); | |||||
else if (Tokenizer_handle_table_cell(self, "||", "th", LC_TABLE_TH_LINE)) | |||||
return NULL; | |||||
} | |||||
else if (this == '!' && next == '!' && this_context & LC_TABLE_TH_LINE) { | |||||
if (this_context & LC_TABLE_CELL_OPEN) | |||||
return Tokenizer_handle_table_cell_end(self, 0); | |||||
else if (Tokenizer_handle_table_cell(self, "!!", "th", LC_TABLE_TH_LINE)) | |||||
return NULL; | |||||
} | |||||
else if (this == '|' && this_context & LC_TABLE_CELL_STYLE) { | |||||
return Tokenizer_handle_table_cell_end(self, 1); | |||||
} | |||||
// On newline, clear out cell line contexts | |||||
else if (this == '\n' && this_context & LC_TABLE_CELL_LINE_CONTEXTS) { | |||||
self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; | |||||
if (Tokenizer_emit_char(self, this)) | |||||
return NULL; | |||||
} | |||||
else if (Tokenizer_has_leading_whitespace(self)) { | |||||
if (this == '|' && next == '}') { | |||||
if (this_context & LC_TABLE_CELL_OPEN) | |||||
return Tokenizer_handle_table_cell_end(self, 0); | |||||
if (this_context & LC_TABLE_ROW_OPEN) | |||||
return Tokenizer_handle_table_row_end(self); | |||||
else | |||||
return Tokenizer_handle_table_end(self); | |||||
} | |||||
else if (this == '|' && next == '-') { | |||||
if (this_context & LC_TABLE_CELL_OPEN) | |||||
return Tokenizer_handle_table_cell_end(self, 0); | |||||
if (this_context & LC_TABLE_ROW_OPEN) | |||||
return Tokenizer_handle_table_row_end(self); | |||||
else if (Tokenizer_handle_table_row(self)) | |||||
return NULL; | |||||
} | |||||
else if (this == '|') { | |||||
if (this_context & LC_TABLE_CELL_OPEN) | |||||
return Tokenizer_handle_table_cell_end(self, 0); | |||||
else if (Tokenizer_handle_table_cell(self, "|", "td", LC_TABLE_TD_LINE)) | |||||
return NULL; | |||||
} | |||||
else if (this == '!') { | |||||
if (this_context & LC_TABLE_CELL_OPEN) | |||||
return Tokenizer_handle_table_cell_end(self, 0); | |||||
else if (Tokenizer_handle_table_cell(self, "!", "th", LC_TABLE_TH_LINE)) | |||||
return NULL; | |||||
} | |||||
else if (Tokenizer_emit_char(self, this)) | |||||
return NULL; | return NULL; | ||||
} | } | ||||
else if (Tokenizer_emit_char(self, this)) | else if (Tokenizer_emit_char(self, this)) | ||||
return NULL; | return NULL; | ||||
} | |||||
else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) { | |||||
if (Tokenizer_handle_dl_term(self)) | |||||
// Raise BadRoute to table start | |||||
if (BAD_ROUTE) | |||||
return NULL; | return NULL; | ||||
} | } | ||||
else if (Tokenizer_emit_char(self, this)) | else if (Tokenizer_emit_char(self, this)) | ||||
@@ -2697,7 +3151,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||||
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | ||||
{ | { | ||||
PyObject *text, *temp, *tokens; | PyObject *text, *temp, *tokens; | ||||
int context = 0, skip_style_tags = 0; | |||||
uint64_t context = 0; | |||||
int skip_style_tags = 0; | |||||
if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { | if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { | ||||
Py_XDECREF(self->text); | Py_XDECREF(self->text); | ||||
@@ -2725,7 +3180,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||||
self->skip_style_tags = skip_style_tags; | self->skip_style_tags = skip_style_tags; | ||||
tokens = Tokenizer_parse(self, context, 1); | tokens = Tokenizer_parse(self, context, 1); | ||||
if (!tokens && !PyErr_Occurred()) { | |||||
if ((!tokens && !PyErr_Occurred()) || self->topstack) { | |||||
if (!ParserError) { | if (!ParserError) { | ||||
if (load_exceptions()) | if (load_exceptions()) | ||||
return NULL; | return NULL; | ||||
@@ -2734,6 +3189,9 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||||
RESET_ROUTE(); | RESET_ROUTE(); | ||||
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); | PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); | ||||
} | } | ||||
else if (self->topstack) | |||||
PyErr_SetString(ParserError, | |||||
"C tokenizer exited with non-empty token stack"); | |||||
else | else | ||||
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); | PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); | ||||
return NULL; | return NULL; | ||||
@@ -29,6 +29,7 @@ SOFTWARE. | |||||
#include <math.h> | #include <math.h> | ||||
#include <structmember.h> | #include <structmember.h> | ||||
#include <bytesobject.h> | #include <bytesobject.h> | ||||
#include <stdint.h> | |||||
#if PY_MAJOR_VERSION >= 3 | #if PY_MAJOR_VERSION >= 3 | ||||
#define IS_PY3K | #define IS_PY3K | ||||
@@ -43,16 +44,17 @@ SOFTWARE. | |||||
static const char MARKERS[] = { | static const char MARKERS[] = { | ||||
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', | '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', | ||||
'-', '\n', '\0'}; | |||||
'-', '!', '\n', '\0'}; | |||||
#define NUM_MARKERS 18 | |||||
#define NUM_MARKERS 19 | |||||
#define TEXTBUFFER_BLOCKSIZE 1024 | #define TEXTBUFFER_BLOCKSIZE 1024 | ||||
#define MAX_DEPTH 40 | #define MAX_DEPTH 40 | ||||
#define MAX_CYCLES 100000 | #define MAX_CYCLES 100000 | ||||
#define MAX_BRACES 255 | #define MAX_BRACES 255 | ||||
#define MAX_ENTITY_SIZE 8 | #define MAX_ENTITY_SIZE 8 | ||||
static int route_state = 0, route_context = 0; | |||||
static int route_state = 0; | |||||
static uint64_t route_context = 0; | |||||
#define BAD_ROUTE route_state | #define BAD_ROUTE route_state | ||||
#define BAD_ROUTE_CONTEXT route_context | #define BAD_ROUTE_CONTEXT route_context | ||||
#define FAIL_ROUTE(context) route_state = 1; route_context = context | #define FAIL_ROUTE(context) route_state = 1; route_context = context | ||||
@@ -109,52 +111,61 @@ static PyObject* TagCloseClose; | |||||
/* Local contexts: */ | /* Local contexts: */ | ||||
#define LC_TEMPLATE 0x00000007 | |||||
#define LC_TEMPLATE_NAME 0x00000001 | |||||
#define LC_TEMPLATE_PARAM_KEY 0x00000002 | |||||
#define LC_TEMPLATE_PARAM_VALUE 0x00000004 | |||||
#define LC_ARGUMENT 0x00000018 | |||||
#define LC_ARGUMENT_NAME 0x00000008 | |||||
#define LC_ARGUMENT_DEFAULT 0x00000010 | |||||
#define LC_WIKILINK 0x00000060 | |||||
#define LC_WIKILINK_TITLE 0x00000020 | |||||
#define LC_WIKILINK_TEXT 0x00000040 | |||||
#define LC_EXT_LINK 0x00000180 | |||||
#define LC_EXT_LINK_URI 0x00000080 | |||||
#define LC_EXT_LINK_TITLE 0x00000100 | |||||
#define LC_HEADING 0x00007E00 | |||||
#define LC_HEADING_LEVEL_1 0x00000200 | |||||
#define LC_HEADING_LEVEL_2 0x00000400 | |||||
#define LC_HEADING_LEVEL_3 0x00000800 | |||||
#define LC_HEADING_LEVEL_4 0x00001000 | |||||
#define LC_HEADING_LEVEL_5 0x00002000 | |||||
#define LC_HEADING_LEVEL_6 0x00004000 | |||||
#define LC_TAG 0x00078000 | |||||
#define LC_TAG_OPEN 0x00008000 | |||||
#define LC_TAG_ATTR 0x00010000 | |||||
#define LC_TAG_BODY 0x00020000 | |||||
#define LC_TAG_CLOSE 0x00040000 | |||||
#define LC_STYLE 0x00780000 | |||||
#define LC_STYLE_ITALICS 0x00080000 | |||||
#define LC_STYLE_BOLD 0x00100000 | |||||
#define LC_STYLE_PASS_AGAIN 0x00200000 | |||||
#define LC_STYLE_SECOND_PASS 0x00400000 | |||||
#define LC_DLTERM 0x00800000 | |||||
#define LC_SAFETY_CHECK 0x3F000000 | |||||
#define LC_HAS_TEXT 0x01000000 | |||||
#define LC_FAIL_ON_TEXT 0x02000000 | |||||
#define LC_FAIL_NEXT 0x04000000 | |||||
#define LC_FAIL_ON_LBRACE 0x08000000 | |||||
#define LC_FAIL_ON_RBRACE 0x10000000 | |||||
#define LC_FAIL_ON_EQUALS 0x20000000 | |||||
#define LC_TEMPLATE 0x0000000000000007 | |||||
#define LC_TEMPLATE_NAME 0x0000000000000001 | |||||
#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002 | |||||
#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004 | |||||
#define LC_ARGUMENT 0x0000000000000018 | |||||
#define LC_ARGUMENT_NAME 0x0000000000000008 | |||||
#define LC_ARGUMENT_DEFAULT 0x0000000000000010 | |||||
#define LC_WIKILINK 0x0000000000000060 | |||||
#define LC_WIKILINK_TITLE 0x0000000000000020 | |||||
#define LC_WIKILINK_TEXT 0x0000000000000040 | |||||
#define LC_EXT_LINK 0x0000000000000180 | |||||
#define LC_EXT_LINK_URI 0x0000000000000080 | |||||
#define LC_EXT_LINK_TITLE 0x0000000000000100 | |||||
#define LC_HEADING 0x0000000000007E00 | |||||
#define LC_HEADING_LEVEL_1 0x0000000000000200 | |||||
#define LC_HEADING_LEVEL_2 0x0000000000000400 | |||||
#define LC_HEADING_LEVEL_3 0x0000000000000800 | |||||
#define LC_HEADING_LEVEL_4 0x0000000000001000 | |||||
#define LC_HEADING_LEVEL_5 0x0000000000002000 | |||||
#define LC_HEADING_LEVEL_6 0x0000000000004000 | |||||
#define LC_TAG 0x0000000000078000 | |||||
#define LC_TAG_OPEN 0x0000000000008000 | |||||
#define LC_TAG_ATTR 0x0000000000010000 | |||||
#define LC_TAG_BODY 0x0000000000020000 | |||||
#define LC_TAG_CLOSE 0x0000000000040000 | |||||
#define LC_STYLE 0x0000000000780000 | |||||
#define LC_STYLE_ITALICS 0x0000000000080000 | |||||
#define LC_STYLE_BOLD 0x0000000000100000 | |||||
#define LC_STYLE_PASS_AGAIN 0x0000000000200000 | |||||
#define LC_STYLE_SECOND_PASS 0x0000000000400000 | |||||
#define LC_DLTERM 0x0000000000800000 | |||||
#define LC_SAFETY_CHECK 0x000000003F000000 | |||||
#define LC_HAS_TEXT 0x0000000001000000 | |||||
#define LC_FAIL_ON_TEXT 0x0000000002000000 | |||||
#define LC_FAIL_NEXT 0x0000000004000000 | |||||
#define LC_FAIL_ON_LBRACE 0x0000000008000000 | |||||
#define LC_FAIL_ON_RBRACE 0x0000000010000000 | |||||
#define LC_FAIL_ON_EQUALS 0x0000000020000000 | |||||
#define LC_TABLE 0x0000000FC0000000 | |||||
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000 | |||||
#define LC_TABLE_OPEN 0x0000000040000000 | |||||
#define LC_TABLE_CELL_OPEN 0x0000000080000000 | |||||
#define LC_TABLE_CELL_STYLE 0x0000000100000000 | |||||
#define LC_TABLE_ROW_OPEN 0x0000000200000000 | |||||
#define LC_TABLE_TD_LINE 0x0000000400000000 | |||||
#define LC_TABLE_TH_LINE 0x0000000800000000 | |||||
/* Global contexts: */ | /* Global contexts: */ | ||||
@@ -162,9 +173,9 @@ static PyObject* TagCloseClose; | |||||
/* Aggregate contexts: */ | /* Aggregate contexts: */ | ||||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) | |||||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) | |||||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | ||||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) | |||||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) | |||||
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) | #define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) | ||||
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) | #define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) | ||||
@@ -191,7 +202,7 @@ struct Textbuffer { | |||||
struct Stack { | struct Stack { | ||||
PyObject* stack; | PyObject* stack; | ||||
int context; | |||||
uint64_t context; | |||||
struct Textbuffer* textbuffer; | struct Textbuffer* textbuffer; | ||||
struct Stack* next; | struct Stack* next; | ||||
}; | }; | ||||
@@ -202,7 +213,7 @@ typedef struct { | |||||
} HeadingData; | } HeadingData; | ||||
typedef struct { | typedef struct { | ||||
int context; | |||||
uint64_t context; | |||||
struct Textbuffer* pad_first; | struct Textbuffer* pad_first; | ||||
struct Textbuffer* pad_before_eq; | struct Textbuffer* pad_before_eq; | ||||
struct Textbuffer* pad_after_eq; | struct Textbuffer* pad_after_eq; | ||||
@@ -267,7 +278,7 @@ static int Tokenizer_parse_entity(Tokenizer*); | |||||
static int Tokenizer_parse_comment(Tokenizer*); | static int Tokenizer_parse_comment(Tokenizer*); | ||||
static int Tokenizer_handle_dl_term(Tokenizer*); | static int Tokenizer_handle_dl_term(Tokenizer*); | ||||
static int Tokenizer_parse_tag(Tokenizer*); | static int Tokenizer_parse_tag(Tokenizer*); | ||||
static PyObject* Tokenizer_parse(Tokenizer*, int, int); | |||||
static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int); | |||||
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | ||||
static int load_exceptions(void); | static int load_exceptions(void); | ||||
@@ -63,7 +63,7 @@ class Tokenizer(object): | |||||
START = object() | START = object() | ||||
END = object() | END = object() | ||||
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", | MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", | ||||
":", "/", "-", "\n", START, END] | |||||
":", "/", "-", "!", "\n", START, END] | |||||
MAX_DEPTH = 40 | MAX_DEPTH = 40 | ||||
MAX_CYCLES = 100000 | MAX_CYCLES = 100000 | ||||
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) | regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) | ||||
@@ -991,12 +991,166 @@ class Tokenizer(object): | |||||
else: | else: | ||||
self._emit_text("\n") | self._emit_text("\n") | ||||
def _emit_table_tag(self, open_open_markup, tag, style, padding, | |||||
close_open_markup, contents, open_close_markup): | |||||
"""Emit a table tag.""" | |||||
self._emit(tokens.TagOpenOpen(wiki_markup=open_open_markup)) | |||||
self._emit_text(tag) | |||||
if style: | |||||
self._emit_all(style) | |||||
if close_open_markup: | |||||
self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup, | |||||
padding=padding)) | |||||
else: | |||||
self._emit(tokens.TagCloseOpen(padding=padding)) | |||||
if contents: | |||||
self._emit_all(contents) | |||||
self._emit(tokens.TagOpenClose(wiki_markup=open_close_markup)) | |||||
self._emit_text(tag) | |||||
self._emit(tokens.TagCloseClose()) | |||||
def _handle_table_style(self, end_token): | |||||
"""Handle style attributes for a table until ``end_token``.""" | |||||
data = _TagOpenData() | |||||
data.context = _TagOpenData.CX_ATTR_READY | |||||
while True: | |||||
this = self._read() | |||||
can_exit = (not data.context & data.CX_QUOTED or | |||||
data.context & data.CX_NOTE_SPACE) | |||||
if this == end_token and can_exit: | |||||
if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): | |||||
self._push_tag_buffer(data) | |||||
if this.isspace(): | |||||
data.padding_buffer["first"] += this | |||||
return data.padding_buffer["first"] | |||||
elif this is self.END or this == end_token: | |||||
if self._context & contexts.TAG_ATTR: | |||||
if data.context & data.CX_QUOTED: | |||||
# Unclosed attribute quote: reset, don't die | |||||
data.context = data.CX_ATTR_VALUE | |||||
self._pop() | |||||
self._head = data.reset | |||||
continue | |||||
self._pop() | |||||
self._fail_route() | |||||
else: | |||||
self._handle_tag_data(data, this) | |||||
self._head += 1 | |||||
def _parse_table(self): | |||||
"""Parse a wikicode table by starting with the first line.""" | |||||
reset = self._head + 1 | |||||
self._head += 2 | |||||
self._push(contexts.TABLE_OPEN) | |||||
try: | |||||
padding = self._handle_table_style("\n") | |||||
except BadRoute: | |||||
self._head = reset | |||||
self._emit_text("{|") | |||||
return | |||||
style = self._pop() | |||||
self._head += 1 | |||||
try: | |||||
table = self._parse(contexts.TABLE_OPEN) | |||||
except BadRoute: | |||||
self._head = reset | |||||
self._emit_text("{|") | |||||
return | |||||
self._emit_table_tag("{|", "table", style, padding, None, table, "|}") | |||||
# Offset displacement done by _parse(): | |||||
self._head -= 1 | |||||
def _handle_table_row(self): | |||||
"""Parse as style until end of the line, then continue.""" | |||||
self._head += 2 | |||||
if not self._can_recurse(): | |||||
self._emit_text("|-") | |||||
self._head -= 1 | |||||
return | |||||
self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) | |||||
try: | |||||
padding = self._handle_table_style("\n") | |||||
except BadRoute: | |||||
self._pop() | |||||
raise | |||||
style = self._pop() | |||||
# Don't parse the style separator: | |||||
self._head += 1 | |||||
row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) | |||||
self._emit_table_tag("|-", "tr", style, padding, None, row, "") | |||||
# Offset displacement done by parse(): | |||||
self._head -= 1 | |||||
def _handle_table_cell(self, markup, tag, line_context): | |||||
"""Parse as normal syntax unless we hit a style marker, then parse | |||||
style as HTML attributes and the remainder as normal syntax.""" | |||||
old_context = self._context | |||||
padding, style = "", None | |||||
self._head += len(markup) | |||||
reset = self._head | |||||
if not self._can_recurse(): | |||||
self._emit_text(markup) | |||||
self._head -= 1 | |||||
return | |||||
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | | |||||
line_context | contexts.TABLE_CELL_STYLE) | |||||
cell_context = self._context | |||||
self._context = old_context | |||||
reset_for_style = cell_context & contexts.TABLE_CELL_STYLE | |||||
if reset_for_style: | |||||
self._head = reset | |||||
self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | | |||||
line_context) | |||||
padding = self._handle_table_style("|") | |||||
style = self._pop() | |||||
# Don't parse the style separator: | |||||
self._head += 1 | |||||
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | | |||||
line_context) | |||||
cell_context = self._context | |||||
self._context = old_context | |||||
close_open_markup = "|" if reset_for_style else None | |||||
self._emit_table_tag(markup, tag, style, padding, close_open_markup, | |||||
cell, "") | |||||
# Keep header/cell line contexts: | |||||
self._context |= cell_context & (contexts.TABLE_TH_LINE | | |||||
contexts.TABLE_TD_LINE) | |||||
# Offset displacement done by parse(): | |||||
self._head -= 1 | |||||
def _handle_table_cell_end(self, reset_for_style=False): | |||||
"""Returns the current context, with the TABLE_CELL_STYLE flag set if | |||||
it is necessary to reset and parse style attributes.""" | |||||
if reset_for_style: | |||||
self._context |= contexts.TABLE_CELL_STYLE | |||||
else: | |||||
self._context &= ~contexts.TABLE_CELL_STYLE | |||||
return self._pop(keep_context=True) | |||||
def _handle_table_row_end(self): | |||||
"""Return the stack in order to handle the table row end.""" | |||||
return self._pop() | |||||
def _handle_table_end(self): | |||||
"""Return the stack in order to handle the table end.""" | |||||
self._head += 2 | |||||
return self._pop() | |||||
def _handle_end(self): | def _handle_end(self): | ||||
"""Handle the end of the stream of wikitext.""" | """Handle the end of the stream of wikitext.""" | ||||
if self._context & contexts.FAIL: | if self._context & contexts.FAIL: | ||||
if self._context & contexts.TAG_BODY: | if self._context & contexts.TAG_BODY: | ||||
if is_single(self._stack[1].text): | if is_single(self._stack[1].text): | ||||
return self._handle_single_tag_end() | return self._handle_single_tag_end() | ||||
if self._context & contexts.TABLE_CELL_OPEN: | |||||
self._pop() | |||||
if self._context & contexts.DOUBLE: | if self._context & contexts.DOUBLE: | ||||
self._pop() | self._pop() | ||||
self._fail_route() | self._fail_route() | ||||
@@ -1144,15 +1298,68 @@ class Tokenizer(object): | |||||
result = self._parse_style() | result = self._parse_style() | ||||
if result is not None: | if result is not None: | ||||
return result | return result | ||||
elif self._read(-1) in ("\n", self.START): | |||||
if this in ("#", "*", ";", ":"): | |||||
elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"): | |||||
self._handle_list() | self._handle_list() | ||||
elif this == next == self._read(2) == self._read(3) == "-": | |||||
elif self._read(-1) in ("\n", self.START) and this == next == self._read(2) == self._read(3) == "-": | |||||
self._handle_hr() | self._handle_hr() | ||||
else: | |||||
self._emit_text(this) | |||||
elif this in ("\n", ":") and self._context & contexts.DL_TERM: | elif this in ("\n", ":") and self._context & contexts.DL_TERM: | ||||
self._handle_dl_term() | self._handle_dl_term() | ||||
if this == "\n": | |||||
# Kill potential table contexts | |||||
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS | |||||
# Start of table parsing | |||||
elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or | |||||
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): | |||||
if self._can_recurse(): | |||||
self._parse_table() | |||||
else: | |||||
self._emit_text("{|") | |||||
elif self._context & contexts.TABLE_OPEN: | |||||
if this == next == "|" and self._context & contexts.TABLE_TD_LINE: | |||||
if self._context & contexts.TABLE_CELL_OPEN: | |||||
return self._handle_table_cell_end() | |||||
self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE) | |||||
elif this == next == "|" and self._context & contexts.TABLE_TH_LINE: | |||||
if self._context & contexts.TABLE_CELL_OPEN: | |||||
return self._handle_table_cell_end() | |||||
self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE) | |||||
elif this == next == "!" and self._context & contexts.TABLE_TH_LINE: | |||||
if self._context & contexts.TABLE_CELL_OPEN: | |||||
return self._handle_table_cell_end() | |||||
self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) | |||||
elif this == "|" and self._context & contexts.TABLE_CELL_STYLE: | |||||
return self._handle_table_cell_end(reset_for_style=True) | |||||
# on newline, clear out cell line contexts | |||||
elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS: | |||||
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS | |||||
self._emit_text(this) | |||||
elif (self._read(-1) in ("\n", self.START) or | |||||
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): | |||||
if this == "|" and next == "}": | |||||
if self._context & contexts.TABLE_CELL_OPEN: | |||||
return self._handle_table_cell_end() | |||||
if self._context & contexts.TABLE_ROW_OPEN: | |||||
return self._handle_table_row_end() | |||||
return self._handle_table_end() | |||||
elif this == "|" and next == "-": | |||||
if self._context & contexts.TABLE_CELL_OPEN: | |||||
return self._handle_table_cell_end() | |||||
if self._context & contexts.TABLE_ROW_OPEN: | |||||
return self._handle_table_row_end() | |||||
self._handle_table_row() | |||||
elif this == "|": | |||||
if self._context & contexts.TABLE_CELL_OPEN: | |||||
return self._handle_table_cell_end() | |||||
self._handle_table_cell("|", "td", contexts.TABLE_TD_LINE) | |||||
elif this == "!": | |||||
if self._context & contexts.TABLE_CELL_OPEN: | |||||
return self._handle_table_cell_end() | |||||
self._handle_table_cell("!", "th", contexts.TABLE_TH_LINE) | |||||
else: | |||||
self._emit_text(this) | |||||
else: | |||||
self._emit_text(this) | |||||
else: | else: | ||||
self._emit_text(this) | self._emit_text(this) | ||||
self._head += 1 | self._head += 1 | ||||
@@ -1164,6 +1371,10 @@ class Tokenizer(object): | |||||
self._text = [segment for segment in split if segment] | self._text = [segment for segment in split if segment] | ||||
self._head = self._global = self._depth = self._cycles = 0 | self._head = self._global = self._depth = self._cycles = 0 | ||||
try: | try: | ||||
return self._parse(context) | |||||
tokens = self._parse(context) | |||||
except BadRoute: # pragma: no cover (untestable/exceptional case) | except BadRoute: # pragma: no cover (untestable/exceptional case) | ||||
raise ParserError("Python tokenizer exited with BadRoute") | raise ParserError("Python tokenizer exited with BadRoute") | ||||
if self._stacks: # pragma: no cover (untestable/exceptional case) | |||||
err = "Python tokenizer exited with non-empty token stack" | |||||
raise ParserError(err) | |||||
return tokens |
@@ -25,8 +25,9 @@ import codecs | |||||
from os import listdir, path | from os import listdir, path | ||||
import sys | import sys | ||||
from mwparserfromhell.compat import py3k | |||||
from mwparserfromhell.compat import py3k, str | |||||
from mwparserfromhell.parser import tokens | from mwparserfromhell.parser import tokens | ||||
from mwparserfromhell.parser.builder import Builder | |||||
class _TestParseError(Exception): | class _TestParseError(Exception): | ||||
"""Raised internally when a test could not be parsed.""" | """Raised internally when a test could not be parsed.""" | ||||
@@ -50,8 +51,12 @@ class TokenizerTestCase(object): | |||||
*label* for the method's docstring. | *label* for the method's docstring. | ||||
""" | """ | ||||
def inner(self): | def inner(self): | ||||
expected = data["output"] | |||||
actual = self.tokenizer().tokenize(data["input"]) | |||||
if hasattr(self, "roundtrip"): | |||||
expected = data["input"] | |||||
actual = str(Builder().build(data["output"][:])) | |||||
else: | |||||
expected = data["output"] | |||||
actual = self.tokenizer().tokenize(data["input"]) | |||||
self.assertEqual(expected, actual) | self.assertEqual(expected, actual) | ||||
if not py3k: | if not py3k: | ||||
inner.__name__ = funcname.encode("utf8") | inner.__name__ = funcname.encode("utf8") | ||||
@@ -0,0 +1,41 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2012-2014 Ben Kurtovic <ben.kurtovic@gmail.com> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
from __future__ import unicode_literals | |||||
try: | |||||
import unittest2 as unittest | |||||
except ImportError: | |||||
import unittest | |||||
from ._test_tokenizer import TokenizerTestCase | |||||
class TestRoundtripping(TokenizerTestCase, unittest.TestCase): | |||||
"""Test cases for roundtripping tokens back to wikitext.""" | |||||
@classmethod | |||||
def setUpClass(cls): | |||||
cls.roundtrip = True | |||||
if __name__ == "__main__": | |||||
unittest.main(verbosity=2) |
@@ -226,6 +226,38 @@ class TestTag(TreeEqualityTestCase): | |||||
self.assertWikicodeEqual(parsed, node.closing_tag) | self.assertWikicodeEqual(parsed, node.closing_tag) | ||||
self.assertEqual("<ref>foobar</ref {{ignore me}}>", node) | self.assertEqual("<ref>foobar</ref {{ignore me}}>", node) | ||||
def test_wiki_style_separator(self): | |||||
"""test getter/setter for wiki_style_separator attribute""" | |||||
node = Tag(wraptext("table"), wraptext("\n")) | |||||
self.assertIs(None, node.wiki_style_separator) | |||||
node.wiki_style_separator = "|" | |||||
self.assertEqual("|", node.wiki_style_separator) | |||||
node.wiki_markup = "{" | |||||
self.assertEqual("{|\n{", node) | |||||
node2 = Tag(wraptext("table"), wraptext("\n"), wiki_style_separator="|") | |||||
self.assertEqual("|", node.wiki_style_separator) | |||||
def test_closing_wiki_markup(self): | |||||
"""test getter/setter for closing_wiki_markup attribute""" | |||||
node = Tag(wraptext("table"), wraptext("\n")) | |||||
self.assertIs(None, node.closing_wiki_markup) | |||||
node.wiki_markup = "{|" | |||||
self.assertEqual("{|", node.closing_wiki_markup) | |||||
node.closing_wiki_markup = "|}" | |||||
self.assertEqual("|}", node.closing_wiki_markup) | |||||
self.assertEqual("{|\n|}", node) | |||||
node.wiki_markup = "!!" | |||||
self.assertEqual("|}", node.closing_wiki_markup) | |||||
self.assertEqual("!!\n|}", node) | |||||
node.wiki_markup = False | |||||
self.assertFalse(node.closing_wiki_markup) | |||||
self.assertEqual("<table>\n</table>", node) | |||||
node2 = Tag(wraptext("table"), wraptext("\n"), | |||||
attrs=[agen("id", "foo")], wiki_markup="{|", | |||||
closing_wiki_markup="|}") | |||||
self.assertEqual("|}", node2.closing_wiki_markup) | |||||
self.assertEqual('{| id="foo"\n|}', node2) | |||||
def test_has(self): | def test_has(self): | ||||
"""test Tag.has()""" | """test Tag.has()""" | ||||
node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")]) | node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")]) | ||||
@@ -0,0 +1,410 @@ | |||||
name: empty_table | |||||
label: parsing an empty table | |||||
input: "{|\n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: inline_table | |||||
label: tables with a close on the same line are not valid | |||||
input: "{||}" | |||||
output: [Text(text="{||}")] | |||||
--- | |||||
name: no_table_close_simple | |||||
label: no table close on inline table | |||||
input: "{| " | |||||
output: [Text(text="{| ")] | |||||
--- | |||||
name: no_table_close_newline | |||||
label: no table close with a newline | |||||
input: "{| \n " | |||||
output: [Text(text="{| \n ")] | |||||
--- | |||||
name: no_table_close_inside_cell | |||||
label: no table close while inside of a cell | |||||
input: "{| \n| " | |||||
output: [Text(text="{| \n| ")] | |||||
--- | |||||
name: no_table_close_inside_cell_after_newline | |||||
label: no table close while inside of a cell after a newline | |||||
input: "{| \n| \n " | |||||
output: [Text(text="{| \n| \n ")] | |||||
--- | |||||
name: no_table_close_inside_cell_with_attributes | |||||
label: no table close while inside of a cell with attributes | |||||
input: "{| \n| red | test" | |||||
output: [Text(text="{| \n| red | test")] | |||||
--- | |||||
name: no_table_close_inside_row | |||||
label: no table close while inside of a row | |||||
input: "{| \n|- " | |||||
output: [Text(text="{| \n|- ")] | |||||
--- | |||||
name: no_table_close_inside_row_after_newline | |||||
label: no table close while inside of a row after a newline | |||||
input: "{| \n|- \n " | |||||
output: [Text(text="{| \n|- \n ")] | |||||
--- | |||||
name: no_table_close_row_and_cell | |||||
label: no table close while inside a cell inside a row | |||||
input: "{| \n|- \n|" | |||||
output: [Text(text="{| \n|- \n|")] | |||||
--- | |||||
name: no_table_close_attributes | |||||
label: don't parse attributes as attributes if the table doesn't exist | |||||
input: "{| border="1"" | |||||
output: [Text(text="{| border=\"1\"")] | |||||
--- | |||||
name: no_table_close_unclosed_attributes | |||||
label: don't parse unclosed attributes if the table doesn't exist | |||||
input: "{| border=" | |||||
output: [Text(text="{| border=")] | |||||
--- | |||||
name: no_table_close_row_attributes | |||||
label: don't parse row attributes as attributes if the table doesn't exist | |||||
input: "{| |- border="1"" | |||||
output: [Text(text="{| |- border=\"1\"")] | |||||
--- | |||||
name: no_table_close_cell | |||||
label: don't parse cells if the table doesn't close | |||||
input: "{| | border="1"| test || red | foo" | |||||
output: [Text(text="{| | border=\"1\"| test || red | foo")] | |||||
--- | |||||
name: crazy_no_table_close | |||||
label: lots of opened wiki syntax without closes | |||||
input: "{{{ {{ {| <ref" | |||||
output: [Text(text="{{{ {{ {| <ref")] | |||||
--- | |||||
name: leading_whitespace_table | |||||
label: handle leading whitespace for a table | |||||
input: "foo \n \t {|\n|}" | |||||
output: [Text(text="foo \n \t "), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: whitespace_after_table | |||||
label: handle whitespace after a table close | |||||
input: "{|\n|}\n \t " | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text="\n \t ")] | |||||
--- | |||||
name: different_whitespace_after_table | |||||
label: handle spaces after a table close | |||||
input: "{|\n|} \n " | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" \n ")] | |||||
--- | |||||
name: characters_after_table | |||||
label: handle characters after a table close | |||||
input: "{|\n|} tsta" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" tsta")] | |||||
--- | |||||
name: characters_after_inline_table | |||||
label: handle characters after an inline table close | |||||
input: "{| |} tsta" | |||||
output: [Text(text="{| |} tsta")] | |||||
--- | |||||
name: leading_characters_table | |||||
label: don't parse as a table when leading characters are not newline or whitespace | |||||
input: "foo \n foo \t {|\n|}" | |||||
output: [Text(text="foo \n foo \t {|\n|}")] | |||||
--- | |||||
name: table_row_simple | |||||
label: simple table row | |||||
input: "{|\n |- \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_row_multiple | |||||
label: simple table row | |||||
input: "{|\n |- \n|- \n |-\n |}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_simple | |||||
label: simple table cell | |||||
input: "{|\n | foo \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_inline | |||||
label: multiple inline table cells | |||||
input: "{|\n | foo || bar || test \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_multiple | |||||
label: multiple table cells (non-inline) | |||||
input: "{|\n| foo \n| bar \n| test \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_header_simple | |||||
label: simple header cell | |||||
input: "{|\n ! foo \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_header_inline | |||||
label: multiple inline header cells | |||||
input: "{|\n ! foo || bar !! test \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_header_multiple | |||||
label: multiple table header cells (non-inline) | |||||
input: "{|\n! foo \n! bar \n! test \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagCloseOpen(padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: nested_cells_and_rows | |||||
label: combination of cells and rows in a table | |||||
input: "{|\n|- \n| foo \n|- \n| bar\n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding=" \n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" bar\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_fake_close | |||||
label: looks like a table close but is not | |||||
input: "{|\n | |} \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text="} \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_more_fake_close | |||||
label: looks like a table close but is not | |||||
input: "{|\n || |} \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" |} \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_extra_close | |||||
label: process second close as text | |||||
input: "{| \n |} \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose(), Text(text=" \n|}")] | |||||
--- | |||||
name: nowiki_inside_table | |||||
label: nowiki handles pipe characters in tables | |||||
input: "{|\n | foo <nowiki>| |- {| |} || ! !!</nowiki> bar \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_text_outside_cell | |||||
label: parse text inside table but outside of a cell | |||||
input: "{|\n bar \n | foo \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: no_table_cell_with_leading_characters | |||||
label: fail to create a table cell when there are leading non-whitespace characters | |||||
input: "{|\n bar | foo \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: no_table_row_with_leading_characters | |||||
label: fail to create a table row when there are leading non-whitespace characters | |||||
input: "{|\n bar |- foo \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: template_inside_table_cell | |||||
label: template within table cell | |||||
input: "{|\n |{{foo\n|bar=baz}} \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_attributes | |||||
label: parse table cell style attributes | |||||
input: "{| \n | name="foo bar"| test \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_empty_attributes | |||||
label: parse table cell with style markers but no attributes | |||||
input: "{| \n | | test \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_with_dash | |||||
label: parse a situation in which a cell line looks like a row line | |||||
input: "{|\n ||- \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_attributes_quote_with_pipe | |||||
label: pipe inside an attribute quote should still be used as a style separator | |||||
input: "{| \n | name="foo|bar"| test \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_attributes_name_with_pipe | |||||
label: pipe inside an attribute name should still be used as a style separator | |||||
input: "{| \n | name|="foo bar" | test \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_attributes_pipe_after_equals | |||||
label: pipe inside an attribute should still be used as a style separator after an equals | |||||
input: "{| \n | name=|"foo|bar"| test \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseOpen(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_cell_attributes_templates | |||||
label: pipe inside attributes shouldn't be style separator | |||||
input: "{| \n | {{comment|template=baz}} | test \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: header_cell_attributes | |||||
label: parse header cell style attributes | |||||
input: "{| \n ! name="foo bar"| test \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: inline_cell_attributes | |||||
label: parse cell style attributes of inline cells | |||||
input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_row_attributes | |||||
label: parse table row style attributes | |||||
input: "{| \n |- name="foo bar"\n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_row_attributes_crazy_whitespace | |||||
label: parse table row style attributes with different whitespace | |||||
input: "{| \t \n |- \t name="foo bar" \t \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding=" \t \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: table_attributes | |||||
label: parse table style attributes | |||||
input: "{| name="foo bar"\n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: inline_table_attributes | |||||
label: handle attributes in inline tables | |||||
input: "{| foo="tee bar" |}" | |||||
output: [Text(text='{| foo="tee bar" |}')] | |||||
--- | |||||
name: table_incorrect_attributes | |||||
label: parse incorrect table style attributes | |||||
input: "{| name="foo\n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: templates_in_table_attribute | |||||
label: templates in the attributes of a table, after the start | |||||
input: "{| {{class}}="{{wikitable}}"\n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="class"), TemplateClose(), TagAttrEquals(), TagAttrQuote(char="\""), TemplateOpen(), Text(text="wikitable"), TemplateClose(), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: templates_in_table_attribute_2 | |||||
label: templates in the attributes of a table, after the start | |||||
input: "{|{{foo}} \n | name="foo bar" | test \n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: inappropriate_marker_at_line_start | |||||
label: an inappropriate marker (a right bracket) at the start of a line in the table | |||||
input: "{|\n}" | |||||
output: [Text(text="{|\n}")] | |||||
--- | |||||
name: fake_close_near_start | |||||
label: a fake closing token at the end of the first line in the table | |||||
input: "{| class="wikitable" style="text-align: center; width=100%;|}\n|\n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"text-align:"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="center;"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="width"), TagAttrEquals(), Text(text="100%;|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: fake_close_near_start_2 | |||||
label: a fake closing token at the end of the first line in the table | |||||
input: "{| class="wikitable|}"\n|\n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: junk_after_table_start | |||||
label: ignore more junk on the first line of the table | |||||
input: "{| class="wikitable" | foobar\n|\n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="foobar"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] | |||||
--- | |||||
name: junk_after_table_row | |||||
label: ignore junk on the first line of a table row | |||||
input: "{|\n|- foo="bar" | baz\n|blerp\n|}" | |||||
output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="bar"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="baz"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="blerp\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] |
@@ -447,6 +447,13 @@ output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Tag | |||||
--- | --- | ||||
name: dt_dd_mix4 | |||||
label: another example of correct dt/dd usage, with a trigger for a specific parse route | |||||
input: ";foo]:bar" | |||||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo]"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")] | |||||
--- | |||||
name: ul_ol_dt_dd_mix | name: ul_ol_dt_dd_mix | ||||
label: an assortment of uls, ols, dds, and dts | label: an assortment of uls, ols, dds, and dts | ||||
input: ";:#*foo\n:#*;foo\n#*;:foo\n*;:#foo" | input: ";:#*foo\n:#*;foo\n#*;:foo\n*;:#foo" | ||||