diff --git a/CHANGELOG b/CHANGELOG index b4b01d6..3471531 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,7 @@ v0.4 (unreleased): - The parser is now distributed with Windows binaries, fixing an issue that prevented Windows users from using the C tokenizer. +- Added support for parsing wikicode tables (patches by David Winegar). - Added a script to test for memory leaks in scripts/memtest.py. - Added a script to do releases in scripts/release.sh. - skip_style_tags can now be passed to mwparserfromhell.parse() (previously, diff --git a/docs/changelog.rst b/docs/changelog.rst index 9fdfef2..b3e7548 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,6 +9,7 @@ Unreleased - The parser is now distributed with Windows binaries, fixing an issue that prevented Windows users from using the C tokenizer. +- Added support for parsing wikicode tables (patches by David Winegar). - Added a script to test for memory leaks in :file:`scripts/memtest.py`. - Added a script to do releases in :file:`scripts/release.sh`. - *skip_style_tags* can now be passed to :func:`mwparserfromhell.parse() diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 6020ad1..af41f49 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -52,7 +52,7 @@ INVISIBLE_TAGS = [ # [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] -SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] +SINGLE = SINGLE_ONLY + ["li", "dt", "dd", "th", "td", "tr"] MARKUP_TO_HTML = { "#": "li", diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 7cbe78d..e3c7260 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -35,7 +35,8 @@ class Tag(Node): def __init__(self, tag, contents=None, attrs=None, wiki_markup=None, self_closing=False, invalid=False, implicit=False, padding="", - closing_tag=None): + closing_tag=None, wiki_style_separator=None, + closing_wiki_markup=None): super(Tag, self).__init__() self._tag = tag if contents is None and not self_closing: @@ -52,13 +53,28 @@ class Tag(Node): self._closing_tag = closing_tag else: self._closing_tag = tag + self._wiki_style_separator = wiki_style_separator + if closing_wiki_markup is not None: + self._closing_wiki_markup = closing_wiki_markup + elif wiki_markup and not self_closing: + self._closing_wiki_markup = wiki_markup + else: + self._closing_wiki_markup = None def __unicode__(self): if self.wiki_markup: + if self.attributes: + attrs = "".join([str(attr) for attr in self.attributes]) + else: + attrs = "" + padding = self.padding or "" + separator = self.wiki_style_separator or "" + close = self.closing_wiki_markup or "" if self.self_closing: - return self.wiki_markup + return self.wiki_markup + attrs + padding + separator else: - return self.wiki_markup + str(self.contents) + self.wiki_markup + return self.wiki_markup + attrs + padding + separator + \ + str(self.contents) + close result = ("topstack->context; + uint64_t context = self->topstack->context; PyObject* stack = Tokenizer_pop(self); Py_XDECREF(stack); @@ -676,11 +676,8 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) RESET_ROUTE(); for (i = 0; i < braces; i++) text[i] = '{'; text[braces] = '\0'; - if (Tokenizer_emit_text_then_stack(self, text)) { - Py_XDECREF(text); + if (Tokenizer_emit_text_then_stack(self, text)) return -1; - } - Py_XDECREF(text); return 0; } else @@ -1034,7 +1031,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) { // Built from Tokenizer_parse()'s end sentinels: Py_UNICODE after = Tokenizer_READ(self, 2); - int ctx = self->topstack->context; + uint64_t ctx = self->topstack->context; return (!this || this == '\n' || this == '[' || this == ']' || this == '<' || this == '>' || (this == '\'' && next == '\'') || @@ -1629,9 +1626,9 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data) static int Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) { - int ctx = data->context; - int end_of_value = (ctx & TAG_ATTR_VALUE && - !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE))); + uint64_t ctx = data->context; + uint64_t end_of_value = (ctx & TAG_ATTR_VALUE && + !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE))); if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) { if (Tokenizer_push_tag_buffer(self, data)) @@ -2153,7 +2150,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag, static int Tokenizer_parse_italics(Tokenizer* self) { Py_ssize_t reset = self->head; - int context; + uint64_t context; PyObject *stack; stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1); @@ -2273,7 +2270,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self) */ static PyObject* Tokenizer_parse_style(Tokenizer* self) { - int context = self->topstack->context, ticks = 2, i; + uint64_t context = self->topstack->context, ticks = 2, i; self->head += 2; while (Tokenizer_READ(self, 0) == '\'') { @@ -2426,9 +2423,363 @@ static int Tokenizer_handle_dl_term(Tokenizer* self) } /* + Emit a table tag. +*/ +static int +Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, + const char* tag, PyObject* style, PyObject* padding, + const char* close_open_markup, PyObject* contents, + const char* open_close_markup) +{ + PyObject *open_open_kwargs, *open_open_markup_unicode, *close_open_kwargs, + *close_open_markup_unicode, *open_close_kwargs, + *open_close_markup_unicode; + + open_open_kwargs = PyDict_New(); + if (!open_open_kwargs) + goto fail_decref_all; + open_open_markup_unicode = PyUnicode_FromString(open_open_markup); + if (!open_open_markup_unicode) { + Py_DECREF(open_open_kwargs); + goto fail_decref_all; + } + PyDict_SetItemString(open_open_kwargs, "wiki_markup", + open_open_markup_unicode); + Py_DECREF(open_open_markup_unicode); + if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs)) + goto fail_decref_all; + if (Tokenizer_emit_text(self, tag)) + goto fail_decref_all; + + if (style) { + if (Tokenizer_emit_all(self, style)) + goto fail_decref_all; + Py_DECREF(style); + } + + close_open_kwargs = PyDict_New(); + if (!close_open_kwargs) + goto fail_decref_padding_contents; + if (close_open_markup && strlen(close_open_markup) != 0) { + close_open_markup_unicode = PyUnicode_FromString(close_open_markup); + if (!close_open_markup_unicode) { + Py_DECREF(close_open_kwargs); + goto fail_decref_padding_contents; + } + PyDict_SetItemString(close_open_kwargs, "wiki_markup", + close_open_markup_unicode); + Py_DECREF(close_open_markup_unicode); + } + PyDict_SetItemString(close_open_kwargs, "padding", padding); + Py_DECREF(padding); + if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs)) + goto fail_decref_contents; + + if (contents) { + if (Tokenizer_emit_all(self, contents)) + goto fail_decref_contents; + Py_DECREF(contents); + } + + open_close_kwargs = PyDict_New(); + if (!open_close_kwargs) + return -1; + open_close_markup_unicode = PyUnicode_FromString(open_close_markup); + if (!open_close_markup_unicode) { + Py_DECREF(open_close_kwargs); + return -1; + } + PyDict_SetItemString(open_close_kwargs, "wiki_markup", + open_close_markup_unicode); + Py_DECREF(open_close_markup_unicode); + if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs)) + return -1; + if (Tokenizer_emit_text(self, tag)) + return -1; + if (Tokenizer_emit(self, TagCloseClose)) + return -1; + return 0; + + fail_decref_all: + Py_XDECREF(style); + fail_decref_padding_contents: + Py_DECREF(padding); + fail_decref_contents: + Py_DECREF(contents); + return -1; +} + +/* + Handle style attributes for a table until an ending token. +*/ +static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token) +{ + TagData *data = TagData_new(); + PyObject *padding, *trash; + Py_UNICODE this; + int can_exit; + + if (!data) + return NULL; + data->context = TAG_ATTR_READY; + + while (1) { + this = Tokenizer_READ(self, 0); + can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE); + if (this == end_token && can_exit) { + if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) { + if (Tokenizer_push_tag_buffer(self, data)) { + TagData_dealloc(data); + return NULL; + } + } + if (Py_UNICODE_ISSPACE(this)) + Textbuffer_write(&(data->pad_first), this); + padding = Textbuffer_render(data->pad_first); + TagData_dealloc(data); + if (!padding) + return NULL; + return padding; + } + else if (!this || this == end_token) { + if (self->topstack->context & LC_TAG_ATTR) { + if (data->context & TAG_QUOTED) { + // Unclosed attribute quote: reset, don't die + data->context = TAG_ATTR_VALUE; + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + self->head = data->reset; + continue; + } + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + } + TagData_dealloc(data); + return Tokenizer_fail_route(self); + } + else { + if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) { + TagData_dealloc(data); + return NULL; + } + } + self->head++; + } +} + +/* + Parse a wikicode table by starting with the first line. +*/ +static int Tokenizer_parse_table(Tokenizer* self) +{ + Py_ssize_t reset = self->head + 1; + PyObject *style, *padding; + PyObject *table = NULL; + self->head += 2; + + if(Tokenizer_push(self, LC_TABLE_OPEN)) + return -1; + padding = Tokenizer_handle_table_style(self, '\n'); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + if (Tokenizer_emit_text(self, "{|")) + return -1; + return 0; + } + if (!padding) + return -1; + style = Tokenizer_pop(self); + if (!style) { + Py_DECREF(padding); + return -1; + } + + self->head++; + table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); + if (BAD_ROUTE) { + RESET_ROUTE(); + Py_DECREF(padding); + Py_DECREF(style); + self->head = reset; + if (Tokenizer_emit_text(self, "{|")) + return -1; + return 0; + } + if (!table) { + Py_DECREF(padding); + Py_DECREF(style); + return -1; + } + + if (Tokenizer_emit_table_tag(self, "{|", "table", style, padding, NULL, + table, "|}")) + return -1; + // Offset displacement done by _parse() + self->head--; + return 0; +} + +/* + Parse as style until end of the line, then continue. +*/ +static int Tokenizer_handle_table_row(Tokenizer* self) +{ + PyObject *padding, *style, *row, *trash; + self->head += 2; + + if (!Tokenizer_CAN_RECURSE(self)) { + if (Tokenizer_emit_text(self, "|-")) + return -1; + self->head -= 1; + return 0; + } + + if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) + return -1; + padding = Tokenizer_handle_table_style(self, '\n'); + if (BAD_ROUTE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + return 0; + } + if (!padding) + return -1; + style = Tokenizer_pop(self); + if (!style) { + Py_DECREF(padding); + return -1; + } + + // Don't parse the style separator + self->head++; + row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); + if (!row) { + Py_DECREF(padding); + Py_DECREF(style); + return -1; + } + + if (Tokenizer_emit_table_tag(self, "|-", "tr", style, padding, NULL, row, "")) + return -1; + // Offset displacement done by _parse() + self->head--; + return 0; +} + +/* + Parse as normal syntax unless we hit a style marker, then parse style + as HTML attributes and the remainder as normal syntax. +*/ +static int +Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, + const char *tag, uint64_t line_context) +{ + uint64_t old_context = self->topstack->context; + uint64_t cell_context; + Py_ssize_t reset; + PyObject *padding, *cell, *style = NULL; + const char *close_open_markup = NULL; + + self->head += strlen(markup); + reset = self->head; + + if (!Tokenizer_CAN_RECURSE(self)) { + if (Tokenizer_emit_text(self, markup)) + return -1; + self->head--; + return 0; + } + + cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | + LC_TABLE_CELL_STYLE | line_context, 1); + if (!cell) + return -1; + cell_context = self->topstack->context; + self->topstack->context = old_context; + + if (cell_context & LC_TABLE_CELL_STYLE) { + Py_DECREF(cell); + self->head = reset; + if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | + line_context)) + return -1; + padding = Tokenizer_handle_table_style(self, '|'); + if (!padding) + return -1; + style = Tokenizer_pop(self); + if (!style) { + Py_DECREF(padding); + return -1; + } + // Don't parse the style separator + self->head++; + cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | + line_context, 1); + if (!cell) { + Py_DECREF(padding); + Py_DECREF(style); + return -1; + } + cell_context = self->topstack->context; + self->topstack->context = old_context; + } + else { + padding = PyUnicode_FromString(""); + if (!padding) { + Py_DECREF(cell); + return -1; + } + } + + if (style) { + close_open_markup = "|"; + } + if (Tokenizer_emit_table_tag(self, markup, tag, style, padding, + close_open_markup, cell, "")) + return -1; + // Keep header/cell line contexts + self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE); + // Offset displacement done by parse() + self->head--; + return 0; +} + +/* + Returns the context, stack, and whether to reset the cell for style + in a tuple. +*/ +static PyObject* +Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) +{ + if (reset_for_style) + self->topstack->context |= LC_TABLE_CELL_STYLE; + else + self->topstack->context &= ~LC_TABLE_CELL_STYLE; + return Tokenizer_pop_keeping_context(self); +} + +/* + Return the stack in order to handle the table row end. +*/ +static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) +{ + return Tokenizer_pop(self); +} + +/* + Return the stack in order to handle the table end. +*/ +static PyObject* Tokenizer_handle_table_end(Tokenizer* self) +{ + self->head += 2; + return Tokenizer_pop(self); +} + +/* Handle the end of the stream of wikitext. */ -static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) +static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) { PyObject *token, *text, *trash; int single; @@ -2444,9 +2795,16 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) if (single) return Tokenizer_handle_single_tag_end(self); } - else if (context & AGG_DOUBLE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); + else { + if (context & LC_TABLE_CELL_OPEN) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + context = self->topstack->context; + } + if (context & AGG_DOUBLE) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + } } return Tokenizer_fail_route(self); } @@ -2457,7 +2815,8 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) Make sure we are not trying to write an invalid character. Return 0 if everything is safe, or -1 if the route must be failed. */ -static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) +static int +Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) { if (context & LC_FAIL_NEXT) return -1; @@ -2508,7 +2867,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) } else if (context & LC_FAIL_ON_LBRACE) { if (data == '{' || (Tokenizer_READ_BACKWARDS(self, 1) == '{' && - Tokenizer_READ_BACKWARDS(self, 2) == '{')) { + Tokenizer_READ_BACKWARDS(self, 2) == '{')) { if (context & LC_TEMPLATE) self->topstack->context |= LC_FAIL_ON_EQUALS; else @@ -2533,12 +2892,30 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) } /* + Returns whether the current head has leading whitespace. + TODO: treat comments and templates as whitespace, allow fail on non-newline spaces. +*/ +static int Tokenizer_has_leading_whitespace(Tokenizer* self) +{ + int offset = 1; + Py_UNICODE current_character; + while (1) { + current_character = Tokenizer_READ_BACKWARDS(self, offset); + if (!current_character || current_character == '\n') + return 1; + else if (!Py_UNICODE_ISSPACE(current_character)) + return 0; + offset++; + } +} + +/* Parse the wikicode string, using context for when to stop. If push is true, we will push a new context, otherwise we won't and context will be ignored. */ -static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) +static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) { - int this_context; + uint64_t this_context; Py_UNICODE this, next, next_next, last; PyObject* temp; @@ -2667,22 +3044,99 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) if (temp != Py_None) return temp; } - else if (!last || last == '\n') { - if (this == '#' || this == '*' || this == ';' || this == ':') { - if (Tokenizer_handle_list(self)) + else if ((!last || last == '\n') && (this == '#' || this == '*' || this == ';' || this == ':')) { + if (Tokenizer_handle_list(self)) + return NULL; + } + else if ((!last || last == '\n') && (this == '-' && this == next && + this == Tokenizer_READ(self, 2) && + this == Tokenizer_READ(self, 3))) { + if (Tokenizer_handle_hr(self)) + return NULL; + } + else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) { + if (Tokenizer_handle_dl_term(self)) + return NULL; + // Kill potential table contexts + if (this == '\n') + self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; + } + + // Start of table parsing + else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) { + if (Tokenizer_CAN_RECURSE(self)) { + if (Tokenizer_parse_table(self)) + return NULL; + } + else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next)) + return NULL; + else + self->head++; + } + else if (this_context & LC_TABLE_OPEN) { + if (this == '|' && next == '|' && this_context & LC_TABLE_TD_LINE) { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "||", "td", LC_TABLE_TD_LINE)) return NULL; } - else if (this == '-' && this == next && - this == Tokenizer_READ(self, 2) && - this == Tokenizer_READ(self, 3)) { - if (Tokenizer_handle_hr(self)) + else if (this == '|' && next == '|' && this_context & LC_TABLE_TH_LINE) { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "||", "th", LC_TABLE_TH_LINE)) + return NULL; + } + else if (this == '!' && next == '!' && this_context & LC_TABLE_TH_LINE) { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "!!", "th", LC_TABLE_TH_LINE)) + return NULL; + } + else if (this == '|' && this_context & LC_TABLE_CELL_STYLE) { + return Tokenizer_handle_table_cell_end(self, 1); + } + // On newline, clear out cell line contexts + else if (this == '\n' && this_context & LC_TABLE_CELL_LINE_CONTEXTS) { + self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS; + if (Tokenizer_emit_char(self, this)) + return NULL; + } + else if (Tokenizer_has_leading_whitespace(self)) { + if (this == '|' && next == '}') { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + if (this_context & LC_TABLE_ROW_OPEN) + return Tokenizer_handle_table_row_end(self); + else + return Tokenizer_handle_table_end(self); + } + else if (this == '|' && next == '-') { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + if (this_context & LC_TABLE_ROW_OPEN) + return Tokenizer_handle_table_row_end(self); + else if (Tokenizer_handle_table_row(self)) + return NULL; + } + else if (this == '|') { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "|", "td", LC_TABLE_TD_LINE)) + return NULL; + } + else if (this == '!') { + if (this_context & LC_TABLE_CELL_OPEN) + return Tokenizer_handle_table_cell_end(self, 0); + else if (Tokenizer_handle_table_cell(self, "!", "th", LC_TABLE_TH_LINE)) + return NULL; + } + else if (Tokenizer_emit_char(self, this)) return NULL; } else if (Tokenizer_emit_char(self, this)) return NULL; - } - else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) { - if (Tokenizer_handle_dl_term(self)) + // Raise BadRoute to table start + if (BAD_ROUTE) return NULL; } else if (Tokenizer_emit_char(self, this)) @@ -2697,7 +3151,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) { PyObject *text, *temp, *tokens; - int context = 0, skip_style_tags = 0; + uint64_t context = 0; + int skip_style_tags = 0; if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { Py_XDECREF(self->text); @@ -2725,7 +3180,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) self->skip_style_tags = skip_style_tags; tokens = Tokenizer_parse(self, context, 1); - if (!tokens && !PyErr_Occurred()) { + if ((!tokens && !PyErr_Occurred()) || self->topstack) { if (!ParserError) { if (load_exceptions()) return NULL; @@ -2734,6 +3189,9 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) RESET_ROUTE(); PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); } + else if (self->topstack) + PyErr_SetString(ParserError, + "C tokenizer exited with non-empty token stack"); else PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); return NULL; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index dde6464..33ba0e1 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -29,6 +29,7 @@ SOFTWARE. #include #include #include +#include #if PY_MAJOR_VERSION >= 3 #define IS_PY3K @@ -43,16 +44,17 @@ SOFTWARE. static const char MARKERS[] = { '{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/', - '-', '\n', '\0'}; + '-', '!', '\n', '\0'}; -#define NUM_MARKERS 18 +#define NUM_MARKERS 19 #define TEXTBUFFER_BLOCKSIZE 1024 #define MAX_DEPTH 40 #define MAX_CYCLES 100000 #define MAX_BRACES 255 #define MAX_ENTITY_SIZE 8 -static int route_state = 0, route_context = 0; +static int route_state = 0; +static uint64_t route_context = 0; #define BAD_ROUTE route_state #define BAD_ROUTE_CONTEXT route_context #define FAIL_ROUTE(context) route_state = 1; route_context = context @@ -109,52 +111,61 @@ static PyObject* TagCloseClose; /* Local contexts: */ -#define LC_TEMPLATE 0x00000007 -#define LC_TEMPLATE_NAME 0x00000001 -#define LC_TEMPLATE_PARAM_KEY 0x00000002 -#define LC_TEMPLATE_PARAM_VALUE 0x00000004 - -#define LC_ARGUMENT 0x00000018 -#define LC_ARGUMENT_NAME 0x00000008 -#define LC_ARGUMENT_DEFAULT 0x00000010 - -#define LC_WIKILINK 0x00000060 -#define LC_WIKILINK_TITLE 0x00000020 -#define LC_WIKILINK_TEXT 0x00000040 - -#define LC_EXT_LINK 0x00000180 -#define LC_EXT_LINK_URI 0x00000080 -#define LC_EXT_LINK_TITLE 0x00000100 - -#define LC_HEADING 0x00007E00 -#define LC_HEADING_LEVEL_1 0x00000200 -#define LC_HEADING_LEVEL_2 0x00000400 -#define LC_HEADING_LEVEL_3 0x00000800 -#define LC_HEADING_LEVEL_4 0x00001000 -#define LC_HEADING_LEVEL_5 0x00002000 -#define LC_HEADING_LEVEL_6 0x00004000 - -#define LC_TAG 0x00078000 -#define LC_TAG_OPEN 0x00008000 -#define LC_TAG_ATTR 0x00010000 -#define LC_TAG_BODY 0x00020000 -#define LC_TAG_CLOSE 0x00040000 - -#define LC_STYLE 0x00780000 -#define LC_STYLE_ITALICS 0x00080000 -#define LC_STYLE_BOLD 0x00100000 -#define LC_STYLE_PASS_AGAIN 0x00200000 -#define LC_STYLE_SECOND_PASS 0x00400000 - -#define LC_DLTERM 0x00800000 - -#define LC_SAFETY_CHECK 0x3F000000 -#define LC_HAS_TEXT 0x01000000 -#define LC_FAIL_ON_TEXT 0x02000000 -#define LC_FAIL_NEXT 0x04000000 -#define LC_FAIL_ON_LBRACE 0x08000000 -#define LC_FAIL_ON_RBRACE 0x10000000 -#define LC_FAIL_ON_EQUALS 0x20000000 +#define LC_TEMPLATE 0x0000000000000007 +#define LC_TEMPLATE_NAME 0x0000000000000001 +#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002 +#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004 + +#define LC_ARGUMENT 0x0000000000000018 +#define LC_ARGUMENT_NAME 0x0000000000000008 +#define LC_ARGUMENT_DEFAULT 0x0000000000000010 + +#define LC_WIKILINK 0x0000000000000060 +#define LC_WIKILINK_TITLE 0x0000000000000020 +#define LC_WIKILINK_TEXT 0x0000000000000040 + +#define LC_EXT_LINK 0x0000000000000180 +#define LC_EXT_LINK_URI 0x0000000000000080 +#define LC_EXT_LINK_TITLE 0x0000000000000100 + +#define LC_HEADING 0x0000000000007E00 +#define LC_HEADING_LEVEL_1 0x0000000000000200 +#define LC_HEADING_LEVEL_2 0x0000000000000400 +#define LC_HEADING_LEVEL_3 0x0000000000000800 +#define LC_HEADING_LEVEL_4 0x0000000000001000 +#define LC_HEADING_LEVEL_5 0x0000000000002000 +#define LC_HEADING_LEVEL_6 0x0000000000004000 + +#define LC_TAG 0x0000000000078000 +#define LC_TAG_OPEN 0x0000000000008000 +#define LC_TAG_ATTR 0x0000000000010000 +#define LC_TAG_BODY 0x0000000000020000 +#define LC_TAG_CLOSE 0x0000000000040000 + +#define LC_STYLE 0x0000000000780000 +#define LC_STYLE_ITALICS 0x0000000000080000 +#define LC_STYLE_BOLD 0x0000000000100000 +#define LC_STYLE_PASS_AGAIN 0x0000000000200000 +#define LC_STYLE_SECOND_PASS 0x0000000000400000 + +#define LC_DLTERM 0x0000000000800000 + +#define LC_SAFETY_CHECK 0x000000003F000000 +#define LC_HAS_TEXT 0x0000000001000000 +#define LC_FAIL_ON_TEXT 0x0000000002000000 +#define LC_FAIL_NEXT 0x0000000004000000 +#define LC_FAIL_ON_LBRACE 0x0000000008000000 +#define LC_FAIL_ON_RBRACE 0x0000000010000000 +#define LC_FAIL_ON_EQUALS 0x0000000020000000 + +#define LC_TABLE 0x0000000FC0000000 +#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000 +#define LC_TABLE_OPEN 0x0000000040000000 +#define LC_TABLE_CELL_OPEN 0x0000000080000000 +#define LC_TABLE_CELL_STYLE 0x0000000100000000 +#define LC_TABLE_ROW_OPEN 0x0000000200000000 +#define LC_TABLE_TD_LINE 0x0000000400000000 +#define LC_TABLE_TH_LINE 0x0000000800000000 /* Global contexts: */ @@ -162,9 +173,9 @@ static PyObject* TagCloseClose; /* Aggregate contexts: */ -#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) -#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) +#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) #define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) #define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) @@ -191,7 +202,7 @@ struct Textbuffer { struct Stack { PyObject* stack; - int context; + uint64_t context; struct Textbuffer* textbuffer; struct Stack* next; }; @@ -202,7 +213,7 @@ typedef struct { } HeadingData; typedef struct { - int context; + uint64_t context; struct Textbuffer* pad_first; struct Textbuffer* pad_before_eq; struct Textbuffer* pad_after_eq; @@ -267,7 +278,7 @@ static int Tokenizer_parse_entity(Tokenizer*); static int Tokenizer_parse_comment(Tokenizer*); static int Tokenizer_handle_dl_term(Tokenizer*); static int Tokenizer_parse_tag(Tokenizer*); -static PyObject* Tokenizer_parse(Tokenizer*, int, int); +static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); static int load_exceptions(void); diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 073e64c..3ac25a5 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -63,7 +63,7 @@ class Tokenizer(object): START = object() END = object() MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", - ":", "/", "-", "\n", START, END] + ":", "/", "-", "!", "\n", START, END] MAX_DEPTH = 40 MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) @@ -991,12 +991,166 @@ class Tokenizer(object): else: self._emit_text("\n") + def _emit_table_tag(self, open_open_markup, tag, style, padding, + close_open_markup, contents, open_close_markup): + """Emit a table tag.""" + self._emit(tokens.TagOpenOpen(wiki_markup=open_open_markup)) + self._emit_text(tag) + if style: + self._emit_all(style) + if close_open_markup: + self._emit(tokens.TagCloseOpen(wiki_markup=close_open_markup, + padding=padding)) + else: + self._emit(tokens.TagCloseOpen(padding=padding)) + if contents: + self._emit_all(contents) + self._emit(tokens.TagOpenClose(wiki_markup=open_close_markup)) + self._emit_text(tag) + self._emit(tokens.TagCloseClose()) + + def _handle_table_style(self, end_token): + """Handle style attributes for a table until ``end_token``.""" + data = _TagOpenData() + data.context = _TagOpenData.CX_ATTR_READY + while True: + this = self._read() + can_exit = (not data.context & data.CX_QUOTED or + data.context & data.CX_NOTE_SPACE) + if this == end_token and can_exit: + if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): + self._push_tag_buffer(data) + if this.isspace(): + data.padding_buffer["first"] += this + return data.padding_buffer["first"] + elif this is self.END or this == end_token: + if self._context & contexts.TAG_ATTR: + if data.context & data.CX_QUOTED: + # Unclosed attribute quote: reset, don't die + data.context = data.CX_ATTR_VALUE + self._pop() + self._head = data.reset + continue + self._pop() + self._fail_route() + else: + self._handle_tag_data(data, this) + self._head += 1 + + def _parse_table(self): + """Parse a wikicode table by starting with the first line.""" + reset = self._head + 1 + self._head += 2 + self._push(contexts.TABLE_OPEN) + try: + padding = self._handle_table_style("\n") + except BadRoute: + self._head = reset + self._emit_text("{|") + return + style = self._pop() + + self._head += 1 + try: + table = self._parse(contexts.TABLE_OPEN) + except BadRoute: + self._head = reset + self._emit_text("{|") + return + + self._emit_table_tag("{|", "table", style, padding, None, table, "|}") + # Offset displacement done by _parse(): + self._head -= 1 + + def _handle_table_row(self): + """Parse as style until end of the line, then continue.""" + self._head += 2 + if not self._can_recurse(): + self._emit_text("|-") + self._head -= 1 + return + + self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) + try: + padding = self._handle_table_style("\n") + except BadRoute: + self._pop() + raise + style = self._pop() + + # Don't parse the style separator: + self._head += 1 + row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) + + self._emit_table_tag("|-", "tr", style, padding, None, row, "") + # Offset displacement done by parse(): + self._head -= 1 + + def _handle_table_cell(self, markup, tag, line_context): + """Parse as normal syntax unless we hit a style marker, then parse + style as HTML attributes and the remainder as normal syntax.""" + old_context = self._context + padding, style = "", None + self._head += len(markup) + reset = self._head + if not self._can_recurse(): + self._emit_text(markup) + self._head -= 1 + return + + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context | contexts.TABLE_CELL_STYLE) + cell_context = self._context + self._context = old_context + reset_for_style = cell_context & contexts.TABLE_CELL_STYLE + if reset_for_style: + self._head = reset + self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context) + padding = self._handle_table_style("|") + style = self._pop() + # Don't parse the style separator: + self._head += 1 + cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | + line_context) + cell_context = self._context + self._context = old_context + + close_open_markup = "|" if reset_for_style else None + self._emit_table_tag(markup, tag, style, padding, close_open_markup, + cell, "") + # Keep header/cell line contexts: + self._context |= cell_context & (contexts.TABLE_TH_LINE | + contexts.TABLE_TD_LINE) + # Offset displacement done by parse(): + self._head -= 1 + + def _handle_table_cell_end(self, reset_for_style=False): + """Returns the current context, with the TABLE_CELL_STYLE flag set if + it is necessary to reset and parse style attributes.""" + if reset_for_style: + self._context |= contexts.TABLE_CELL_STYLE + else: + self._context &= ~contexts.TABLE_CELL_STYLE + return self._pop(keep_context=True) + + def _handle_table_row_end(self): + """Return the stack in order to handle the table row end.""" + return self._pop() + + def _handle_table_end(self): + """Return the stack in order to handle the table end.""" + self._head += 2 + return self._pop() + def _handle_end(self): """Handle the end of the stream of wikitext.""" if self._context & contexts.FAIL: if self._context & contexts.TAG_BODY: if is_single(self._stack[1].text): return self._handle_single_tag_end() + if self._context & contexts.TABLE_CELL_OPEN: + self._pop() if self._context & contexts.DOUBLE: self._pop() self._fail_route() @@ -1144,15 +1298,68 @@ class Tokenizer(object): result = self._parse_style() if result is not None: return result - elif self._read(-1) in ("\n", self.START): - if this in ("#", "*", ";", ":"): + elif self._read(-1) in ("\n", self.START) and this in ("#", "*", ";", ":"): self._handle_list() - elif this == next == self._read(2) == self._read(3) == "-": + elif self._read(-1) in ("\n", self.START) and this == next == self._read(2) == self._read(3) == "-": self._handle_hr() - else: - self._emit_text(this) elif this in ("\n", ":") and self._context & contexts.DL_TERM: self._handle_dl_term() + if this == "\n": + # Kill potential table contexts + self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS + # Start of table parsing + elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or + (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): + if self._can_recurse(): + self._parse_table() + else: + self._emit_text("{|") + elif self._context & contexts.TABLE_OPEN: + if this == next == "|" and self._context & contexts.TABLE_TD_LINE: + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE) + elif this == next == "|" and self._context & contexts.TABLE_TH_LINE: + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE) + elif this == next == "!" and self._context & contexts.TABLE_TH_LINE: + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) + elif this == "|" and self._context & contexts.TABLE_CELL_STYLE: + return self._handle_table_cell_end(reset_for_style=True) + # on newline, clear out cell line contexts + elif this == "\n" and self._context & contexts.TABLE_CELL_LINE_CONTEXTS: + self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS + self._emit_text(this) + elif (self._read(-1) in ("\n", self.START) or + (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): + if this == "|" and next == "}": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + if self._context & contexts.TABLE_ROW_OPEN: + return self._handle_table_row_end() + return self._handle_table_end() + elif this == "|" and next == "-": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + if self._context & contexts.TABLE_ROW_OPEN: + return self._handle_table_row_end() + self._handle_table_row() + elif this == "|": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("|", "td", contexts.TABLE_TD_LINE) + elif this == "!": + if self._context & contexts.TABLE_CELL_OPEN: + return self._handle_table_cell_end() + self._handle_table_cell("!", "th", contexts.TABLE_TH_LINE) + else: + self._emit_text(this) + else: + self._emit_text(this) + else: self._emit_text(this) self._head += 1 @@ -1164,6 +1371,10 @@ class Tokenizer(object): self._text = [segment for segment in split if segment] self._head = self._global = self._depth = self._cycles = 0 try: - return self._parse(context) + tokens = self._parse(context) except BadRoute: # pragma: no cover (untestable/exceptional case) raise ParserError("Python tokenizer exited with BadRoute") + if self._stacks: # pragma: no cover (untestable/exceptional case) + err = "Python tokenizer exited with non-empty token stack" + raise ParserError(err) + return tokens diff --git a/tests/_test_tokenizer.py b/tests/_test_tokenizer.py index bfd4857..17d588b 100644 --- a/tests/_test_tokenizer.py +++ b/tests/_test_tokenizer.py @@ -25,8 +25,9 @@ import codecs from os import listdir, path import sys -from mwparserfromhell.compat import py3k +from mwparserfromhell.compat import py3k, str from mwparserfromhell.parser import tokens +from mwparserfromhell.parser.builder import Builder class _TestParseError(Exception): """Raised internally when a test could not be parsed.""" @@ -50,8 +51,12 @@ class TokenizerTestCase(object): *label* for the method's docstring. """ def inner(self): - expected = data["output"] - actual = self.tokenizer().tokenize(data["input"]) + if hasattr(self, "roundtrip"): + expected = data["input"] + actual = str(Builder().build(data["output"][:])) + else: + expected = data["output"] + actual = self.tokenizer().tokenize(data["input"]) self.assertEqual(expected, actual) if not py3k: inner.__name__ = funcname.encode("utf8") diff --git a/tests/test_roundtripping.py b/tests/test_roundtripping.py new file mode 100644 index 0000000..5360387 --- /dev/null +++ b/tests/test_roundtripping.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012-2014 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals + +try: + import unittest2 as unittest +except ImportError: + import unittest + +from ._test_tokenizer import TokenizerTestCase + +class TestRoundtripping(TokenizerTestCase, unittest.TestCase): + """Test cases for roundtripping tokens back to wikitext.""" + + @classmethod + def setUpClass(cls): + cls.roundtrip = True + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_tag.py b/tests/test_tag.py index 7577cce..3beea98 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -226,6 +226,38 @@ class TestTag(TreeEqualityTestCase): self.assertWikicodeEqual(parsed, node.closing_tag) self.assertEqual("foobar", node) + def test_wiki_style_separator(self): + """test getter/setter for wiki_style_separator attribute""" + node = Tag(wraptext("table"), wraptext("\n")) + self.assertIs(None, node.wiki_style_separator) + node.wiki_style_separator = "|" + self.assertEqual("|", node.wiki_style_separator) + node.wiki_markup = "{" + self.assertEqual("{|\n{", node) + node2 = Tag(wraptext("table"), wraptext("\n"), wiki_style_separator="|") + self.assertEqual("|", node.wiki_style_separator) + + def test_closing_wiki_markup(self): + """test getter/setter for closing_wiki_markup attribute""" + node = Tag(wraptext("table"), wraptext("\n")) + self.assertIs(None, node.closing_wiki_markup) + node.wiki_markup = "{|" + self.assertEqual("{|", node.closing_wiki_markup) + node.closing_wiki_markup = "|}" + self.assertEqual("|}", node.closing_wiki_markup) + self.assertEqual("{|\n|}", node) + node.wiki_markup = "!!" + self.assertEqual("|}", node.closing_wiki_markup) + self.assertEqual("!!\n|}", node) + node.wiki_markup = False + self.assertFalse(node.closing_wiki_markup) + self.assertEqual("\n
", node) + node2 = Tag(wraptext("table"), wraptext("\n"), + attrs=[agen("id", "foo")], wiki_markup="{|", + closing_wiki_markup="|}") + self.assertEqual("|}", node2.closing_wiki_markup) + self.assertEqual('{| id="foo"\n|}', node2) + def test_has(self): """test Tag.has()""" node = Tag(wraptext("ref"), wraptext("cite"), [agen("name", "foo")]) diff --git a/tests/tokenizer/tables.mwtest b/tests/tokenizer/tables.mwtest new file mode 100644 index 0000000..16012cf --- /dev/null +++ b/tests/tokenizer/tables.mwtest @@ -0,0 +1,410 @@ +name: empty_table +label: parsing an empty table +input: "{|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: inline_table +label: tables with a close on the same line are not valid +input: "{||}" +output: [Text(text="{||}")] + +--- + +name: no_table_close_simple +label: no table close on inline table +input: "{| " +output: [Text(text="{| ")] + +--- + +name: no_table_close_newline +label: no table close with a newline +input: "{| \n " +output: [Text(text="{| \n ")] + +--- + +name: no_table_close_inside_cell +label: no table close while inside of a cell +input: "{| \n| " +output: [Text(text="{| \n| ")] + +--- + +name: no_table_close_inside_cell_after_newline +label: no table close while inside of a cell after a newline +input: "{| \n| \n " +output: [Text(text="{| \n| \n ")] + +--- + +name: no_table_close_inside_cell_with_attributes +label: no table close while inside of a cell with attributes +input: "{| \n| red | test" +output: [Text(text="{| \n| red | test")] + +--- + +name: no_table_close_inside_row +label: no table close while inside of a row +input: "{| \n|- " +output: [Text(text="{| \n|- ")] + +--- + +name: no_table_close_inside_row_after_newline +label: no table close while inside of a row after a newline +input: "{| \n|- \n " +output: [Text(text="{| \n|- \n ")] + +--- + +name: no_table_close_row_and_cell +label: no table close while inside a cell inside a row +input: "{| \n|- \n|" +output: [Text(text="{| \n|- \n|")] + +--- + +name: no_table_close_attributes +label: don't parse attributes as attributes if the table doesn't exist +input: "{| border="1"" +output: [Text(text="{| border=\"1\"")] + +--- + +name: no_table_close_unclosed_attributes +label: don't parse unclosed attributes if the table doesn't exist +input: "{| border=" +output: [Text(text="{| border=")] + +--- + +name: no_table_close_row_attributes +label: don't parse row attributes as attributes if the table doesn't exist +input: "{| |- border="1"" +output: [Text(text="{| |- border=\"1\"")] + +--- + +name: no_table_close_cell +label: don't parse cells if the table doesn't close +input: "{| | border="1"| test || red | foo" +output: [Text(text="{| | border=\"1\"| test || red | foo")] + +--- + +name: crazy_no_table_close +label: lots of opened wiki syntax without closes +input: "{{{ {{ {| | |- {| |} || ! !! bar \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo "), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="| |- {| |} || ! !!"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), Text(text=" bar \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_text_outside_cell +label: parse text inside table but outside of a cell +input: "{|\n bar \n | foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar \n "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text=" foo \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: no_table_cell_with_leading_characters +label: fail to create a table cell when there are leading non-whitespace characters +input: "{|\n bar | foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar | foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: no_table_row_with_leading_characters +label: fail to create a table row when there are leading non-whitespace characters +input: "{|\n bar |- foo \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" bar |- foo \n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: template_inside_table_cell +label: template within table cell +input: "{|\n |{{foo\n|bar=baz}} \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), TemplateOpen(), Text(text="foo\n"), TemplateParamSeparator(), Text(text="bar"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), Text(text=" \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes +label: parse table cell style attributes +input: "{| \n | name="foo bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_empty_attributes +label: parse table cell with style markers but no attributes +input: "{| \n | | test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_with_dash +label: parse a situation in which a cell line looks like a row line +input: "{|\n ||- \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="- \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes_quote_with_pipe +label: pipe inside an attribute quote should still be used as a style separator +input: "{| \n | name="foo|bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes_name_with_pipe +label: pipe inside an attribute name should still be used as a style separator +input: "{| \n | name|="foo bar" | test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(wiki_markup="|", padding=""), Text(text="=\"foo bar\" | test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes_pipe_after_equals +label: pipe inside an attribute should still be used as a style separator after an equals +input: "{| \n | name=|"foo|bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagCloseOpen(wiki_markup="|", padding=""), Text(text="\"foo|bar\"| test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_cell_attributes_templates +label: pipe inside attributes shouldn't be style separator +input: "{| \n | {{comment|template=baz}} | test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=" "), TemplateOpen(), Text(text="comment"), TemplateParamSeparator(), Text(text="template"), TemplateParamEquals(), Text(text="baz"), TemplateClose(), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: header_cell_attributes +label: parse header cell style attributes +input: "{| \n ! name="foo bar"| test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: inline_cell_attributes +label: parse cell style attributes of inline cells +input: "{| \n ! name="foo bar" | test ||color="red"| markup!!foo | time \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="!"), Text(text="th"), TagAttrStart(pad_after_eq="", pad_first=" ", pad_before_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test "), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="||"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="color"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="red"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" markup"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenOpen(wiki_markup="!!"), Text(text="th"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), Text(text="foo"), TagCloseOpen(wiki_markup="|", padding=""), Text(text=" time \n"), TagOpenClose(wiki_markup=""), Text(text="th"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_row_attributes +label: parse table row style attributes +input: "{| \n |- name="foo bar"\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_row_attributes_crazy_whitespace +label: parse table row style attributes with different whitespace +input: "{| \t \n |- \t name="foo bar" \t \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding=" \t \n"), Text(text=" "), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" \t ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding=" \t \n"), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: table_attributes +label: parse table style attributes +input: "{| name="foo bar"\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: inline_table_attributes +label: handle attributes in inline tables +input: "{| foo="tee bar" |}" +output: [Text(text='{| foo="tee bar" |}')] + +--- + +name: table_incorrect_attributes +label: parse incorrect table style attributes +input: "{| name="foo\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="\"foo"), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: templates_in_table_attribute +label: templates in the attributes of a table, after the start +input: "{| {{class}}="{{wikitable}}"\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="class"), TemplateClose(), TagAttrEquals(), TagAttrQuote(char="\""), TemplateOpen(), Text(text="wikitable"), TemplateClose(), TagCloseOpen(padding="\n"), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: templates_in_table_attribute_2 +label: templates in the attributes of a table, after the start +input: "{|{{foo}} \n | name="foo bar" | test \n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding="\n"), Text(text=" "), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(char="\""), Text(text="foo bar"), TagCloseOpen(wiki_markup="|", padding=" "), Text(text=" test \n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: inappropriate_marker_at_line_start +label: an inappropriate marker (a right bracket) at the start of a line in the table +input: "{|\n}" +output: [Text(text="{|\n}")] + +--- + +name: fake_close_near_start +label: a fake closing token at the end of the first line in the table +input: "{| class="wikitable" style="text-align: center; width=100%;|}\n|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"text-align:"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="center;"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="width"), TagAttrEquals(), Text(text="100%;|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: fake_close_near_start_2 +label: a fake closing token at the end of the first line in the table +input: "{| class="wikitable|}"\n|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable|}"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: junk_after_table_start +label: ignore more junk on the first line of the table +input: "{| class="wikitable" | foobar\n|\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="class"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="wikitable"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="foobar"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: junk_after_table_row +label: ignore junk on the first line of a table row +input: "{|\n|- foo="bar" | baz\n|blerp\n|}" +output: [TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(char='"'), Text(text="bar"), TagAttrStart(pad_first=" ", pad_before_eq=" ", pad_after_eq=""), Text(text="|"), TagAttrStart(pad_first="", pad_before_eq="", pad_after_eq=""), Text(text="baz"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="blerp\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index 04f617a..c709ba7 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -447,6 +447,13 @@ output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Tag --- +name: dt_dd_mix4 +label: another example of correct dt/dd usage, with a trigger for a specific parse route +input: ";foo]:bar" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo]"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")] + +--- + name: ul_ol_dt_dd_mix label: an assortment of uls, ols, dds, and dts input: ";:#*foo\n:#*;foo\n#*;:foo\n*;:#foo"