@@ -171,7 +171,7 @@ TABLE_ROW_OPEN = 1 << 33 | |||
TABLE_TD_LINE = 1 << 34 | |||
TABLE_TH_LINE = 1 << 35 | |||
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE | |||
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + + TABLE_ROW_OPEN + | |||
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + | |||
TABLE_TD_LINE + TABLE_TH_LINE) | |||
# Global contexts: | |||
@@ -184,6 +184,6 @@ FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + | |||
STYLE + TABLE) | |||
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + | |||
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) | |||
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE | |||
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN | |||
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI | |||
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK |
@@ -2510,10 +2510,9 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup, | |||
} | |||
/* | |||
Parse until ``end_token`` as style attributes for a table. | |||
Handle style attributes for a table until an ending token. | |||
*/ | |||
static PyObject* | |||
Tokenizer_parse_as_table_style(Tokenizer* self, char end_token) | |||
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token) | |||
{ | |||
TagData *data = TagData_new(); | |||
PyObject *padding, *trash; | |||
@@ -2569,9 +2568,9 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token) | |||
} | |||
/* | |||
Handle the start of a table. | |||
Parse a wikicode table by starting with the first line. | |||
*/ | |||
static int Tokenizer_handle_table_start(Tokenizer* self) | |||
static int Tokenizer_parse_table(Tokenizer* self) | |||
{ | |||
Py_ssize_t reset = self->head + 1; | |||
PyObject *style, *padding; | |||
@@ -2580,7 +2579,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self) | |||
if(Tokenizer_push(self, LC_TABLE_OPEN)) | |||
return -1; | |||
padding = Tokenizer_parse_as_table_style(self, '\n'); | |||
padding = Tokenizer_handle_table_style(self, '\n'); | |||
if (BAD_ROUTE) { | |||
RESET_ROUTE(); | |||
self->head = reset; | |||
@@ -2622,20 +2621,10 @@ static int Tokenizer_handle_table_start(Tokenizer* self) | |||
} | |||
/* | |||
Return the stack in order to handle the table end. | |||
*/ | |||
static PyObject* Tokenizer_handle_table_end(Tokenizer* self) | |||
{ | |||
self->head += 2; | |||
return Tokenizer_pop(self); | |||
} | |||
/* | |||
Parse as style until end of the line, then continue. | |||
*/ | |||
static int Tokenizer_handle_table_row(Tokenizer* self) | |||
{ | |||
Py_ssize_t reset = self->head; | |||
PyObject *padding, *style, *row, *trash; | |||
self->head += 2; | |||
@@ -2648,11 +2637,10 @@ static int Tokenizer_handle_table_row(Tokenizer* self) | |||
if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) | |||
return -1; | |||
padding = Tokenizer_parse_as_table_style(self, '\n'); | |||
padding = Tokenizer_handle_table_style(self, '\n'); | |||
if (BAD_ROUTE) { | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
self->head = reset; | |||
return 0; | |||
} | |||
if (!padding) | |||
@@ -2666,14 +2654,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self) | |||
// Don't parse the style separator | |||
self->head++; | |||
row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1); | |||
if (BAD_ROUTE) { | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
Py_DECREF(padding); | |||
Py_DECREF(style); | |||
self->head = reset; | |||
return 0; | |||
} | |||
if (!row) { | |||
Py_DECREF(padding); | |||
Py_DECREF(style); | |||
@@ -2688,14 +2668,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self) | |||
} | |||
/* | |||
Return the stack in order to handle the table row end. | |||
*/ | |||
static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) | |||
{ | |||
return Tokenizer_pop(self); | |||
} | |||
/* | |||
Parse as normal syntax unless we hit a style marker, then parse style | |||
as HTML attributes and the remainder as normal syntax. | |||
*/ | |||
@@ -2705,11 +2677,10 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, | |||
{ | |||
uint64_t old_context = self->topstack->context; | |||
uint64_t cell_context; | |||
Py_ssize_t reset = self->head; | |||
PyObject *padding, *cell, *trash; | |||
PyObject *style = NULL; | |||
PyObject *padding, *cell, *style = NULL; | |||
const char *close_open_markup = NULL; | |||
self->head += strlen(markup); | |||
Py_ssize_t reset = self->head; | |||
if (!Tokenizer_CAN_RECURSE(self)) { | |||
if (Tokenizer_emit_text(self, markup)) | |||
@@ -2720,12 +2691,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, | |||
cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | | |||
LC_TABLE_CELL_STYLE | line_context, 1); | |||
if (BAD_ROUTE) { | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
self->head = reset; | |||
return 0; | |||
} | |||
if (!cell) | |||
return -1; | |||
cell_context = self->topstack->context; | |||
@@ -2733,11 +2698,11 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, | |||
if (cell_context & LC_TABLE_CELL_STYLE) { | |||
Py_DECREF(cell); | |||
self->head = reset + strlen(markup); | |||
self->head = reset; | |||
if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | | |||
line_context)) | |||
return -1; | |||
padding = Tokenizer_parse_as_table_style(self, '|'); | |||
padding = Tokenizer_handle_table_style(self, '|'); | |||
if (!padding) | |||
return -1; | |||
style = Tokenizer_pop(self); | |||
@@ -2749,14 +2714,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, | |||
self->head++; | |||
cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | | |||
line_context, 1); | |||
if (BAD_ROUTE) { | |||
Py_DECREF(padding); | |||
Py_DECREF(style); | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
self->head = reset; | |||
return 0; | |||
} | |||
if (!cell) { | |||
Py_DECREF(padding); | |||
Py_DECREF(style); | |||
@@ -2801,6 +2758,23 @@ Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style) | |||
} | |||
/* | |||
Return the stack in order to handle the table row end. | |||
*/ | |||
static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self) | |||
{ | |||
return Tokenizer_pop(self); | |||
} | |||
/* | |||
Return the stack in order to handle the table end. | |||
*/ | |||
static PyObject* Tokenizer_handle_table_end(Tokenizer* self) | |||
{ | |||
self->head += 2; | |||
return Tokenizer_pop(self); | |||
} | |||
/* | |||
Handle the end of the stream of wikitext. | |||
*/ | |||
static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) | |||
@@ -2819,9 +2793,16 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) | |||
if (single) | |||
return Tokenizer_handle_single_tag_end(self); | |||
} | |||
else if (context & AGG_DOUBLE) { | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
else { | |||
if (context & LC_TABLE_CELL_OPEN) { | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
context = self->topstack->context; | |||
} | |||
if (context & AGG_DOUBLE) { | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
} | |||
} | |||
return Tokenizer_fail_route(self); | |||
} | |||
@@ -3082,7 +3063,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) | |||
// Start of table parsing | |||
else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) { | |||
if (Tokenizer_CAN_RECURSE(self)) { | |||
if (Tokenizer_handle_table_start(self)) | |||
if (Tokenizer_parse_table(self)) | |||
return NULL; | |||
} | |||
else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next)) | |||
@@ -3197,7 +3178,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
self->skip_style_tags = skip_style_tags; | |||
tokens = Tokenizer_parse(self, context, 1); | |||
if (!tokens && !PyErr_Occurred()) { | |||
if ((!tokens && !PyErr_Occurred()) || self->topstack) { | |||
if (!ParserError) { | |||
if (load_exceptions()) | |||
return NULL; | |||
@@ -3206,6 +3187,9 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
RESET_ROUTE(); | |||
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE"); | |||
} | |||
else if (self->topstack) | |||
PyErr_SetString(ParserError, | |||
"C tokenizer exited with non-empty token stack"); | |||
else | |||
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly"); | |||
return NULL; | |||
@@ -175,7 +175,7 @@ static PyObject* TagCloseClose; | |||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN) | |||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) | |||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN) | |||
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) | |||
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) | |||
@@ -1009,8 +1009,8 @@ class Tokenizer(object): | |||
self._emit_text(tag) | |||
self._emit(tokens.TagCloseClose()) | |||
def _parse_as_table_style(self, end_token): | |||
"""Parse until ``end_token`` as style attributes for a table.""" | |||
def _handle_table_style(self, end_token): | |||
"""Handle style attributes for a table until ``end_token``.""" | |||
data = _TagOpenData() | |||
data.context = _TagOpenData.CX_ATTR_READY | |||
while True: | |||
@@ -1037,14 +1037,13 @@ class Tokenizer(object): | |||
self._handle_tag_data(data, this) | |||
self._head += 1 | |||
def _handle_table_start(self): | |||
"""Handle the start of a table.""" | |||
def _parse_table(self): | |||
"""Parse a wikicode table by starting with the first line.""" | |||
reset = self._head + 1 | |||
self._head += 2 | |||
self._push(contexts.TABLE_OPEN) | |||
try: | |||
padding = self._parse_as_table_style("\n") | |||
padding = self._handle_table_style("\n") | |||
except BadRoute: | |||
self._head = reset | |||
self._emit_text("{|") | |||
@@ -1063,14 +1062,8 @@ class Tokenizer(object): | |||
# Offset displacement done by _parse(): | |||
self._head -= 1 | |||
def _handle_table_end(self): | |||
"""Return the stack in order to handle the table end.""" | |||
self._head += 2 | |||
return self._pop() | |||
def _handle_table_row(self): | |||
"""Parse as style until end of the line, then continue.""" | |||
reset = self._head | |||
self._head += 2 | |||
if not self._can_recurse(): | |||
self._emit_text("|-") | |||
@@ -1079,67 +1072,47 @@ class Tokenizer(object): | |||
self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) | |||
try: | |||
padding = self._parse_as_table_style("\n") | |||
padding = self._handle_table_style("\n") | |||
except BadRoute: | |||
self._head = reset | |||
self._pop() | |||
raise | |||
style = self._pop() | |||
# Don't parse the style separator: | |||
self._head += 1 | |||
try: | |||
row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) | |||
except BadRoute: | |||
self._head = reset | |||
self._pop() | |||
raise | |||
row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) | |||
self._emit_table_tag("|-", "tr", style, padding, None, row, "") | |||
# Offset displacement done by parse(): | |||
self._head -= 1 | |||
def _handle_table_row_end(self): | |||
"""Return the stack in order to handle the table row end.""" | |||
return self._pop() | |||
def _handle_table_cell(self, markup, tag, line_context): | |||
"""Parse as normal syntax unless we hit a style marker, then parse | |||
style as HTML attributes and the remainder as normal syntax.""" | |||
old_context = self._context | |||
reset = self._head | |||
reset_for_style, padding, style = False, "", None | |||
padding, style = "", None | |||
self._head += len(markup) | |||
reset = self._head | |||
if not self._can_recurse(): | |||
self._emit_text(markup) | |||
self._head -= 1 | |||
return | |||
try: | |||
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | | |||
line_context | contexts.TABLE_CELL_STYLE) | |||
except BadRoute: | |||
self._head = reset | |||
self._pop() | |||
raise | |||
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | | |||
line_context | contexts.TABLE_CELL_STYLE) | |||
cell_context = self._context | |||
self._context = old_context | |||
reset_for_style = cell_context & contexts.TABLE_CELL_STYLE | |||
if reset_for_style: | |||
self._head = reset + len(markup) | |||
self._head = reset | |||
self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | | |||
line_context) | |||
padding = self._parse_as_table_style("|") | |||
padding = self._handle_table_style("|") | |||
style = self._pop() | |||
# Don't parse the style separator: | |||
self._head += 1 | |||
try: | |||
cell = self._parse(contexts.TABLE_OPEN | | |||
contexts.TABLE_CELL_OPEN | line_context) | |||
except BadRoute: | |||
self._head = reset | |||
ret = self._pop() | |||
raise | |||
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN | | |||
line_context) | |||
cell_context = self._context | |||
self._context = old_context | |||
@@ -1161,12 +1134,23 @@ class Tokenizer(object): | |||
self._context &= ~contexts.TABLE_CELL_STYLE | |||
return self._pop(keep_context=True) | |||
def _handle_table_row_end(self): | |||
"""Return the stack in order to handle the table row end.""" | |||
return self._pop() | |||
def _handle_table_end(self): | |||
"""Return the stack in order to handle the table end.""" | |||
self._head += 2 | |||
return self._pop() | |||
def _handle_end(self): | |||
"""Handle the end of the stream of wikitext.""" | |||
if self._context & contexts.FAIL: | |||
if self._context & contexts.TAG_BODY: | |||
if is_single(self._stack[1].text): | |||
return self._handle_single_tag_end() | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
self._pop() | |||
if self._context & contexts.DOUBLE: | |||
self._pop() | |||
self._fail_route() | |||
@@ -1327,19 +1311,19 @@ class Tokenizer(object): | |||
elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or | |||
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): | |||
if self._can_recurse(): | |||
self._handle_table_start() | |||
self._parse_table() | |||
else: | |||
self._emit_text("{|") | |||
elif self._context & contexts.TABLE_OPEN: | |||
if this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE: | |||
if this == next == "|" and self._context & contexts.TABLE_TD_LINE: | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
return self._handle_table_cell_end() | |||
self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE) | |||
elif this == "|" and next == "|" and self._context & contexts.TABLE_TH_LINE: | |||
elif this == next == "|" and self._context & contexts.TABLE_TH_LINE: | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
return self._handle_table_cell_end() | |||
self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE) | |||
elif this == "!" and next == "!" and self._context & contexts.TABLE_TH_LINE: | |||
elif this == next == "!" and self._context & contexts.TABLE_TH_LINE: | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
return self._handle_table_cell_end() | |||
self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE) | |||
@@ -1387,6 +1371,10 @@ class Tokenizer(object): | |||
self._text = [segment for segment in split if segment] | |||
self._head = self._global = self._depth = self._cycles = 0 | |||
try: | |||
return self._parse(context) | |||
tokens = self._parse(context) | |||
except BadRoute: # pragma: no cover (untestable/exceptional case) | |||
raise ParserError("Python tokenizer exited with BadRoute") | |||
if self._stacks: # pragma: no cover (untestable/exceptional case) | |||
err = "Python tokenizer exited with non-empty token stack" | |||
raise ParserError(err) | |||
return tokens |
@@ -61,6 +61,13 @@ output: [Text(text="{| \n|- \n ")] | |||
--- | |||
name: no_table_close_row_and_cell | |||
label: no table close while inside a cell inside a row | |||
input: "{| \n|- \n|" | |||
output: [Text(text="{| \n|- \n|")] | |||
--- | |||
name: no_table_close_attributes | |||
label: don't parse attributes as attributes if the table doesn't exist | |||
input: "{| border="1"" | |||