Browse Source

Refactor a lot of table error recovery code.

tags/v0.4
Ben Kurtovic 9 years ago
parent
commit
9fc4b909e1
5 changed files with 87 additions and 108 deletions
  1. +2
    -2
      mwparserfromhell/parser/contexts.py
  2. +42
    -58
      mwparserfromhell/parser/tokenizer.c
  3. +1
    -1
      mwparserfromhell/parser/tokenizer.h
  4. +35
    -47
      mwparserfromhell/parser/tokenizer.py
  5. +7
    -0
      tests/tokenizer/tables.mwtest

+ 2
- 2
mwparserfromhell/parser/contexts.py View File

@@ -171,7 +171,7 @@ TABLE_ROW_OPEN = 1 << 33
TABLE_TD_LINE = 1 << 34
TABLE_TH_LINE = 1 << 35
TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + + TABLE_ROW_OPEN +
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN +
TABLE_TD_LINE + TABLE_TH_LINE)

# Global contexts:
@@ -184,6 +184,6 @@ FAIL = (TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG +
STYLE + TABLE)
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE +
TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE)
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE + TABLE_ROW_OPEN
NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI
NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK

+ 42
- 58
mwparserfromhell/parser/tokenizer.c View File

@@ -2510,10 +2510,9 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup,
}

/*
Parse until ``end_token`` as style attributes for a table.
Handle style attributes for a table until an ending token.
*/
static PyObject*
Tokenizer_parse_as_table_style(Tokenizer* self, char end_token)
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
{
TagData *data = TagData_new();
PyObject *padding, *trash;
@@ -2569,9 +2568,9 @@ Tokenizer_parse_as_table_style(Tokenizer* self, char end_token)
}

/*
Handle the start of a table.
Parse a wikicode table by starting with the first line.
*/
static int Tokenizer_handle_table_start(Tokenizer* self)
static int Tokenizer_parse_table(Tokenizer* self)
{
Py_ssize_t reset = self->head + 1;
PyObject *style, *padding;
@@ -2580,7 +2579,7 @@ static int Tokenizer_handle_table_start(Tokenizer* self)

if(Tokenizer_push(self, LC_TABLE_OPEN))
return -1;
padding = Tokenizer_parse_as_table_style(self, '\n');
padding = Tokenizer_handle_table_style(self, '\n');
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
@@ -2622,20 +2621,10 @@ static int Tokenizer_handle_table_start(Tokenizer* self)
}

/*
Return the stack in order to handle the table end.
*/
static PyObject* Tokenizer_handle_table_end(Tokenizer* self)
{
self->head += 2;
return Tokenizer_pop(self);
}

/*
Parse as style until end of the line, then continue.
*/
static int Tokenizer_handle_table_row(Tokenizer* self)
{
Py_ssize_t reset = self->head;
PyObject *padding, *style, *row, *trash;
self->head += 2;

@@ -2648,11 +2637,10 @@ static int Tokenizer_handle_table_row(Tokenizer* self)

if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN))
return -1;
padding = Tokenizer_parse_as_table_style(self, '\n');
padding = Tokenizer_handle_table_style(self, '\n');
if (BAD_ROUTE) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
self->head = reset;
return 0;
}
if (!padding)
@@ -2666,14 +2654,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self)
// Don't parse the style separator
self->head++;
row = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN, 1);
if (BAD_ROUTE) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
Py_DECREF(padding);
Py_DECREF(style);
self->head = reset;
return 0;
}
if (!row) {
Py_DECREF(padding);
Py_DECREF(style);
@@ -2688,14 +2668,6 @@ static int Tokenizer_handle_table_row(Tokenizer* self)
}

/*
Return the stack in order to handle the table row end.
*/
static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self)
{
return Tokenizer_pop(self);
}

/*
Parse as normal syntax unless we hit a style marker, then parse style
as HTML attributes and the remainder as normal syntax.
*/
@@ -2705,11 +2677,10 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
{
uint64_t old_context = self->topstack->context;
uint64_t cell_context;
Py_ssize_t reset = self->head;
PyObject *padding, *cell, *trash;
PyObject *style = NULL;
PyObject *padding, *cell, *style = NULL;
const char *close_open_markup = NULL;
self->head += strlen(markup);
Py_ssize_t reset = self->head;

if (!Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_emit_text(self, markup))
@@ -2720,12 +2691,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,

cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
LC_TABLE_CELL_STYLE | line_context, 1);
if (BAD_ROUTE) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
self->head = reset;
return 0;
}
if (!cell)
return -1;
cell_context = self->topstack->context;
@@ -2733,11 +2698,11 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,

if (cell_context & LC_TABLE_CELL_STYLE) {
Py_DECREF(cell);
self->head = reset + strlen(markup);
self->head = reset;
if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
line_context))
return -1;
padding = Tokenizer_parse_as_table_style(self, '|');
padding = Tokenizer_handle_table_style(self, '|');
if (!padding)
return -1;
style = Tokenizer_pop(self);
@@ -2749,14 +2714,6 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
self->head++;
cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN |
line_context, 1);
if (BAD_ROUTE) {
Py_DECREF(padding);
Py_DECREF(style);
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
self->head = reset;
return 0;
}
if (!cell) {
Py_DECREF(padding);
Py_DECREF(style);
@@ -2801,6 +2758,23 @@ Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style)
}

/*
Return the stack in order to handle the table row end.
*/
static PyObject* Tokenizer_handle_table_row_end(Tokenizer* self)
{
return Tokenizer_pop(self);
}

/*
Return the stack in order to handle the table end.
*/
static PyObject* Tokenizer_handle_table_end(Tokenizer* self)
{
self->head += 2;
return Tokenizer_pop(self);
}

/*
Handle the end of the stream of wikitext.
*/
static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
@@ -2819,9 +2793,16 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
if (single)
return Tokenizer_handle_single_tag_end(self);
}
else if (context & AGG_DOUBLE) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
else {
if (context & LC_TABLE_CELL_OPEN) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
context = self->topstack->context;
}
if (context & AGG_DOUBLE) {
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
}
}
return Tokenizer_fail_route(self);
}
@@ -3082,7 +3063,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
// Start of table parsing
else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) {
if (Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_handle_table_start(self))
if (Tokenizer_parse_table(self))
return NULL;
}
else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next))
@@ -3197,7 +3178,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
self->skip_style_tags = skip_style_tags;
tokens = Tokenizer_parse(self, context, 1);

if (!tokens && !PyErr_Occurred()) {
if ((!tokens && !PyErr_Occurred()) || self->topstack) {
if (!ParserError) {
if (load_exceptions())
return NULL;
@@ -3206,6 +3187,9 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
RESET_ROUTE();
PyErr_SetString(ParserError, "C tokenizer exited with BAD_ROUTE");
}
else if (self->topstack)
PyErr_SetString(ParserError,
"C tokenizer exited with non-empty token stack");
else
PyErr_SetString(ParserError, "C tokenizer exited unexpectedly");
return NULL;


+ 1
- 1
mwparserfromhell/parser/tokenizer.h View File

@@ -175,7 +175,7 @@ static PyObject* TagCloseClose;

#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE | LC_TABLE_ROW_OPEN)
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)
#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK)



+ 35
- 47
mwparserfromhell/parser/tokenizer.py View File

@@ -1009,8 +1009,8 @@ class Tokenizer(object):
self._emit_text(tag)
self._emit(tokens.TagCloseClose())

def _parse_as_table_style(self, end_token):
"""Parse until ``end_token`` as style attributes for a table."""
def _handle_table_style(self, end_token):
"""Handle style attributes for a table until ``end_token``."""
data = _TagOpenData()
data.context = _TagOpenData.CX_ATTR_READY
while True:
@@ -1037,14 +1037,13 @@ class Tokenizer(object):
self._handle_tag_data(data, this)
self._head += 1

def _handle_table_start(self):
"""Handle the start of a table."""
def _parse_table(self):
"""Parse a wikicode table by starting with the first line."""
reset = self._head + 1
self._head += 2

self._push(contexts.TABLE_OPEN)
try:
padding = self._parse_as_table_style("\n")
padding = self._handle_table_style("\n")
except BadRoute:
self._head = reset
self._emit_text("{|")
@@ -1063,14 +1062,8 @@ class Tokenizer(object):
# Offset displacement done by _parse():
self._head -= 1

def _handle_table_end(self):
"""Return the stack in order to handle the table end."""
self._head += 2
return self._pop()

def _handle_table_row(self):
"""Parse as style until end of the line, then continue."""
reset = self._head
self._head += 2
if not self._can_recurse():
self._emit_text("|-")
@@ -1079,67 +1072,47 @@ class Tokenizer(object):

self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
try:
padding = self._parse_as_table_style("\n")
padding = self._handle_table_style("\n")
except BadRoute:
self._head = reset
self._pop()
raise
style = self._pop()

# Don't parse the style separator:
self._head += 1
try:
row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)
except BadRoute:
self._head = reset
self._pop()
raise
row = self._parse(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN)

self._emit_table_tag("|-", "tr", style, padding, None, row, "")
# Offset displacement done by parse():
self._head -= 1

def _handle_table_row_end(self):
"""Return the stack in order to handle the table row end."""
return self._pop()

def _handle_table_cell(self, markup, tag, line_context):
"""Parse as normal syntax unless we hit a style marker, then parse
style as HTML attributes and the remainder as normal syntax."""
old_context = self._context
reset = self._head
reset_for_style, padding, style = False, "", None
padding, style = "", None
self._head += len(markup)
reset = self._head
if not self._can_recurse():
self._emit_text(markup)
self._head -= 1
return

try:
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context | contexts.TABLE_CELL_STYLE)
except BadRoute:
self._head = reset
self._pop()
raise
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context | contexts.TABLE_CELL_STYLE)
cell_context = self._context
self._context = old_context
reset_for_style = cell_context & contexts.TABLE_CELL_STYLE
if reset_for_style:
self._head = reset + len(markup)
self._head = reset
self._push(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context)
padding = self._parse_as_table_style("|")
padding = self._handle_table_style("|")
style = self._pop()
# Don't parse the style separator:
self._head += 1
try:
cell = self._parse(contexts.TABLE_OPEN |
contexts.TABLE_CELL_OPEN | line_context)
except BadRoute:
self._head = reset
ret = self._pop()
raise
cell = self._parse(contexts.TABLE_OPEN | contexts.TABLE_CELL_OPEN |
line_context)
cell_context = self._context
self._context = old_context

@@ -1161,12 +1134,23 @@ class Tokenizer(object):
self._context &= ~contexts.TABLE_CELL_STYLE
return self._pop(keep_context=True)

def _handle_table_row_end(self):
"""Return the stack in order to handle the table row end."""
return self._pop()

def _handle_table_end(self):
"""Return the stack in order to handle the table end."""
self._head += 2
return self._pop()

def _handle_end(self):
"""Handle the end of the stream of wikitext."""
if self._context & contexts.FAIL:
if self._context & contexts.TAG_BODY:
if is_single(self._stack[1].text):
return self._handle_single_tag_end()
if self._context & contexts.TABLE_CELL_OPEN:
self._pop()
if self._context & contexts.DOUBLE:
self._pop()
self._fail_route()
@@ -1327,19 +1311,19 @@ class Tokenizer(object):
elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())):
if self._can_recurse():
self._handle_table_start()
self._parse_table()
else:
self._emit_text("{|")
elif self._context & contexts.TABLE_OPEN:
if this == "|" and next == "|" and self._context & contexts.TABLE_TD_LINE:
if this == next == "|" and self._context & contexts.TABLE_TD_LINE:
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("||", "td", contexts.TABLE_TD_LINE)
elif this == "|" and next == "|" and self._context & contexts.TABLE_TH_LINE:
elif this == next == "|" and self._context & contexts.TABLE_TH_LINE:
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("||", "th", contexts.TABLE_TH_LINE)
elif this == "!" and next == "!" and self._context & contexts.TABLE_TH_LINE:
elif this == next == "!" and self._context & contexts.TABLE_TH_LINE:
if self._context & contexts.TABLE_CELL_OPEN:
return self._handle_table_cell_end()
self._handle_table_cell("!!", "th", contexts.TABLE_TH_LINE)
@@ -1387,6 +1371,10 @@ class Tokenizer(object):
self._text = [segment for segment in split if segment]
self._head = self._global = self._depth = self._cycles = 0
try:
return self._parse(context)
tokens = self._parse(context)
except BadRoute: # pragma: no cover (untestable/exceptional case)
raise ParserError("Python tokenizer exited with BadRoute")
if self._stacks: # pragma: no cover (untestable/exceptional case)
err = "Python tokenizer exited with non-empty token stack"
raise ParserError(err)
return tokens

+ 7
- 0
tests/tokenizer/tables.mwtest View File

@@ -61,6 +61,13 @@ output: [Text(text="{| \n|- \n ")]

---

name: no_table_close_row_and_cell
label: no table close while inside a cell inside a row
input: "{| \n|- \n|"
output: [Text(text="{| \n|- \n|")]

---

name: no_table_close_attributes
label: don't parse attributes as attributes if the table doesn't exist
input: "{| border="1""


Loading…
Cancel
Save