Переглянути джерело

Implement CTokenizer for tables

CTokenizer is completely implemented in this commit - it didn't
make much sense to me to split it up. All tests passing, memory test
shows no leaks on Linux.
tags/v0.4
David Winegar 10 роки тому
джерело
коміт
0128b1f78a
3 змінених файлів з 551 додано та 62 видалено
  1. +492
    -11
      mwparserfromhell/parser/tokenizer.c
  2. +58
    -50
      mwparserfromhell/parser/tokenizer.h
  3. +1
    -1
      mwparserfromhell/parser/tokenizer.py

+ 492
- 11
mwparserfromhell/parser/tokenizer.c Переглянути файл

@@ -2454,6 +2454,399 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
}

/*
Parse until ``end_token`` as style attributes for a table.
*/
static PyObject* Tokenizer_parse_as_table_style(Tokenizer* self, char end_token,
int break_on_table_end)
{
TagData *data = TagData_new();
PyObject *padding, *trash;
Py_UNICODE this, next;
int can_exit, table_end;

if (!data)
return NULL;
data->context = TAG_ATTR_READY;

while (1) {
this = Tokenizer_READ(self, 0);
next = Tokenizer_READ(self, 1);
can_exit = (!(data->context & TAG_QUOTED) || data->context & TAG_NOTE_SPACE);
table_end = (break_on_table_end && this == '|' && next == '}');
if ((this == end_token && can_exit) || table_end) {
if (data->context & (TAG_ATTR_NAME | TAG_ATTR_VALUE)) {
if (Tokenizer_push_tag_buffer(self, data)) {
TagData_dealloc(data);
return NULL;
}
}
if (Py_UNICODE_ISSPACE(this))
Textbuffer_write(&(data->pad_first), this);
padding = Textbuffer_render(data->pad_first);
TagData_dealloc(data);
if (!padding)
return NULL;
return padding;
}
else if (!this || table_end || this == end_token) {
if (self->topstack->context & LC_TAG_ATTR) {
if (data->context & TAG_QUOTED) {
// Unclosed attribute quote: reset, don't die
data->context = TAG_ATTR_VALUE;
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
self->head = data->reset;
continue;
}
trash = Tokenizer_pop(self);
Py_XDECREF(trash);
}
TagData_dealloc(data);
return Tokenizer_fail_route(self);
}
else {
if (Tokenizer_handle_tag_data(self, data, this) || BAD_ROUTE) {
TagData_dealloc(data);
return NULL;
}
}
self->head++;
}
}

/*
Handle the start of a table.
*/
static int Tokenizer_handle_table_start(Tokenizer* self)
{
self->head += 2;
Py_ssize_t reset = self->head;
PyObject *style, *open_open_kwargs, *close_open_kwargs, *open_close_kwargs,
*padding, *newline_character, *open_wiki_markup, *close_wiki_markup;
PyObject *table = NULL;

if(Tokenizer_push(self, LC_TABLE_OPEN))
return -1;
padding = Tokenizer_parse_as_table_style(self, '\n', 1);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset - 1;
if (Tokenizer_emit_text(self, "{|"))
return -1;
return 0;
}
if (!padding)
return -1;
style = Tokenizer_pop(self);
if (!style) {
Py_DECREF(padding);
return -1;
}

newline_character = PyUnicode_FromString("\n");
if (!newline_character) {
Py_DECREF(padding);
Py_DECREF(style);
return -1;
}
// continue to parse if it is NOT an inline table
if (PyUnicode_Contains(padding, newline_character)) {
Py_DECREF(newline_character);
self->head++;
table = Tokenizer_parse(self, LC_TABLE_OPEN, 1);
if (BAD_ROUTE) {
RESET_ROUTE();
// offset displacement done by parse()
self->head = reset - 1;
if (Tokenizer_emit_text(self, "{|"))
return -1;
return 0;
}
if (!table) {
Py_DECREF(padding);
Py_DECREF(style);
return -1;
}
} else {
Py_DECREF(newline_character);
// close tag
self->head += 2;
}

open_open_kwargs = PyDict_New();
if (!open_open_kwargs)
goto fail_decref_all;
open_wiki_markup = PyUnicode_FromString("{|");
if (!open_wiki_markup) {
Py_DECREF(open_open_kwargs);
goto fail_decref_all;
}
PyDict_SetItemString(open_open_kwargs, "wiki_markup", open_wiki_markup);
Py_DECREF(open_wiki_markup);
if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_open_kwargs))
goto fail_decref_all;
if (Tokenizer_emit_text(self, "table"))
goto fail_decref_all;

if (style) {
if (Tokenizer_emit_all(self, style))
goto fail_decref_padding_table;
Py_DECREF(style);
}

close_open_kwargs = PyDict_New();
if (!close_open_kwargs)
goto fail_decref_padding_table;
PyDict_SetItemString(close_open_kwargs, "padding", padding);
Py_DECREF(padding);
if (Tokenizer_emit_kwargs(self, TagCloseOpen, close_open_kwargs))
goto fail_decref_table;

if (table) {
if (Tokenizer_emit_all(self, table))
goto fail_decref_table;
Py_DECREF(table);
}

open_close_kwargs = PyDict_New();
if (!open_close_kwargs)
return -1;
close_wiki_markup = PyUnicode_FromString("|}");
if (!close_wiki_markup) {
Py_DECREF(open_close_kwargs);
return -1;
}
PyDict_SetItemString(open_close_kwargs, "wiki_markup", close_wiki_markup);
Py_DECREF(close_wiki_markup);
if (Tokenizer_emit_kwargs(self, TagOpenClose, open_close_kwargs))
return -1;
if (Tokenizer_emit_text(self, "table"))
return -1;
if (Tokenizer_emit(self, TagCloseClose))
return -1;
// offset displacement done by _parse()
self->head--;
return 0;

fail_decref_all:
Py_DECREF(style);
fail_decref_padding_table:
Py_DECREF(padding);
fail_decref_table:
Py_XDECREF(table);
return -1;
}

/*
Return the stack in order to handle the table end.
*/
static PyObject * Tokenizer_handle_table_end(Tokenizer* self)
{
self->head += 2;
return Tokenizer_pop(self);
}

/*
Parse as style until end of the line, then continue.
*/
static int Tokenizer_handle_table_row(Tokenizer* self)
{
Py_ssize_t reset = self->head;
self->head += 2;
PyObject *padding, *open_kwargs, *close_kwargs, *wiki_markup;
PyObject *style = NULL;

// If we can't recurse, still tokenize tag but parse style attrs as text
if (Tokenizer_CAN_RECURSE(self)) {
if(Tokenizer_push(self, LC_TABLE_OPEN))
return -1;
padding = Tokenizer_parse_as_table_style(self, '\n', 0);
if (BAD_ROUTE) {
self->head = reset;
return 0;
}
if (!padding)
return -1;
style = Tokenizer_pop(self);
if (!style) {
Py_DECREF(padding);
return -1;
}
} else {
padding = PyUnicode_FromString("");
if (!padding)
return -1;
}

open_kwargs = PyDict_New();
if (!open_kwargs)
goto fail_decref_all;
wiki_markup = PyUnicode_FromString("|-");
if (!wiki_markup) {
Py_DECREF(open_kwargs);
goto fail_decref_all;
}
PyDict_SetItemString(open_kwargs, "wiki_markup", wiki_markup);
Py_DECREF(wiki_markup);
if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_kwargs))
goto fail_decref_all;
if (Tokenizer_emit_text(self, "tr"))
goto fail_decref_all;

if (style) {
if (Tokenizer_emit_all(self, style))
goto fail_decref_all;
Py_DECREF(style);
}

close_kwargs = PyDict_New();
if (!close_kwargs)
goto fail_decref_all;
PyDict_SetItemString(close_kwargs, "padding", padding);
Py_DECREF(padding);
if (Tokenizer_emit_kwargs(self, TagCloseSelfclose, close_kwargs))
return -1;
return 0;

fail_decref_all:
Py_XDECREF(style);
Py_DECREF(padding);
return -1;
}

/*
Parse as normal syntax unless we hit a style marker, then parse style
as HTML attributes and the remainder as normal syntax.
*/
static int Tokenizer_handle_table_cell(Tokenizer* self, const char *markup,
const char *tag, uint64_t line_context)
{
if (!Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_emit_text(self, markup))
return -1;
self->head += strlen(markup) - 1;
return 0;
}

uint64_t old_context = self->topstack->context;
uint64_t cell_context;
Py_ssize_t reset = self->head;
self->head += strlen(markup);
PyObject *padding;
PyObject *cell, *open_kwargs, *close_kwargs, *open_wiki_markup, *close_wiki_markup;
PyObject *style = NULL;

cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | LC_TABLE_CELL_STYLE | line_context, 1);
if (BAD_ROUTE) {
self->head = reset;
return 0;
}
if (!cell)
return -1;
cell_context = self->topstack->context;
self->topstack->context = old_context;

if (cell_context & LC_TABLE_CELL_STYLE) {
Py_DECREF(cell);
self->head = reset + strlen(markup);
if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context))
return -1;
padding = Tokenizer_parse_as_table_style(self, '|', 0);
if (BAD_ROUTE) {
self->head = reset;
return 0;
}
if (!padding)
return -1;
style = Tokenizer_pop(self);
if (!style) {
Py_DECREF(padding);
return -1;
}
// Don't parse the style separator
self->head++;
cell = Tokenizer_parse(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | line_context, 1);
if (BAD_ROUTE) {
self->head = reset;
return 0;
}
if (!cell)
return -1;
cell_context = self->topstack->context;
self->topstack->context = old_context;
}
else {
padding = PyUnicode_FromString("");
if (!padding) {
Py_DECREF(cell);
return -1;
}
}

open_kwargs = PyDict_New();
if (!open_kwargs)
goto fail_decref_all;
close_kwargs = PyDict_New();
if (!close_kwargs)
goto fail_decref_all;
open_wiki_markup = PyUnicode_FromString(markup);
if (!open_wiki_markup)
goto fail_decref_all;
PyDict_SetItemString(open_kwargs, "wiki_markup", open_wiki_markup);
Py_DECREF(open_wiki_markup);
if (Tokenizer_emit_kwargs(self, TagOpenOpen, open_kwargs))
goto fail_decref_all;
if (Tokenizer_emit_text(self, tag))
goto fail_decref_all;

if (style) {
if (Tokenizer_emit_all(self, style))
goto fail_decref_all;
close_wiki_markup = PyUnicode_FromString("|");
if (!close_wiki_markup)
goto fail_decref_all;
PyDict_SetItemString(close_kwargs, "wiki_markup", close_wiki_markup);
Py_DECREF(close_wiki_markup);
Py_DECREF(style);
}

PyDict_SetItemString(close_kwargs, "padding", padding);
Py_DECREF(padding);
if (Tokenizer_emit_kwargs(self, TagCloseSelfclose, close_kwargs))
goto fail_decref_cell;
if (Tokenizer_emit_all(self, cell))
goto fail_decref_cell;
Py_DECREF(cell);
// keep header/cell line contexts
self->topstack->context |= cell_context & (LC_TABLE_TH_LINE | LC_TABLE_TD_LINE);
// offset displacement done by parse()
self->head--;
return 0;

fail_decref_all:
Py_XDECREF(style);
Py_DECREF(padding);
Py_XDECREF(open_kwargs);
Py_XDECREF(close_kwargs);
fail_decref_cell:
Py_DECREF(cell);
return -1;
}

/*
Returns the context, stack, and whether to reset the cell for style
in a tuple.
*/
static PyObject* Tokenizer_handle_table_cell_end(Tokenizer* self, int reset_for_style)
{
if (reset_for_style)
self->topstack->context |= LC_TABLE_CELL_STYLE;
else
self->topstack->context &= ~LC_TABLE_CELL_STYLE;
return Tokenizer_pop_keeping_context(self);
}

/*
Make sure we are not trying to write an invalid character. Return 0 if
everything is safe, or -1 if the route must be failed.
*/
@@ -2533,6 +2926,24 @@ static int Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE d
}

/*
Returns whether the current head has leading whitespace.
TODO: treat comments and templates as whitespace, allow fail on non-newline spaces.
*/
static int Tokenizer_has_leading_whitespace(Tokenizer* self)
{
int offset = 1;
Py_UNICODE current_character;
while (1) {
current_character = Tokenizer_READ_BACKWARDS(self, offset);
if (!current_character || current_character == '\n')
return 1;
else if (!Py_UNICODE_ISSPACE(current_character))
return 0;
offset++;
}
}

/*
Parse the wikicode string, using context for when to stop. If push is true,
we will push a new context, otherwise we won't and context will be ignored.
*/
@@ -2667,24 +3078,94 @@ static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
if (temp != Py_None)
return temp;
}
else if (!last || last == '\n') {
if (this == '#' || this == '*' || this == ';' || this == ':') {
if (Tokenizer_handle_list(self))
else if ((!last || last == '\n') && (this == '#' || this == '*' || this == ';' || this == ':')) {
if (Tokenizer_handle_list(self))
return NULL;
}
else if ((!last || last == '\n') && (this == '-' && this == next &&
this == Tokenizer_READ(self, 2) &&
this == Tokenizer_READ(self, 3))) {
if (Tokenizer_handle_hr(self))
return NULL;
}
else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) {
if (Tokenizer_handle_dl_term(self))
return NULL;
// kill potential table contexts
if (this == '\n')
self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS;
}

// Start of table parsing
else if (this == '{' && next == '|' && Tokenizer_has_leading_whitespace(self)) {
if (Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_handle_table_start(self))
return NULL;
}
else if (Tokenizer_emit_char(self, this) || Tokenizer_emit_char(self, next))
return NULL;
else
self->head++;
}
else if (this_context & LC_TABLE_OPEN) {
if (this == '|' && next == '|' && this_context & LC_TABLE_TD_LINE) {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "||", "td", LC_TABLE_TD_LINE))
return NULL;
}
else if (this == '|' && next == '|' && this_context & LC_TABLE_TH_LINE) {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "||", "th", LC_TABLE_TH_LINE))
return NULL;
}
else if (this == '-' && this == next &&
this == Tokenizer_READ(self, 2) &&
this == Tokenizer_READ(self, 3)) {
if (Tokenizer_handle_hr(self))
else if (this == '!' && next == '!' && this_context & LC_TABLE_TH_LINE) {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "!!", "th", LC_TABLE_TH_LINE))
return NULL;
}
else if (this == '|' && this_context & LC_TABLE_CELL_STYLE) {
return Tokenizer_handle_table_cell_end(self, 1);
}
// on newline, clear out cell line contexts
else if (this == '\n' && this_context & LC_TABLE_CELL_LINE_CONTEXTS) {
self->topstack->context &= ~LC_TABLE_CELL_LINE_CONTEXTS;
if (Tokenizer_emit_char(self, this))
return NULL;
}
else if (Tokenizer_has_leading_whitespace(self)) {
if (this == '|' && next == '}') {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else
return Tokenizer_handle_table_end(self);
}
else if (this == '|' && next == '-') {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_row(self))
return NULL;
}
else if (this == '|') {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "|", "td", LC_TABLE_TD_LINE))
return NULL;
}
else if (this == '!') {
if (this_context & LC_TABLE_CELL_OPEN)
return Tokenizer_handle_table_cell_end(self, 0);
else if (Tokenizer_handle_table_cell(self, "!", "th", LC_TABLE_TH_LINE))
return NULL;
}
else if (Tokenizer_emit_char(self, this))
return NULL;
}
else if (Tokenizer_emit_char(self, this))
return NULL;
}
else if ((this == '\n' || this == ':') && this_context & LC_DLTERM) {
if (Tokenizer_handle_dl_term(self))
return NULL;
}
else if (Tokenizer_emit_char(self, this))
return NULL;
self->head++;


+ 58
- 50
mwparserfromhell/parser/tokenizer.h Переглянути файл

@@ -44,9 +44,9 @@ SOFTWARE.

static const char MARKERS[] = {
'{', '}', '[', ']', '<', '>', '|', '=', '&', '\'', '#', '*', ';', ':', '/',
'-', '\n', '\0'};
'-', '!', '\n', '\0'};

#define NUM_MARKERS 18
#define NUM_MARKERS 19
#define TEXTBUFFER_BLOCKSIZE 1024
#define MAX_DEPTH 40
#define MAX_CYCLES 100000
@@ -110,60 +110,68 @@ static PyObject* TagCloseClose;

/* Local contexts: */

#define LC_TEMPLATE 0x00000007
#define LC_TEMPLATE_NAME 0x00000001
#define LC_TEMPLATE_PARAM_KEY 0x00000002
#define LC_TEMPLATE_PARAM_VALUE 0x00000004

#define LC_ARGUMENT 0x00000018
#define LC_ARGUMENT_NAME 0x00000008
#define LC_ARGUMENT_DEFAULT 0x00000010

#define LC_WIKILINK 0x00000060
#define LC_WIKILINK_TITLE 0x00000020
#define LC_WIKILINK_TEXT 0x00000040

#define LC_EXT_LINK 0x00000180
#define LC_EXT_LINK_URI 0x00000080
#define LC_EXT_LINK_TITLE 0x00000100

#define LC_HEADING 0x00007E00
#define LC_HEADING_LEVEL_1 0x00000200
#define LC_HEADING_LEVEL_2 0x00000400
#define LC_HEADING_LEVEL_3 0x00000800
#define LC_HEADING_LEVEL_4 0x00001000
#define LC_HEADING_LEVEL_5 0x00002000
#define LC_HEADING_LEVEL_6 0x00004000

#define LC_TAG 0x00078000
#define LC_TAG_OPEN 0x00008000
#define LC_TAG_ATTR 0x00010000
#define LC_TAG_BODY 0x00020000
#define LC_TAG_CLOSE 0x00040000

#define LC_STYLE 0x00780000
#define LC_STYLE_ITALICS 0x00080000
#define LC_STYLE_BOLD 0x00100000
#define LC_STYLE_PASS_AGAIN 0x00200000
#define LC_STYLE_SECOND_PASS 0x00400000

#define LC_DLTERM 0x00800000

#define LC_SAFETY_CHECK 0x3F000000
#define LC_HAS_TEXT 0x01000000
#define LC_FAIL_ON_TEXT 0x02000000
#define LC_FAIL_NEXT 0x04000000
#define LC_FAIL_ON_LBRACE 0x08000000
#define LC_FAIL_ON_RBRACE 0x10000000
#define LC_FAIL_ON_EQUALS 0x20000000

#define LC_TEMPLATE 0x0000000000000007
#define LC_TEMPLATE_NAME 0x0000000000000001
#define LC_TEMPLATE_PARAM_KEY 0x0000000000000002
#define LC_TEMPLATE_PARAM_VALUE 0x0000000000000004

#define LC_ARGUMENT 0x0000000000000018
#define LC_ARGUMENT_NAME 0x0000000000000008
#define LC_ARGUMENT_DEFAULT 0x0000000000000010

#define LC_WIKILINK 0x0000000000000060
#define LC_WIKILINK_TITLE 0x0000000000000020
#define LC_WIKILINK_TEXT 0x0000000000000040

#define LC_EXT_LINK 0x0000000000000180
#define LC_EXT_LINK_URI 0x0000000000000080
#define LC_EXT_LINK_TITLE 0x0000000000000100

#define LC_HEADING 0x0000000000007E00
#define LC_HEADING_LEVEL_1 0x0000000000000200
#define LC_HEADING_LEVEL_2 0x0000000000000400
#define LC_HEADING_LEVEL_3 0x0000000000000800
#define LC_HEADING_LEVEL_4 0x0000000000001000
#define LC_HEADING_LEVEL_5 0x0000000000002000
#define LC_HEADING_LEVEL_6 0x0000000000004000

#define LC_TAG 0x0000000000078000
#define LC_TAG_OPEN 0x0000000000008000
#define LC_TAG_ATTR 0x0000000000010000
#define LC_TAG_BODY 0x0000000000020000
#define LC_TAG_CLOSE 0x0000000000040000

#define LC_STYLE 0x0000000000780000
#define LC_STYLE_ITALICS 0x0000000000080000
#define LC_STYLE_BOLD 0x0000000000100000
#define LC_STYLE_PASS_AGAIN 0x0000000000200000
#define LC_STYLE_SECOND_PASS 0x0000000000400000

#define LC_DLTERM 0x0000000000800000

#define LC_SAFETY_CHECK 0x000000003F000000
#define LC_HAS_TEXT 0x0000000001000000
#define LC_FAIL_ON_TEXT 0x0000000002000000
#define LC_FAIL_NEXT 0x0000000004000000
#define LC_FAIL_ON_LBRACE 0x0000000008000000
#define LC_FAIL_ON_RBRACE 0x0000000010000000
#define LC_FAIL_ON_EQUALS 0x0000000020000000

// TODO realign all
#define LC_TABLE 0x00000007C0000000
#define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000700000000
#define LC_TABLE_OPEN 0x0000000040000000
#define LC_TABLE_CELL_OPEN 0x0000000080000000
#define LC_TABLE_CELL_STYLE 0x0000000100000000
#define LC_TABLE_TD_LINE 0x0000000200000000
#define LC_TABLE_TH_LINE 0x0000000400000000
/* Global contexts: */

#define GL_HEADING 0x1

/* Aggregate contexts: */

#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE)
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE | LC_TABLE_OPEN)
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)
#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI)


+ 1
- 1
mwparserfromhell/parser/tokenizer.py Переглянути файл

@@ -1134,7 +1134,7 @@ class Tokenizer(object):
self._emit_all(cell)
# keep header/cell line contexts
self._context |= cell_context & (contexts.TABLE_TH_LINE | contexts.TABLE_TD_LINE)
# offset displacement done by _parse()
# offset displacement done by parse()
self._head -= 1

def _handle_table_cell_end(self, reset_for_style=False):


Завантаження…
Відмінити
Зберегти