Browse Source

Replace Python list of stacks with a singly linked list.

tags/v0.2
Ben Kurtovic 11 years ago
parent
commit
6d73eeeab1
2 changed files with 103 additions and 167 deletions
  1. +67
    -131
      mwparserfromhell/parser/tokenizer.c
  2. +36
    -36
      mwparserfromhell/parser/tokenizer.h

+ 67
- 131
mwparserfromhell/parser/tokenizer.c View File

@@ -34,8 +34,14 @@ static void
Tokenizer_dealloc(Tokenizer* self)
{
Py_XDECREF(self->text);
Py_XDECREF(self->stacks);
Py_XDECREF(self->topstack);
struct Stack *this = self->topstack, *next;
while (this) {
Py_DECREF(this->stack);
Py_DECREF(this->textbuffer);
next = this->next;
free(this);
this = next;
}
self->ob_type->tp_free((PyObject*) self);
}

@@ -47,57 +53,26 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
return -1;

self->text = Py_None;
self->topstack = Py_None;
Py_INCREF(Py_None);
Py_INCREF(Py_None);

self->stacks = PyList_New(0);
if (!self->stacks) {
Py_DECREF(self);
return -1;
}

self->topstack = NULL;
self->head = 0;
self->length = 0;
self->global = 0;

return 0;
}

static int
Tokenizer_set_context(Tokenizer* self, Py_ssize_t value)
{
if (PyList_SetItem(self->topstack, 1, PyInt_FromSsize_t(value)))
return -1;
return 0;
}

static int
Tokenizer_set_textbuffer(Tokenizer* self, PyObject* value)
{
if (PyList_SetItem(self->topstack, 2, value))
return -1;
return 0;
}

/*
Add a new token stack, context, and textbuffer to the list.
*/
static int
Tokenizer_push(Tokenizer* self, Py_ssize_t context)
static void
Tokenizer_push(Tokenizer* self, int context)
{
PyObject* top = PyList_New(3);
if (!top) return -1;
PyList_SET_ITEM(top, 0, PyList_New(0));
PyList_SET_ITEM(top, 1, PyInt_FromSsize_t(context));
PyList_SET_ITEM(top, 2, PyList_New(0));

Py_XDECREF(self->topstack);
struct Stack* top = malloc(sizeof(struct Stack));
top->stack = PyList_New(0);
top->context = context;
top->textbuffer = PyList_New(0);
top->next = self->topstack;
self->topstack = top;

if (PyList_Append(self->stacks, top))
return -1;
return 0;
}

/*
@@ -106,8 +81,8 @@ Tokenizer_push(Tokenizer* self, Py_ssize_t context)
static int
Tokenizer_push_textbuffer(Tokenizer* self)
{
if (PyList_GET_SIZE(Tokenizer_TEXTBUFFER(self)) > 0) {
PyObject* text = PyUnicode_Join(EMPTY, Tokenizer_TEXTBUFFER(self));
if (PyList_GET_SIZE(self->topstack->textbuffer) > 0) {
PyObject* text = PyUnicode_Join(EMPTY, self->topstack->textbuffer);
if (!text) return -1;

PyObject* class = PyObject_GetAttrString(tokens, "Text");
@@ -129,37 +104,28 @@ Tokenizer_push_textbuffer(Tokenizer* self)
Py_DECREF(kwargs);
if (!token) return -1;

if (PyList_Append(Tokenizer_STACK(self), token)) {
if (PyList_Append(self->topstack->stack, token)) {
Py_DECREF(token);
return -1;
}

Py_DECREF(token);

if (Tokenizer_set_textbuffer(self, PyList_New(0)))
self->topstack->textbuffer = PyList_New(0);
if (!self->topstack->textbuffer)
return -1;
}
return 0;
}

static int
static void
Tokenizer_delete_top_of_stack(Tokenizer* self)
{
if (PySequence_DelItem(self->stacks, -1))
return -1;
Py_DECREF(self->topstack);

Py_ssize_t size = PyList_GET_SIZE(self->stacks);
if (size > 0) {
PyObject* top = PyList_GET_ITEM(self->stacks, size - 1);
self->topstack = top;
Py_INCREF(top);
}
else {
self->topstack = NULL;
}

return 0;
struct Stack* top = self->topstack;
Py_DECREF(top->stack);
Py_DECREF(top->textbuffer);
self->topstack = top->next;
free(top);
}

/*
@@ -171,12 +137,10 @@ Tokenizer_pop(Tokenizer* self)
if (Tokenizer_push_textbuffer(self))
return NULL;

PyObject* stack = Tokenizer_STACK(self);
PyObject* stack = self->topstack->stack;
Py_INCREF(stack);

if (Tokenizer_delete_top_of_stack(self))
return NULL;

Tokenizer_delete_top_of_stack(self);
return stack;
}

@@ -190,17 +154,12 @@ Tokenizer_pop_keeping_context(Tokenizer* self)
if (Tokenizer_push_textbuffer(self))
return NULL;

PyObject* stack = Tokenizer_STACK(self);
PyObject* context = Tokenizer_CONTEXT(self);
PyObject* stack = self->topstack->stack;
Py_INCREF(stack);
Py_INCREF(context);

if (Tokenizer_delete_top_of_stack(self))
return NULL;

if (PyList_SetItem(self->topstack, 1, context))
return NULL;
int context = self->topstack->context;

Tokenizer_delete_top_of_stack(self);
self->topstack->context = context;
return stack;
}

@@ -226,7 +185,7 @@ Tokenizer_write(Tokenizer* self, PyObject* token)
if (Tokenizer_push_textbuffer(self))
return -1;

if (PyList_Append(Tokenizer_STACK(self), token))
if (PyList_Append(self->topstack->stack, token))
return -1;

return 0;
@@ -241,7 +200,7 @@ Tokenizer_write_first(Tokenizer* self, PyObject* token)
if (Tokenizer_push_textbuffer(self))
return -1;

if (PyList_Insert(Tokenizer_STACK(self), 0, token))
if (PyList_Insert(self->topstack->stack, 0, token))
return -1;

return 0;
@@ -253,7 +212,7 @@ Tokenizer_write_first(Tokenizer* self, PyObject* token)
static int
Tokenizer_write_text(Tokenizer* self, PyObject* text)
{
if (PyList_Append(Tokenizer_TEXTBUFFER(self), text))
if (PyList_Append(self->topstack->textbuffer, text))
return -1;

return 0;
@@ -302,7 +261,7 @@ Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist)
if (Tokenizer_push_textbuffer(self))
return -1;

PyObject* stack = Tokenizer_STACK(self);
PyObject* stack = self->topstack->stack;
Py_ssize_t size = PyList_GET_SIZE(stack);

if (PyList_SetSlice(stack, size, size, tokenlist))
@@ -579,7 +538,7 @@ Tokenizer_verify_safe(Tokenizer* self, const char* unsafes[])
if (Tokenizer_push_textbuffer(self))
return -1;

PyObject* stack = Tokenizer_STACK(self);
PyObject* stack = self->topstack->stack;
if (stack) {
PyObject* textlist = PyList_New(0);
if (!textlist) return -1;
@@ -673,24 +632,18 @@ Tokenizer_verify_safe(Tokenizer* self, const char* unsafes[])
static int
Tokenizer_handle_template_param(Tokenizer* self)
{
Py_ssize_t context = Tokenizer_CONTEXT_VAL(self);

if (context & LC_TEMPLATE_NAME) {
if (self->topstack->context & LC_TEMPLATE_NAME) {
const char* unsafes[] = {"\n", "{", "}", "[", "]", NULL};
if (Tokenizer_verify_safe(self, unsafes))
return -1;
if (BAD_ROUTE) return -1;
context ^= LC_TEMPLATE_NAME;
if (Tokenizer_set_context(self, context))
return -1;
self->topstack->context ^= LC_TEMPLATE_NAME;
}
else if (context & LC_TEMPLATE_PARAM_VALUE) {
context ^= LC_TEMPLATE_PARAM_VALUE;
if (Tokenizer_set_context(self, context))
return -1;
else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE) {
self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE;
}

if (context & LC_TEMPLATE_PARAM_KEY) {
if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
PyObject* stack = Tokenizer_pop_keeping_context(self);
if (!stack) return -1;
if (Tokenizer_write_all(self, stack)) {
@@ -700,9 +653,7 @@ Tokenizer_handle_template_param(Tokenizer* self)
Py_DECREF(stack);
}
else {
context |= LC_TEMPLATE_PARAM_KEY;
if (Tokenizer_set_context(self, context))
return -1;
self->topstack->context |= LC_TEMPLATE_PARAM_KEY;
}

PyObject* class = PyObject_GetAttrString(tokens, "TemplateParamSeparator");
@@ -717,7 +668,7 @@ Tokenizer_handle_template_param(Tokenizer* self)
}
Py_DECREF(token);

Tokenizer_push(self, context);
Tokenizer_push(self, self->topstack->context);
return 0;
}

@@ -744,11 +695,8 @@ Tokenizer_handle_template_param_value(Tokenizer* self)
}
Py_DECREF(stack);

Py_ssize_t context = Tokenizer_CONTEXT_VAL(self);
context ^= LC_TEMPLATE_PARAM_KEY;
context |= LC_TEMPLATE_PARAM_VALUE;
if (Tokenizer_set_context(self, context))
return -1;
self->topstack->context ^= LC_TEMPLATE_PARAM_KEY;
self->topstack->context |= LC_TEMPLATE_PARAM_VALUE;

PyObject* class = PyObject_GetAttrString(tokens, "TemplateParamEquals");
if (!class) return -1;
@@ -771,14 +719,12 @@ static PyObject*
Tokenizer_handle_template_end(Tokenizer* self)
{
PyObject* stack;
Py_ssize_t context = Tokenizer_CONTEXT_VAL(self);

if (context & LC_TEMPLATE_NAME) {
if (self->topstack->context & LC_TEMPLATE_NAME) {
const char* unsafes[] = {"\n", "{", "}", "[", "]", NULL};
if (Tokenizer_verify_safe(self, unsafes))
return NULL;
}
else if (context & LC_TEMPLATE_PARAM_KEY) {
else if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
stack = Tokenizer_pop_keeping_context(self);
if (!stack) return NULL;
if (Tokenizer_write_all(self, stack)) {
@@ -803,11 +749,8 @@ Tokenizer_handle_argument_separator(Tokenizer* self)
if (Tokenizer_verify_safe(self, unsafes))
return -1;

Py_ssize_t context = Tokenizer_CONTEXT_VAL(self);
context ^= LC_ARGUMENT_NAME;
context |= LC_ARGUMENT_DEFAULT;
if (Tokenizer_set_context(self, context))
return -1;
self->topstack->context ^= LC_ARGUMENT_NAME;
self->topstack->context |= LC_ARGUMENT_DEFAULT;

PyObject* class = PyObject_GetAttrString(tokens, "ArgumentSeparator");
if (!class) return -1;
@@ -829,7 +772,7 @@ Tokenizer_handle_argument_separator(Tokenizer* self)
static PyObject*
Tokenizer_handle_argument_end(Tokenizer* self)
{
if (Tokenizer_CONTEXT_VAL(self) & LC_ARGUMENT_NAME) {
if (self->topstack->context & LC_ARGUMENT_NAME) {
const char* unsafes[] = {"\n", "{{", "}}", NULL};
if (Tokenizer_verify_safe(self, unsafes))
return NULL;
@@ -914,11 +857,8 @@ Tokenizer_handle_wikilink_separator(Tokenizer* self)
if (Tokenizer_verify_safe(self, unsafes))
return -1;

Py_ssize_t context = Tokenizer_CONTEXT_VAL(self);
context ^= LC_WIKILINK_TITLE;
context |= LC_WIKILINK_TEXT;
if (Tokenizer_set_context(self, context))
return -1;
self->topstack->context ^= LC_WIKILINK_TITLE;
self->topstack->context |= LC_WIKILINK_TEXT;

PyObject* class = PyObject_GetAttrString(tokens, "WikilinkSeparator");
if (!class) return -1;
@@ -940,7 +880,7 @@ Tokenizer_handle_wikilink_separator(Tokenizer* self)
static PyObject*
Tokenizer_handle_wikilink_end(Tokenizer* self)
{
if (Tokenizer_CONTEXT_VAL(self) & LC_WIKILINK_TITLE) {
if (self->topstack->context & LC_WIKILINK_TITLE) {
const char* unsafes[] = {"\n", "{", "}", "[", "]", NULL};
if (Tokenizer_verify_safe(self, unsafes))
return NULL;
@@ -960,7 +900,7 @@ Tokenizer_parse_heading(Tokenizer* self)
self->global |= GL_HEADING;
Py_ssize_t reset = self->head;
self->head += 1;
Py_ssize_t best = 1;
int best = 1;
PyObject* text;
int i;

@@ -969,7 +909,7 @@ Tokenizer_parse_heading(Tokenizer* self)
self->head++;
}

Py_ssize_t context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1);
int context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1);
HeadingData* heading = (HeadingData*) Tokenizer_parse(self, context);

if (BAD_ROUTE) {
@@ -1032,7 +972,7 @@ Tokenizer_parse_heading(Tokenizer* self)
Py_DECREF(token);

if (heading->level < best) {
Py_ssize_t diff = best - heading->level;
int diff = best - heading->level;
char diffblocks[diff];
for (i = 0; i < diff; i++) diffblocks[i] = *"=";
PyObject* text = PyUnicode_FromStringAndSize(diffblocks, diff);
@@ -1092,16 +1032,14 @@ Tokenizer_handle_heading_end(Tokenizer* self)
self->head++;
}

Py_ssize_t current = log2(Tokenizer_CONTEXT_VAL(self) / LC_HEADING_LEVEL_1) + 1;
Py_ssize_t level = current > best ? (best > 6 ? 6 : best) : (current > 6 ? 6 : current);

Py_ssize_t context = Tokenizer_CONTEXT_VAL(self);
HeadingData* after = (HeadingData*) Tokenizer_parse(self, context);
int current = log2(self->topstack->context / LC_HEADING_LEVEL_1) + 1;
int level = current > best ? (best > 6 ? 6 : best) : (current > 6 ? 6 : current);
HeadingData* after = (HeadingData*) Tokenizer_parse(self, self->topstack->context);

if (BAD_ROUTE) {
RESET_ROUTE();
if (level < best) {
Py_ssize_t diff = best - level;
int diff = best - level;
char diffblocks[diff];
for (i = 0; i < diff; i++) diffblocks[i] = *"=";
text = PyUnicode_FromStringAndSize(diffblocks, diff);
@@ -1174,8 +1112,7 @@ static int
Tokenizer_parse_entity(Tokenizer* self)
{
Py_ssize_t reset = self->head;
if (Tokenizer_push(self, 0))
return -1;
Tokenizer_push(self, 0);

if (Tokenizer_really_parse_entity(self))
return -1;
@@ -1268,13 +1205,12 @@ Tokenizer_parse_comment(Tokenizer* self)
Parse the wikicode string, using context for when to stop.
*/
static PyObject*
Tokenizer_parse(Tokenizer* self, Py_ssize_t context)
Tokenizer_parse(Tokenizer* self, int context)
{
PyObject *this;
Py_UNICODE this_data, next, next_next, last;
Py_ssize_t this_context;
Py_ssize_t fail_contexts = (
LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_COMMENT);
int this_context;
int fail_contexts = LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_COMMENT;
int is_marker, i;

Tokenizer_push(self, context);
@@ -1297,7 +1233,7 @@ Tokenizer_parse(Tokenizer* self, Py_ssize_t context)
continue;
}

this_context = Tokenizer_CONTEXT_VAL(self);
this_context = self->topstack->context;

if (this_data == *"") {
if (this_context & LC_TEMPLATE_PARAM_KEY) {


+ 36
- 36
mwparserfromhell/parser/tokenizer.h View File

@@ -46,40 +46,47 @@ static PyObject* tokens;

/* Local contexts: */

static const Py_ssize_t LC_TEMPLATE = 0x0007;
static const Py_ssize_t LC_TEMPLATE_NAME = 0x0001;
static const Py_ssize_t LC_TEMPLATE_PARAM_KEY = 0x0002;
static const Py_ssize_t LC_TEMPLATE_PARAM_VALUE = 0x0004;
static const int LC_TEMPLATE = 0x0007;
static const int LC_TEMPLATE_NAME = 0x0001;
static const int LC_TEMPLATE_PARAM_KEY = 0x0002;
static const int LC_TEMPLATE_PARAM_VALUE = 0x0004;

static const Py_ssize_t LC_ARGUMENT = 0x0018;
static const Py_ssize_t LC_ARGUMENT_NAME = 0x0008;
static const Py_ssize_t LC_ARGUMENT_DEFAULT = 0x0010;
static const int LC_ARGUMENT = 0x0018;
static const int LC_ARGUMENT_NAME = 0x0008;
static const int LC_ARGUMENT_DEFAULT = 0x0010;

static const Py_ssize_t LC_WIKILINK = 0x0060;
static const Py_ssize_t LC_WIKILINK_TITLE = 0x0020;
static const Py_ssize_t LC_WIKILINK_TEXT = 0x0040;
static const int LC_WIKILINK = 0x0060;
static const int LC_WIKILINK_TITLE = 0x0020;
static const int LC_WIKILINK_TEXT = 0x0040;

static const Py_ssize_t LC_HEADING = 0x1f80;
static const Py_ssize_t LC_HEADING_LEVEL_1 = 0x0080;
static const Py_ssize_t LC_HEADING_LEVEL_2 = 0x0100;
static const Py_ssize_t LC_HEADING_LEVEL_3 = 0x0200;
static const Py_ssize_t LC_HEADING_LEVEL_4 = 0x0400;
static const Py_ssize_t LC_HEADING_LEVEL_5 = 0x0800;
static const Py_ssize_t LC_HEADING_LEVEL_6 = 0x1000;
static const int LC_HEADING = 0x1f80;
static const int LC_HEADING_LEVEL_1 = 0x0080;
static const int LC_HEADING_LEVEL_2 = 0x0100;
static const int LC_HEADING_LEVEL_3 = 0x0200;
static const int LC_HEADING_LEVEL_4 = 0x0400;
static const int LC_HEADING_LEVEL_5 = 0x0800;
static const int LC_HEADING_LEVEL_6 = 0x1000;

static const Py_ssize_t LC_COMMENT = 0x2000;
static const int LC_COMMENT = 0x2000;


/* Global contexts: */

static const Py_ssize_t GL_HEADING = 0x1;
static const int GL_HEADING = 0x1;


/* Miscellaneous structs: */

struct Stack {
PyObject* stack;
int context;
PyObject* textbuffer;
struct Stack* next;
};

typedef struct {
PyObject* title;
Py_ssize_t level;
int level;
} HeadingData;


@@ -87,22 +94,17 @@ typedef struct {

typedef struct {
PyObject_HEAD
PyObject* text; /* text to tokenize */
PyObject* stacks; /* token stacks */
PyObject* topstack; /* topmost stack */
Py_ssize_t head; /* current position in text */
Py_ssize_t length; /* length of text */
Py_ssize_t global; /* global context */
PyObject* text; /* text to tokenize */
struct Stack* topstack; /* topmost stack */
Py_ssize_t head; /* current position in text */
Py_ssize_t length; /* length of text */
int global; /* global context */
} Tokenizer;


/* Macros for accessing Tokenizer data: */

#define Tokenizer_STACK(self) PySequence_Fast_GET_ITEM(self->topstack, 0)
#define Tokenizer_CONTEXT(self) PySequence_Fast_GET_ITEM(self->topstack, 1)
#define Tokenizer_CONTEXT_VAL(self) PyInt_AsSsize_t(Tokenizer_CONTEXT(self))
#define Tokenizer_TEXTBUFFER(self) PySequence_Fast_GET_ITEM(self->topstack, 2)
#define Tokenizer_READ(self, num) PyUnicode_AS_UNICODE(Tokenizer_read(self, num))
#define Tokenizer_READ(self, delta) PyUnicode_AS_UNICODE(Tokenizer_read(self, delta))


/* Tokenizer function prototypes: */
@@ -110,11 +112,9 @@ typedef struct {
static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
static void Tokenizer_dealloc(Tokenizer*);
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
static int Tokenizer_set_context(Tokenizer*, Py_ssize_t);
static int Tokenizer_set_textbuffer(Tokenizer*, PyObject*);
static int Tokenizer_push(Tokenizer*, Py_ssize_t);
static void Tokenizer_push(Tokenizer*, int);
static int Tokenizer_push_textbuffer(Tokenizer*);
static int Tokenizer_delete_top_of_stack(Tokenizer*);
static void Tokenizer_delete_top_of_stack(Tokenizer*);
static PyObject* Tokenizer_pop(Tokenizer*);
static PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
static void* Tokenizer_fail_route(Tokenizer*);
@@ -142,7 +142,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer*);
static int Tokenizer_really_parse_entity(Tokenizer*);
static int Tokenizer_parse_entity(Tokenizer*);
static int Tokenizer_parse_comment(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, Py_ssize_t);
static PyObject* Tokenizer_parse(Tokenizer*, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);




Loading…
Cancel
Save