Browse Source

Replace textbuffer with our own datastructure.

tags/v0.2
Ben Kurtovic 11 years ago
parent
commit
cdef073a9b
2 changed files with 211 additions and 157 deletions
  1. +175
    -132
      mwparserfromhell/parser/tokenizer.c
  2. +36
    -25
      mwparserfromhell/parser/tokenizer.h

+ 175
- 132
mwparserfromhell/parser/tokenizer.c View File

@@ -30,6 +30,25 @@ Tokenizer_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
return (PyObject*) self;
}

static struct Textbuffer*
Textbuffer_new(void)
{
struct Textbuffer* buffer = malloc(sizeof(struct Textbuffer));
if (!buffer) {
PyErr_NoMemory();
return NULL;
}
buffer->size = 0;
buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
if (!buffer->data) {
free(buffer);
PyErr_NoMemory();
return NULL;
}
buffer->next = NULL;
return buffer;
}

static void
Tokenizer_dealloc(Tokenizer* self)
{
@@ -37,7 +56,7 @@ Tokenizer_dealloc(Tokenizer* self)
struct Stack *this = self->topstack, *next;
while (this) {
Py_DECREF(this->stack);
Py_DECREF(this->textbuffer);
Textbuffer_dealloc(this->textbuffer);
next = this->next;
free(this);
this = next;
@@ -45,6 +64,18 @@ Tokenizer_dealloc(Tokenizer* self)
self->ob_type->tp_free((PyObject*) self);
}

static void
Textbuffer_dealloc(struct Textbuffer* this)
{
struct Textbuffer* next;
while (this) {
free(this->data);
next = this->next;
free(this);
this = next;
}
}

static int
Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
{
@@ -64,15 +95,32 @@ Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
/*
Add a new token stack, context, and textbuffer to the list.
*/
static void
static int
Tokenizer_push(Tokenizer* self, int context)
{
struct Stack* top = malloc(sizeof(struct Stack));
if (!top) {
PyErr_NoMemory();
return -1;
}
top->stack = PyList_New(0);
top->context = context;
top->textbuffer = PyList_New(0);
top->textbuffer = Textbuffer_new();
if (!top->textbuffer) {
return -1;
}
top->next = self->topstack;
self->topstack = top;
return 0;
}

/*
Return the contents of the textbuffer as a Python Unicode object.
*/
static PyObject*
Textbuffer_render(struct Textbuffer* self)
{
return PyUnicode_FromUnicode(self->data, self->size);
}

/*
@@ -81,32 +129,35 @@ Tokenizer_push(Tokenizer* self, int context)
static int
Tokenizer_push_textbuffer(Tokenizer* self)
{
if (PyList_GET_SIZE(self->topstack->textbuffer) > 0) {
PyObject* text = PyUnicode_Join(EMPTY, self->topstack->textbuffer);
if (!text) return -1;
struct Textbuffer* buffer = self->topstack->textbuffer;
if (buffer->size == 0 && !buffer->next) {
return 0;
}
PyObject* text = Textbuffer_render(buffer);
if (!text) return -1;

PyObject* kwargs = PyDict_New();
if (!kwargs) {
Py_DECREF(text);
return -1;
}
PyDict_SetItemString(kwargs, "text", text);
PyObject* kwargs = PyDict_New();
if (!kwargs) {
Py_DECREF(text);
return -1;
}
PyDict_SetItemString(kwargs, "text", text);
Py_DECREF(text);

PyObject* token = PyObject_Call(Text, NOARGS, kwargs);
Py_DECREF(kwargs);
if (!token) return -1;

if (PyList_Append(self->topstack->stack, token)) {
Py_DECREF(token);
return -1;
}
PyObject* token = PyObject_Call(Text, NOARGS, kwargs);
Py_DECREF(kwargs);
if (!token) return -1;

if (PyList_Append(self->topstack->stack, token)) {
Py_DECREF(token);
return -1;
}

self->topstack->textbuffer = PyList_New(0);
if (!self->topstack->textbuffer)
return -1;
Py_DECREF(token);

self->topstack->textbuffer = Textbuffer_new();
if (!self->topstack->textbuffer) {
return -1;
}
return 0;
}
@@ -116,7 +167,7 @@ Tokenizer_delete_top_of_stack(Tokenizer* self)
{
struct Stack* top = self->topstack;
Py_DECREF(top->stack);
Py_DECREF(top->textbuffer);
Textbuffer_dealloc(top->textbuffer);
self->topstack = top->next;
free(top);
}
@@ -203,11 +254,20 @@ Tokenizer_write_first(Tokenizer* self, PyObject* token)
Write text to the current textbuffer.
*/
static int
Tokenizer_write_text(Tokenizer* self, PyObject* text)
Tokenizer_write_text(Tokenizer* self, Py_UNICODE text)
{
if (PyList_Append(self->topstack->textbuffer, text))
return -1;

struct Textbuffer* buf = self->topstack->textbuffer;
if (buf->size == TEXTBUFFER_BLOCKSIZE) {
struct Textbuffer* new = Textbuffer_new();
if (!new) {
return -1;
}
new->next = buf;
self->topstack->textbuffer = new;
buf = new;
}
buf->data[buf->size] = text;
buf->size++;
return 0;
}

@@ -217,35 +277,48 @@ Tokenizer_write_text(Tokenizer* self, PyObject* text)
static int
Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist)
{
int pushed = 0;
if (PyList_GET_SIZE(tokenlist) > 0) {
PyObject* token = PyList_GET_ITEM(tokenlist, 0);

PyObject* text;
switch (PyObject_IsInstance(token, Text)) {
case 0:
break;
case 1:
text = PyObject_GetAttrString(token, "text");
if (!text) {
return -1;
case 1: {
pushed = 1;
struct Textbuffer* buffer = self->topstack->textbuffer;
if (buffer->size == 0 && !buffer->next) {
break;
}
if (PySequence_DelItem(tokenlist, 0)) {
PyObject* left = Textbuffer_render(buffer);
if (!left) return -1;
PyObject* right = PyObject_GetAttrString(token, "text");
if (!right) return -1;

PyObject* text = PyUnicode_Concat(left, right);
Py_DECREF(left);
Py_DECREF(right);
if (!text) return -1;

if (PyObject_SetAttrString(token, "text", text)) {
Py_DECREF(text);
return -1;
}
if (Tokenizer_write_text(self, text)) {
Py_DECREF(text);
Py_DECREF(text);

self->topstack->textbuffer = Textbuffer_new();
if (!self->topstack->textbuffer) {
return -1;
}
Py_DECREF(text);
break;
}
case -1:
return -1;
}
}

if (Tokenizer_push_textbuffer(self))
return -1;
if (!pushed) {
if (Tokenizer_push_textbuffer(self))
return -1;
}

PyObject* stack = self->topstack->stack;
Py_ssize_t size = PyList_GET_SIZE(stack);
@@ -257,15 +330,21 @@ Tokenizer_write_all(Tokenizer* self, PyObject* tokenlist)
}

/*
Pop the current stack, write text, and then write the stack.
Pop the current stack, write text, and then write the stack. 'text' is a
NULL-terminated array of chars.
*/
static int
Tokenizer_write_text_then_stack(Tokenizer* self, PyObject* text)
Tokenizer_write_text_then_stack(Tokenizer* self, const char* text)
{
PyObject* stack = Tokenizer_pop(self);
if (Tokenizer_write_text(self, text)) {
Py_XDECREF(stack);
return -1;
int i = 0;
while (1) {
if (!text[i]) break;
if (Tokenizer_write_text(self, (Py_UNICODE) text[i])) {
Py_XDECREF(stack);
return -1;
}
i++;
}

if (stack) {
@@ -322,17 +401,13 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
self->head++;
braces++;
}
Tokenizer_push(self, 0);
if (Tokenizer_push(self, 0))
return -1;

while (braces) {
if (braces == 1) {
PyObject* text = PyUnicode_FromString("{");
if (Tokenizer_write_text_then_stack(self, text)) {
Py_XDECREF(text);
if (Tokenizer_write_text_then_stack(self, "{"))
return -1;
}

Py_XDECREF(text);
return 0;
}

@@ -342,13 +417,8 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)

if (BAD_ROUTE) {
RESET_ROUTE();
PyObject* text = PyUnicode_FromString("{{");
if (Tokenizer_write_text_then_stack(self, text)) {
Py_XDECREF(text);
if (Tokenizer_write_text_then_stack(self, "{{"))
return -1;
}

Py_XDECREF(text);
return 0;
}
break;
@@ -364,10 +434,9 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)

if (BAD_ROUTE) {
RESET_ROUTE();
char bracestr[braces];
for (i = 0; i < braces; i++) bracestr[i] = *"{";
PyObject* text = PyUnicode_FromStringAndSize(bracestr, braces);

char text[braces + 1];
for (i = 0; i < braces; i++) text[i] = *"{";
text[braces] = *"";
if (Tokenizer_write_text_then_stack(self, text)) {
Py_XDECREF(text);
return -1;
@@ -623,7 +692,8 @@ Tokenizer_handle_template_param(Tokenizer* self)
}
Py_DECREF(token);

Tokenizer_push(self, self->topstack->context);
if (Tokenizer_push(self, self->topstack->context))
return -1;
return 0;
}

@@ -748,11 +818,10 @@ Tokenizer_parse_wikilink(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
PyObject* text = PyUnicode_FromString("[[");
if (!text) return -1;
if (Tokenizer_write_text(self, text)) {
Py_XDECREF(text);
return -1;
int i;
for (i = 0; i < 2; i++) {
if (Tokenizer_write_text(self, *"["))
return -1;
}
return 0;
}
@@ -837,9 +906,7 @@ Tokenizer_parse_heading(Tokenizer* self)
self->global |= GL_HEADING;
Py_ssize_t reset = self->head;
self->head += 1;
int best = 1;
PyObject* text;
int i;
int best = 1, i;

while (*Tokenizer_READ(self, 0) == *"=") {
best++;
@@ -852,16 +919,11 @@ Tokenizer_parse_heading(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset + best - 1;
char blocks[best];
for (i = 0; i < best; i++) blocks[i] = *"=";
text = PyUnicode_FromStringAndSize(blocks, best);
if (!text) return -1;

if (Tokenizer_write_text_then_stack(self, text)) {
Py_DECREF(text);
char text[best + 1];
for (i = 0; i < best; i++) text[i] = *"=";
text[best] = *"";
if (Tokenizer_write_text_then_stack(self, text))
return -1;
}
Py_DECREF(text);
self->global ^= GL_HEADING;
return 0;
}
@@ -901,22 +963,14 @@ Tokenizer_parse_heading(Tokenizer* self)

if (heading->level < best) {
int diff = best - heading->level;
char diffblocks[diff];
for (i = 0; i < diff; i++) diffblocks[i] = *"=";
PyObject* text = PyUnicode_FromStringAndSize(diffblocks, diff);
if (!text) {
char difftext[diff + 1];
for (i = 0; i < diff; i++) difftext[i] = *"=";
difftext[diff] = *"";
if (Tokenizer_write_text_then_stack(self, difftext)) {
Py_DECREF(heading->title);
free(heading);
return -1;
}

if (Tokenizer_write_text_then_stack(self, text)) {
Py_DECREF(text);
Py_DECREF(heading->title);
free(heading);
return -1;
}
Py_DECREF(text);
}

if (Tokenizer_write_all(self, heading->title)) {
@@ -949,7 +1003,6 @@ Tokenizer_handle_heading_end(Tokenizer* self)
Py_ssize_t reset = self->head;
self->head += 1;
Py_ssize_t best = 1;
PyObject* text;
int i;

while (*Tokenizer_READ(self, 0) == *"=") {
@@ -965,39 +1018,24 @@ Tokenizer_handle_heading_end(Tokenizer* self)
RESET_ROUTE();
if (level < best) {
int diff = best - level;
char diffblocks[diff];
for (i = 0; i < diff; i++) diffblocks[i] = *"=";
text = PyUnicode_FromStringAndSize(diffblocks, diff);
if (!text) return NULL;

if (Tokenizer_write_text_then_stack(self, text)) {
Py_DECREF(text);
char difftext[diff + 1];
for (i = 0; i < diff; i++) difftext[i] = *"=";
difftext[diff] = *"";
if (Tokenizer_write_text_then_stack(self, difftext))
return NULL;
}
Py_DECREF(text);
}

self->head = reset + best - 1;
}

else {
char blocks[best];
for (i = 0; i < best; i++) blocks[i] = *"=";
text = PyUnicode_FromStringAndSize(blocks, best);
if (!text) {
Py_DECREF(after->title);
free(after);
return NULL;
}

char text[best + 1];
for (i = 0; i < best; i++) text[i] = *"=";
text[best] = *"";
if (Tokenizer_write_text_then_stack(self, text)) {
Py_DECREF(text);
Py_DECREF(after->title);
free(after);
return NULL;
}
Py_DECREF(text);

if (Tokenizer_write_all(self, after->title)) {
Py_DECREF(after->title);
free(after);
@@ -1037,7 +1075,8 @@ static int
Tokenizer_parse_entity(Tokenizer* self)
{
Py_ssize_t reset = self->head;
Tokenizer_push(self, 0);
if (Tokenizer_push(self, 0))
return -1;

if (Tokenizer_really_parse_entity(self))
return -1;
@@ -1045,7 +1084,7 @@ Tokenizer_parse_entity(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
if (Tokenizer_write_text(self, Tokenizer_read(self, 0)))
if (Tokenizer_write_text(self, *PyUnicode_AS_UNICODE(Tokenizer_read(self, 0))))
return -1;
return 0;
}
@@ -1077,13 +1116,16 @@ Tokenizer_parse_comment(Tokenizer* self)
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
PyObject* text = PyUnicode_FromString("<!--");
if (!text) return -1;
if (Tokenizer_write_text(self, text)) {
Py_XDECREF(text);
return -1;
const char* text = "<!--";
int i = 0;
while (1) {
if (!text[i]) return 0;
if (Tokenizer_write_text(self, (Py_UNICODE) text[i])) {
Py_XDECREF(text);
return -1;
}
i++;
}
return 0;
}

token = PyObject_CallObject(CommentStart, NULL);
@@ -1129,7 +1171,8 @@ Tokenizer_parse(Tokenizer* self, int context)
int fail_contexts = LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_COMMENT;
int is_marker, i;

Tokenizer_push(self, context);
if (Tokenizer_push(self, context))
return NULL;

while (1) {
this = Tokenizer_read(self, 0);
@@ -1144,7 +1187,7 @@ Tokenizer_parse(Tokenizer* self, int context)
}

if (!is_marker) {
Tokenizer_write_text(self, this);
Tokenizer_write_text(self, this_data);
self->head++;
continue;
}
@@ -1170,7 +1213,7 @@ Tokenizer_parse(Tokenizer* self, int context)
return Tokenizer_pop(self);
}
}
Tokenizer_write_text(self, this);
Tokenizer_write_text(self, this_data);
}
else if (this_data == next && next == *"{") {
if (Tokenizer_parse_template_or_argument(self))
@@ -1195,7 +1238,7 @@ Tokenizer_parse(Tokenizer* self, int context)
if (*Tokenizer_READ(self, 2) == *"}") {
return Tokenizer_handle_argument_end(self);
}
Tokenizer_write_text(self, this);
Tokenizer_write_text(self, this_data);
}
else if (this_data == next && next == *"[") {
if (!(this_context & LC_WIKILINK_TITLE)) {
@@ -1203,7 +1246,7 @@ Tokenizer_parse(Tokenizer* self, int context)
return NULL;
}
else {
Tokenizer_write_text(self, this);
Tokenizer_write_text(self, this_data);
}
}
else if (this_data == *"|" && this_context & LC_WIKILINK_TITLE) {
@@ -1220,7 +1263,7 @@ Tokenizer_parse(Tokenizer* self, int context)
return NULL;
}
else {
Tokenizer_write_text(self, this);
Tokenizer_write_text(self, this_data);
}
}
else if (this_data == *"=" && this_context & LC_HEADING) {
@@ -1240,11 +1283,11 @@ Tokenizer_parse(Tokenizer* self, int context)
return NULL;
}
else {
Tokenizer_write_text(self, this);
Tokenizer_write_text(self, this_data);
}
}
else {
Tokenizer_write_text(self, this);
Tokenizer_write_text(self, this_data);
}

self->head++;


+ 36
- 25
mwparserfromhell/parser/tokenizer.h View File

@@ -32,7 +32,9 @@ SOFTWARE.
static const char* MARKERS[] = {
"{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-",
"!", "\n", ""};
static const int NUM_MARKERS = 18;

#define NUM_MARKERS 18
#define TEXTBUFFER_BLOCKSIZE 1024

static int route_state = 0;
#define BAD_ROUTE (route_state)
@@ -83,41 +85,47 @@ static PyObject* TagCloseClose;

/* Local contexts: */

static const int LC_TEMPLATE = 0x0007;
static const int LC_TEMPLATE_NAME = 0x0001;
static const int LC_TEMPLATE_PARAM_KEY = 0x0002;
static const int LC_TEMPLATE_PARAM_VALUE = 0x0004;
#define LC_TEMPLATE 0x0007
#define LC_TEMPLATE_NAME 0x0001
#define LC_TEMPLATE_PARAM_KEY 0x0002
#define LC_TEMPLATE_PARAM_VALUE 0x0004

static const int LC_ARGUMENT = 0x0018;
static const int LC_ARGUMENT_NAME = 0x0008;
static const int LC_ARGUMENT_DEFAULT = 0x0010;
#define LC_ARGUMENT 0x0018
#define LC_ARGUMENT_NAME 0x0008
#define LC_ARGUMENT_DEFAULT 0x0010

static const int LC_WIKILINK = 0x0060;
static const int LC_WIKILINK_TITLE = 0x0020;
static const int LC_WIKILINK_TEXT = 0x0040;
#define LC_WIKILINK 0x0060
#define LC_WIKILINK_TITLE 0x0020
#define LC_WIKILINK_TEXT 0x0040

static const int LC_HEADING = 0x1f80;
static const int LC_HEADING_LEVEL_1 = 0x0080;
static const int LC_HEADING_LEVEL_2 = 0x0100;
static const int LC_HEADING_LEVEL_3 = 0x0200;
static const int LC_HEADING_LEVEL_4 = 0x0400;
static const int LC_HEADING_LEVEL_5 = 0x0800;
static const int LC_HEADING_LEVEL_6 = 0x1000;
#define LC_HEADING 0x1f80
#define LC_HEADING_LEVEL_1 0x0080
#define LC_HEADING_LEVEL_2 0x0100
#define LC_HEADING_LEVEL_3 0x0200
#define LC_HEADING_LEVEL_4 0x0400
#define LC_HEADING_LEVEL_5 0x0800
#define LC_HEADING_LEVEL_6 0x1000

static const int LC_COMMENT = 0x2000;
#define LC_COMMENT 0x2000


/* Global contexts: */

static const int GL_HEADING = 0x1;
#define GL_HEADING 0x1


/* Miscellaneous structs: */

struct Textbuffer {
Py_ssize_t size;
Py_UNICODE* data;
struct Textbuffer* next;
};

struct Stack {
PyObject* stack;
int context;
PyObject* textbuffer;
struct Textbuffer* textbuffer;
struct Stack* next;
};

@@ -144,12 +152,15 @@ typedef struct {
#define Tokenizer_READ(self, delta) PyUnicode_AS_UNICODE(Tokenizer_read(self, delta))


/* Tokenizer function prototypes: */
/* Function prototypes: */

static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
static struct Textbuffer* Textbuffer_new(void);
static void Tokenizer_dealloc(Tokenizer*);
static void Textbuffer_dealloc(struct Textbuffer*);
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
static void Tokenizer_push(Tokenizer*, int);
static int Tokenizer_push(Tokenizer*, int);
static PyObject* Textbuffer_render(struct Textbuffer*);
static int Tokenizer_push_textbuffer(Tokenizer*);
static void Tokenizer_delete_top_of_stack(Tokenizer*);
static PyObject* Tokenizer_pop(Tokenizer*);
@@ -157,9 +168,9 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
static void* Tokenizer_fail_route(Tokenizer*);
static int Tokenizer_write(Tokenizer*, PyObject*);
static int Tokenizer_write_first(Tokenizer*, PyObject*);
static int Tokenizer_write_text(Tokenizer*, PyObject*);
static int Tokenizer_write_text(Tokenizer*, Py_UNICODE);
static int Tokenizer_write_all(Tokenizer*, PyObject*);
static int Tokenizer_write_text_then_stack(Tokenizer*, PyObject*);
static int Tokenizer_write_text_then_stack(Tokenizer*, const char*);
static PyObject* Tokenizer_read(Tokenizer*, Py_ssize_t);
static PyObject* Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);
static int Tokenizer_parse_template_or_argument(Tokenizer*);


Loading…
Cancel
Save