From 1357da119d37eb893b5de44bfa5ff79ea464cb69 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 28 Jul 2015 02:11:18 -0400 Subject: [PATCH] Finish improved Unicode support for PEP 393. --- mwparserfromhell/parser/ctokenizer/common.h | 5 +- mwparserfromhell/parser/ctokenizer/textbuffer.c | 220 +++++++++++++++++++----- 2 files changed, 179 insertions(+), 46 deletions(-) diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h index 8c6e71c..abade02 100644 --- a/mwparserfromhell/parser/ctokenizer/common.h +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -40,8 +40,9 @@ SOFTWARE. #define uint64_t unsigned PY_LONG_LONG #endif -#define malloc PyObject_Malloc // XXX: yuck -#define free PyObject_Free +#define malloc PyObject_Malloc // XXX: yuck +#define realloc PyObject_Realloc +#define free PyObject_Free /* Unicode support macros */ diff --git a/mwparserfromhell/parser/ctokenizer/textbuffer.c b/mwparserfromhell/parser/ctokenizer/textbuffer.c index 63d45d6..e028a58 100644 --- a/mwparserfromhell/parser/ctokenizer/textbuffer.c +++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c @@ -22,28 +22,95 @@ SOFTWARE. #include "textbuffer.h" -#define TEXTBUFFER_BLOCKSIZE 1024 +#define INITIAL_CAPACITY 32 +#define RESIZE_FACTOR 2 +#define CONCAT_EXTRA 32 + +/* + Internal allocation function for textbuffers. +*/ +static int internal_alloc(Textbuffer* self, Unicode maxchar) +{ + self->capacity = INITIAL_CAPACITY; + self->length = 0; + +#ifdef PEP_393 + self->object = PyUnicode_New(self->capacity, maxchar); + if (!self->object) + return -1; + self->kind = PyUnicode_KIND(self->object); + self->data = PyUnicode_DATA(self->object); +#else + (void) maxchar; // Unused + self->data = malloc(sizeof(Unicode) * self->capacity); + if (!self->data) + return -1; +#endif + + return 0; +} + +/* + Internal deallocation function for textbuffers. +*/ +static void internal_dealloc(Textbuffer* self) +{ +#ifdef PEP_393 + Py_DECREF(self->object); +#else + free(self->data); +#endif +} + +/* + Internal resize function. +*/ +static int internal_resize(Textbuffer* self, Py_ssize_t new_cap) +{ +#ifdef PEP_393 + PyObject *newobj; + void *newdata; + + newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object)); + if (!newobj) + return -1; + newdata = PyUnicode_DATA(newobj); + memcpy(newdata, self->data, self->length * self->kind); + Py_DECREF(self->object); + self->object = newobj; + self->data = newdata; +#else + if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap))) + return -1; +#endif + + self->capacity = new_cap; + return 0; +} /* Create a new textbuffer object. */ -Textbuffer* Textbuffer_new(void) +Textbuffer* Textbuffer_new(TokenizerInput* text) { - Textbuffer* buffer = malloc(sizeof(Textbuffer)); + Textbuffer* self = malloc(sizeof(Textbuffer)); + Unicode maxchar = 0; - if (!buffer) { - PyErr_NoMemory(); - return NULL; - } - buffer->size = 0; - buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE); - if (!buffer->data) { - free(buffer); - PyErr_NoMemory(); - return NULL; - } - buffer->prev = buffer->next = NULL; - return buffer; +#ifdef PEP_393 + maxchar = PyUnicode_MAX_CHAR_VALUE(text->object); +#endif + + if (!self) + goto fail_nomem; + if (internal_alloc(self, maxchar) < 0) + goto fail_dealloc; + return self; + + fail_dealloc: + free(self); + fail_nomem: + PyErr_NoMemory(); + return NULL; } /* @@ -51,50 +118,115 @@ Textbuffer* Textbuffer_new(void) */ void Textbuffer_dealloc(Textbuffer* self) { - Textbuffer* next; + internal_dealloc(self); + free(self); +} - while (self) { - free(self->data); - next = self->next; - free(self); - self = next; - } +/* + Reset a textbuffer to its initial, empty state. +*/ +int Textbuffer_reset(Textbuffer* self) +{ + Unicode maxchar = 0; + +#ifdef PEP_393 + maxchar = PyUnicode_MAX_CHAR_VALUE(self->object); +#endif + + internal_dealloc(self); + if (internal_alloc(self, maxchar)) + return -1; + return 0; } /* Write a Unicode codepoint to the given textbuffer. */ -int Textbuffer_write(Textbuffer** this, Py_UNICODE code) +int Textbuffer_write(Textbuffer* self, Unicode code) { - Textbuffer* self = *this; - - if (self->size == TEXTBUFFER_BLOCKSIZE) { - Textbuffer* new = Textbuffer_new(); - if (!new) + if (self->length >= self->capacity) { + if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0) return -1; - new->next = self; - self->prev = new; - *this = self = new; } - self->data[self->size++] = code; + +#ifdef PEP_393 + PyUnicode_WRITE(self->kind, self->data, self->length++, code); +#else + self->data[self->length++] = code; +#endif + return 0; } /* + Read a Unicode codepoint from the given index of the given textbuffer. + + This function does not check for bounds. +*/ +Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index) +{ +#ifdef PEP_393 + return PyUnicode_READ(self->kind, self->data, index); +#else + return self->data[index]; +#endif +} + +/* Return the contents of the textbuffer as a Python Unicode object. */ PyObject* Textbuffer_render(Textbuffer* self) { - PyObject *result = PyUnicode_FromUnicode(self->data, self->size); - PyObject *left, *concat; - - while (self->next) { - self = self->next; - left = PyUnicode_FromUnicode(self->data, self->size); - concat = PyUnicode_Concat(left, result); - Py_DECREF(left); - Py_DECREF(result); - result = concat; +#ifdef PEP_393 + return PyUnicode_FromKindAndData(self->kind, self->data, self->length); +#else + return PyUnicode_FromUnicode(self->data, self->length); +#endif +} + +/* + Concatenate the 'other' textbuffer onto the end of the given textbuffer. +*/ +int Textbuffer_concat(Textbuffer* self, Textbuffer* other) +{ + Py_ssize_t newlen = self->length + other->length; + + if (newlen > self->capacity) { + if (internal_resize(self, newlen + CONCAT_EXTRA) < 0) + return -1; + } + +#ifdef PEP_393 + assert(self->kind == other->kind); + memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data, + other->length * other->kind); +#else + memcpy(self->data + self->length, other->data, + other->length * sizeof(Unicode)); +#endif + + self->length = newlen; + return 0; +} + +/* + Reverse the contents of the given textbuffer. +*/ +void Textbuffer_reverse(Textbuffer* self) +{ + Py_ssize_t i, mid = self->length / 2; + Unicode tmp; + + for (i = 0; i < mid; i++) { +#ifdef PEP_393 + tmp = PyUnicode_READ(self->kind, self->data, i); + PyUnicode_WRITE(self->kind, self->data, i, + PyUnicode_READ(self->kind, self->data, mid + i)); + PyUnicode_WRITE(self->kind, self->data, mid + i, tmp); +#else + tmp = self->data[i]; + self->data[i] = self->data[mid + i]; + self->data[mid + i] = tmp; +#endif } - return result; }