Finish improved Unicode support for PEP 393.

8 years ago · 1357da119d
--- a/mwparserfromhell/parser/ctokenizer/common.h
+++ b/mwparserfromhell/parser/ctokenizer/common.h
@@ -40,8 +40,9 @@ SOFTWARE.
 #define uint64_t unsigned PY_LONG_LONG
 #endif

 #define malloc PyObject_Malloc  // XXX: yuck
 #define free   PyObject_Free
 #define malloc  PyObject_Malloc  // XXX: yuck
 #define realloc PyObject_Realloc
 #define free    PyObject_Free

 /* Unicode support macros */

--- a/mwparserfromhell/parser/ctokenizer/textbuffer.c
+++ b/mwparserfromhell/parser/ctokenizer/textbuffer.c
@@ -22,28 +22,95 @@ SOFTWARE.

 #include "textbuffer.h"

 #define TEXTBUFFER_BLOCKSIZE 1024
 #define INITIAL_CAPACITY 32
 #define RESIZE_FACTOR 2
 #define CONCAT_EXTRA 32

 /*
    Internal allocation function for textbuffers.
 */
 static int internal_alloc(Textbuffer* self, Unicode maxchar)
 {
    self->capacity = INITIAL_CAPACITY;
    self->length = 0;

 #ifdef PEP_393
    self->object = PyUnicode_New(self->capacity, maxchar);
    if (!self->object)
        return -1;
    self->kind = PyUnicode_KIND(self->object);
    self->data = PyUnicode_DATA(self->object);
 #else
    (void) maxchar;  // Unused
    self->data = malloc(sizeof(Unicode) * self->capacity);
    if (!self->data)
        return -1;
 #endif

    return 0;
 }

 /*
    Internal deallocation function for textbuffers.
 */
 static void internal_dealloc(Textbuffer* self)
 {
 #ifdef PEP_393
    Py_DECREF(self->object);
 #else
    free(self->data);
 #endif
 }

 /*
    Internal resize function.
 */
 static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
 {
 #ifdef PEP_393
    PyObject *newobj;
    void *newdata;

    newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object));
    if (!newobj)
        return -1;
    newdata = PyUnicode_DATA(newobj);
    memcpy(newdata, self->data, self->length * self->kind);
    Py_DECREF(self->object);
    self->object = newobj;
    self->data = newdata;
 #else
    if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap)))
        return -1;
 #endif

    self->capacity = new_cap;
    return 0;
 }

 /*
    Create a new textbuffer object.
 */
 Textbuffer* Textbuffer_new(void)
 Textbuffer* Textbuffer_new(TokenizerInput* text)
 {
    Textbuffer* buffer = malloc(sizeof(Textbuffer));
    Textbuffer* self = malloc(sizeof(Textbuffer));
    Unicode maxchar = 0;

    if (!buffer) {
        PyErr_NoMemory();
        return NULL;
    }
    buffer->size = 0;
    buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
    if (!buffer->data) {
        free(buffer);
        PyErr_NoMemory();
        return NULL;
    }
    buffer->prev = buffer->next = NULL;
    return buffer;
 #ifdef PEP_393
    maxchar = PyUnicode_MAX_CHAR_VALUE(text->object);
 #endif

    if (!self)
        goto fail_nomem;
    if (internal_alloc(self, maxchar) < 0)
        goto fail_dealloc;
    return self;

    fail_dealloc:
    free(self);
    fail_nomem:
    PyErr_NoMemory();
    return NULL;
 }

 /*
@@ -51,50 +118,115 @@ Textbuffer* Textbuffer_new(void)
 */
 void Textbuffer_dealloc(Textbuffer* self)
 {
    Textbuffer* next;
    internal_dealloc(self);
    free(self);
 }

    while (self) {
        free(self->data);
        next = self->next;
        free(self);
        self = next;
    }
 /*
    Reset a textbuffer to its initial, empty state.
 */
 int Textbuffer_reset(Textbuffer* self)
 {
    Unicode maxchar = 0;

 #ifdef PEP_393
    maxchar = PyUnicode_MAX_CHAR_VALUE(self->object);
 #endif

    internal_dealloc(self);
    if (internal_alloc(self, maxchar))
        return -1;
    return 0;
 }

 /*
    Write a Unicode codepoint to the given textbuffer.
 */
 int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
 int Textbuffer_write(Textbuffer* self, Unicode code)
 {
    Textbuffer* self = *this;

    if (self->size == TEXTBUFFER_BLOCKSIZE) {
        Textbuffer* new = Textbuffer_new();
        if (!new)
    if (self->length >= self->capacity) {
        if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0)
            return -1;
        new->next = self;
        self->prev = new;
        *this = self = new;
    }
    self->data[self->size++] = code;

 #ifdef PEP_393
    PyUnicode_WRITE(self->kind, self->data, self->length++, code);
 #else
    self->data[self->length++] = code;
 #endif

    return 0;
 }

 /*
    Read a Unicode codepoint from the given index of the given textbuffer.

    This function does not check for bounds.
 */
 Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index)
 {
 #ifdef PEP_393
    return PyUnicode_READ(self->kind, self->data, index);
 #else
    return self->data[index];
 #endif
 }

 /*
    Return the contents of the textbuffer as a Python Unicode object.
 */
 PyObject* Textbuffer_render(Textbuffer* self)
 {
    PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
    PyObject *left, *concat;

    while (self->next) {
        self = self->next;
        left = PyUnicode_FromUnicode(self->data, self->size);
        concat = PyUnicode_Concat(left, result);
        Py_DECREF(left);
        Py_DECREF(result);
        result = concat;
 #ifdef PEP_393
    return PyUnicode_FromKindAndData(self->kind, self->data, self->length);
 #else
    return PyUnicode_FromUnicode(self->data, self->length);
 #endif
 }

 /*
    Concatenate the 'other' textbuffer onto the end of the given textbuffer.
 */
 int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
 {
    Py_ssize_t newlen = self->length + other->length;

    if (newlen > self->capacity) {
        if (internal_resize(self, newlen + CONCAT_EXTRA) < 0)
            return -1;
    }

 #ifdef PEP_393
    assert(self->kind == other->kind);
    memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data,
           other->length * other->kind);
 #else
    memcpy(self->data + self->length, other->data,
           other->length * sizeof(Unicode));
 #endif

    self->length = newlen;
    return 0;
 }

 /*
    Reverse the contents of the given textbuffer.
 */
 void Textbuffer_reverse(Textbuffer* self)
 {
    Py_ssize_t i, mid = self->length / 2;
    Unicode tmp;

    for (i = 0; i < mid; i++) {
 #ifdef PEP_393
        tmp = PyUnicode_READ(self->kind, self->data, i);
        PyUnicode_WRITE(self->kind, self->data, i,
                        PyUnicode_READ(self->kind, self->data, mid + i));
        PyUnicode_WRITE(self->kind, self->data, mid + i, tmp);
 #else
        tmp = self->data[i];
        self->data[i] = self->data[mid + i];
        self->data[mid + i] = tmp;
 #endif
    }
    return result;
 }