Browse Source

Finish improved Unicode support for PEP 393.

tags/v0.4.1
Ben Kurtovic 8 years ago
parent
commit
1357da119d
2 changed files with 179 additions and 46 deletions
  1. +3
    -2
      mwparserfromhell/parser/ctokenizer/common.h
  2. +176
    -44
      mwparserfromhell/parser/ctokenizer/textbuffer.c

+ 3
- 2
mwparserfromhell/parser/ctokenizer/common.h View File

@@ -40,8 +40,9 @@ SOFTWARE.
#define uint64_t unsigned PY_LONG_LONG
#endif

#define malloc PyObject_Malloc // XXX: yuck
#define free PyObject_Free
#define malloc PyObject_Malloc // XXX: yuck
#define realloc PyObject_Realloc
#define free PyObject_Free

/* Unicode support macros */



+ 176
- 44
mwparserfromhell/parser/ctokenizer/textbuffer.c View File

@@ -22,28 +22,95 @@ SOFTWARE.

#include "textbuffer.h"

#define TEXTBUFFER_BLOCKSIZE 1024
#define INITIAL_CAPACITY 32
#define RESIZE_FACTOR 2
#define CONCAT_EXTRA 32

/*
Internal allocation function for textbuffers.
*/
static int internal_alloc(Textbuffer* self, Unicode maxchar)
{
self->capacity = INITIAL_CAPACITY;
self->length = 0;

#ifdef PEP_393
self->object = PyUnicode_New(self->capacity, maxchar);
if (!self->object)
return -1;
self->kind = PyUnicode_KIND(self->object);
self->data = PyUnicode_DATA(self->object);
#else
(void) maxchar; // Unused
self->data = malloc(sizeof(Unicode) * self->capacity);
if (!self->data)
return -1;
#endif

return 0;
}

/*
Internal deallocation function for textbuffers.
*/
static void internal_dealloc(Textbuffer* self)
{
#ifdef PEP_393
Py_DECREF(self->object);
#else
free(self->data);
#endif
}

/*
Internal resize function.
*/
static int internal_resize(Textbuffer* self, Py_ssize_t new_cap)
{
#ifdef PEP_393
PyObject *newobj;
void *newdata;

newobj = PyUnicode_New(new_cap, PyUnicode_MAX_CHAR_VALUE(self->object));
if (!newobj)
return -1;
newdata = PyUnicode_DATA(newobj);
memcpy(newdata, self->data, self->length * self->kind);
Py_DECREF(self->object);
self->object = newobj;
self->data = newdata;
#else
if (!(self->data = realloc(self->data, sizeof(Unicode) * new_cap)))
return -1;
#endif

self->capacity = new_cap;
return 0;
}

/*
Create a new textbuffer object.
*/
Textbuffer* Textbuffer_new(void)
Textbuffer* Textbuffer_new(TokenizerInput* text)
{
Textbuffer* buffer = malloc(sizeof(Textbuffer));
Textbuffer* self = malloc(sizeof(Textbuffer));
Unicode maxchar = 0;

if (!buffer) {
PyErr_NoMemory();
return NULL;
}
buffer->size = 0;
buffer->data = malloc(sizeof(Py_UNICODE) * TEXTBUFFER_BLOCKSIZE);
if (!buffer->data) {
free(buffer);
PyErr_NoMemory();
return NULL;
}
buffer->prev = buffer->next = NULL;
return buffer;
#ifdef PEP_393
maxchar = PyUnicode_MAX_CHAR_VALUE(text->object);
#endif

if (!self)
goto fail_nomem;
if (internal_alloc(self, maxchar) < 0)
goto fail_dealloc;
return self;

fail_dealloc:
free(self);
fail_nomem:
PyErr_NoMemory();
return NULL;
}

/*
@@ -51,50 +118,115 @@ Textbuffer* Textbuffer_new(void)
*/
void Textbuffer_dealloc(Textbuffer* self)
{
Textbuffer* next;
internal_dealloc(self);
free(self);
}

while (self) {
free(self->data);
next = self->next;
free(self);
self = next;
}
/*
Reset a textbuffer to its initial, empty state.
*/
int Textbuffer_reset(Textbuffer* self)
{
Unicode maxchar = 0;

#ifdef PEP_393
maxchar = PyUnicode_MAX_CHAR_VALUE(self->object);
#endif

internal_dealloc(self);
if (internal_alloc(self, maxchar))
return -1;
return 0;
}

/*
Write a Unicode codepoint to the given textbuffer.
*/
int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
int Textbuffer_write(Textbuffer* self, Unicode code)
{
Textbuffer* self = *this;

if (self->size == TEXTBUFFER_BLOCKSIZE) {
Textbuffer* new = Textbuffer_new();
if (!new)
if (self->length >= self->capacity) {
if (internal_resize(self, self->capacity * RESIZE_FACTOR) < 0)
return -1;
new->next = self;
self->prev = new;
*this = self = new;
}
self->data[self->size++] = code;

#ifdef PEP_393
PyUnicode_WRITE(self->kind, self->data, self->length++, code);
#else
self->data[self->length++] = code;
#endif

return 0;
}

/*
Read a Unicode codepoint from the given index of the given textbuffer.

This function does not check for bounds.
*/
Unicode Textbuffer_read(Textbuffer* self, Py_ssize_t index)
{
#ifdef PEP_393
return PyUnicode_READ(self->kind, self->data, index);
#else
return self->data[index];
#endif
}

/*
Return the contents of the textbuffer as a Python Unicode object.
*/
PyObject* Textbuffer_render(Textbuffer* self)
{
PyObject *result = PyUnicode_FromUnicode(self->data, self->size);
PyObject *left, *concat;

while (self->next) {
self = self->next;
left = PyUnicode_FromUnicode(self->data, self->size);
concat = PyUnicode_Concat(left, result);
Py_DECREF(left);
Py_DECREF(result);
result = concat;
#ifdef PEP_393
return PyUnicode_FromKindAndData(self->kind, self->data, self->length);
#else
return PyUnicode_FromUnicode(self->data, self->length);
#endif
}

/*
Concatenate the 'other' textbuffer onto the end of the given textbuffer.
*/
int Textbuffer_concat(Textbuffer* self, Textbuffer* other)
{
Py_ssize_t newlen = self->length + other->length;

if (newlen > self->capacity) {
if (internal_resize(self, newlen + CONCAT_EXTRA) < 0)
return -1;
}

#ifdef PEP_393
assert(self->kind == other->kind);
memcpy(((Py_UCS1*) self->data) + self->kind * self->length, other->data,
other->length * other->kind);
#else
memcpy(self->data + self->length, other->data,
other->length * sizeof(Unicode));
#endif

self->length = newlen;
return 0;
}

/*
Reverse the contents of the given textbuffer.
*/
void Textbuffer_reverse(Textbuffer* self)
{
Py_ssize_t i, mid = self->length / 2;
Unicode tmp;

for (i = 0; i < mid; i++) {
#ifdef PEP_393
tmp = PyUnicode_READ(self->kind, self->data, i);
PyUnicode_WRITE(self->kind, self->data, i,
PyUnicode_READ(self->kind, self->data, mid + i));
PyUnicode_WRITE(self->kind, self->data, mid + i, tmp);
#else
tmp = self->data[i];
self->data[i] = self->data[mid + i];
self->data[mid + i] = tmp;
#endif
}
return result;
}

Loading…
Cancel
Save