Browse Source

More PEP 393 work; update Textbuffer interface and usage.

tags/v0.4.1
Ben Kurtovic 8 years ago
parent
commit
5eac0ab16f
9 changed files with 112 additions and 163 deletions
  1. +1
    -3
      mwparserfromhell/definitions.py
  2. +10
    -3
      mwparserfromhell/parser/ctokenizer/common.h
  3. +15
    -25
      mwparserfromhell/parser/ctokenizer/tag_data.c
  4. +2
    -2
      mwparserfromhell/parser/ctokenizer/tag_data.h
  5. +6
    -2
      mwparserfromhell/parser/ctokenizer/textbuffer.h
  6. +62
    -81
      mwparserfromhell/parser/ctokenizer/tok_parse.c
  7. +12
    -41
      mwparserfromhell/parser/ctokenizer/tok_support.c
  8. +1
    -1
      mwparserfromhell/parser/ctokenizer/tok_support.h
  9. +3
    -5
      mwparserfromhell/parser/ctokenizer/tokenizer.c

+ 1
- 3
mwparserfromhell/definitions.py View File

@@ -81,10 +81,8 @@ def is_single_only(tag):
"""Return whether or not the given *tag* must exist without a close tag."""
return tag.lower() in SINGLE_ONLY

def is_scheme(scheme, slashes=True, reverse=False):
def is_scheme(scheme, slashes=True):
"""Return whether *scheme* is valid for external links."""
if reverse: # Convenience for C
scheme = scheme[::-1]
scheme = scheme.lower()
if slashes:
return scheme in URI_SCHEMES


+ 10
- 3
mwparserfromhell/parser/ctokenizer/common.h View File

@@ -57,6 +57,7 @@ SOFTWARE.
#define Unicode Py_UNICODE
#define PyUnicode_FROM_SINGLE(chr) \
PyUnicode_FromUnicode(&(chr), 1)
#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE
#endif

/* Error handling macros */
@@ -73,15 +74,21 @@ SOFTWARE.

extern char** entitydefs;

extern PyObject* EMPTY;
extern PyObject* NOARGS;
extern PyObject* definitions;

/* Structs */

typedef struct {
Py_ssize_t size;
Unicode* data;
Py_ssize_t capacity;
Py_ssize_t length;
#ifdef PEP_393
PyObject* object;
int kind;
void* data;
#else
Py_UNICODE* data;
#endif
} Textbuffer;

struct Stack {


+ 15
- 25
mwparserfromhell/parser/ctokenizer/tag_data.c View File

@@ -26,13 +26,13 @@ SOFTWARE.
/*
Initialize a new TagData object.
*/
TagData* TagData_new(void)
TagData* TagData_new(TokenizerInput* text)
{
#define ALLOC_BUFFER(name) \
name = Textbuffer_new(); \
if (!name) { \
TagData_dealloc(self); \
return NULL; \
#define ALLOC_BUFFER(name) \
name = Textbuffer_new(text); \
if (!name) { \
TagData_dealloc(self); \
return NULL; \
}

TagData *self = malloc(sizeof(TagData));
@@ -56,16 +56,13 @@ TagData* TagData_new(void)
*/
void TagData_dealloc(TagData* self)
{
#define DEALLOC_BUFFER(name) \
if (name) \
Textbuffer_dealloc(name);

DEALLOC_BUFFER(self->pad_first);
DEALLOC_BUFFER(self->pad_before_eq);
DEALLOC_BUFFER(self->pad_after_eq);
if (self->pad_first)
Textbuffer_dealloc(self->pad_first);
if (self->pad_before_eq)
Textbuffer_dealloc(self->pad_before_eq);
if (self->pad_after_eq)
Textbuffer_dealloc(self->pad_after_eq);
free(self);

#undef DEALLOC_BUFFER
}

/*
@@ -73,16 +70,9 @@ void TagData_dealloc(TagData* self)
*/
int TagData_reset_buffers(TagData* self)
{
#define RESET_BUFFER(name) \
Textbuffer_dealloc(name); \
name = Textbuffer_new(); \
if (!name) \
if (Textbuffer_reset(self->pad_first) ||
Textbuffer_reset(self->pad_before_eq) ||
Textbuffer_reset(self->pad_after_eq))
return -1;

RESET_BUFFER(self->pad_first)
RESET_BUFFER(self->pad_before_eq)
RESET_BUFFER(self->pad_after_eq)
return 0;

#undef RESET_BUFFER
}

+ 2
- 2
mwparserfromhell/parser/ctokenizer/tag_data.h View File

@@ -32,12 +32,12 @@ typedef struct {
Textbuffer* pad_first;
Textbuffer* pad_before_eq;
Textbuffer* pad_after_eq;
Py_UNICODE quoter;
Unicode quoter;
Py_ssize_t reset;
} TagData;

/* Functions */

TagData* TagData_new(void);
TagData* TagData_new(TokenizerInput*);
void TagData_dealloc(TagData*);
int TagData_reset_buffers(TagData*);

+ 6
- 2
mwparserfromhell/parser/ctokenizer/textbuffer.h View File

@@ -26,7 +26,11 @@ SOFTWARE.

/* Functions */

Textbuffer* Textbuffer_new(void);
Textbuffer* Textbuffer_new(TokenizerInput*);
void Textbuffer_dealloc(Textbuffer*);
int Textbuffer_write(Textbuffer**, Py_UNICODE);
int Textbuffer_reset(Textbuffer*);
int Textbuffer_write(Textbuffer*, Unicode);
Unicode Textbuffer_read(Textbuffer*, Py_ssize_t);
PyObject* Textbuffer_render(Textbuffer*);
int Textbuffer_concat(Textbuffer*, Textbuffer*);
void Textbuffer_reverse(Textbuffer*);

+ 62
- 81
mwparserfromhell/parser/ctokenizer/tok_parse.c View File

@@ -34,11 +34,11 @@ SOFTWARE.
#define MAX_ENTITY_SIZE 8

#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL))
#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL))
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL))
#define IS_SCHEME(scheme, slashes, reverse) \
(call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False))
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL))
#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL))
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL))
#define IS_SCHEME(scheme, slashes) \
(call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False))

typedef struct {
PyObject* title;
@@ -80,14 +80,13 @@ static int heading_level_from_context(uint64_t n)
}

/*
Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as
Call the given function in definitions.py, using 'in1' and 'in2' as
parameters, and return its output as a bool.
*/
static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2,
PyObject* in3)
static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2)
{
PyObject* func = PyObject_GetAttrString(definitions, funcname);
PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL);
PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL);
int ans = (result == Py_True) ? 1 : 0;

Py_DECREF(func);
@@ -432,7 +431,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
self->head += 2;
}
else {
buffer = Textbuffer_new();
buffer = Textbuffer_new(&self->text);
if (!buffer)
return -1;
while ((this = Tokenizer_read(self, 0))) {
@@ -444,7 +443,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
break;
i++;
}
Textbuffer_write(&buffer, this);
Textbuffer_write(buffer, this);
if (Tokenizer_emit_char(self, this)) {
Textbuffer_dealloc(buffer);
return -1;
@@ -475,7 +474,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
Textbuffer_dealloc(buffer);
if (!scheme)
return -1;
if (!IS_SCHEME(scheme, slashes, 0)) {
if (!IS_SCHEME(scheme, slashes)) {
Py_DECREF(scheme);
Tokenizer_fail_route(self);
return 0;
@@ -491,7 +490,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
{
static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer;
Textbuffer *scheme_buffer = Textbuffer_new(&self->text);
PyObject *scheme;
Py_UNICODE chunk;
Py_ssize_t i;
@@ -501,28 +500,22 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
return -1;
// We have to backtrack through the textbuffer looking for our scheme since
// it was just parsed as text:
temp_buffer = self->topstack->textbuffer;
while (temp_buffer) {
for (i = temp_buffer->size - 1; i >= 0; i--) {
chunk = temp_buffer->data[i];
if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
goto end_of_loop;
j = 0;
while (1) {
if (!valid[j]) {
Textbuffer_dealloc(scheme_buffer);
FAIL_ROUTE(0);
return 0;
}
if (chunk == valid[j])
break;
j++;
for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) {
chunk = Textbuffer_read(self->topstack->textbuffer, i);
if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
goto end_of_loop;
j = 0;
do {
if (!valid[j]) {
Textbuffer_dealloc(scheme_buffer);
FAIL_ROUTE(0);
return 0;
}
Textbuffer_write(&scheme_buffer, chunk);
}
temp_buffer = temp_buffer->next;
} while (chunk != valid[j++]);
Textbuffer_write(scheme_buffer, chunk);
}
end_of_loop:
Textbuffer_reverse(scheme_buffer);
scheme = Textbuffer_render(scheme_buffer);
if (!scheme) {
Textbuffer_dealloc(scheme_buffer);
@@ -530,7 +523,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
}
slashes = (Tokenizer_read(self, 0) == '/' &&
Tokenizer_read(self, 1) == '/');
if (!IS_SCHEME(scheme, slashes, 1)) {
if (!IS_SCHEME(scheme, slashes)) {
Py_DECREF(scheme);
Textbuffer_dealloc(scheme_buffer);
FAIL_ROUTE(0);
@@ -541,7 +534,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
Textbuffer_dealloc(scheme_buffer);
return -1;
}
if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1))
if (Tokenizer_emit_textbuffer(self, scheme_buffer))
return -1;
if (Tokenizer_emit_char(self, ':'))
return -1;
@@ -558,27 +551,26 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
*/
static int
Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
Textbuffer** tail, Py_UNICODE this)
Textbuffer* tail, Py_UNICODE this)
{
#define PUSH_TAIL_BUFFER(tail, error) \
if ((tail)->size || (tail)->next) { \
if (Tokenizer_emit_textbuffer(self, tail, 0)) \
return error; \
tail = Textbuffer_new(); \
if (!(tail)) \
return error; \
#define PUSH_TAIL_BUFFER(tail, error) \
if (tail->length > 0) { \
if (Textbuffer_concat(self->topstack->textbuffer, tail)) \
return error; \
if (Textbuffer_reset(tail)) \
return error; \
}

if (this == '(' && !(*parens)) {
*parens = 1;
PUSH_TAIL_BUFFER(*tail, -1)
PUSH_TAIL_BUFFER(tail, -1)
}
else if (this == ',' || this == ';' || this == '\\' || this == '.' ||
this == ':' || this == '!' || this == '?' ||
(!(*parens) && this == ')'))
return Textbuffer_write(tail, this);
else
PUSH_TAIL_BUFFER(*tail, -1)
PUSH_TAIL_BUFFER(tail, -1)
return Tokenizer_emit_char(self, this);
}

@@ -605,7 +597,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
*/
static PyObject*
Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
Textbuffer** extra)
Textbuffer* extra)
{
Py_UNICODE this, next;
int parens = 0;
@@ -624,14 +616,14 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
this = Tokenizer_read(self, 0);
next = Tokenizer_read(self, 1);
if (this == '&') {
PUSH_TAIL_BUFFER(*extra, NULL)
PUSH_TAIL_BUFFER(extra, NULL)
if (Tokenizer_parse_entity(self))
return NULL;
}
else if (this == '<' && next == '!'
&& Tokenizer_read(self, 2) == '-'
&& Tokenizer_read(self, 3) == '-') {
PUSH_TAIL_BUFFER(*extra, NULL)
PUSH_TAIL_BUFFER(extra, NULL)
if (Tokenizer_parse_comment(self))
return NULL;
}
@@ -642,7 +634,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
else if (!this || this == '\n')
return Tokenizer_fail_route(self);
else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) {
PUSH_TAIL_BUFFER(*extra, NULL)
PUSH_TAIL_BUFFER(extra, NULL)
if (Tokenizer_parse_template_or_argument(self))
return NULL;
}
@@ -682,7 +674,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"),
*split, *scheme;
Py_ssize_t length;
Textbuffer* temp;

if (!text)
return -1;
@@ -691,19 +682,9 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
if (!split)
return -1;
scheme = PyList_GET_ITEM(split, 0);
length = PyUnicode_GET_SIZE(scheme);
while (length) {
temp = self->topstack->textbuffer;
if (length <= temp->size) {
temp->size -= length;
break;
}
length -= temp->size;
self->topstack->textbuffer = temp->next;
free(temp->data);
free(temp);
}
length = PyUnicode_GET_LENGTH(scheme);
Py_DECREF(split);
self->topstack->textbuffer->length -= length;
return 0;
}

@@ -720,16 +701,16 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)

Py_ssize_t reset = self->head;
PyObject *link, *kwargs;
Textbuffer *extra = 0;
Textbuffer *extra;

if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) {
NOT_A_LINK;
}
extra = Textbuffer_new();
extra = Textbuffer_new(&self->text);
if (!extra)
return -1;
self->head++;
link = Tokenizer_really_parse_external_link(self, brackets, &extra);
link = Tokenizer_really_parse_external_link(self, brackets, extra);
if (BAD_ROUTE) {
RESET_ROUTE();
self->head = reset;
@@ -769,8 +750,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
Textbuffer_dealloc(extra);
return -1;
}
if (extra->size || extra->next)
return Tokenizer_emit_textbuffer(self, extra, 0);
if (extra->length > 0)
return Tokenizer_emit_textbuffer(self, extra);
Textbuffer_dealloc(extra);
return 0;
}
@@ -1143,7 +1124,7 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
kwargs = PyDict_New();
if (!kwargs)
return -1;
tmp = PyUnicode_FromUnicode(&data->quoter, 1);
tmp = PyUnicode_FROM_SINGLE(data->quoter);
if (!tmp)
return -1;
PyDict_SetItemString(kwargs, "char", tmp);
@@ -1207,7 +1188,7 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
data->context = TAG_ATTR_READY;
else if (ctx & TAG_ATTR_NAME) {
data->context |= TAG_NOTE_EQUALS;
if (Textbuffer_write(&(data->pad_before_eq), text))
if (Textbuffer_write(data->pad_before_eq, text))
return -1;
}
if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) {
@@ -1215,9 +1196,9 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
return -1;
}
else if (data->context & TAG_ATTR_READY)
return Textbuffer_write(&(data->pad_first), text);
return Textbuffer_write(data->pad_first, text);
else if (data->context & TAG_ATTR_VALUE)
return Textbuffer_write(&(data->pad_after_eq), text);
return Textbuffer_write(data->pad_after_eq, text);
return 0;
}

@@ -1431,7 +1412,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
else if (this == '<' && next == '/') {
self->head += 2;
reset = self->head - 1;
buffer = Textbuffer_new();
buffer = Textbuffer_new(&self->text);
if (!buffer)
return NULL;
while ((this = Tokenizer_read(self, 0)), 1) {
@@ -1454,7 +1435,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
goto no_matching_end;
if (Tokenizer_emit(self, TagOpenClose))
return NULL;
if (Tokenizer_emit_textbuffer(self, buffer, 0))
if (Tokenizer_emit_textbuffer(self, buffer))
return NULL;
if (Tokenizer_emit(self, TagCloseClose))
return NULL;
@@ -1468,7 +1449,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
return NULL;
break;
}
Textbuffer_write(&buffer, this);
Textbuffer_write(buffer, this);
self->head++;
}
}
@@ -1565,7 +1546,7 @@ static PyObject* Tokenizer_handle_single_tag_end(Tokenizer* self)
*/
static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
{
TagData *data = TagData_new();
TagData *data = TagData_new(&self->text);
PyObject *token, *text, *trash;
Py_UNICODE this, next;
int can_exit;
@@ -1653,7 +1634,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
Py_UNICODE this;

self->head += 2;
buf = Textbuffer_new();
buf = Textbuffer_new(&self->text);
if (!buf)
return -1;
while (1) {
@@ -1669,7 +1650,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
Py_DECREF(name);
break;
}
Textbuffer_write(&buf, this);
Textbuffer_write(buf, this);
pos++;
}
Textbuffer_dealloc(buf);
@@ -1994,18 +1975,18 @@ static int Tokenizer_handle_list(Tokenizer* self)
static int Tokenizer_handle_hr(Tokenizer* self)
{
PyObject *markup, *kwargs;
Textbuffer *buffer = Textbuffer_new();
Textbuffer *buffer = Textbuffer_new(&self->text);
int i;

if (!buffer)
return -1;
self->head += 3;
for (i = 0; i < 4; i++) {
if (Textbuffer_write(&buffer, '-'))
if (Textbuffer_write(buffer, '-'))
return -1;
}
while (Tokenizer_read(self, 1) == '-') {
if (Textbuffer_write(&buffer, '-'))
if (Textbuffer_write(buffer, '-'))
return -1;
self->head++;
}
@@ -2130,7 +2111,7 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup,
*/
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
{
TagData *data = TagData_new();
TagData *data = TagData_new(&self->text);
PyObject *padding, *trash;
Py_UNICODE this;
int can_exit;
@@ -2150,7 +2131,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
}
}
if (Py_UNICODE_ISSPACE(this))
Textbuffer_write(&(data->pad_first), this);
Textbuffer_write(data->pad_first, this);
padding = Textbuffer_render(data->pad_first);
TagData_dealloc(data);
if (!padding)


+ 12
- 41
mwparserfromhell/parser/ctokenizer/tok_support.c View File

@@ -37,7 +37,7 @@ int Tokenizer_push(Tokenizer* self, uint64_t context)
}
top->stack = PyList_New(0);
top->context = context;
top->textbuffer = Textbuffer_new();
top->textbuffer = Textbuffer_new(&self->text);
if (!top->textbuffer)
return -1;
top->next = self->topstack;
@@ -55,7 +55,7 @@ int Tokenizer_push_textbuffer(Tokenizer* self)
PyObject *text, *kwargs, *token;
Textbuffer* buffer = self->topstack->textbuffer;

if (buffer->size == 0 && !buffer->next)
if (buffer->length == 0)
return 0;
text = Textbuffer_render(buffer);
if (!text)
@@ -76,9 +76,7 @@ int Tokenizer_push_textbuffer(Tokenizer* self)
return -1;
}
Py_DECREF(token);
Textbuffer_dealloc(buffer);
self->topstack->textbuffer = Textbuffer_new();
if (!self->topstack->textbuffer)
if (Textbuffer_reset(buffer))
return -1;
return 0;
}
@@ -200,7 +198,7 @@ int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
*/
int Tokenizer_emit_char(Tokenizer* self, Unicode code)
{
return Textbuffer_write(&(self->topstack->textbuffer), code);
return Textbuffer_write(self->topstack->textbuffer, code);
}

/*
@@ -222,36 +220,11 @@ int Tokenizer_emit_text(Tokenizer* self, const char* text)
Write the contents of another textbuffer to the current textbuffer,
deallocating it in the process.
*/
int
Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse)
int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer)
{
Textbuffer *original = buffer;
Py_ssize_t i;

if (reverse) {
do {
for (i = buffer->size - 1; i >= 0; i--) {
if (Tokenizer_emit_char(self, buffer->data[i])) {
Textbuffer_dealloc(original);
return -1;
}
}
} while ((buffer = buffer->next));
}
else {
while (buffer->next)
buffer = buffer->next;
do {
for (i = 0; i < buffer->size; i++) {
if (Tokenizer_emit_char(self, buffer->data[i])) {
Textbuffer_dealloc(original);
return -1;
}
}
} while ((buffer = buffer->prev));
}
Textbuffer_dealloc(original);
return 0;
int retval = Textbuffer_concat(self->topstack->textbuffer, buffer);
Textbuffer_dealloc(buffer);
return retval;
}

/*
@@ -272,7 +245,7 @@ int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
case 1: {
pushed = 1;
buffer = self->topstack->textbuffer;
if (buffer->size == 0 && !buffer->next)
if (buffer->length == 0)
break;
left = Textbuffer_render(buffer);
if (!left)
@@ -290,9 +263,7 @@ int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
return -1;
}
Py_DECREF(text);
Textbuffer_dealloc(buffer);
self->topstack->textbuffer = Textbuffer_new();
if (!self->topstack->textbuffer)
if (Textbuffer_reset(buffer))
return -1;
break;
}
@@ -356,7 +327,7 @@ Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
Py_ssize_t index = self->head + delta;

if (index >= self->text.length)
return EMPTY;
return '\0';
return read_codepoint(&self->text, index);
}

@@ -368,7 +339,7 @@ Unicode Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
Py_ssize_t index;

if (delta > self->head)
return EMPTY;
return '\0';
index = self->head - delta;
return read_codepoint(&self->text, index);
}

+ 1
- 1
mwparserfromhell/parser/ctokenizer/tok_support.h View File

@@ -37,7 +37,7 @@ int Tokenizer_emit_token(Tokenizer*, PyObject*, int);
int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int);
int Tokenizer_emit_char(Tokenizer*, Unicode);
int Tokenizer_emit_text(Tokenizer*, const char*);
int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*, int);
int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*);
int Tokenizer_emit_all(Tokenizer*, PyObject*);
int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);



+ 3
- 5
mwparserfromhell/parser/ctokenizer/tokenizer.c View File

@@ -31,7 +31,6 @@ uint64_t route_context;

char** entitydefs;

PyObject* EMPTY;
PyObject* NOARGS;
PyObject* definitions;

@@ -121,13 +120,13 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
#ifdef PEP_393
if (PyUnicode_READY(input) < 0)
return -1;
text->length = PyUnicode_GET_LENGTH(input);
text->kind = PyUnicode_KIND(input);
text->data = PyUnicode_DATA(input);
#else
text->length = PyUnicode_GET_SIZE(input);
text->buf = PyUnicode_AS_UNICODE(input);
#endif
text->length = PyUnicode_GET_LENGTH(input);
return 0;
}

/*
@@ -301,9 +300,8 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void)
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
Py_INCREF(Py_True);
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
EMPTY = PyUnicode_FromString("");
NOARGS = PyTuple_New(0);
if (!EMPTY || !NOARGS || load_entities() || load_tokens() || load_defs())
if (!NOARGS || load_entities() || load_tokens() || load_defs())
INIT_ERROR;
#ifdef IS_PY3K
return module;


Loading…
Cancel
Save