Browse Source

More PEP 393 work; update Textbuffer interface and usage.

tags/v0.4.1
Ben Kurtovic 9 years ago
parent
commit
5eac0ab16f
9 changed files with 112 additions and 163 deletions
  1. +1
    -3
      mwparserfromhell/definitions.py
  2. +10
    -3
      mwparserfromhell/parser/ctokenizer/common.h
  3. +15
    -25
      mwparserfromhell/parser/ctokenizer/tag_data.c
  4. +2
    -2
      mwparserfromhell/parser/ctokenizer/tag_data.h
  5. +6
    -2
      mwparserfromhell/parser/ctokenizer/textbuffer.h
  6. +62
    -81
      mwparserfromhell/parser/ctokenizer/tok_parse.c
  7. +12
    -41
      mwparserfromhell/parser/ctokenizer/tok_support.c
  8. +1
    -1
      mwparserfromhell/parser/ctokenizer/tok_support.h
  9. +3
    -5
      mwparserfromhell/parser/ctokenizer/tokenizer.c

+ 1
- 3
mwparserfromhell/definitions.py View File

@@ -81,10 +81,8 @@ def is_single_only(tag):
"""Return whether or not the given *tag* must exist without a close tag.""" """Return whether or not the given *tag* must exist without a close tag."""
return tag.lower() in SINGLE_ONLY return tag.lower() in SINGLE_ONLY


def is_scheme(scheme, slashes=True, reverse=False):
def is_scheme(scheme, slashes=True):
"""Return whether *scheme* is valid for external links.""" """Return whether *scheme* is valid for external links."""
if reverse: # Convenience for C
scheme = scheme[::-1]
scheme = scheme.lower() scheme = scheme.lower()
if slashes: if slashes:
return scheme in URI_SCHEMES return scheme in URI_SCHEMES


+ 10
- 3
mwparserfromhell/parser/ctokenizer/common.h View File

@@ -57,6 +57,7 @@ SOFTWARE.
#define Unicode Py_UNICODE #define Unicode Py_UNICODE
#define PyUnicode_FROM_SINGLE(chr) \ #define PyUnicode_FROM_SINGLE(chr) \
PyUnicode_FromUnicode(&(chr), 1) PyUnicode_FromUnicode(&(chr), 1)
#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE
#endif #endif


/* Error handling macros */ /* Error handling macros */
@@ -73,15 +74,21 @@ SOFTWARE.


extern char** entitydefs; extern char** entitydefs;


extern PyObject* EMPTY;
extern PyObject* NOARGS; extern PyObject* NOARGS;
extern PyObject* definitions; extern PyObject* definitions;


/* Structs */ /* Structs */


typedef struct { typedef struct {
Py_ssize_t size;
Unicode* data;
Py_ssize_t capacity;
Py_ssize_t length;
#ifdef PEP_393
PyObject* object;
int kind;
void* data;
#else
Py_UNICODE* data;
#endif
} Textbuffer; } Textbuffer;


struct Stack { struct Stack {


+ 15
- 25
mwparserfromhell/parser/ctokenizer/tag_data.c View File

@@ -26,13 +26,13 @@ SOFTWARE.
/* /*
Initialize a new TagData object. Initialize a new TagData object.
*/ */
TagData* TagData_new(void)
TagData* TagData_new(TokenizerInput* text)
{ {
#define ALLOC_BUFFER(name) \
name = Textbuffer_new(); \
if (!name) { \
TagData_dealloc(self); \
return NULL; \
#define ALLOC_BUFFER(name) \
name = Textbuffer_new(text); \
if (!name) { \
TagData_dealloc(self); \
return NULL; \
} }


TagData *self = malloc(sizeof(TagData)); TagData *self = malloc(sizeof(TagData));
@@ -56,16 +56,13 @@ TagData* TagData_new(void)
*/ */
void TagData_dealloc(TagData* self) void TagData_dealloc(TagData* self)
{ {
#define DEALLOC_BUFFER(name) \
if (name) \
Textbuffer_dealloc(name);

DEALLOC_BUFFER(self->pad_first);
DEALLOC_BUFFER(self->pad_before_eq);
DEALLOC_BUFFER(self->pad_after_eq);
if (self->pad_first)
Textbuffer_dealloc(self->pad_first);
if (self->pad_before_eq)
Textbuffer_dealloc(self->pad_before_eq);
if (self->pad_after_eq)
Textbuffer_dealloc(self->pad_after_eq);
free(self); free(self);

#undef DEALLOC_BUFFER
} }


/* /*
@@ -73,16 +70,9 @@ void TagData_dealloc(TagData* self)
*/ */
int TagData_reset_buffers(TagData* self) int TagData_reset_buffers(TagData* self)
{ {
#define RESET_BUFFER(name) \
Textbuffer_dealloc(name); \
name = Textbuffer_new(); \
if (!name) \
if (Textbuffer_reset(self->pad_first) ||
Textbuffer_reset(self->pad_before_eq) ||
Textbuffer_reset(self->pad_after_eq))
return -1; return -1;

RESET_BUFFER(self->pad_first)
RESET_BUFFER(self->pad_before_eq)
RESET_BUFFER(self->pad_after_eq)
return 0; return 0;

#undef RESET_BUFFER
} }

+ 2
- 2
mwparserfromhell/parser/ctokenizer/tag_data.h View File

@@ -32,12 +32,12 @@ typedef struct {
Textbuffer* pad_first; Textbuffer* pad_first;
Textbuffer* pad_before_eq; Textbuffer* pad_before_eq;
Textbuffer* pad_after_eq; Textbuffer* pad_after_eq;
Py_UNICODE quoter;
Unicode quoter;
Py_ssize_t reset; Py_ssize_t reset;
} TagData; } TagData;


/* Functions */ /* Functions */


TagData* TagData_new(void);
TagData* TagData_new(TokenizerInput*);
void TagData_dealloc(TagData*); void TagData_dealloc(TagData*);
int TagData_reset_buffers(TagData*); int TagData_reset_buffers(TagData*);

+ 6
- 2
mwparserfromhell/parser/ctokenizer/textbuffer.h View File

@@ -26,7 +26,11 @@ SOFTWARE.


/* Functions */ /* Functions */


Textbuffer* Textbuffer_new(void);
Textbuffer* Textbuffer_new(TokenizerInput*);
void Textbuffer_dealloc(Textbuffer*); void Textbuffer_dealloc(Textbuffer*);
int Textbuffer_write(Textbuffer**, Py_UNICODE);
int Textbuffer_reset(Textbuffer*);
int Textbuffer_write(Textbuffer*, Unicode);
Unicode Textbuffer_read(Textbuffer*, Py_ssize_t);
PyObject* Textbuffer_render(Textbuffer*); PyObject* Textbuffer_render(Textbuffer*);
int Textbuffer_concat(Textbuffer*, Textbuffer*);
void Textbuffer_reverse(Textbuffer*);

+ 62
- 81
mwparserfromhell/parser/ctokenizer/tok_parse.c View File

@@ -34,11 +34,11 @@ SOFTWARE.
#define MAX_ENTITY_SIZE 8 #define MAX_ENTITY_SIZE 8


#define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li") #define GET_HTML_TAG(markup) (markup == ':' ? "dd" : markup == ';' ? "dt" : "li")
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL))
#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL))
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL))
#define IS_SCHEME(scheme, slashes, reverse) \
(call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False))
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL))
#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL))
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL))
#define IS_SCHEME(scheme, slashes) \
(call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False))


typedef struct { typedef struct {
PyObject* title; PyObject* title;
@@ -80,14 +80,13 @@ static int heading_level_from_context(uint64_t n)
} }


/* /*
Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as
Call the given function in definitions.py, using 'in1' and 'in2' as
parameters, and return its output as a bool. parameters, and return its output as a bool.
*/ */
static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2,
PyObject* in3)
static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2)
{ {
PyObject* func = PyObject_GetAttrString(definitions, funcname); PyObject* func = PyObject_GetAttrString(definitions, funcname);
PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL);
PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL);
int ans = (result == Py_True) ? 1 : 0; int ans = (result == Py_True) ? 1 : 0;


Py_DECREF(func); Py_DECREF(func);
@@ -432,7 +431,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
self->head += 2; self->head += 2;
} }
else { else {
buffer = Textbuffer_new();
buffer = Textbuffer_new(&self->text);
if (!buffer) if (!buffer)
return -1; return -1;
while ((this = Tokenizer_read(self, 0))) { while ((this = Tokenizer_read(self, 0))) {
@@ -444,7 +443,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
break; break;
i++; i++;
} }
Textbuffer_write(&buffer, this);
Textbuffer_write(buffer, this);
if (Tokenizer_emit_char(self, this)) { if (Tokenizer_emit_char(self, this)) {
Textbuffer_dealloc(buffer); Textbuffer_dealloc(buffer);
return -1; return -1;
@@ -475,7 +474,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
Textbuffer_dealloc(buffer); Textbuffer_dealloc(buffer);
if (!scheme) if (!scheme)
return -1; return -1;
if (!IS_SCHEME(scheme, slashes, 0)) {
if (!IS_SCHEME(scheme, slashes)) {
Py_DECREF(scheme); Py_DECREF(scheme);
Tokenizer_fail_route(self); Tokenizer_fail_route(self);
return 0; return 0;
@@ -491,7 +490,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
{ {
static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer;
Textbuffer *scheme_buffer = Textbuffer_new(&self->text);
PyObject *scheme; PyObject *scheme;
Py_UNICODE chunk; Py_UNICODE chunk;
Py_ssize_t i; Py_ssize_t i;
@@ -501,28 +500,22 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
return -1; return -1;
// We have to backtrack through the textbuffer looking for our scheme since // We have to backtrack through the textbuffer looking for our scheme since
// it was just parsed as text: // it was just parsed as text:
temp_buffer = self->topstack->textbuffer;
while (temp_buffer) {
for (i = temp_buffer->size - 1; i >= 0; i--) {
chunk = temp_buffer->data[i];
if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
goto end_of_loop;
j = 0;
while (1) {
if (!valid[j]) {
Textbuffer_dealloc(scheme_buffer);
FAIL_ROUTE(0);
return 0;
}
if (chunk == valid[j])
break;
j++;
for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) {
chunk = Textbuffer_read(self->topstack->textbuffer, i);
if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
goto end_of_loop;
j = 0;
do {
if (!valid[j]) {
Textbuffer_dealloc(scheme_buffer);
FAIL_ROUTE(0);
return 0;
} }
Textbuffer_write(&scheme_buffer, chunk);
}
temp_buffer = temp_buffer->next;
} while (chunk != valid[j++]);
Textbuffer_write(scheme_buffer, chunk);
} }
end_of_loop: end_of_loop:
Textbuffer_reverse(scheme_buffer);
scheme = Textbuffer_render(scheme_buffer); scheme = Textbuffer_render(scheme_buffer);
if (!scheme) { if (!scheme) {
Textbuffer_dealloc(scheme_buffer); Textbuffer_dealloc(scheme_buffer);
@@ -530,7 +523,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
} }
slashes = (Tokenizer_read(self, 0) == '/' && slashes = (Tokenizer_read(self, 0) == '/' &&
Tokenizer_read(self, 1) == '/'); Tokenizer_read(self, 1) == '/');
if (!IS_SCHEME(scheme, slashes, 1)) {
if (!IS_SCHEME(scheme, slashes)) {
Py_DECREF(scheme); Py_DECREF(scheme);
Textbuffer_dealloc(scheme_buffer); Textbuffer_dealloc(scheme_buffer);
FAIL_ROUTE(0); FAIL_ROUTE(0);
@@ -541,7 +534,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
Textbuffer_dealloc(scheme_buffer); Textbuffer_dealloc(scheme_buffer);
return -1; return -1;
} }
if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1))
if (Tokenizer_emit_textbuffer(self, scheme_buffer))
return -1; return -1;
if (Tokenizer_emit_char(self, ':')) if (Tokenizer_emit_char(self, ':'))
return -1; return -1;
@@ -558,27 +551,26 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
*/ */
static int static int
Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
Textbuffer** tail, Py_UNICODE this)
Textbuffer* tail, Py_UNICODE this)
{ {
#define PUSH_TAIL_BUFFER(tail, error) \
if ((tail)->size || (tail)->next) { \
if (Tokenizer_emit_textbuffer(self, tail, 0)) \
return error; \
tail = Textbuffer_new(); \
if (!(tail)) \
return error; \
#define PUSH_TAIL_BUFFER(tail, error) \
if (tail->length > 0) { \
if (Textbuffer_concat(self->topstack->textbuffer, tail)) \
return error; \
if (Textbuffer_reset(tail)) \
return error; \
} }


if (this == '(' && !(*parens)) { if (this == '(' && !(*parens)) {
*parens = 1; *parens = 1;
PUSH_TAIL_BUFFER(*tail, -1)
PUSH_TAIL_BUFFER(tail, -1)
} }
else if (this == ',' || this == ';' || this == '\\' || this == '.' || else if (this == ',' || this == ';' || this == '\\' || this == '.' ||
this == ':' || this == '!' || this == '?' || this == ':' || this == '!' || this == '?' ||
(!(*parens) && this == ')')) (!(*parens) && this == ')'))
return Textbuffer_write(tail, this); return Textbuffer_write(tail, this);
else else
PUSH_TAIL_BUFFER(*tail, -1)
PUSH_TAIL_BUFFER(tail, -1)
return Tokenizer_emit_char(self, this); return Tokenizer_emit_char(self, this);
} }


@@ -605,7 +597,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
*/ */
static PyObject* static PyObject*
Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
Textbuffer** extra)
Textbuffer* extra)
{ {
Py_UNICODE this, next; Py_UNICODE this, next;
int parens = 0; int parens = 0;
@@ -624,14 +616,14 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
this = Tokenizer_read(self, 0); this = Tokenizer_read(self, 0);
next = Tokenizer_read(self, 1); next = Tokenizer_read(self, 1);
if (this == '&') { if (this == '&') {
PUSH_TAIL_BUFFER(*extra, NULL)
PUSH_TAIL_BUFFER(extra, NULL)
if (Tokenizer_parse_entity(self)) if (Tokenizer_parse_entity(self))
return NULL; return NULL;
} }
else if (this == '<' && next == '!' else if (this == '<' && next == '!'
&& Tokenizer_read(self, 2) == '-' && Tokenizer_read(self, 2) == '-'
&& Tokenizer_read(self, 3) == '-') { && Tokenizer_read(self, 3) == '-') {
PUSH_TAIL_BUFFER(*extra, NULL)
PUSH_TAIL_BUFFER(extra, NULL)
if (Tokenizer_parse_comment(self)) if (Tokenizer_parse_comment(self))
return NULL; return NULL;
} }
@@ -642,7 +634,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
else if (!this || this == '\n') else if (!this || this == '\n')
return Tokenizer_fail_route(self); return Tokenizer_fail_route(self);
else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) { else if (this == '{' && next == '{' && Tokenizer_CAN_RECURSE(self)) {
PUSH_TAIL_BUFFER(*extra, NULL)
PUSH_TAIL_BUFFER(extra, NULL)
if (Tokenizer_parse_template_or_argument(self)) if (Tokenizer_parse_template_or_argument(self))
return NULL; return NULL;
} }
@@ -682,7 +674,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"), PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"),
*split, *scheme; *split, *scheme;
Py_ssize_t length; Py_ssize_t length;
Textbuffer* temp;


if (!text) if (!text)
return -1; return -1;
@@ -691,19 +682,9 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
if (!split) if (!split)
return -1; return -1;
scheme = PyList_GET_ITEM(split, 0); scheme = PyList_GET_ITEM(split, 0);
length = PyUnicode_GET_SIZE(scheme);
while (length) {
temp = self->topstack->textbuffer;
if (length <= temp->size) {
temp->size -= length;
break;
}
length -= temp->size;
self->topstack->textbuffer = temp->next;
free(temp->data);
free(temp);
}
length = PyUnicode_GET_LENGTH(scheme);
Py_DECREF(split); Py_DECREF(split);
self->topstack->textbuffer->length -= length;
return 0; return 0;
} }


@@ -720,16 +701,16 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)


Py_ssize_t reset = self->head; Py_ssize_t reset = self->head;
PyObject *link, *kwargs; PyObject *link, *kwargs;
Textbuffer *extra = 0;
Textbuffer *extra;


if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) {
NOT_A_LINK; NOT_A_LINK;
} }
extra = Textbuffer_new();
extra = Textbuffer_new(&self->text);
if (!extra) if (!extra)
return -1; return -1;
self->head++; self->head++;
link = Tokenizer_really_parse_external_link(self, brackets, &extra);
link = Tokenizer_really_parse_external_link(self, brackets, extra);
if (BAD_ROUTE) { if (BAD_ROUTE) {
RESET_ROUTE(); RESET_ROUTE();
self->head = reset; self->head = reset;
@@ -769,8 +750,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
Textbuffer_dealloc(extra); Textbuffer_dealloc(extra);
return -1; return -1;
} }
if (extra->size || extra->next)
return Tokenizer_emit_textbuffer(self, extra, 0);
if (extra->length > 0)
return Tokenizer_emit_textbuffer(self, extra);
Textbuffer_dealloc(extra); Textbuffer_dealloc(extra);
return 0; return 0;
} }
@@ -1143,7 +1124,7 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
kwargs = PyDict_New(); kwargs = PyDict_New();
if (!kwargs) if (!kwargs)
return -1; return -1;
tmp = PyUnicode_FromUnicode(&data->quoter, 1);
tmp = PyUnicode_FROM_SINGLE(data->quoter);
if (!tmp) if (!tmp)
return -1; return -1;
PyDict_SetItemString(kwargs, "char", tmp); PyDict_SetItemString(kwargs, "char", tmp);
@@ -1207,7 +1188,7 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
data->context = TAG_ATTR_READY; data->context = TAG_ATTR_READY;
else if (ctx & TAG_ATTR_NAME) { else if (ctx & TAG_ATTR_NAME) {
data->context |= TAG_NOTE_EQUALS; data->context |= TAG_NOTE_EQUALS;
if (Textbuffer_write(&(data->pad_before_eq), text))
if (Textbuffer_write(data->pad_before_eq, text))
return -1; return -1;
} }
if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) { if (ctx & TAG_QUOTED && !(ctx & TAG_NOTE_SPACE)) {
@@ -1215,9 +1196,9 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
return -1; return -1;
} }
else if (data->context & TAG_ATTR_READY) else if (data->context & TAG_ATTR_READY)
return Textbuffer_write(&(data->pad_first), text);
return Textbuffer_write(data->pad_first, text);
else if (data->context & TAG_ATTR_VALUE) else if (data->context & TAG_ATTR_VALUE)
return Textbuffer_write(&(data->pad_after_eq), text);
return Textbuffer_write(data->pad_after_eq, text);
return 0; return 0;
} }


@@ -1431,7 +1412,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
else if (this == '<' && next == '/') { else if (this == '<' && next == '/') {
self->head += 2; self->head += 2;
reset = self->head - 1; reset = self->head - 1;
buffer = Textbuffer_new();
buffer = Textbuffer_new(&self->text);
if (!buffer) if (!buffer)
return NULL; return NULL;
while ((this = Tokenizer_read(self, 0)), 1) { while ((this = Tokenizer_read(self, 0)), 1) {
@@ -1454,7 +1435,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
goto no_matching_end; goto no_matching_end;
if (Tokenizer_emit(self, TagOpenClose)) if (Tokenizer_emit(self, TagOpenClose))
return NULL; return NULL;
if (Tokenizer_emit_textbuffer(self, buffer, 0))
if (Tokenizer_emit_textbuffer(self, buffer))
return NULL; return NULL;
if (Tokenizer_emit(self, TagCloseClose)) if (Tokenizer_emit(self, TagCloseClose))
return NULL; return NULL;
@@ -1468,7 +1449,7 @@ static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer* self)
return NULL; return NULL;
break; break;
} }
Textbuffer_write(&buffer, this);
Textbuffer_write(buffer, this);
self->head++; self->head++;
} }
} }
@@ -1565,7 +1546,7 @@ static PyObject* Tokenizer_handle_single_tag_end(Tokenizer* self)
*/ */
static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
{ {
TagData *data = TagData_new();
TagData *data = TagData_new(&self->text);
PyObject *token, *text, *trash; PyObject *token, *text, *trash;
Py_UNICODE this, next; Py_UNICODE this, next;
int can_exit; int can_exit;
@@ -1653,7 +1634,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
Py_UNICODE this; Py_UNICODE this;


self->head += 2; self->head += 2;
buf = Textbuffer_new();
buf = Textbuffer_new(&self->text);
if (!buf) if (!buf)
return -1; return -1;
while (1) { while (1) {
@@ -1669,7 +1650,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
Py_DECREF(name); Py_DECREF(name);
break; break;
} }
Textbuffer_write(&buf, this);
Textbuffer_write(buf, this);
pos++; pos++;
} }
Textbuffer_dealloc(buf); Textbuffer_dealloc(buf);
@@ -1994,18 +1975,18 @@ static int Tokenizer_handle_list(Tokenizer* self)
static int Tokenizer_handle_hr(Tokenizer* self) static int Tokenizer_handle_hr(Tokenizer* self)
{ {
PyObject *markup, *kwargs; PyObject *markup, *kwargs;
Textbuffer *buffer = Textbuffer_new();
Textbuffer *buffer = Textbuffer_new(&self->text);
int i; int i;


if (!buffer) if (!buffer)
return -1; return -1;
self->head += 3; self->head += 3;
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
if (Textbuffer_write(&buffer, '-'))
if (Textbuffer_write(buffer, '-'))
return -1; return -1;
} }
while (Tokenizer_read(self, 1) == '-') { while (Tokenizer_read(self, 1) == '-') {
if (Textbuffer_write(&buffer, '-'))
if (Textbuffer_write(buffer, '-'))
return -1; return -1;
self->head++; self->head++;
} }
@@ -2130,7 +2111,7 @@ Tokenizer_emit_table_tag(Tokenizer* self, const char* open_open_markup,
*/ */
static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token) static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
{ {
TagData *data = TagData_new();
TagData *data = TagData_new(&self->text);
PyObject *padding, *trash; PyObject *padding, *trash;
Py_UNICODE this; Py_UNICODE this;
int can_exit; int can_exit;
@@ -2150,7 +2131,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, char end_token)
} }
} }
if (Py_UNICODE_ISSPACE(this)) if (Py_UNICODE_ISSPACE(this))
Textbuffer_write(&(data->pad_first), this);
Textbuffer_write(data->pad_first, this);
padding = Textbuffer_render(data->pad_first); padding = Textbuffer_render(data->pad_first);
TagData_dealloc(data); TagData_dealloc(data);
if (!padding) if (!padding)


+ 12
- 41
mwparserfromhell/parser/ctokenizer/tok_support.c View File

@@ -37,7 +37,7 @@ int Tokenizer_push(Tokenizer* self, uint64_t context)
} }
top->stack = PyList_New(0); top->stack = PyList_New(0);
top->context = context; top->context = context;
top->textbuffer = Textbuffer_new();
top->textbuffer = Textbuffer_new(&self->text);
if (!top->textbuffer) if (!top->textbuffer)
return -1; return -1;
top->next = self->topstack; top->next = self->topstack;
@@ -55,7 +55,7 @@ int Tokenizer_push_textbuffer(Tokenizer* self)
PyObject *text, *kwargs, *token; PyObject *text, *kwargs, *token;
Textbuffer* buffer = self->topstack->textbuffer; Textbuffer* buffer = self->topstack->textbuffer;


if (buffer->size == 0 && !buffer->next)
if (buffer->length == 0)
return 0; return 0;
text = Textbuffer_render(buffer); text = Textbuffer_render(buffer);
if (!text) if (!text)
@@ -76,9 +76,7 @@ int Tokenizer_push_textbuffer(Tokenizer* self)
return -1; return -1;
} }
Py_DECREF(token); Py_DECREF(token);
Textbuffer_dealloc(buffer);
self->topstack->textbuffer = Textbuffer_new();
if (!self->topstack->textbuffer)
if (Textbuffer_reset(buffer))
return -1; return -1;
return 0; return 0;
} }
@@ -200,7 +198,7 @@ int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
*/ */
int Tokenizer_emit_char(Tokenizer* self, Unicode code) int Tokenizer_emit_char(Tokenizer* self, Unicode code)
{ {
return Textbuffer_write(&(self->topstack->textbuffer), code);
return Textbuffer_write(self->topstack->textbuffer, code);
} }


/* /*
@@ -222,36 +220,11 @@ int Tokenizer_emit_text(Tokenizer* self, const char* text)
Write the contents of another textbuffer to the current textbuffer, Write the contents of another textbuffer to the current textbuffer,
deallocating it in the process. deallocating it in the process.
*/ */
int
Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse)
int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer)
{ {
Textbuffer *original = buffer;
Py_ssize_t i;

if (reverse) {
do {
for (i = buffer->size - 1; i >= 0; i--) {
if (Tokenizer_emit_char(self, buffer->data[i])) {
Textbuffer_dealloc(original);
return -1;
}
}
} while ((buffer = buffer->next));
}
else {
while (buffer->next)
buffer = buffer->next;
do {
for (i = 0; i < buffer->size; i++) {
if (Tokenizer_emit_char(self, buffer->data[i])) {
Textbuffer_dealloc(original);
return -1;
}
}
} while ((buffer = buffer->prev));
}
Textbuffer_dealloc(original);
return 0;
int retval = Textbuffer_concat(self->topstack->textbuffer, buffer);
Textbuffer_dealloc(buffer);
return retval;
} }


/* /*
@@ -272,7 +245,7 @@ int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
case 1: { case 1: {
pushed = 1; pushed = 1;
buffer = self->topstack->textbuffer; buffer = self->topstack->textbuffer;
if (buffer->size == 0 && !buffer->next)
if (buffer->length == 0)
break; break;
left = Textbuffer_render(buffer); left = Textbuffer_render(buffer);
if (!left) if (!left)
@@ -290,9 +263,7 @@ int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
return -1; return -1;
} }
Py_DECREF(text); Py_DECREF(text);
Textbuffer_dealloc(buffer);
self->topstack->textbuffer = Textbuffer_new();
if (!self->topstack->textbuffer)
if (Textbuffer_reset(buffer))
return -1; return -1;
break; break;
} }
@@ -356,7 +327,7 @@ Unicode Tokenizer_read(Tokenizer* self, Py_ssize_t delta)
Py_ssize_t index = self->head + delta; Py_ssize_t index = self->head + delta;


if (index >= self->text.length) if (index >= self->text.length)
return EMPTY;
return '\0';
return read_codepoint(&self->text, index); return read_codepoint(&self->text, index);
} }


@@ -368,7 +339,7 @@ Unicode Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
Py_ssize_t index; Py_ssize_t index;


if (delta > self->head) if (delta > self->head)
return EMPTY;
return '\0';
index = self->head - delta; index = self->head - delta;
return read_codepoint(&self->text, index); return read_codepoint(&self->text, index);
} }

+ 1
- 1
mwparserfromhell/parser/ctokenizer/tok_support.h View File

@@ -37,7 +37,7 @@ int Tokenizer_emit_token(Tokenizer*, PyObject*, int);
int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int); int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int);
int Tokenizer_emit_char(Tokenizer*, Unicode); int Tokenizer_emit_char(Tokenizer*, Unicode);
int Tokenizer_emit_text(Tokenizer*, const char*); int Tokenizer_emit_text(Tokenizer*, const char*);
int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*, int);
int Tokenizer_emit_textbuffer(Tokenizer*, Textbuffer*);
int Tokenizer_emit_all(Tokenizer*, PyObject*); int Tokenizer_emit_all(Tokenizer*, PyObject*);
int Tokenizer_emit_text_then_stack(Tokenizer*, const char*); int Tokenizer_emit_text_then_stack(Tokenizer*, const char*);




+ 3
- 5
mwparserfromhell/parser/ctokenizer/tokenizer.c View File

@@ -31,7 +31,6 @@ uint64_t route_context;


char** entitydefs; char** entitydefs;


PyObject* EMPTY;
PyObject* NOARGS; PyObject* NOARGS;
PyObject* definitions; PyObject* definitions;


@@ -121,13 +120,13 @@ static int load_tokenizer_text(TokenizerInput* text, PyObject *input)
#ifdef PEP_393 #ifdef PEP_393
if (PyUnicode_READY(input) < 0) if (PyUnicode_READY(input) < 0)
return -1; return -1;
text->length = PyUnicode_GET_LENGTH(input);
text->kind = PyUnicode_KIND(input); text->kind = PyUnicode_KIND(input);
text->data = PyUnicode_DATA(input); text->data = PyUnicode_DATA(input);
#else #else
text->length = PyUnicode_GET_SIZE(input);
text->buf = PyUnicode_AS_UNICODE(input); text->buf = PyUnicode_AS_UNICODE(input);
#endif #endif
text->length = PyUnicode_GET_LENGTH(input);
return 0;
} }


/* /*
@@ -301,9 +300,8 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void)
PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType); PyModule_AddObject(module, "CTokenizer", (PyObject*) &TokenizerType);
Py_INCREF(Py_True); Py_INCREF(Py_True);
PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True); PyDict_SetItemString(TokenizerType.tp_dict, "USES_C", Py_True);
EMPTY = PyUnicode_FromString("");
NOARGS = PyTuple_New(0); NOARGS = PyTuple_New(0);
if (!EMPTY || !NOARGS || load_entities() || load_tokens() || load_defs())
if (!NOARGS || load_entities() || load_tokens() || load_defs())
INIT_ERROR; INIT_ERROR;
#ifdef IS_PY3K #ifdef IS_PY3K
return module; return module;


Loading…
Cancel
Save