diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h index 92a41ca..55d3906 100644 --- a/mwparserfromhell/parser/ctokenizer/common.h +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -51,8 +51,12 @@ SOFTWARE. #ifdef PEP_393 #define Unicode Py_UCS4 +#define PyUnicode_FROM_SINGLE(chr) \ + PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &(chr), 1) #else #define Unicode Py_UNICODE +#define PyUnicode_FROM_SINGLE(chr) \ + PyUnicode_FromUnicode(&(chr), 1) #endif /* Error handling macros */ @@ -77,7 +81,7 @@ extern PyObject* definitions; typedef struct { Py_ssize_t size; - Py_UNICODE* data; + Unicode* data; } Textbuffer; struct Stack { @@ -89,11 +93,21 @@ struct Stack { typedef struct Stack Stack; typedef struct { + PyObject* object; /* base PyUnicodeObject object */ + Py_ssize_t length; /* length of object, in code points */ +#ifdef PEP_393 + int kind; /* object's kind value */ + void* data; /* object's raw unicode buffer */ +#else + Py_UNICODE* buf; /* object's internal buffer */ +#endif +} TokenizerInput; + +typedef struct { PyObject_HEAD - PyObject* text; /* text to tokenize */ + TokenizerInput text; /* text to tokenize */ Stack* topstack; /* topmost stack */ Py_ssize_t head; /* current position in text */ - Py_ssize_t length; /* length of text */ int global; /* global context */ int depth; /* stack recursion depth */ int cycles; /* total number of stack recursions */ diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index d761e27..712e248 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -190,7 +190,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self) PyObject *tokenlist; self->head += 2; - while (Tokenizer_READ(self, 0) == '{' && braces < MAX_BRACES) { + while (Tokenizer_read(self, 0) == '{' && braces < MAX_BRACES) { self->head++; braces++; } @@ -426,7 +426,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) if (Tokenizer_push(self, LC_EXT_LINK_URI)) return -1; - if (Tokenizer_READ(self, 0) == '/' && Tokenizer_READ(self, 1) == '/') { + if (Tokenizer_read(self, 0) == '/' && Tokenizer_read(self, 1) == '/') { if (Tokenizer_emit_text(self, "//")) return -1; self->head += 2; @@ -435,7 +435,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) buffer = Textbuffer_new(); if (!buffer) return -1; - while ((this = Tokenizer_READ(self, 0))) { + while ((this = Tokenizer_read(self, 0))) { i = 0; while (1) { if (!valid[i]) @@ -462,8 +462,8 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) return -1; } self->head++; - slashes = (Tokenizer_READ(self, 0) == '/' && - Tokenizer_READ(self, 1) == '/'); + slashes = (Tokenizer_read(self, 0) == '/' && + Tokenizer_read(self, 1) == '/'); if (slashes) { if (Tokenizer_emit_text(self, "//")) { Textbuffer_dealloc(buffer); @@ -528,8 +528,8 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) Textbuffer_dealloc(scheme_buffer); return -1; } - slashes = (Tokenizer_READ(self, 0) == '/' && - Tokenizer_READ(self, 1) == '/'); + slashes = (Tokenizer_read(self, 0) == '/' && + Tokenizer_read(self, 1) == '/'); if (!IS_SCHEME(scheme, slashes, 1)) { Py_DECREF(scheme); Textbuffer_dealloc(scheme_buffer); @@ -589,7 +589,7 @@ static int Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) { // Built from Tokenizer_parse()'s end sentinels: - Py_UNICODE after = Tokenizer_READ(self, 2); + Py_UNICODE after = Tokenizer_read(self, 2); uint64_t ctx = self->topstack->context; return (!this || this == '\n' || this == '[' || this == ']' || @@ -615,22 +615,22 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, return NULL; if (BAD_ROUTE) return NULL; - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this || this == '\n' || this == ' ' || this == ']') return Tokenizer_fail_route(self); if (!brackets && this == '[') return Tokenizer_fail_route(self); while (1) { - this = Tokenizer_READ(self, 0); - next = Tokenizer_READ(self, 1); + this = Tokenizer_read(self, 0); + next = Tokenizer_read(self, 1); if (this == '&') { PUSH_TAIL_BUFFER(*extra, NULL) if (Tokenizer_parse_entity(self)) return NULL; } else if (this == '<' && next == '!' - && Tokenizer_READ(self, 2) == '-' - && Tokenizer_READ(self, 3) == '-') { + && Tokenizer_read(self, 2) == '-' + && Tokenizer_read(self, 3) == '-') { PUSH_TAIL_BUFFER(*extra, NULL) if (Tokenizer_parse_comment(self)) return NULL; @@ -716,7 +716,7 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) #define NOT_A_LINK \ if (!brackets && self->topstack->context & LC_DLTERM) \ return Tokenizer_handle_dl_term(self); \ - return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)) + return Tokenizer_emit_char(self, Tokenizer_read(self, 0)) Py_ssize_t reset = self->head; PyObject *link, *kwargs; @@ -787,7 +787,7 @@ static int Tokenizer_parse_heading(Tokenizer* self) self->global |= GL_HEADING; self->head += 1; - while (Tokenizer_READ(self, 0) == '=') { + while (Tokenizer_read(self, 0) == '=') { best++; self->head++; } @@ -862,7 +862,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) self->head += 1; best = 1; - while (Tokenizer_READ(self, 0) == '=') { + while (Tokenizer_read(self, 0) == '=') { best++; self->head++; } @@ -916,7 +916,7 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer* self) */ static int Tokenizer_really_parse_entity(Tokenizer* self) { - PyObject *kwargs, *textobj; + PyObject *kwargs, *charobj, *textobj; Py_UNICODE this; int numeric, hexadecimal, i, j, zeroes, test; char *valid, *text, *buffer, *def; @@ -930,7 +930,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) if (Tokenizer_emit(self, HTMLEntityStart)) return -1; self->head++; - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this) { Tokenizer_fail_route(self); return 0; @@ -940,7 +940,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) if (Tokenizer_emit(self, HTMLEntityNumeric)) return -1; self->head++; - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this) { Tokenizer_fail_route(self); return 0; @@ -950,7 +950,12 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) kwargs = PyDict_New(); if (!kwargs) return -1; - PyDict_SetItemString(kwargs, "char", Tokenizer_read(self, 0)); + if (!(charobj = PyUnicode_FROM_SINGLE(this))) { + Py_DECREF(kwargs); + return -1; + } + PyDict_SetItemString(kwargs, "char", charobj); + Py_DECREF(charobj); if (Tokenizer_emit_kwargs(self, HTMLEntityHex, kwargs)) return -1; self->head++; @@ -974,7 +979,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) i = 0; zeroes = 0; while (1) { - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (this == ';') { if (i == 0) FAIL_ROUTE_AND_EXIT() @@ -1093,15 +1098,15 @@ static int Tokenizer_parse_comment(Tokenizer* self) if (Tokenizer_push(self, 0)) return -1; while (1) { - this = Tokenizer_READ(self, 0); + this = Tokenizer_read(self, 0); if (!this) { comment = Tokenizer_pop(self); Py_XDECREF(comment); self->head = reset; return Tokenizer_emit_text(self, "