From 2d945b30e53d41b0a4d448ddee56d1580274b7c6 Mon Sep 17 00:00:00 2001 From: David Winegar Date: Thu, 17 Jul 2014 16:21:20 -0700 Subject: [PATCH] Use uint64_t for context For the C tokenizer, include `` and use `uint64_t` instead of `int` for context. Changes to tables mean that context can be larger than 32 bits, and it is possible for `int` to only have 16 bits anyways (though this is very unlikely). --- mwparserfromhell/parser/tokenizer.c | 29 +++++++++++++++-------------- mwparserfromhell/parser/tokenizer.h | 7 ++++--- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 814ad50..90f51b0 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -241,7 +241,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) /* Add a new token stack, context, and textbuffer to the list. */ -static int Tokenizer_push(Tokenizer* self, int context) +static int Tokenizer_push(Tokenizer* self, uint64_t context) { Stack* top = malloc(sizeof(Stack)); @@ -333,7 +333,7 @@ static PyObject* Tokenizer_pop(Tokenizer* self) static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) { PyObject* stack; - int context; + uint64_t context; if (Tokenizer_push_textbuffer(self)) return NULL; @@ -351,7 +351,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) */ static void* Tokenizer_fail_route(Tokenizer* self) { - int context = self->topstack->context; + uint64_t context = self->topstack->context; PyObject* stack = Tokenizer_pop(self); Py_XDECREF(stack); @@ -1034,7 +1034,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) { // Built from Tokenizer_parse()'s end sentinels: Py_UNICODE after = Tokenizer_READ(self, 2); - int ctx = self->topstack->context; + uint64_t ctx = self->topstack->context; return (!this || this == '\n' || this == '[' || this == ']' || this == '<' || this == '>' || (this == '\'' && next == '\'') || @@ -1629,9 +1629,9 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data) static int Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) { - int ctx = data->context; - int end_of_value = (ctx & TAG_ATTR_VALUE && - !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE))); + uint64_t ctx = data->context; + uint64_t end_of_value = (ctx & TAG_ATTR_VALUE && + !(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE))); if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) { if (Tokenizer_push_tag_buffer(self, data)) @@ -2153,7 +2153,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag, static int Tokenizer_parse_italics(Tokenizer* self) { Py_ssize_t reset = self->head; - int context; + uint64_t context; PyObject *stack; stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1); @@ -2273,7 +2273,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self) */ static PyObject* Tokenizer_parse_style(Tokenizer* self) { - int context = self->topstack->context, ticks = 2, i; + uint64_t context = self->topstack->context, ticks = 2, i; self->head += 2; while (Tokenizer_READ(self, 0) == '\'') { @@ -2428,7 +2428,7 @@ static int Tokenizer_handle_dl_term(Tokenizer* self) /* Handle the end of the stream of wikitext. */ -static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) +static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context) { PyObject *token, *text, *trash; int single; @@ -2457,7 +2457,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) Make sure we are not trying to write an invalid character. Return 0 if everything is safe, or -1 if the route must be failed. */ -static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) +static int Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data) { if (context & LC_FAIL_NEXT) return -1; @@ -2536,9 +2536,9 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) Parse the wikicode string, using context for when to stop. If push is true, we will push a new context, otherwise we won't and context will be ignored. */ -static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) +static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) { - int this_context; + uint64_t this_context; Py_UNICODE this, next, next_next, last; PyObject* temp; @@ -2697,7 +2697,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) { PyObject *text, *temp, *tokens; - int context = 0, skip_style_tags = 0; + uint64_t context = 0; + int skip_style_tags = 0; if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { Py_XDECREF(self->text); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index dde6464..e9b1a92 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -29,6 +29,7 @@ SOFTWARE. #include #include #include +#include #if PY_MAJOR_VERSION >= 3 #define IS_PY3K @@ -191,7 +192,7 @@ struct Textbuffer { struct Stack { PyObject* stack; - int context; + uint64_t context; struct Textbuffer* textbuffer; struct Stack* next; }; @@ -202,7 +203,7 @@ typedef struct { } HeadingData; typedef struct { - int context; + uint64_t context; struct Textbuffer* pad_first; struct Textbuffer* pad_before_eq; struct Textbuffer* pad_after_eq; @@ -267,7 +268,7 @@ static int Tokenizer_parse_entity(Tokenizer*); static int Tokenizer_parse_comment(Tokenizer*); static int Tokenizer_handle_dl_term(Tokenizer*); static int Tokenizer_parse_tag(Tokenizer*); -static PyObject* Tokenizer_parse(Tokenizer*, int, int); +static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); static int load_exceptions(void);