Browse Source

Use uint64_t for context

For the C tokenizer, include `<stdint.h>` and use `uint64_t` instead
of `int` for context. Changes to tables mean that context can be
larger than 32 bits, and it is possible for `int` to only have 16
bits anyways (though this is very unlikely).
tags/v0.4
David Winegar 10 years ago
parent
commit
2d945b30e5
2 changed files with 19 additions and 17 deletions
  1. +15
    -14
      mwparserfromhell/parser/tokenizer.c
  2. +4
    -3
      mwparserfromhell/parser/tokenizer.h

+ 15
- 14
mwparserfromhell/parser/tokenizer.c View File

@@ -241,7 +241,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
/* /*
Add a new token stack, context, and textbuffer to the list. Add a new token stack, context, and textbuffer to the list.
*/ */
static int Tokenizer_push(Tokenizer* self, int context)
static int Tokenizer_push(Tokenizer* self, uint64_t context)
{ {
Stack* top = malloc(sizeof(Stack)); Stack* top = malloc(sizeof(Stack));


@@ -333,7 +333,7 @@ static PyObject* Tokenizer_pop(Tokenizer* self)
static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
{ {
PyObject* stack; PyObject* stack;
int context;
uint64_t context;


if (Tokenizer_push_textbuffer(self)) if (Tokenizer_push_textbuffer(self))
return NULL; return NULL;
@@ -351,7 +351,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
*/ */
static void* Tokenizer_fail_route(Tokenizer* self) static void* Tokenizer_fail_route(Tokenizer* self)
{ {
int context = self->topstack->context;
uint64_t context = self->topstack->context;
PyObject* stack = Tokenizer_pop(self); PyObject* stack = Tokenizer_pop(self);


Py_XDECREF(stack); Py_XDECREF(stack);
@@ -1034,7 +1034,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
{ {
// Built from Tokenizer_parse()'s end sentinels: // Built from Tokenizer_parse()'s end sentinels:
Py_UNICODE after = Tokenizer_READ(self, 2); Py_UNICODE after = Tokenizer_READ(self, 2);
int ctx = self->topstack->context;
uint64_t ctx = self->topstack->context;


return (!this || this == '\n' || this == '[' || this == ']' || return (!this || this == '\n' || this == '[' || this == ']' ||
this == '<' || this == '>' || (this == '\'' && next == '\'') || this == '<' || this == '>' || (this == '\'' && next == '\'') ||
@@ -1629,9 +1629,9 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
static int static int
Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
{ {
int ctx = data->context;
int end_of_value = (ctx & TAG_ATTR_VALUE &&
!(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));
uint64_t ctx = data->context;
uint64_t end_of_value = (ctx & TAG_ATTR_VALUE &&
!(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));


if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) { if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) {
if (Tokenizer_push_tag_buffer(self, data)) if (Tokenizer_push_tag_buffer(self, data))
@@ -2153,7 +2153,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag,
static int Tokenizer_parse_italics(Tokenizer* self) static int Tokenizer_parse_italics(Tokenizer* self)
{ {
Py_ssize_t reset = self->head; Py_ssize_t reset = self->head;
int context;
uint64_t context;
PyObject *stack; PyObject *stack;


stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1); stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
@@ -2273,7 +2273,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
*/ */
static PyObject* Tokenizer_parse_style(Tokenizer* self) static PyObject* Tokenizer_parse_style(Tokenizer* self)
{ {
int context = self->topstack->context, ticks = 2, i;
uint64_t context = self->topstack->context, ticks = 2, i;


self->head += 2; self->head += 2;
while (Tokenizer_READ(self, 0) == '\'') { while (Tokenizer_READ(self, 0) == '\'') {
@@ -2428,7 +2428,7 @@ static int Tokenizer_handle_dl_term(Tokenizer* self)
/* /*
Handle the end of the stream of wikitext. Handle the end of the stream of wikitext.
*/ */
static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
{ {
PyObject *token, *text, *trash; PyObject *token, *text, *trash;
int single; int single;
@@ -2457,7 +2457,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
Make sure we are not trying to write an invalid character. Return 0 if Make sure we are not trying to write an invalid character. Return 0 if
everything is safe, or -1 if the route must be failed. everything is safe, or -1 if the route must be failed.
*/ */
static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
static int Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
{ {
if (context & LC_FAIL_NEXT) if (context & LC_FAIL_NEXT)
return -1; return -1;
@@ -2536,9 +2536,9 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
Parse the wikicode string, using context for when to stop. If push is true, Parse the wikicode string, using context for when to stop. If push is true,
we will push a new context, otherwise we won't and context will be ignored. we will push a new context, otherwise we won't and context will be ignored.
*/ */
static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
{ {
int this_context;
uint64_t this_context;
Py_UNICODE this, next, next_next, last; Py_UNICODE this, next, next_next, last;
PyObject* temp; PyObject* temp;


@@ -2697,7 +2697,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
{ {
PyObject *text, *temp, *tokens; PyObject *text, *temp, *tokens;
int context = 0, skip_style_tags = 0;
uint64_t context = 0;
int skip_style_tags = 0;


if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) { if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) {
Py_XDECREF(self->text); Py_XDECREF(self->text);


+ 4
- 3
mwparserfromhell/parser/tokenizer.h View File

@@ -29,6 +29,7 @@ SOFTWARE.
#include <math.h> #include <math.h>
#include <structmember.h> #include <structmember.h>
#include <bytesobject.h> #include <bytesobject.h>
#include <stdint.h>


#if PY_MAJOR_VERSION >= 3 #if PY_MAJOR_VERSION >= 3
#define IS_PY3K #define IS_PY3K
@@ -191,7 +192,7 @@ struct Textbuffer {


struct Stack { struct Stack {
PyObject* stack; PyObject* stack;
int context;
uint64_t context;
struct Textbuffer* textbuffer; struct Textbuffer* textbuffer;
struct Stack* next; struct Stack* next;
}; };
@@ -202,7 +203,7 @@ typedef struct {
} HeadingData; } HeadingData;


typedef struct { typedef struct {
int context;
uint64_t context;
struct Textbuffer* pad_first; struct Textbuffer* pad_first;
struct Textbuffer* pad_before_eq; struct Textbuffer* pad_before_eq;
struct Textbuffer* pad_after_eq; struct Textbuffer* pad_after_eq;
@@ -267,7 +268,7 @@ static int Tokenizer_parse_entity(Tokenizer*);
static int Tokenizer_parse_comment(Tokenizer*); static int Tokenizer_parse_comment(Tokenizer*);
static int Tokenizer_handle_dl_term(Tokenizer*); static int Tokenizer_handle_dl_term(Tokenizer*);
static int Tokenizer_parse_tag(Tokenizer*); static int Tokenizer_parse_tag(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, int, int);
static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);


static int load_exceptions(void); static int load_exceptions(void);


Loading…
Cancel
Save