Browse Source

Use uint64_t for context

For the C tokenizer, include `<stdint.h>` and use `uint64_t` instead
of `int` for context. Changes to tables mean that context can be
larger than 32 bits, and it is possible for `int` to only have 16
bits anyways (though this is very unlikely).
tags/v0.4
David Winegar 9 years ago
parent
commit
2d945b30e5
2 changed files with 19 additions and 17 deletions
  1. +15
    -14
      mwparserfromhell/parser/tokenizer.c
  2. +4
    -3
      mwparserfromhell/parser/tokenizer.h

+ 15
- 14
mwparserfromhell/parser/tokenizer.c View File

@@ -241,7 +241,7 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds)
/*
Add a new token stack, context, and textbuffer to the list.
*/
static int Tokenizer_push(Tokenizer* self, int context)
static int Tokenizer_push(Tokenizer* self, uint64_t context)
{
Stack* top = malloc(sizeof(Stack));

@@ -333,7 +333,7 @@ static PyObject* Tokenizer_pop(Tokenizer* self)
static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
{
PyObject* stack;
int context;
uint64_t context;

if (Tokenizer_push_textbuffer(self))
return NULL;
@@ -351,7 +351,7 @@ static PyObject* Tokenizer_pop_keeping_context(Tokenizer* self)
*/
static void* Tokenizer_fail_route(Tokenizer* self)
{
int context = self->topstack->context;
uint64_t context = self->topstack->context;
PyObject* stack = Tokenizer_pop(self);

Py_XDECREF(stack);
@@ -1034,7 +1034,7 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next)
{
// Built from Tokenizer_parse()'s end sentinels:
Py_UNICODE after = Tokenizer_READ(self, 2);
int ctx = self->topstack->context;
uint64_t ctx = self->topstack->context;

return (!this || this == '\n' || this == '[' || this == ']' ||
this == '<' || this == '>' || (this == '\'' && next == '\'') ||
@@ -1629,9 +1629,9 @@ static int Tokenizer_push_tag_buffer(Tokenizer* self, TagData* data)
static int
Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
{
int ctx = data->context;
int end_of_value = (ctx & TAG_ATTR_VALUE &&
!(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));
uint64_t ctx = data->context;
uint64_t end_of_value = (ctx & TAG_ATTR_VALUE &&
!(ctx & (TAG_QUOTED | TAG_NOTE_QUOTE)));

if (end_of_value || (ctx & TAG_QUOTED && ctx & TAG_NOTE_SPACE)) {
if (Tokenizer_push_tag_buffer(self, data))
@@ -2153,7 +2153,7 @@ static int Tokenizer_emit_style_tag(Tokenizer* self, const char* tag,
static int Tokenizer_parse_italics(Tokenizer* self)
{
Py_ssize_t reset = self->head;
int context;
uint64_t context;
PyObject *stack;

stack = Tokenizer_parse(self, LC_STYLE_ITALICS, 1);
@@ -2273,7 +2273,7 @@ static int Tokenizer_parse_italics_and_bold(Tokenizer* self)
*/
static PyObject* Tokenizer_parse_style(Tokenizer* self)
{
int context = self->topstack->context, ticks = 2, i;
uint64_t context = self->topstack->context, ticks = 2, i;

self->head += 2;
while (Tokenizer_READ(self, 0) == '\'') {
@@ -2428,7 +2428,7 @@ static int Tokenizer_handle_dl_term(Tokenizer* self)
/*
Handle the end of the stream of wikitext.
*/
static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
static PyObject* Tokenizer_handle_end(Tokenizer* self, uint64_t context)
{
PyObject *token, *text, *trash;
int single;
@@ -2457,7 +2457,7 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
Make sure we are not trying to write an invalid character. Return 0 if
everything is safe, or -1 if the route must be failed.
*/
static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
static int Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
{
if (context & LC_FAIL_NEXT)
return -1;
@@ -2536,9 +2536,9 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
Parse the wikicode string, using context for when to stop. If push is true,
we will push a new context, otherwise we won't and context will be ignored.
*/
static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push)
{
int this_context;
uint64_t this_context;
Py_UNICODE this, next, next_next, last;
PyObject* temp;

@@ -2697,7 +2697,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
{
PyObject *text, *temp, *tokens;
int context = 0, skip_style_tags = 0;
uint64_t context = 0;
int skip_style_tags = 0;

if (PyArg_ParseTuple(args, "U|ii", &text, &context, &skip_style_tags)) {
Py_XDECREF(self->text);


+ 4
- 3
mwparserfromhell/parser/tokenizer.h View File

@@ -29,6 +29,7 @@ SOFTWARE.
#include <math.h>
#include <structmember.h>
#include <bytesobject.h>
#include <stdint.h>

#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
@@ -191,7 +192,7 @@ struct Textbuffer {

struct Stack {
PyObject* stack;
int context;
uint64_t context;
struct Textbuffer* textbuffer;
struct Stack* next;
};
@@ -202,7 +203,7 @@ typedef struct {
} HeadingData;

typedef struct {
int context;
uint64_t context;
struct Textbuffer* pad_first;
struct Textbuffer* pad_before_eq;
struct Textbuffer* pad_after_eq;
@@ -267,7 +268,7 @@ static int Tokenizer_parse_entity(Tokenizer*);
static int Tokenizer_parse_comment(Tokenizer*);
static int Tokenizer_handle_dl_term(Tokenizer*);
static int Tokenizer_parse_tag(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, int, int);
static PyObject* Tokenizer_parse(Tokenizer*, uint64_t, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);

static int load_exceptions(void);


Loading…
Cancel
Save