Finish porting misc changes; add prototypes for remaining functions.

11 years ago · 653071379b
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -487,6 +487,8 @@ Tokenizer_parse_template_or_argument(Tokenizer* self)
        return -1;
    }
    Py_DECREF(tokenlist);
    if (self->topstack->context & LC_FAIL_NEXT)
        self->topstack->context ^= LC_FAIL_NEXT;
    return 0;
 }

@@ -499,7 +501,7 @@ Tokenizer_parse_template(Tokenizer* self)
    PyObject *template, *token;
    Py_ssize_t reset = self->head;

    template = Tokenizer_parse(self, LC_TEMPLATE_NAME);
    template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1);
    if (BAD_ROUTE) {
        self->head = reset;
        return 0;
@@ -542,7 +544,7 @@ Tokenizer_parse_argument(Tokenizer* self)
    PyObject *argument, *token;
    Py_ssize_t reset = self->head;

    argument = Tokenizer_parse(self, LC_ARGUMENT_NAME);
    argument = Tokenizer_parse(self, LC_ARGUMENT_NAME, 1);
    if (BAD_ROUTE) {
        self->head = reset;
        return 0;
@@ -709,7 +711,7 @@ Tokenizer_parse_wikilink(Tokenizer* self)

    self->head += 2;
    reset = self->head - 1;
    wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE);
    wikilink = Tokenizer_parse(self, LC_WIKILINK_TITLE, 1);
    if (BAD_ROUTE) {
        RESET_ROUTE();
        self->head = reset;
@@ -745,6 +747,8 @@ Tokenizer_parse_wikilink(Tokenizer* self)
        return -1;
    }
    Py_DECREF(token);
    if (self->topstack->context & LC_FAIL_NEXT)
        self->topstack->context ^= LC_FAIL_NEXT;
    return 0;
 }

@@ -797,7 +801,7 @@ Tokenizer_parse_heading(Tokenizer* self)
        self->head++;
    }
    context = LC_HEADING_LEVEL_1 << (best > 5 ? 5 : best - 1);
    heading = (HeadingData*) Tokenizer_parse(self, context);
    heading = (HeadingData*) Tokenizer_parse(self, context, 1);
    if (BAD_ROUTE) {
        RESET_ROUTE();
        self->head = reset + best - 1;
@@ -886,7 +890,7 @@ Tokenizer_handle_heading_end(Tokenizer* self)
    current = heading_level_from_context(self->topstack->context);
    level = current > best ? (best > 6 ? 6 : best) :
                             (current > 6 ? 6 : current);
    after = (HeadingData*) Tokenizer_parse(self, self->topstack->context);
    after = (HeadingData*) Tokenizer_parse(self, self->topstack->context, 1);
    if (BAD_ROUTE) {
        RESET_ROUTE();
        if (level < best) {
@@ -1144,7 +1148,7 @@ Tokenizer_parse_comment(Tokenizer* self)
    int i;

    self->head += 4;
    comment = Tokenizer_parse(self, LC_COMMENT);
    comment = Tokenizer_parse(self, LC_COMMENT, 1);
    if (BAD_ROUTE) {
        const char* text = "<!--";
        RESET_ROUTE();
@@ -1192,6 +1196,156 @@ Tokenizer_parse_comment(Tokenizer* self)
 }

 /*
    Parse an HTML tag at the head of the wikicode string.
 */
 static int
 Tokenizer_parse_tag(Tokenizer* self)
 {
    return 0;
 }

 /*
    Actually parse an HTML tag, starting with the open (<foo>).
 */
 static PyObject*
 Tokenizer_really_parse_tag(Tokenizer* self)
 {
    return NULL;
 }

 /*
    Write a pending tag attribute from data to the stack.
 */
 static int
 Tokenizer_push_tag_buffer(Tokenizer* self, TagOpenData* data)
 {
    return 0;
 }

 /*
    Handle all sorts of text data inside of an HTML open tag.
 */
 static int
 Tokenizer_handle_tag_data(Tokenizer* self, TagOpenData* data, Py_UNICODE text)
 {
    return 0;
 }

 /*
    Handle whitespace inside of an HTML open tag.
 */
 static int
 Tokenizer_handle_tag_space(Tokenizer* self, TagOpenData* data, Py_UNICODE text)
 {
    return 0;
 }

 /*
    Handle regular text inside of an HTML open tag.
 */
 static int
 Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
 {
    return 0;
 }

 /*
    Handle the body of an HTML tag that is parser-blacklisted.
 */
 static PyObject*
 Tokenizer_handle_blacklisted_tag(Tokenizer* self)
 {
    return NULL;
 }

 /*
    Handle the closing of a open tag (<foo>).
 */
 static int
 Tokenizer_handle_tag_close_open(Tokenizer* self, TagOpenData* data,
                                PyObject* token)
 {
    return 0;
 }

 /*
    Handle the opening of a closing tag (</foo>).
 */
 static int
 Tokenizer_handle_tag_open_close(Tokenizer* self)
 {
    return 0;
 }

 /*
    Handle the ending of a closing tag (</foo>).
 */
 static PyObject*
 Tokenizer_handle_tag_close_close(Tokenizer* self)
 {
    return NULL;
 }

 /*
    Handle the (possible) start of an implicitly closing single tag.
 */
 static int
 Tokenizer_handle_invalid_tag_start(Tokenizer* self)
 {
    return 0;
 }

 /*
    Handle the end of an implicitly closing single-only HTML tag.
 */
 static PyObject*
 Tokenizer_handle_single_only_tag_end(Tokenizer* self)
 {
    return NULL;
 }

 /*
    Handle the stream end when inside a single-supporting HTML tag.
 */
 static PyObject*
 Tokenizer_handle_single_tag_end(Tokenizer* self)
 {
    return NULL;
 }

 /*
    Handle the end of the stream of wikitext.
 */
 static PyObject*
 Tokenizer_handle_end(Tokenizer* self, int context)
 {
    static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK |
                                LC_HEADING | LC_COMMENT);
    static int double_fail = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
    PyObject *token, *text, *trash;
    int single;

    if (context & fail_contexts) {
        if (context & LC_TAG_BODY) {
            token = PyList_GET_ITEM(self->topstack->stack, 1);
            text = PyObject_GetAttrString(token, "text");
            if (!text)
                return NULL;
            single = IS_SINGLE(text);
            Py_DECREF(text);
            if (single)
                return Tokenizer_handle_single_tag_end(self);
        }
        else if (context & double_fail) {
            trash = Tokenizer_pop(self);
            Py_XDECREF(trash);
        }
        return Tokenizer_fail_route(self);
    }
    return Tokenizer_pop(self);
 }

 /*
    Make sure we are not trying to write an invalid character. Return 0 if
    everything is safe, or -1 if the route must be failed.
 */
@@ -1208,6 +1362,11 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
            return -1;
        return 0;
    }
    if (context & LC_TAG_CLOSE) {
        if (data == *"<")
            return -1;
        return 0;
    }
    if (context & LC_TEMPLATE_NAME) {
        if (data == *"{" || data == *"}" || data == *"[") {
            self->topstack->context |= LC_FAIL_NEXT;
@@ -1267,32 +1426,33 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
 }

 /*
    Parse the wikicode string, using context for when to stop.
    Parse the wikicode string, using context for when to stop. If push is true,
    we will push a new context, otherwise we won't and context will be ignored.
 */
 static PyObject*
 Tokenizer_parse(Tokenizer* self, int context)
 Tokenizer_parse(Tokenizer* self, int context, int push)
 {
    static int fail_contexts = (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK |
                                LC_HEADING | LC_COMMENT);
    static int unsafe_contexts = (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE |
                                  LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME);
    static int double_unsafe = (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE);
    int this_context, is_marker, i;
    Py_UNICODE this, next, next_next, last;
    PyObject *trash;
    PyObject* trash;

    if (Tokenizer_push(self, context))
        return NULL;
    if (push) {
        if (Tokenizer_push(self, context))
            return NULL;
    }
    while (1) {
        this = Tokenizer_READ(self, 0);
        this_context = self->topstack->context;
        if (this_context & unsafe_contexts) {
            if (Tokenizer_verify_safe(self, this_context, this) < 0) {
                if (this_context & LC_TEMPLATE_PARAM_KEY) {
                if (this_context & double_unsafe) {
                    trash = Tokenizer_pop(self);
                    Py_XDECREF(trash);
                }
                Tokenizer_fail_route(self);
                return NULL;
                return Tokenizer_fail_route(self);
            }
        }
        is_marker = 0;
@@ -1307,15 +1467,8 @@ Tokenizer_parse(Tokenizer* self, int context)
            self->head++;
            continue;
        }
        if (this == *"") {
            if (this_context & LC_TEMPLATE_PARAM_KEY) {
                trash = Tokenizer_pop(self);
                Py_XDECREF(trash);
            }
            if (this_context & fail_contexts)
                return Tokenizer_fail_route(self);
            return Tokenizer_pop(self);
        }
        if (this == *"")
            return Tokenizer_handle_end(self, this_context);
        next = Tokenizer_READ(self, 1);
        if (this_context & LC_COMMENT) {
            if (this == next && next == *"-") {
@@ -1328,8 +1481,6 @@ Tokenizer_parse(Tokenizer* self, int context)
            if (Tokenizer_CAN_RECURSE(self)) {
                if (Tokenizer_parse_template_or_argument(self))
                    return NULL;
                if (self->topstack->context & LC_FAIL_NEXT)
                    self->topstack->context ^= LC_FAIL_NEXT;
            }
            else
                Tokenizer_emit_text(self, this);
@@ -1359,8 +1510,6 @@ Tokenizer_parse(Tokenizer* self, int context)
                                                Tokenizer_CAN_RECURSE(self)) {
                if (Tokenizer_parse_wikilink(self))
                    return NULL;
                if (self->topstack->context & LC_FAIL_NEXT)
                    self->topstack->context ^= LC_FAIL_NEXT;
            }
            else
                Tokenizer_emit_text(self, this);
@@ -1397,6 +1546,28 @@ Tokenizer_parse(Tokenizer* self, int context)
            else
                Tokenizer_emit_text(self, this);
        }
        else if (this == *"<" && next == *"/" &&
                                            Tokenizer_READ(self, 2) != *"") {
            if (this_context & LC_TAG_BODY) {
                if (Tokenizer_handle_tag_open_close(self))
                    return NULL;
            }
            else {
                if (Tokenizer_handle_invalid_tag_start(self))
                    return NULL;
            }
        }
        else if (this == *"<") {
            if (!(this_context & LC_TAG_CLOSE) &&
                                                Tokenizer_CAN_RECURSE(self)) {
                if (Tokenizer_parse_tag(self))
                    return NULL;
            }
            else
                Tokenizer_emit_text(self, this);
        }
        else if (this == *">" && this_context & LC_TAG_CLOSE)
            return Tokenizer_handle_tag_close_close(self);
        else
            Tokenizer_emit_text(self, this);
        self->head++;
@@ -1431,7 +1602,7 @@ Tokenizer_tokenize(Tokenizer* self, PyObject* args)
        self->text = PySequence_Fast(text, "expected a sequence");
    }
    self->length = PyList_GET_SIZE(self->text);
    return Tokenizer_parse(self, 0);
    return Tokenizer_parse(self, 0, 1);
 }

 static void
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -192,7 +192,7 @@ typedef struct Stack Stack;
 typedef struct {
    PyObject_HEAD
    PyObject* text;         /* text to tokenize */
    struct Stack* topstack; /* topmost stack */
    Stack* topstack;        /* topmost stack */
    Py_ssize_t head;        /* current position in text */
    Py_ssize_t length;      /* length of text */
    int global;             /* global context */
@@ -251,8 +251,22 @@ static HeadingData* Tokenizer_handle_heading_end(Tokenizer*);
 static int Tokenizer_really_parse_entity(Tokenizer*);
 static int Tokenizer_parse_entity(Tokenizer*);
 static int Tokenizer_parse_comment(Tokenizer*);
 static int Tokenizer_parse_tag(Tokenizer*);
 static PyObject* Tokenizer_really_parse_tag(Tokenizer*);
 static int Tokenizer_push_tag_buffer(Tokenizer*, TagOpenData*);
 static int Tokenizer_handle_tag_data(Tokenizer*, TagOpenData*, Py_UNICODE);
 static int Tokenizer_handle_tag_space(Tokenizer*, TagOpenData*, Py_UNICODE);
 static int Tokenizer_handle_tag_text(Tokenizer*, Py_UNICODE);
 static PyObject* Tokenizer_handle_blacklisted_tag(Tokenizer*);
 static int Tokenizer_handle_tag_close_open(Tokenizer*, TagOpenData*, PyObject*);
 static int Tokenizer_handle_tag_open_close(Tokenizer*);
 static PyObject* Tokenizer_handle_tag_close_close(Tokenizer*);
 static int Tokenizer_handle_invalid_tag_start(Tokenizer*);
 static PyObject* Tokenizer_handle_single_only_tag_end(Tokenizer*);
 static PyObject* Tokenizer_handle_single_tag_end(Tokenizer*);
 static PyObject* Tokenizer_handle_end(Tokenizer*, int);
 static int Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE);
 static PyObject* Tokenizer_parse(Tokenizer*, int);
 static PyObject* Tokenizer_parse(Tokenizer*, int, int);
 static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);