Refactor out C's is_marker(); hooks for ext links.

10 years ago · 7b84b3f0df
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -24,6 +24,20 @@ SOFTWARE.
 #include "tokenizer.h"

 /*
    Determine whether the given Py_UNICODE is a marker.
 */
 static int is_marker(Py_UNICODE this)
 {
    int i;

    for (i = 0; i < NUM_MARKERS; i++) {
        if (*MARKERS[i] == this)
            return 1;
    }
    return 0;
 }

 /*
    Given a context, return the heading level encoded within it.
 */
 static int heading_level_from_context(int n)
@@ -37,13 +51,13 @@ static int heading_level_from_context(int n)
 }

 /*
    Call the given function in definitions.py, using 'tag' as a parameter, and
    return its output as a bool.
    Call the given function in definitions.py, using 'input' as a parameter,
    and return its output as a bool.
 */
 static int call_def_func(const char* funcname, PyObject* tag)
 static int call_def_func(const char* funcname, PyObject* input)
 {
    PyObject* func = PyObject_GetAttrString(definitions, funcname);
    PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL);
    PyObject* result = PyObject_CallFunctionObjArgs(func, input, NULL);
    int ans = (result == Py_True) ? 1 : 0;

    Py_DECREF(func);
@@ -1238,15 +1252,8 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
 static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
 {
    Py_UNICODE next = Tokenizer_READ(self, 1);
    int i, is_marker = 0;

    for (i = 0; i < NUM_MARKERS; i++) {
        if (*MARKERS[i] == text) {
            is_marker = 1;
            break;
        }
    }
    if (!is_marker || !Tokenizer_CAN_RECURSE(self))
    if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self))
        return Tokenizer_emit_char(self, text);
    else if (text == next && next == *"{")
        return Tokenizer_parse_template_or_argument(self);
@@ -1264,17 +1271,11 @@ static int
 Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk)
 {
    PyObject *trash;
    int first_time, i, is_marker = 0, escaped;
    int first_time, escaped;

    if (data->context & TAG_NAME) {
        first_time = !(data->context & TAG_NOTE_SPACE);
        for (i = 0; i < NUM_MARKERS; i++) {
            if (*MARKERS[i] == chunk) {
                is_marker = 1;
                break;
            }
        }
        if (is_marker || (Py_UNICODE_ISSPACE(chunk) && first_time)) {
        if (is_marker(chunk) || (Py_UNICODE_ISSPACE(chunk) && first_time)) {
            // Tags must start with text, not spaces
            Tokenizer_fail_route(self);
            return 0;
@@ -1623,7 +1624,6 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
    Textbuffer* buf;
    PyObject *name, *tag;
    Py_UNICODE this;
    int is_marker, i;

    self->head += 2;
    buf = Textbuffer_new();
@@ -1631,14 +1631,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
        return -1;
    while (1) {
        this = Tokenizer_READ(self, pos);
        is_marker = 0;
        for (i = 0; i < NUM_MARKERS; i++) {
            if (*MARKERS[i] == this) {
                is_marker = 1;
                break;
            }
        }
        if (is_marker) {
        if (is_marker(this)) {
            name = Textbuffer_render(buf);
            if (!name) {
                Textbuffer_dealloc(buf);
@@ -2047,9 +2040,8 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
 */
 static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
 {
    if (context & LC_FAIL_NEXT) {
    if (context & LC_FAIL_NEXT)
        return -1;
    }
    if (context & LC_WIKILINK) {
        if (context & LC_WIKILINK_TEXT)
            return (data == *"[" && Tokenizer_READ(self, 1) == *"[") ? -1 : 0;
@@ -2059,6 +2051,8 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
            return -1;
        return 0;
    }
    if (context & LC_EXT_LINK_TITLE)
        return (data == *"\n") ? -1 : 0;
    if (context & LC_TAG_CLOSE)
        return (data == *"<") ? -1 : 0;
    if (context & LC_TEMPLATE_NAME) {
@@ -2125,7 +2119,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
 */
 static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
 {
    int this_context, is_marker, i;
    int this_context;
    Py_UNICODE this, next, next_next, last;
    PyObject* temp;

@@ -2145,14 +2139,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
                return Tokenizer_fail_route(self);
            }
        }
        is_marker = 0;
        for (i = 0; i < NUM_MARKERS; i++) {
            if (*MARKERS[i] == this) {
                is_marker = 1;
                break;
            }
        }
        if (!is_marker) {
        if (!is_marker(this)) {
            if (Tokenizer_emit_char(self, this))
                return NULL;
            self->head++;
@@ -2205,6 +2192,16 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
        }
        else if (this == next && next == *"]" && this_context & LC_WIKILINK)
            return Tokenizer_handle_wikilink_end(self);
        // else if (this == *"[") {
        //     if (Tokenizer_parse_external_link(self, 1))
        //         return NULL;
        // }
        // else if (this == *":" && !is_marker(last)) {
        //     if (Tokenizer_parse_external_link(self, 0))
        //         return NULL;
        // }
        // else if (this == *"]" && this_context & LC_EXT_LINK_TITLE)
        //     return Tokenizer_pop(self);
        else if (this == *"=" && !(self->global & GL_HEADING)) {
            if (last == *"\n" || last == *"") {
                if (Tokenizer_parse_heading(self))
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -238,12 +238,13 @@ typedef struct {
 #define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1)


 /* Macros for accessing HTML tag definitions: */
 /* Macros for accessing definitions: */

 #define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li")
 #define IS_PARSABLE(tag) (call_def_func("is_parsable", tag))
 #define IS_SINGLE(tag) (call_def_func("is_single", tag))
 #define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag))
 #define IS_SCHEME(scheme) (call_def_func("is_scheme", scheme))


 /* Function prototypes: */