Port the fix for #197 to the C tokenizer

5 years ago · df3a9054cb
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -100,6 +100,66 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr)
 }

 /*
    Check if the given character is a non-word character.

    Equivalent to this Python code:

    def is_non_word_character(ch):
        if re.fullmatch(r"\W", chunk):
            return True
        return False
 */
 static int is_non_word_character(Py_UCS4 ch)
 {
    int ret = 0;
    PyObject* modname = NULL;
    PyObject* module = NULL;
    PyObject* fmatch = NULL;
    PyObject* pattern = NULL;
    PyObject* str = NULL;
    PyObject* posArgs = NULL;
    PyObject* match = NULL;

    modname = PyUnicode_FromString("re");
    if (modname == NULL)
        goto error;
    module = PyImport_Import(modname);
    if (module == NULL)
        goto error;
    fmatch = PyObject_GetAttrString(module, "fullmatch");
    if (fmatch == NULL)
        goto error;
    pattern = PyUnicode_FromString("\\W");
    if (pattern == NULL)
        goto error;
    str = PyUnicode_FROM_SINGLE(ch);
    if (str == NULL)
        goto error;
    posArgs = PyTuple_Pack(2, pattern, str);
    if (posArgs == NULL)
        goto error;
    match = PyObject_Call(fmatch, posArgs, NULL);
    if (match == NULL)
        goto error;

    if (match != Py_None)
        ret = 1;
    goto end;

    error:
    ret = -1;
    end:
    Py_XDECREF(match);
    Py_XDECREF(posArgs);
    Py_XDECREF(str);
    Py_XDECREF(pattern);
    Py_XDECREF(fmatch);
    Py_XDECREF(module);
    Py_XDECREF(modname);
    return ret;
 }

 /*
    Parse a template at the head of the wikicode string.
 */
 static int Tokenizer_parse_template(Tokenizer* self, int has_content)
@@ -527,7 +587,13 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
    // it was just parsed as text:
    for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) {
        chunk = Textbuffer_read(self->topstack->textbuffer, i);
        if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
        // stop at the first non-word character
        int is_non_word = is_non_word_character(chunk);
        if (is_non_word < 0) {
            Textbuffer_dealloc(scheme_buffer);
            return -1;
        }
        else if (is_non_word == 1)
            goto end_of_loop;
        j = 0;
        do {