From df3a9054cbf94d33945d3ab46380d2c1e0e5339c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Sat, 4 Jan 2020 13:41:14 +0100 Subject: [PATCH] Port the fix for #197 to the C tokenizer --- mwparserfromhell/parser/ctokenizer/tok_parse.c | 68 +++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index e73b3ef..6529ff5 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -100,6 +100,66 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr) } /* + Check if the given character is a non-word character. + + Equivalent to this Python code: + + def is_non_word_character(ch): + if re.fullmatch(r"\W", chunk): + return True + return False +*/ +static int is_non_word_character(Py_UCS4 ch) +{ + int ret = 0; + PyObject* modname = NULL; + PyObject* module = NULL; + PyObject* fmatch = NULL; + PyObject* pattern = NULL; + PyObject* str = NULL; + PyObject* posArgs = NULL; + PyObject* match = NULL; + + modname = PyUnicode_FromString("re"); + if (modname == NULL) + goto error; + module = PyImport_Import(modname); + if (module == NULL) + goto error; + fmatch = PyObject_GetAttrString(module, "fullmatch"); + if (fmatch == NULL) + goto error; + pattern = PyUnicode_FromString("\\W"); + if (pattern == NULL) + goto error; + str = PyUnicode_FROM_SINGLE(ch); + if (str == NULL) + goto error; + posArgs = PyTuple_Pack(2, pattern, str); + if (posArgs == NULL) + goto error; + match = PyObject_Call(fmatch, posArgs, NULL); + if (match == NULL) + goto error; + + if (match != Py_None) + ret = 1; + goto end; + + error: + ret = -1; + end: + Py_XDECREF(match); + Py_XDECREF(posArgs); + Py_XDECREF(str); + Py_XDECREF(pattern); + Py_XDECREF(fmatch); + Py_XDECREF(module); + Py_XDECREF(modname); + return ret; +} + +/* Parse a template at the head of the wikicode string. */ static int Tokenizer_parse_template(Tokenizer* self, int has_content) @@ -527,7 +587,13 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) // it was just parsed as text: for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) { chunk = Textbuffer_read(self->topstack->textbuffer, i); - if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) + // stop at the first non-word character + int is_non_word = is_non_word_character(chunk); + if (is_non_word < 0) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + else if (is_non_word == 1) goto end_of_loop; j = 0; do {