From df3a9054cbf94d33945d3ab46380d2c1e0e5339c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <j.l.k@gmx.com>
Date: Sat, 4 Jan 2020 13:41:14 +0100
Subject: [PATCH] Port the fix for #197 to the C tokenizer

---
 mwparserfromhell/parser/ctokenizer/tok_parse.c | 68 +++++++++++++++++++++++++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c
index e73b3ef..6529ff5 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -100,6 +100,66 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr)
 }
 
 /*
+    Check if the given character is a non-word character.
+
+    Equivalent to this Python code:
+
+    def is_non_word_character(ch):
+        if re.fullmatch(r"\W", chunk):
+            return True
+        return False
+*/
+static int is_non_word_character(Py_UCS4 ch)
+{
+    int ret = 0;
+    PyObject* modname = NULL;
+    PyObject* module = NULL;
+    PyObject* fmatch = NULL;
+    PyObject* pattern = NULL;
+    PyObject* str = NULL;
+    PyObject* posArgs = NULL;
+    PyObject* match = NULL;
+
+    modname = PyUnicode_FromString("re");
+    if (modname == NULL)
+        goto error;
+    module = PyImport_Import(modname);
+    if (module == NULL)
+        goto error;
+    fmatch = PyObject_GetAttrString(module, "fullmatch");
+    if (fmatch == NULL)
+        goto error;
+    pattern = PyUnicode_FromString("\\W");
+    if (pattern == NULL)
+        goto error;
+    str = PyUnicode_FROM_SINGLE(ch);
+    if (str == NULL)
+        goto error;
+    posArgs = PyTuple_Pack(2, pattern, str);
+    if (posArgs == NULL)
+        goto error;
+    match = PyObject_Call(fmatch, posArgs, NULL);
+    if (match == NULL)
+        goto error;
+
+    if (match != Py_None)
+        ret = 1;
+    goto end;
+
+    error:
+    ret = -1;
+    end:
+    Py_XDECREF(match);
+    Py_XDECREF(posArgs);
+    Py_XDECREF(str);
+    Py_XDECREF(pattern);
+    Py_XDECREF(fmatch);
+    Py_XDECREF(module);
+    Py_XDECREF(modname);
+    return ret;
+}
+
+/*
     Parse a template at the head of the wikicode string.
 */
 static int Tokenizer_parse_template(Tokenizer* self, int has_content)
@@ -527,7 +587,13 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
     // it was just parsed as text:
     for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) {
         chunk = Textbuffer_read(self->topstack->textbuffer, i);
-        if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
+        // stop at the first non-word character
+        int is_non_word = is_non_word_character(chunk);
+        if (is_non_word < 0) {
+            Textbuffer_dealloc(scheme_buffer);
+            return -1;
+        }
+        else if (is_non_word == 1)
             goto end_of_loop;
         j = 0;
         do {