From bb51e8f282c304a0a2b479d3bd1f325cb760ba66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?=
 <lahwaacz@users.noreply.github.com>
Date: Mon, 21 Dec 2020 15:50:27 +0100
Subject: [PATCH] Some fixes for the parsing of external links (#232)

* Proposed fix for https://github.com/earwig/mwparserfromhell/issues/197

* Port the fix for #197 to the C tokenizer

* Fix parsing of external links where the URL is terminated by some special character

- One existing test case has been found wrong -- current MediaWiki
  version always terminates the URL when an opening bracket is
  encountered.
- Other test cases added: double quote, two single quotes and angles
  always terminate the URL (regardless if it is a free link or external
  link inside brackets). One single quote does not terminate the URL.

* Fix case-insensitive parsing of URI schemes
---
 mwparserfromhell/nodes/external_link.py        |   5 +-
 mwparserfromhell/parser/builder.py             |   9 ++-
 mwparserfromhell/parser/ctokenizer/tok_parse.c |  99 +++++++++++++++++++++---
 mwparserfromhell/parser/tokenizer.py           |  30 ++++++--
 tests/tokenizer/external_links.mwtest          | 102 ++++++++++++++++++++++++-
 5 files changed, 221 insertions(+), 24 deletions(-)

diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py
index 0423e2a..6dafe71 100644
--- a/mwparserfromhell/nodes/external_link.py
+++ b/mwparserfromhell/nodes/external_link.py
@@ -27,15 +27,18 @@ __all__ = ["ExternalLink"]
 class ExternalLink(Node):
     """Represents an external link, like ``[http://example.com/ Example]``."""
 
-    def __init__(self, url, title=None, brackets=True):
+    def __init__(self, url, title=None, brackets=True, suppress_space=False):
         super().__init__()
         self.url = url
         self.title = title
         self.brackets = brackets
+        self.suppress_space = suppress_space
 
     def __str__(self):
         if self.brackets:
             if self.title is not None:
+                if self.suppress_space is True:
+                    return "[" + str(self.url) + str(self.title) + "]"
                 return "[" + str(self.url) + " " + str(self.title) + "]"
             return "[" + str(self.url) + "]"
         return str(self.url)
diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py
index 2f58455..b1556fc 100644
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -157,17 +157,20 @@ class Builder:
     @_add_handler(tokens.ExternalLinkOpen)
     def _handle_external_link(self, token):
         """Handle when an external link is at the head of the tokens."""
-        brackets, url = token.brackets, None
+        brackets, url, suppress_space = token.brackets, None, None
         self._push()
         while self._tokens:
             token = self._tokens.pop()
             if isinstance(token, tokens.ExternalLinkSeparator):
                 url = self._pop()
+                suppress_space = token.suppress_space
                 self._push()
             elif isinstance(token, tokens.ExternalLinkClose):
                 if url is not None:
-                    return ExternalLink(url, self._pop(), brackets)
-                return ExternalLink(self._pop(), brackets=brackets)
+                    return ExternalLink(url, self._pop(), brackets=brackets,
+                                        suppress_space=suppress_space is True)
+                return ExternalLink(self._pop(), brackets=brackets,
+                                    suppress_space=suppress_space is True)
             else:
                 self._write(self._handle_token(token))
         raise ParserError("_handle_external_link() missed a close token")
diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c
index e73b3ef..d36ce56 100644
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -30,7 +30,7 @@ SOFTWARE.
 #define DIGITS    "0123456789"
 #define HEXDIGITS "0123456789abcdefABCDEF"
 #define ALPHANUM  "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#define URISCHEME "abcdefghijklmnopqrstuvwxyz0123456789+.-"
+#define URISCHEME "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
 
 #define MAX_BRACES 255
 #define MAX_ENTITY_SIZE 8
@@ -100,6 +100,66 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr)
 }
 
 /*
+    Check if the given character is a non-word character.
+
+    Equivalent to this Python code:
+
+    def is_non_word_character(ch):
+        if re.fullmatch(r"\W", chunk):
+            return True
+        return False
+*/
+static int is_non_word_character(Py_UCS4 ch)
+{
+    int ret = 0;
+    PyObject* modname = NULL;
+    PyObject* module = NULL;
+    PyObject* fmatch = NULL;
+    PyObject* pattern = NULL;
+    PyObject* str = NULL;
+    PyObject* posArgs = NULL;
+    PyObject* match = NULL;
+
+    modname = PyUnicode_FromString("re");
+    if (modname == NULL)
+        goto error;
+    module = PyImport_Import(modname);
+    if (module == NULL)
+        goto error;
+    fmatch = PyObject_GetAttrString(module, "fullmatch");
+    if (fmatch == NULL)
+        goto error;
+    pattern = PyUnicode_FromString("\\W");
+    if (pattern == NULL)
+        goto error;
+    str = PyUnicode_FROM_SINGLE(ch);
+    if (str == NULL)
+        goto error;
+    posArgs = PyTuple_Pack(2, pattern, str);
+    if (posArgs == NULL)
+        goto error;
+    match = PyObject_Call(fmatch, posArgs, NULL);
+    if (match == NULL)
+        goto error;
+
+    if (match != Py_None)
+        ret = 1;
+    goto end;
+
+    error:
+    ret = -1;
+    end:
+    Py_XDECREF(match);
+    Py_XDECREF(posArgs);
+    Py_XDECREF(str);
+    Py_XDECREF(pattern);
+    Py_XDECREF(fmatch);
+    Py_XDECREF(module);
+    Py_XDECREF(modname);
+    return ret;
+}
+
+/*
     Parse a template at the head of the wikicode string.
 */
 static int Tokenizer_parse_template(Tokenizer* self, int has_content)
@@ -527,7 +587,13 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
     // it was just parsed as text:
     for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) {
         chunk = Textbuffer_read(self->topstack->textbuffer, i);
-        if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
+        // stop at the first non-word character
+        int is_non_word = is_non_word_character(chunk);
+        if (is_non_word < 0) {
+            Textbuffer_dealloc(scheme_buffer);
+            return -1;
+        }
+        else if (is_non_word == 1)
             goto end_of_loop;
         j = 0;
         do {
@@ -607,14 +673,15 @@ static int Tokenizer_handle_free_link_text(
     Return whether the current head is the end of a free link.
 */
 static int
-Tokenizer_is_free_link(Tokenizer* self, Py_UCS4 this, Py_UCS4 next)
+Tokenizer_is_free_link_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next)
 {
     // Built from Tokenizer_parse()'s end sentinels:
     Py_UCS4 after = Tokenizer_read(self, 2);
     uint64_t ctx = self->topstack->context;
 
     return (!this || this == '\n' || this == '[' || this == ']' ||
-        this == '<' || this == '>'  || (this == '\'' && next == '\'') ||
+        this == '<' || this == '>' || this == '"' ||
+        (this == '\'' && next == '\'') ||
         (this == '|' && ctx & LC_TEMPLATE) ||
         (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) ||
         (this == '}' && next == '}' &&
@@ -656,7 +723,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
             if (Tokenizer_parse_comment(self))
                 return NULL;
         }
-        else if (!brackets && Tokenizer_is_free_link(self, this, next)) {
+        else if (!brackets && Tokenizer_is_free_link_end(self, this, next)) {
             self->head--;
             return Tokenizer_pop(self);
         }
@@ -669,16 +736,28 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
         }
         else if (this == ']')
             return Tokenizer_pop(self);
-        else if (this == ' ') {
+        else if (this == ' ' || Tokenizer_is_free_link_end(self, this, next)) {
             if (brackets) {
-                if (Tokenizer_emit(self, ExternalLinkSeparator))
-                    return NULL;
+                if (this == ' ') {
+                    if (Tokenizer_emit(self, ExternalLinkSeparator))
+                        return NULL;
+                }
+                else {
+                    PyObject* kwargs = PyDict_New();
+                    if (!kwargs)
+                        return NULL;
+                    if (this != ' ')
+                        PyDict_SetItemString(kwargs, "suppress_space", Py_True);
+                    if (Tokenizer_emit_kwargs(self, ExternalLinkSeparator, kwargs))
+                        return NULL;
+                }
                 self->topstack->context ^= LC_EXT_LINK_URI;
                 self->topstack->context |= LC_EXT_LINK_TITLE;
-                self->head++;
+                if (this == ' ')
+                    self->head++;
                 return Tokenizer_parse(self, 0, 0);
             }
-            if (Textbuffer_write(extra, ' '))
+            if (Textbuffer_write(extra, this))
                 return NULL;
             return Tokenizer_pop(self);
         }
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index ab61f92..c48e180 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -366,7 +366,7 @@ class Tokenizer:
             self._emit_text("//")
             self._head += 2
         else:
-            valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"
+            valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
             all_valid = lambda: all(char in valid for char in self._read())
             scheme = ""
             while self._read() is not self.END and all_valid():
@@ -386,14 +386,15 @@ class Tokenizer:
 
     def _parse_free_uri_scheme(self):
         """Parse the URI scheme of a free (no brackets) external link."""
-        valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"
+        valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"
         scheme = []
         try:
             # We have to backtrack through the textbuffer looking for our
             # scheme since it was just parsed as text:
             for chunk in reversed(self._textbuffer):
                 for char in reversed(chunk):
-                    if char.isspace() or char in self.MARKERS:
+                    # stop at the first non-word character
+                    if re.fullmatch(r"\W", char):
                         raise StopIteration()
                     if char not in valid:
                         raise BadRoute()
@@ -438,7 +439,7 @@ class Tokenizer:
         # Built from _parse()'s end sentinels:
         after, ctx = self._read(2), self._context
         equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING
-        return (this in (self.END, "\n", "[", "]", "<", ">") or
+        return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or
                 this == nxt == "'" or
                 (this == "|" and ctx & contexts.TEMPLATE) or
                 (this == "=" and ctx & equal_sign_contexts) or
@@ -481,16 +482,29 @@ class Tokenizer:
                 self._parse_template_or_argument()
             elif this == "]":
                 return self._pop(), tail, 0
-            elif " " in this:
-                before, after = this.split(" ", 1)
+            elif this == "'" and nxt == "'":
+                separator = tokens.ExternalLinkSeparator()
+                separator.suppress_space = True
+                self._emit(separator)
+                self._context ^= contexts.EXT_LINK_URI
+                self._context |= contexts.EXT_LINK_TITLE
+                return self._parse(push=False), None, 0
+            elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">",
+                                           "\"")):
+                before, after = re.split("[ \n\[\]<>\"]", this, maxsplit=1)
+                delimiter = this[len(before)]
                 if brackets:
                     self._emit_text(before)
-                    self._emit(tokens.ExternalLinkSeparator())
+                    separator = tokens.ExternalLinkSeparator()
+                    if delimiter != " ":
+                        separator.suppress_space = True
+                    self._emit(separator)
                     if after:
                         self._emit_text(after)
                     self._context ^= contexts.EXT_LINK_URI
                     self._context |= contexts.EXT_LINK_TITLE
-                    self._head += 1
+                    if delimiter == " ":
+                        self._head += 1
                     return self._parse(push=False), None, 0
                 punct, tail = self._handle_free_link_text(punct, tail, before)
                 return self._pop(), tail + " " + after, 0
diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest
index d2efdfc..ca64fd0 100644
--- a/tests/tokenizer/external_links.mwtest
+++ b/tests/tokenizer/external_links.mwtest
@@ -153,9 +153,9 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), Ext
 ---
 
 name:   brackets_open_bracket_inside
-label:  an open bracket inside a bracket-enclosed link that is also included
+label:  an open bracket inside a bracket-enclosed link that is not included
 input:  "[http://foobar[baz.com Example]"
-output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()]
+output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar"), ExternalLinkSeparator(suppress_space=True), Text(text="[baz.com Example"), ExternalLinkClose()]
 
 ---
 
@@ -478,3 +478,101 @@ name:   brackets_scheme_title_but_no_url
 label:  brackets around a scheme, colon, and slashes, with a title
 input:  "[http:// Example]"
 output: [Text(text="[http:// Example]")]
+
+---
+
+name:   url_preceded_by_non_word_character
+label:  non-word character immediately before a valid URL
+input:  "svn+ssh://server.domain.com:/reponame"
+output: [Text(text="svn+"), ExternalLinkOpen(brackets=False), Text(text="ssh://server.domain.com:/reponame"), ExternalLinkClose()]
+
+---
+
+name:   url_preceded_by_underscore
+label:  underscore immediately before a valid URL
+input:  "svn_ssh://server.domain.com:/reponame"
+output: [Text(text="svn_ssh://server.domain.com:/reponame")]
+
+---
+
+name:   url_terminated_by_double_quote
+label:  a free link terminated by a double quote
+input:  "http://foo\"bar"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="\"bar")]
+
+---
+
+name:   url_not_terminated_by_single_quote
+label:  a free link not terminated by a single quote
+input:  "http://foo'bar"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://foo'bar"), ExternalLinkClose()]
+
+---
+
+name:   url_terminated_by_two_single_quotes
+label:  a free link terminated by two single quotes
+input:  "http://foo''bar''"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()]
+
+---
+
+name:   url_terminated_by_left_angle
+label:  a free link terminated by a left angle
+input:  "http://foo<bar"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="<bar")]
+
+---
+
+name:   url_terminated_by_right_angle
+label:  a free link terminated by a right angle
+input:  "http://foo>bar"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text=">bar")]
+
+---
+
+name:   brackets_terminated_by_double_quote
+label:  an external link terminated by a double quote
+input:  "[http://foo\"bar]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="\"bar"), ExternalLinkClose()]
+
+---
+
+name:   brackets_not_terminated_by_single_quote
+label:  an external link not terminated by a single quote
+input:  "[http://foo'bar]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://foo'bar"), ExternalLinkClose()]
+
+---
+
+name:   brackets_terminated_by_two_single_quotes
+label:  an external link terminated by two single quotes
+input:  "[http://foo''bar'']"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), ExternalLinkClose()]
+
+---
+
+name:   brackets_terminated_by_left_angle
+label:  an external link terminated by a left angle
+input:  "[http://foo<bar]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="<bar"), ExternalLinkClose()]
+
+---
+
+name:   brackets_terminated_by_right_angle
+label:  an external link terminated by a right angle
+input:  "[http://foo>bar]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text=">bar"), ExternalLinkClose()]
+
+---
+
+name:   scheme_case
+label:  a free link with uppercase letters in the URL scheme
+input:  "HtTp://example.com/"
+output: [ExternalLinkOpen(brackets=False), Text(text="HtTp://example.com/"), ExternalLinkClose()]
+
+---
+
+name:   bracket_scheme_case
+label:  an external link with uppercase letters in the URL scheme
+input:  "[HtTp://example.com/]"
+output: [ExternalLinkOpen(brackets=True), Text(text="HtTp://example.com/"), ExternalLinkClose()]