From bb51e8f282c304a0a2b479d3bd1f325cb760ba66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= Date: Mon, 21 Dec 2020 15:50:27 +0100 Subject: [PATCH] Some fixes for the parsing of external links (#232) * Proposed fix for https://github.com/earwig/mwparserfromhell/issues/197 * Port the fix for #197 to the C tokenizer * Fix parsing of external links where the URL is terminated by some special character - One existing test case has been found wrong -- current MediaWiki version always terminates the URL when an opening bracket is encountered. - Other test cases added: double quote, two single quotes and angles always terminate the URL (regardless if it is a free link or external link inside brackets). One single quote does not terminate the URL. * Fix case-insensitive parsing of URI schemes --- mwparserfromhell/nodes/external_link.py | 5 +- mwparserfromhell/parser/builder.py | 9 ++- mwparserfromhell/parser/ctokenizer/tok_parse.c | 99 +++++++++++++++++++++--- mwparserfromhell/parser/tokenizer.py | 30 ++++++-- tests/tokenizer/external_links.mwtest | 102 ++++++++++++++++++++++++- 5 files changed, 221 insertions(+), 24 deletions(-) diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py index 0423e2a..6dafe71 100644 --- a/mwparserfromhell/nodes/external_link.py +++ b/mwparserfromhell/nodes/external_link.py @@ -27,15 +27,18 @@ __all__ = ["ExternalLink"] class ExternalLink(Node): """Represents an external link, like ``[http://example.com/ Example]``.""" - def __init__(self, url, title=None, brackets=True): + def __init__(self, url, title=None, brackets=True, suppress_space=False): super().__init__() self.url = url self.title = title self.brackets = brackets + self.suppress_space = suppress_space def __str__(self): if self.brackets: if self.title is not None: + if self.suppress_space is True: + return "[" + str(self.url) + str(self.title) + "]" return "[" + str(self.url) + " " + str(self.title) + "]" return "[" + str(self.url) + "]" return str(self.url) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 2f58455..b1556fc 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -157,17 +157,20 @@ class Builder: @_add_handler(tokens.ExternalLinkOpen) def _handle_external_link(self, token): """Handle when an external link is at the head of the tokens.""" - brackets, url = token.brackets, None + brackets, url, suppress_space = token.brackets, None, None self._push() while self._tokens: token = self._tokens.pop() if isinstance(token, tokens.ExternalLinkSeparator): url = self._pop() + suppress_space = token.suppress_space self._push() elif isinstance(token, tokens.ExternalLinkClose): if url is not None: - return ExternalLink(url, self._pop(), brackets) - return ExternalLink(self._pop(), brackets=brackets) + return ExternalLink(url, self._pop(), brackets=brackets, + suppress_space=suppress_space is True) + return ExternalLink(self._pop(), brackets=brackets, + suppress_space=suppress_space is True) else: self._write(self._handle_token(token)) raise ParserError("_handle_external_link() missed a close token") diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index e73b3ef..d36ce56 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -30,7 +30,7 @@ SOFTWARE. #define DIGITS "0123456789" #define HEXDIGITS "0123456789abcdefABCDEF" #define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" -#define URISCHEME "abcdefghijklmnopqrstuvwxyz0123456789+.-" +#define URISCHEME "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" #define MAX_BRACES 255 #define MAX_ENTITY_SIZE 8 @@ -100,6 +100,66 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr) } /* + Check if the given character is a non-word character. + + Equivalent to this Python code: + + def is_non_word_character(ch): + if re.fullmatch(r"\W", chunk): + return True + return False +*/ +static int is_non_word_character(Py_UCS4 ch) +{ + int ret = 0; + PyObject* modname = NULL; + PyObject* module = NULL; + PyObject* fmatch = NULL; + PyObject* pattern = NULL; + PyObject* str = NULL; + PyObject* posArgs = NULL; + PyObject* match = NULL; + + modname = PyUnicode_FromString("re"); + if (modname == NULL) + goto error; + module = PyImport_Import(modname); + if (module == NULL) + goto error; + fmatch = PyObject_GetAttrString(module, "fullmatch"); + if (fmatch == NULL) + goto error; + pattern = PyUnicode_FromString("\\W"); + if (pattern == NULL) + goto error; + str = PyUnicode_FROM_SINGLE(ch); + if (str == NULL) + goto error; + posArgs = PyTuple_Pack(2, pattern, str); + if (posArgs == NULL) + goto error; + match = PyObject_Call(fmatch, posArgs, NULL); + if (match == NULL) + goto error; + + if (match != Py_None) + ret = 1; + goto end; + + error: + ret = -1; + end: + Py_XDECREF(match); + Py_XDECREF(posArgs); + Py_XDECREF(str); + Py_XDECREF(pattern); + Py_XDECREF(fmatch); + Py_XDECREF(module); + Py_XDECREF(modname); + return ret; +} + +/* Parse a template at the head of the wikicode string. */ static int Tokenizer_parse_template(Tokenizer* self, int has_content) @@ -527,7 +587,13 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) // it was just parsed as text: for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) { chunk = Textbuffer_read(self->topstack->textbuffer, i); - if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) + // stop at the first non-word character + int is_non_word = is_non_word_character(chunk); + if (is_non_word < 0) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + else if (is_non_word == 1) goto end_of_loop; j = 0; do { @@ -607,14 +673,15 @@ static int Tokenizer_handle_free_link_text( Return whether the current head is the end of a free link. */ static int -Tokenizer_is_free_link(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) +Tokenizer_is_free_link_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) { // Built from Tokenizer_parse()'s end sentinels: Py_UCS4 after = Tokenizer_read(self, 2); uint64_t ctx = self->topstack->context; return (!this || this == '\n' || this == '[' || this == ']' || - this == '<' || this == '>' || (this == '\'' && next == '\'') || + this == '<' || this == '>' || this == '"' || + (this == '\'' && next == '\'') || (this == '|' && ctx & LC_TEMPLATE) || (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || (this == '}' && next == '}' && @@ -656,7 +723,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, if (Tokenizer_parse_comment(self)) return NULL; } - else if (!brackets && Tokenizer_is_free_link(self, this, next)) { + else if (!brackets && Tokenizer_is_free_link_end(self, this, next)) { self->head--; return Tokenizer_pop(self); } @@ -669,16 +736,28 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, } else if (this == ']') return Tokenizer_pop(self); - else if (this == ' ') { + else if (this == ' ' || Tokenizer_is_free_link_end(self, this, next)) { if (brackets) { - if (Tokenizer_emit(self, ExternalLinkSeparator)) - return NULL; + if (this == ' ') { + if (Tokenizer_emit(self, ExternalLinkSeparator)) + return NULL; + } + else { + PyObject* kwargs = PyDict_New(); + if (!kwargs) + return NULL; + if (this != ' ') + PyDict_SetItemString(kwargs, "suppress_space", Py_True); + if (Tokenizer_emit_kwargs(self, ExternalLinkSeparator, kwargs)) + return NULL; + } self->topstack->context ^= LC_EXT_LINK_URI; self->topstack->context |= LC_EXT_LINK_TITLE; - self->head++; + if (this == ' ') + self->head++; return Tokenizer_parse(self, 0, 0); } - if (Textbuffer_write(extra, ' ')) + if (Textbuffer_write(extra, this)) return NULL; return Tokenizer_pop(self); } diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index ab61f92..c48e180 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -366,7 +366,7 @@ class Tokenizer: self._emit_text("//") self._head += 2 else: - valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" + valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" all_valid = lambda: all(char in valid for char in self._read()) scheme = "" while self._read() is not self.END and all_valid(): @@ -386,14 +386,15 @@ class Tokenizer: def _parse_free_uri_scheme(self): """Parse the URI scheme of a free (no brackets) external link.""" - valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" + valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" scheme = [] try: # We have to backtrack through the textbuffer looking for our # scheme since it was just parsed as text: for chunk in reversed(self._textbuffer): for char in reversed(chunk): - if char.isspace() or char in self.MARKERS: + # stop at the first non-word character + if re.fullmatch(r"\W", char): raise StopIteration() if char not in valid: raise BadRoute() @@ -438,7 +439,7 @@ class Tokenizer: # Built from _parse()'s end sentinels: after, ctx = self._read(2), self._context equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING - return (this in (self.END, "\n", "[", "]", "<", ">") or + return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or this == nxt == "'" or (this == "|" and ctx & contexts.TEMPLATE) or (this == "=" and ctx & equal_sign_contexts) or @@ -481,16 +482,29 @@ class Tokenizer: self._parse_template_or_argument() elif this == "]": return self._pop(), tail, 0 - elif " " in this: - before, after = this.split(" ", 1) + elif this == "'" and nxt == "'": + separator = tokens.ExternalLinkSeparator() + separator.suppress_space = True + self._emit(separator) + self._context ^= contexts.EXT_LINK_URI + self._context |= contexts.EXT_LINK_TITLE + return self._parse(push=False), None, 0 + elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">", + "\"")): + before, after = re.split("[ \n\[\]<>\"]", this, maxsplit=1) + delimiter = this[len(before)] if brackets: self._emit_text(before) - self._emit(tokens.ExternalLinkSeparator()) + separator = tokens.ExternalLinkSeparator() + if delimiter != " ": + separator.suppress_space = True + self._emit(separator) if after: self._emit_text(after) self._context ^= contexts.EXT_LINK_URI self._context |= contexts.EXT_LINK_TITLE - self._head += 1 + if delimiter == " ": + self._head += 1 return self._parse(push=False), None, 0 punct, tail = self._handle_free_link_text(punct, tail, before) return self._pop(), tail + " " + after, 0 diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index d2efdfc..ca64fd0 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -153,9 +153,9 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), Ext --- name: brackets_open_bracket_inside -label: an open bracket inside a bracket-enclosed link that is also included +label: an open bracket inside a bracket-enclosed link that is not included input: "[http://foobar[baz.com Example]" -output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] +output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar"), ExternalLinkSeparator(suppress_space=True), Text(text="[baz.com Example"), ExternalLinkClose()] --- @@ -478,3 +478,101 @@ name: brackets_scheme_title_but_no_url label: brackets around a scheme, colon, and slashes, with a title input: "[http:// Example]" output: [Text(text="[http:// Example]")] + +--- + +name: url_preceded_by_non_word_character +label: non-word character immediately before a valid URL +input: "svn+ssh://server.domain.com:/reponame" +output: [Text(text="svn+"), ExternalLinkOpen(brackets=False), Text(text="ssh://server.domain.com:/reponame"), ExternalLinkClose()] + +--- + +name: url_preceded_by_underscore +label: underscore immediately before a valid URL +input: "svn_ssh://server.domain.com:/reponame" +output: [Text(text="svn_ssh://server.domain.com:/reponame")] + +--- + +name: url_terminated_by_double_quote +label: a free link terminated by a double quote +input: "http://foo\"bar" +output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="\"bar")] + +--- + +name: url_not_terminated_by_single_quote +label: a free link not terminated by a single quote +input: "http://foo'bar" +output: [ExternalLinkOpen(brackets=False), Text(text="http://foo'bar"), ExternalLinkClose()] + +--- + +name: url_terminated_by_two_single_quotes +label: a free link terminated by two single quotes +input: "http://foo''bar''" +output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()] + +--- + +name: url_terminated_by_left_angle +label: a free link terminated by a left angle +input: "http://foobar" +output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text=">bar")] + +--- + +name: brackets_terminated_by_double_quote +label: an external link terminated by a double quote +input: "[http://foo\"bar]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="\"bar"), ExternalLinkClose()] + +--- + +name: brackets_not_terminated_by_single_quote +label: an external link not terminated by a single quote +input: "[http://foo'bar]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foo'bar"), ExternalLinkClose()] + +--- + +name: brackets_terminated_by_two_single_quotes +label: an external link terminated by two single quotes +input: "[http://foo''bar'']" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), ExternalLinkClose()] + +--- + +name: brackets_terminated_by_left_angle +label: an external link terminated by a left angle +input: "[http://foobar]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text=">bar"), ExternalLinkClose()] + +--- + +name: scheme_case +label: a free link with uppercase letters in the URL scheme +input: "HtTp://example.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="HtTp://example.com/"), ExternalLinkClose()] + +--- + +name: bracket_scheme_case +label: an external link with uppercase letters in the URL scheme +input: "[HtTp://example.com/]" +output: [ExternalLinkOpen(brackets=True), Text(text="HtTp://example.com/"), ExternalLinkClose()]