diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py index 0423e2a..6dafe71 100644 --- a/mwparserfromhell/nodes/external_link.py +++ b/mwparserfromhell/nodes/external_link.py @@ -27,15 +27,18 @@ __all__ = ["ExternalLink"] class ExternalLink(Node): """Represents an external link, like ``[http://example.com/ Example]``.""" - def __init__(self, url, title=None, brackets=True): + def __init__(self, url, title=None, brackets=True, suppress_space=False): super().__init__() self.url = url self.title = title self.brackets = brackets + self.suppress_space = suppress_space def __str__(self): if self.brackets: if self.title is not None: + if self.suppress_space is True: + return "[" + str(self.url) + str(self.title) + "]" return "[" + str(self.url) + " " + str(self.title) + "]" return "[" + str(self.url) + "]" return str(self.url) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 2f58455..b1556fc 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -157,17 +157,20 @@ class Builder: @_add_handler(tokens.ExternalLinkOpen) def _handle_external_link(self, token): """Handle when an external link is at the head of the tokens.""" - brackets, url = token.brackets, None + brackets, url, suppress_space = token.brackets, None, None self._push() while self._tokens: token = self._tokens.pop() if isinstance(token, tokens.ExternalLinkSeparator): url = self._pop() + suppress_space = token.suppress_space self._push() elif isinstance(token, tokens.ExternalLinkClose): if url is not None: - return ExternalLink(url, self._pop(), brackets) - return ExternalLink(self._pop(), brackets=brackets) + return ExternalLink(url, self._pop(), brackets=brackets, + suppress_space=suppress_space is True) + return ExternalLink(self._pop(), brackets=brackets, + suppress_space=suppress_space is True) else: self._write(self._handle_token(token)) raise ParserError("_handle_external_link() missed a close token") diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index 6529ff5..879a5db 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -673,14 +673,15 @@ static int Tokenizer_handle_free_link_text( Return whether the current head is the end of a free link. */ static int -Tokenizer_is_free_link(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) +Tokenizer_is_free_link_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) { // Built from Tokenizer_parse()'s end sentinels: Py_UCS4 after = Tokenizer_read(self, 2); uint64_t ctx = self->topstack->context; return (!this || this == '\n' || this == '[' || this == ']' || - this == '<' || this == '>' || (this == '\'' && next == '\'') || + this == '<' || this == '>' || this == '"' || + (this == '\'' && next == '\'') || (this == '|' && ctx & LC_TEMPLATE) || (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || (this == '}' && next == '}' && @@ -722,7 +723,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, if (Tokenizer_parse_comment(self)) return NULL; } - else if (!brackets && Tokenizer_is_free_link(self, this, next)) { + else if (!brackets && Tokenizer_is_free_link_end(self, this, next)) { self->head--; return Tokenizer_pop(self); } @@ -735,16 +736,28 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, } else if (this == ']') return Tokenizer_pop(self); - else if (this == ' ') { + else if (this == ' ' || Tokenizer_is_free_link_end(self, this, next)) { if (brackets) { - if (Tokenizer_emit(self, ExternalLinkSeparator)) - return NULL; + if (this == ' ') { + if (Tokenizer_emit(self, ExternalLinkSeparator)) + return NULL; + } + else { + PyObject* kwargs = PyDict_New(); + if (!kwargs) + return NULL; + if (this != ' ') + PyDict_SetItemString(kwargs, "suppress_space", Py_True); + if (Tokenizer_emit_kwargs(self, ExternalLinkSeparator, kwargs)) + return NULL; + } self->topstack->context ^= LC_EXT_LINK_URI; self->topstack->context |= LC_EXT_LINK_TITLE; - self->head++; + if (this == ' ') + self->head++; return Tokenizer_parse(self, 0, 0); } - if (Textbuffer_write(extra, ' ')) + if (Textbuffer_write(extra, this)) return NULL; return Tokenizer_pop(self); } diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 8e4b79d..6acfb8d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -439,7 +439,7 @@ class Tokenizer: # Built from _parse()'s end sentinels: after, ctx = self._read(2), self._context equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING - return (this in (self.END, "\n", "[", "]", "<", ">") or + return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or this == nxt == "'" or (this == "|" and ctx & contexts.TEMPLATE) or (this == "=" and ctx & equal_sign_contexts) or @@ -482,16 +482,29 @@ class Tokenizer: self._parse_template_or_argument() elif this == "]": return self._pop(), tail, 0 - elif " " in this: - before, after = this.split(" ", 1) + elif this == "'" and nxt == "'": + separator = tokens.ExternalLinkSeparator() + separator.suppress_space = True + self._emit(separator) + self._context ^= contexts.EXT_LINK_URI + self._context |= contexts.EXT_LINK_TITLE + return self._parse(push=False), None, 0 + elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">", + "\"")): + before, after = re.split("[ \n\[\]<>\"]", this, maxsplit=1) + delimiter = this[len(before)] if brackets: self._emit_text(before) - self._emit(tokens.ExternalLinkSeparator()) + separator = tokens.ExternalLinkSeparator() + if delimiter != " ": + separator.suppress_space = True + self._emit(separator) if after: self._emit_text(after) self._context ^= contexts.EXT_LINK_URI self._context |= contexts.EXT_LINK_TITLE - self._head += 1 + if delimiter == " ": + self._head += 1 return self._parse(push=False), None, 0 punct, tail = self._handle_free_link_text(punct, tail, before) return self._pop(), tail + " " + after, 0 diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index 5e581f9..7fed7b6 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -153,9 +153,9 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), Ext --- name: brackets_open_bracket_inside -label: an open bracket inside a bracket-enclosed link that is also included +label: an open bracket inside a bracket-enclosed link that is not included input: "[http://foobar[baz.com Example]" -output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] +output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar"), ExternalLinkSeparator(suppress_space=True), Text(text="[baz.com Example"), ExternalLinkClose()] --- @@ -492,3 +492,73 @@ name: url_preceded_by_underscore label: underscore immediately before a valid URL input: "svn_ssh://server.domain.com:/reponame" output: [Text(text="svn_ssh://server.domain.com:/reponame")] + +--- + +name: url_terminated_by_double_quote +label: a free link terminated by a double quote +input: "http://foo\"bar" +output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="\"bar")] + +--- + +name: url_not_terminated_by_single_quote +label: a free link not terminated by a single quote +input: "http://foo'bar" +output: [ExternalLinkOpen(brackets=False), Text(text="http://foo'bar"), ExternalLinkClose()] + +--- + +name: url_terminated_by_two_single_quotes +label: a free link terminated by two single quotes +input: "http://foo''bar''" +output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()] + +--- + +name: url_terminated_by_left_angle +label: a free link terminated by a left angle +input: "http://foobar" +output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text=">bar")] + +--- + +name: brackets_terminated_by_double_quote +label: an external link terminated by a double quote +input: "[http://foo\"bar]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="\"bar"), ExternalLinkClose()] + +--- + +name: brackets_not_terminated_by_single_quote +label: an external link not terminated by a single quote +input: "[http://foo'bar]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foo'bar"), ExternalLinkClose()] + +--- + +name: brackets_terminated_by_two_single_quotes +label: an external link terminated by two single quotes +input: "[http://foo''bar'']" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), ExternalLinkClose()] + +--- + +name: brackets_terminated_by_left_angle +label: an external link terminated by a left angle +input: "[http://foobar]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text=">bar"), ExternalLinkClose()]