- One existing test case has been found wrong -- current MediaWiki version always terminates the URL when an opening bracket is encountered. - Other test cases added: double quote, two single quotes and angles always terminate the URL (regardless if it is a free link or external link inside brackets). One single quote does not terminate the URL.pull/232/head
@@ -27,15 +27,18 @@ __all__ = ["ExternalLink"] | |||||
class ExternalLink(Node): | class ExternalLink(Node): | ||||
"""Represents an external link, like ``[http://example.com/ Example]``.""" | """Represents an external link, like ``[http://example.com/ Example]``.""" | ||||
def __init__(self, url, title=None, brackets=True): | |||||
def __init__(self, url, title=None, brackets=True, suppress_space=False): | |||||
super().__init__() | super().__init__() | ||||
self.url = url | self.url = url | ||||
self.title = title | self.title = title | ||||
self.brackets = brackets | self.brackets = brackets | ||||
self.suppress_space = suppress_space | |||||
def __str__(self): | def __str__(self): | ||||
if self.brackets: | if self.brackets: | ||||
if self.title is not None: | if self.title is not None: | ||||
if self.suppress_space is True: | |||||
return "[" + str(self.url) + str(self.title) + "]" | |||||
return "[" + str(self.url) + " " + str(self.title) + "]" | return "[" + str(self.url) + " " + str(self.title) + "]" | ||||
return "[" + str(self.url) + "]" | return "[" + str(self.url) + "]" | ||||
return str(self.url) | return str(self.url) | ||||
@@ -157,17 +157,20 @@ class Builder: | |||||
@_add_handler(tokens.ExternalLinkOpen) | @_add_handler(tokens.ExternalLinkOpen) | ||||
def _handle_external_link(self, token): | def _handle_external_link(self, token): | ||||
"""Handle when an external link is at the head of the tokens.""" | """Handle when an external link is at the head of the tokens.""" | ||||
brackets, url = token.brackets, None | |||||
brackets, url, suppress_space = token.brackets, None, None | |||||
self._push() | self._push() | ||||
while self._tokens: | while self._tokens: | ||||
token = self._tokens.pop() | token = self._tokens.pop() | ||||
if isinstance(token, tokens.ExternalLinkSeparator): | if isinstance(token, tokens.ExternalLinkSeparator): | ||||
url = self._pop() | url = self._pop() | ||||
suppress_space = token.suppress_space | |||||
self._push() | self._push() | ||||
elif isinstance(token, tokens.ExternalLinkClose): | elif isinstance(token, tokens.ExternalLinkClose): | ||||
if url is not None: | if url is not None: | ||||
return ExternalLink(url, self._pop(), brackets) | |||||
return ExternalLink(self._pop(), brackets=brackets) | |||||
return ExternalLink(url, self._pop(), brackets=brackets, | |||||
suppress_space=suppress_space is True) | |||||
return ExternalLink(self._pop(), brackets=brackets, | |||||
suppress_space=suppress_space is True) | |||||
else: | else: | ||||
self._write(self._handle_token(token)) | self._write(self._handle_token(token)) | ||||
raise ParserError("_handle_external_link() missed a close token") | raise ParserError("_handle_external_link() missed a close token") | ||||
@@ -673,14 +673,15 @@ static int Tokenizer_handle_free_link_text( | |||||
Return whether the current head is the end of a free link. | Return whether the current head is the end of a free link. | ||||
*/ | */ | ||||
static int | static int | ||||
Tokenizer_is_free_link(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) | |||||
Tokenizer_is_free_link_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) | |||||
{ | { | ||||
// Built from Tokenizer_parse()'s end sentinels: | // Built from Tokenizer_parse()'s end sentinels: | ||||
Py_UCS4 after = Tokenizer_read(self, 2); | Py_UCS4 after = Tokenizer_read(self, 2); | ||||
uint64_t ctx = self->topstack->context; | uint64_t ctx = self->topstack->context; | ||||
return (!this || this == '\n' || this == '[' || this == ']' || | return (!this || this == '\n' || this == '[' || this == ']' || | ||||
this == '<' || this == '>' || (this == '\'' && next == '\'') || | |||||
this == '<' || this == '>' || this == '"' || | |||||
(this == '\'' && next == '\'') || | |||||
(this == '|' && ctx & LC_TEMPLATE) || | (this == '|' && ctx & LC_TEMPLATE) || | ||||
(this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || | (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || | ||||
(this == '}' && next == '}' && | (this == '}' && next == '}' && | ||||
@@ -722,7 +723,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||||
if (Tokenizer_parse_comment(self)) | if (Tokenizer_parse_comment(self)) | ||||
return NULL; | return NULL; | ||||
} | } | ||||
else if (!brackets && Tokenizer_is_free_link(self, this, next)) { | |||||
else if (!brackets && Tokenizer_is_free_link_end(self, this, next)) { | |||||
self->head--; | self->head--; | ||||
return Tokenizer_pop(self); | return Tokenizer_pop(self); | ||||
} | } | ||||
@@ -735,16 +736,28 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||||
} | } | ||||
else if (this == ']') | else if (this == ']') | ||||
return Tokenizer_pop(self); | return Tokenizer_pop(self); | ||||
else if (this == ' ') { | |||||
else if (this == ' ' || Tokenizer_is_free_link_end(self, this, next)) { | |||||
if (brackets) { | if (brackets) { | ||||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | |||||
return NULL; | |||||
if (this == ' ') { | |||||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | |||||
return NULL; | |||||
} | |||||
else { | |||||
PyObject* kwargs = PyDict_New(); | |||||
if (!kwargs) | |||||
return NULL; | |||||
if (this != ' ') | |||||
PyDict_SetItemString(kwargs, "suppress_space", Py_True); | |||||
if (Tokenizer_emit_kwargs(self, ExternalLinkSeparator, kwargs)) | |||||
return NULL; | |||||
} | |||||
self->topstack->context ^= LC_EXT_LINK_URI; | self->topstack->context ^= LC_EXT_LINK_URI; | ||||
self->topstack->context |= LC_EXT_LINK_TITLE; | self->topstack->context |= LC_EXT_LINK_TITLE; | ||||
self->head++; | |||||
if (this == ' ') | |||||
self->head++; | |||||
return Tokenizer_parse(self, 0, 0); | return Tokenizer_parse(self, 0, 0); | ||||
} | } | ||||
if (Textbuffer_write(extra, ' ')) | |||||
if (Textbuffer_write(extra, this)) | |||||
return NULL; | return NULL; | ||||
return Tokenizer_pop(self); | return Tokenizer_pop(self); | ||||
} | } | ||||
@@ -439,7 +439,7 @@ class Tokenizer: | |||||
# Built from _parse()'s end sentinels: | # Built from _parse()'s end sentinels: | ||||
after, ctx = self._read(2), self._context | after, ctx = self._read(2), self._context | ||||
equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING | equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING | ||||
return (this in (self.END, "\n", "[", "]", "<", ">") or | |||||
return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or | |||||
this == nxt == "'" or | this == nxt == "'" or | ||||
(this == "|" and ctx & contexts.TEMPLATE) or | (this == "|" and ctx & contexts.TEMPLATE) or | ||||
(this == "=" and ctx & equal_sign_contexts) or | (this == "=" and ctx & equal_sign_contexts) or | ||||
@@ -482,16 +482,29 @@ class Tokenizer: | |||||
self._parse_template_or_argument() | self._parse_template_or_argument() | ||||
elif this == "]": | elif this == "]": | ||||
return self._pop(), tail, 0 | return self._pop(), tail, 0 | ||||
elif " " in this: | |||||
before, after = this.split(" ", 1) | |||||
elif this == "'" and nxt == "'": | |||||
separator = tokens.ExternalLinkSeparator() | |||||
separator.suppress_space = True | |||||
self._emit(separator) | |||||
self._context ^= contexts.EXT_LINK_URI | |||||
self._context |= contexts.EXT_LINK_TITLE | |||||
return self._parse(push=False), None, 0 | |||||
elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">", | |||||
"\"")): | |||||
before, after = re.split("[ \n\[\]<>\"]", this, maxsplit=1) | |||||
delimiter = this[len(before)] | |||||
if brackets: | if brackets: | ||||
self._emit_text(before) | self._emit_text(before) | ||||
self._emit(tokens.ExternalLinkSeparator()) | |||||
separator = tokens.ExternalLinkSeparator() | |||||
if delimiter != " ": | |||||
separator.suppress_space = True | |||||
self._emit(separator) | |||||
if after: | if after: | ||||
self._emit_text(after) | self._emit_text(after) | ||||
self._context ^= contexts.EXT_LINK_URI | self._context ^= contexts.EXT_LINK_URI | ||||
self._context |= contexts.EXT_LINK_TITLE | self._context |= contexts.EXT_LINK_TITLE | ||||
self._head += 1 | |||||
if delimiter == " ": | |||||
self._head += 1 | |||||
return self._parse(push=False), None, 0 | return self._parse(push=False), None, 0 | ||||
punct, tail = self._handle_free_link_text(punct, tail, before) | punct, tail = self._handle_free_link_text(punct, tail, before) | ||||
return self._pop(), tail + " " + after, 0 | return self._pop(), tail + " " + after, 0 | ||||
@@ -153,9 +153,9 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), Ext | |||||
--- | --- | ||||
name: brackets_open_bracket_inside | name: brackets_open_bracket_inside | ||||
label: an open bracket inside a bracket-enclosed link that is also included | |||||
label: an open bracket inside a bracket-enclosed link that is not included | |||||
input: "[http://foobar[baz.com Example]" | input: "[http://foobar[baz.com Example]" | ||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar"), ExternalLinkSeparator(suppress_space=True), Text(text="[baz.com Example"), ExternalLinkClose()] | |||||
--- | --- | ||||
@@ -492,3 +492,73 @@ name: url_preceded_by_underscore | |||||
label: underscore immediately before a valid URL | label: underscore immediately before a valid URL | ||||
input: "svn_ssh://server.domain.com:/reponame" | input: "svn_ssh://server.domain.com:/reponame" | ||||
output: [Text(text="svn_ssh://server.domain.com:/reponame")] | output: [Text(text="svn_ssh://server.domain.com:/reponame")] | ||||
--- | |||||
name: url_terminated_by_double_quote | |||||
label: a free link terminated by a double quote | |||||
input: "http://foo\"bar" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="\"bar")] | |||||
--- | |||||
name: url_not_terminated_by_single_quote | |||||
label: a free link not terminated by a single quote | |||||
input: "http://foo'bar" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo'bar"), ExternalLinkClose()] | |||||
--- | |||||
name: url_terminated_by_two_single_quotes | |||||
label: a free link terminated by two single quotes | |||||
input: "http://foo''bar''" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()] | |||||
--- | |||||
name: url_terminated_by_left_angle | |||||
label: a free link terminated by a left angle | |||||
input: "http://foo<bar" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="<bar")] | |||||
--- | |||||
name: url_terminated_by_right_angle | |||||
label: a free link terminated by a right angle | |||||
input: "http://foo>bar" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text=">bar")] | |||||
--- | |||||
name: brackets_terminated_by_double_quote | |||||
label: an external link terminated by a double quote | |||||
input: "[http://foo\"bar]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="\"bar"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_not_terminated_by_single_quote | |||||
label: an external link not terminated by a single quote | |||||
input: "[http://foo'bar]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo'bar"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_terminated_by_two_single_quotes | |||||
label: an external link terminated by two single quotes | |||||
input: "[http://foo''bar'']" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_terminated_by_left_angle | |||||
label: an external link terminated by a left angle | |||||
input: "[http://foo<bar]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="<bar"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_terminated_by_right_angle | |||||
label: an external link terminated by a right angle | |||||
input: "[http://foo>bar]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text=">bar"), ExternalLinkClose()] |