- One existing test case has been found wrong -- current MediaWiki version always terminates the URL when an opening bracket is encountered. - Other test cases added: double quote, two single quotes and angles always terminate the URL (regardless if it is a free link or external link inside brackets). One single quote does not terminate the URL.pull/232/head
@@ -27,15 +27,18 @@ __all__ = ["ExternalLink"] | |||
class ExternalLink(Node): | |||
"""Represents an external link, like ``[http://example.com/ Example]``.""" | |||
def __init__(self, url, title=None, brackets=True): | |||
def __init__(self, url, title=None, brackets=True, suppress_space=False): | |||
super().__init__() | |||
self.url = url | |||
self.title = title | |||
self.brackets = brackets | |||
self.suppress_space = suppress_space | |||
def __str__(self): | |||
if self.brackets: | |||
if self.title is not None: | |||
if self.suppress_space is True: | |||
return "[" + str(self.url) + str(self.title) + "]" | |||
return "[" + str(self.url) + " " + str(self.title) + "]" | |||
return "[" + str(self.url) + "]" | |||
return str(self.url) | |||
@@ -157,17 +157,20 @@ class Builder: | |||
@_add_handler(tokens.ExternalLinkOpen) | |||
def _handle_external_link(self, token): | |||
"""Handle when an external link is at the head of the tokens.""" | |||
brackets, url = token.brackets, None | |||
brackets, url, suppress_space = token.brackets, None, None | |||
self._push() | |||
while self._tokens: | |||
token = self._tokens.pop() | |||
if isinstance(token, tokens.ExternalLinkSeparator): | |||
url = self._pop() | |||
suppress_space = token.suppress_space | |||
self._push() | |||
elif isinstance(token, tokens.ExternalLinkClose): | |||
if url is not None: | |||
return ExternalLink(url, self._pop(), brackets) | |||
return ExternalLink(self._pop(), brackets=brackets) | |||
return ExternalLink(url, self._pop(), brackets=brackets, | |||
suppress_space=suppress_space is True) | |||
return ExternalLink(self._pop(), brackets=brackets, | |||
suppress_space=suppress_space is True) | |||
else: | |||
self._write(self._handle_token(token)) | |||
raise ParserError("_handle_external_link() missed a close token") | |||
@@ -673,14 +673,15 @@ static int Tokenizer_handle_free_link_text( | |||
Return whether the current head is the end of a free link. | |||
*/ | |||
static int | |||
Tokenizer_is_free_link(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) | |||
Tokenizer_is_free_link_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) | |||
{ | |||
// Built from Tokenizer_parse()'s end sentinels: | |||
Py_UCS4 after = Tokenizer_read(self, 2); | |||
uint64_t ctx = self->topstack->context; | |||
return (!this || this == '\n' || this == '[' || this == ']' || | |||
this == '<' || this == '>' || (this == '\'' && next == '\'') || | |||
this == '<' || this == '>' || this == '"' || | |||
(this == '\'' && next == '\'') || | |||
(this == '|' && ctx & LC_TEMPLATE) || | |||
(this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || | |||
(this == '}' && next == '}' && | |||
@@ -722,7 +723,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||
if (Tokenizer_parse_comment(self)) | |||
return NULL; | |||
} | |||
else if (!brackets && Tokenizer_is_free_link(self, this, next)) { | |||
else if (!brackets && Tokenizer_is_free_link_end(self, this, next)) { | |||
self->head--; | |||
return Tokenizer_pop(self); | |||
} | |||
@@ -735,16 +736,28 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||
} | |||
else if (this == ']') | |||
return Tokenizer_pop(self); | |||
else if (this == ' ') { | |||
else if (this == ' ' || Tokenizer_is_free_link_end(self, this, next)) { | |||
if (brackets) { | |||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | |||
return NULL; | |||
if (this == ' ') { | |||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | |||
return NULL; | |||
} | |||
else { | |||
PyObject* kwargs = PyDict_New(); | |||
if (!kwargs) | |||
return NULL; | |||
if (this != ' ') | |||
PyDict_SetItemString(kwargs, "suppress_space", Py_True); | |||
if (Tokenizer_emit_kwargs(self, ExternalLinkSeparator, kwargs)) | |||
return NULL; | |||
} | |||
self->topstack->context ^= LC_EXT_LINK_URI; | |||
self->topstack->context |= LC_EXT_LINK_TITLE; | |||
self->head++; | |||
if (this == ' ') | |||
self->head++; | |||
return Tokenizer_parse(self, 0, 0); | |||
} | |||
if (Textbuffer_write(extra, ' ')) | |||
if (Textbuffer_write(extra, this)) | |||
return NULL; | |||
return Tokenizer_pop(self); | |||
} | |||
@@ -439,7 +439,7 @@ class Tokenizer: | |||
# Built from _parse()'s end sentinels: | |||
after, ctx = self._read(2), self._context | |||
equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING | |||
return (this in (self.END, "\n", "[", "]", "<", ">") or | |||
return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or | |||
this == nxt == "'" or | |||
(this == "|" and ctx & contexts.TEMPLATE) or | |||
(this == "=" and ctx & equal_sign_contexts) or | |||
@@ -482,16 +482,29 @@ class Tokenizer: | |||
self._parse_template_or_argument() | |||
elif this == "]": | |||
return self._pop(), tail, 0 | |||
elif " " in this: | |||
before, after = this.split(" ", 1) | |||
elif this == "'" and nxt == "'": | |||
separator = tokens.ExternalLinkSeparator() | |||
separator.suppress_space = True | |||
self._emit(separator) | |||
self._context ^= contexts.EXT_LINK_URI | |||
self._context |= contexts.EXT_LINK_TITLE | |||
return self._parse(push=False), None, 0 | |||
elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">", | |||
"\"")): | |||
before, after = re.split("[ \n\[\]<>\"]", this, maxsplit=1) | |||
delimiter = this[len(before)] | |||
if brackets: | |||
self._emit_text(before) | |||
self._emit(tokens.ExternalLinkSeparator()) | |||
separator = tokens.ExternalLinkSeparator() | |||
if delimiter != " ": | |||
separator.suppress_space = True | |||
self._emit(separator) | |||
if after: | |||
self._emit_text(after) | |||
self._context ^= contexts.EXT_LINK_URI | |||
self._context |= contexts.EXT_LINK_TITLE | |||
self._head += 1 | |||
if delimiter == " ": | |||
self._head += 1 | |||
return self._parse(push=False), None, 0 | |||
punct, tail = self._handle_free_link_text(punct, tail, before) | |||
return self._pop(), tail + " " + after, 0 | |||
@@ -153,9 +153,9 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), Ext | |||
--- | |||
name: brackets_open_bracket_inside | |||
label: an open bracket inside a bracket-enclosed link that is also included | |||
label: an open bracket inside a bracket-enclosed link that is not included | |||
input: "[http://foobar[baz.com Example]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar"), ExternalLinkSeparator(suppress_space=True), Text(text="[baz.com Example"), ExternalLinkClose()] | |||
--- | |||
@@ -492,3 +492,73 @@ name: url_preceded_by_underscore | |||
label: underscore immediately before a valid URL | |||
input: "svn_ssh://server.domain.com:/reponame" | |||
output: [Text(text="svn_ssh://server.domain.com:/reponame")] | |||
--- | |||
name: url_terminated_by_double_quote | |||
label: a free link terminated by a double quote | |||
input: "http://foo\"bar" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="\"bar")] | |||
--- | |||
name: url_not_terminated_by_single_quote | |||
label: a free link not terminated by a single quote | |||
input: "http://foo'bar" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo'bar"), ExternalLinkClose()] | |||
--- | |||
name: url_terminated_by_two_single_quotes | |||
label: a free link terminated by two single quotes | |||
input: "http://foo''bar''" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()] | |||
--- | |||
name: url_terminated_by_left_angle | |||
label: a free link terminated by a left angle | |||
input: "http://foo<bar" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="<bar")] | |||
--- | |||
name: url_terminated_by_right_angle | |||
label: a free link terminated by a right angle | |||
input: "http://foo>bar" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text=">bar")] | |||
--- | |||
name: brackets_terminated_by_double_quote | |||
label: an external link terminated by a double quote | |||
input: "[http://foo\"bar]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="\"bar"), ExternalLinkClose()] | |||
--- | |||
name: brackets_not_terminated_by_single_quote | |||
label: an external link not terminated by a single quote | |||
input: "[http://foo'bar]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo'bar"), ExternalLinkClose()] | |||
--- | |||
name: brackets_terminated_by_two_single_quotes | |||
label: an external link terminated by two single quotes | |||
input: "[http://foo''bar'']" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), ExternalLinkClose()] | |||
--- | |||
name: brackets_terminated_by_left_angle | |||
label: an external link terminated by a left angle | |||
input: "[http://foo<bar]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="<bar"), ExternalLinkClose()] | |||
--- | |||
name: brackets_terminated_by_right_angle | |||
label: an external link terminated by a right angle | |||
input: "[http://foo>bar]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text=">bar"), ExternalLinkClose()] |