* Proposed fix for https://github.com/earwig/mwparserfromhell/issues/197 * Port the fix for #197 to the C tokenizer * Fix parsing of external links where the URL is terminated by some special character - One existing test case has been found wrong -- current MediaWiki version always terminates the URL when an opening bracket is encountered. - Other test cases added: double quote, two single quotes and angles always terminate the URL (regardless if it is a free link or external link inside brackets). One single quote does not terminate the URL. * Fix case-insensitive parsing of URI schemestags/v0.6.1
@@ -27,15 +27,18 @@ __all__ = ["ExternalLink"] | |||||
class ExternalLink(Node): | class ExternalLink(Node): | ||||
"""Represents an external link, like ``[http://example.com/ Example]``.""" | """Represents an external link, like ``[http://example.com/ Example]``.""" | ||||
def __init__(self, url, title=None, brackets=True): | |||||
def __init__(self, url, title=None, brackets=True, suppress_space=False): | |||||
super().__init__() | super().__init__() | ||||
self.url = url | self.url = url | ||||
self.title = title | self.title = title | ||||
self.brackets = brackets | self.brackets = brackets | ||||
self.suppress_space = suppress_space | |||||
def __str__(self): | def __str__(self): | ||||
if self.brackets: | if self.brackets: | ||||
if self.title is not None: | if self.title is not None: | ||||
if self.suppress_space is True: | |||||
return "[" + str(self.url) + str(self.title) + "]" | |||||
return "[" + str(self.url) + " " + str(self.title) + "]" | return "[" + str(self.url) + " " + str(self.title) + "]" | ||||
return "[" + str(self.url) + "]" | return "[" + str(self.url) + "]" | ||||
return str(self.url) | return str(self.url) | ||||
@@ -157,17 +157,20 @@ class Builder: | |||||
@_add_handler(tokens.ExternalLinkOpen) | @_add_handler(tokens.ExternalLinkOpen) | ||||
def _handle_external_link(self, token): | def _handle_external_link(self, token): | ||||
"""Handle when an external link is at the head of the tokens.""" | """Handle when an external link is at the head of the tokens.""" | ||||
brackets, url = token.brackets, None | |||||
brackets, url, suppress_space = token.brackets, None, None | |||||
self._push() | self._push() | ||||
while self._tokens: | while self._tokens: | ||||
token = self._tokens.pop() | token = self._tokens.pop() | ||||
if isinstance(token, tokens.ExternalLinkSeparator): | if isinstance(token, tokens.ExternalLinkSeparator): | ||||
url = self._pop() | url = self._pop() | ||||
suppress_space = token.suppress_space | |||||
self._push() | self._push() | ||||
elif isinstance(token, tokens.ExternalLinkClose): | elif isinstance(token, tokens.ExternalLinkClose): | ||||
if url is not None: | if url is not None: | ||||
return ExternalLink(url, self._pop(), brackets) | |||||
return ExternalLink(self._pop(), brackets=brackets) | |||||
return ExternalLink(url, self._pop(), brackets=brackets, | |||||
suppress_space=suppress_space is True) | |||||
return ExternalLink(self._pop(), brackets=brackets, | |||||
suppress_space=suppress_space is True) | |||||
else: | else: | ||||
self._write(self._handle_token(token)) | self._write(self._handle_token(token)) | ||||
raise ParserError("_handle_external_link() missed a close token") | raise ParserError("_handle_external_link() missed a close token") | ||||
@@ -30,7 +30,7 @@ SOFTWARE. | |||||
#define DIGITS "0123456789" | #define DIGITS "0123456789" | ||||
#define HEXDIGITS "0123456789abcdefABCDEF" | #define HEXDIGITS "0123456789abcdefABCDEF" | ||||
#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" | #define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" | ||||
#define URISCHEME "abcdefghijklmnopqrstuvwxyz0123456789+.-" | |||||
#define URISCHEME "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" | |||||
#define MAX_BRACES 255 | #define MAX_BRACES 255 | ||||
#define MAX_ENTITY_SIZE 8 | #define MAX_ENTITY_SIZE 8 | ||||
@@ -100,6 +100,66 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr) | |||||
} | } | ||||
/* | /* | ||||
Check if the given character is a non-word character. | |||||
Equivalent to this Python code: | |||||
def is_non_word_character(ch): | |||||
if re.fullmatch(r"\W", chunk): | |||||
return True | |||||
return False | |||||
*/ | |||||
static int is_non_word_character(Py_UCS4 ch) | |||||
{ | |||||
int ret = 0; | |||||
PyObject* modname = NULL; | |||||
PyObject* module = NULL; | |||||
PyObject* fmatch = NULL; | |||||
PyObject* pattern = NULL; | |||||
PyObject* str = NULL; | |||||
PyObject* posArgs = NULL; | |||||
PyObject* match = NULL; | |||||
modname = PyUnicode_FromString("re"); | |||||
if (modname == NULL) | |||||
goto error; | |||||
module = PyImport_Import(modname); | |||||
if (module == NULL) | |||||
goto error; | |||||
fmatch = PyObject_GetAttrString(module, "fullmatch"); | |||||
if (fmatch == NULL) | |||||
goto error; | |||||
pattern = PyUnicode_FromString("\\W"); | |||||
if (pattern == NULL) | |||||
goto error; | |||||
str = PyUnicode_FROM_SINGLE(ch); | |||||
if (str == NULL) | |||||
goto error; | |||||
posArgs = PyTuple_Pack(2, pattern, str); | |||||
if (posArgs == NULL) | |||||
goto error; | |||||
match = PyObject_Call(fmatch, posArgs, NULL); | |||||
if (match == NULL) | |||||
goto error; | |||||
if (match != Py_None) | |||||
ret = 1; | |||||
goto end; | |||||
error: | |||||
ret = -1; | |||||
end: | |||||
Py_XDECREF(match); | |||||
Py_XDECREF(posArgs); | |||||
Py_XDECREF(str); | |||||
Py_XDECREF(pattern); | |||||
Py_XDECREF(fmatch); | |||||
Py_XDECREF(module); | |||||
Py_XDECREF(modname); | |||||
return ret; | |||||
} | |||||
/* | |||||
Parse a template at the head of the wikicode string. | Parse a template at the head of the wikicode string. | ||||
*/ | */ | ||||
static int Tokenizer_parse_template(Tokenizer* self, int has_content) | static int Tokenizer_parse_template(Tokenizer* self, int has_content) | ||||
@@ -527,7 +587,13 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||||
// it was just parsed as text: | // it was just parsed as text: | ||||
for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) { | for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) { | ||||
chunk = Textbuffer_read(self->topstack->textbuffer, i); | chunk = Textbuffer_read(self->topstack->textbuffer, i); | ||||
if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) | |||||
// stop at the first non-word character | |||||
int is_non_word = is_non_word_character(chunk); | |||||
if (is_non_word < 0) { | |||||
Textbuffer_dealloc(scheme_buffer); | |||||
return -1; | |||||
} | |||||
else if (is_non_word == 1) | |||||
goto end_of_loop; | goto end_of_loop; | ||||
j = 0; | j = 0; | ||||
do { | do { | ||||
@@ -607,14 +673,15 @@ static int Tokenizer_handle_free_link_text( | |||||
Return whether the current head is the end of a free link. | Return whether the current head is the end of a free link. | ||||
*/ | */ | ||||
static int | static int | ||||
Tokenizer_is_free_link(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) | |||||
Tokenizer_is_free_link_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) | |||||
{ | { | ||||
// Built from Tokenizer_parse()'s end sentinels: | // Built from Tokenizer_parse()'s end sentinels: | ||||
Py_UCS4 after = Tokenizer_read(self, 2); | Py_UCS4 after = Tokenizer_read(self, 2); | ||||
uint64_t ctx = self->topstack->context; | uint64_t ctx = self->topstack->context; | ||||
return (!this || this == '\n' || this == '[' || this == ']' || | return (!this || this == '\n' || this == '[' || this == ']' || | ||||
this == '<' || this == '>' || (this == '\'' && next == '\'') || | |||||
this == '<' || this == '>' || this == '"' || | |||||
(this == '\'' && next == '\'') || | |||||
(this == '|' && ctx & LC_TEMPLATE) || | (this == '|' && ctx & LC_TEMPLATE) || | ||||
(this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || | (this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || | ||||
(this == '}' && next == '}' && | (this == '}' && next == '}' && | ||||
@@ -656,7 +723,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||||
if (Tokenizer_parse_comment(self)) | if (Tokenizer_parse_comment(self)) | ||||
return NULL; | return NULL; | ||||
} | } | ||||
else if (!brackets && Tokenizer_is_free_link(self, this, next)) { | |||||
else if (!brackets && Tokenizer_is_free_link_end(self, this, next)) { | |||||
self->head--; | self->head--; | ||||
return Tokenizer_pop(self); | return Tokenizer_pop(self); | ||||
} | } | ||||
@@ -669,16 +736,28 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||||
} | } | ||||
else if (this == ']') | else if (this == ']') | ||||
return Tokenizer_pop(self); | return Tokenizer_pop(self); | ||||
else if (this == ' ') { | |||||
else if (this == ' ' || Tokenizer_is_free_link_end(self, this, next)) { | |||||
if (brackets) { | if (brackets) { | ||||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | |||||
return NULL; | |||||
if (this == ' ') { | |||||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | |||||
return NULL; | |||||
} | |||||
else { | |||||
PyObject* kwargs = PyDict_New(); | |||||
if (!kwargs) | |||||
return NULL; | |||||
if (this != ' ') | |||||
PyDict_SetItemString(kwargs, "suppress_space", Py_True); | |||||
if (Tokenizer_emit_kwargs(self, ExternalLinkSeparator, kwargs)) | |||||
return NULL; | |||||
} | |||||
self->topstack->context ^= LC_EXT_LINK_URI; | self->topstack->context ^= LC_EXT_LINK_URI; | ||||
self->topstack->context |= LC_EXT_LINK_TITLE; | self->topstack->context |= LC_EXT_LINK_TITLE; | ||||
self->head++; | |||||
if (this == ' ') | |||||
self->head++; | |||||
return Tokenizer_parse(self, 0, 0); | return Tokenizer_parse(self, 0, 0); | ||||
} | } | ||||
if (Textbuffer_write(extra, ' ')) | |||||
if (Textbuffer_write(extra, this)) | |||||
return NULL; | return NULL; | ||||
return Tokenizer_pop(self); | return Tokenizer_pop(self); | ||||
} | } | ||||
@@ -366,7 +366,7 @@ class Tokenizer: | |||||
self._emit_text("//") | self._emit_text("//") | ||||
self._head += 2 | self._head += 2 | ||||
else: | else: | ||||
valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" | |||||
valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" | |||||
all_valid = lambda: all(char in valid for char in self._read()) | all_valid = lambda: all(char in valid for char in self._read()) | ||||
scheme = "" | scheme = "" | ||||
while self._read() is not self.END and all_valid(): | while self._read() is not self.END and all_valid(): | ||||
@@ -386,14 +386,15 @@ class Tokenizer: | |||||
def _parse_free_uri_scheme(self): | def _parse_free_uri_scheme(self): | ||||
"""Parse the URI scheme of a free (no brackets) external link.""" | """Parse the URI scheme of a free (no brackets) external link.""" | ||||
valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" | |||||
valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" | |||||
scheme = [] | scheme = [] | ||||
try: | try: | ||||
# We have to backtrack through the textbuffer looking for our | # We have to backtrack through the textbuffer looking for our | ||||
# scheme since it was just parsed as text: | # scheme since it was just parsed as text: | ||||
for chunk in reversed(self._textbuffer): | for chunk in reversed(self._textbuffer): | ||||
for char in reversed(chunk): | for char in reversed(chunk): | ||||
if char.isspace() or char in self.MARKERS: | |||||
# stop at the first non-word character | |||||
if re.fullmatch(r"\W", char): | |||||
raise StopIteration() | raise StopIteration() | ||||
if char not in valid: | if char not in valid: | ||||
raise BadRoute() | raise BadRoute() | ||||
@@ -438,7 +439,7 @@ class Tokenizer: | |||||
# Built from _parse()'s end sentinels: | # Built from _parse()'s end sentinels: | ||||
after, ctx = self._read(2), self._context | after, ctx = self._read(2), self._context | ||||
equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING | equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING | ||||
return (this in (self.END, "\n", "[", "]", "<", ">") or | |||||
return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or | |||||
this == nxt == "'" or | this == nxt == "'" or | ||||
(this == "|" and ctx & contexts.TEMPLATE) or | (this == "|" and ctx & contexts.TEMPLATE) or | ||||
(this == "=" and ctx & equal_sign_contexts) or | (this == "=" and ctx & equal_sign_contexts) or | ||||
@@ -481,16 +482,29 @@ class Tokenizer: | |||||
self._parse_template_or_argument() | self._parse_template_or_argument() | ||||
elif this == "]": | elif this == "]": | ||||
return self._pop(), tail, 0 | return self._pop(), tail, 0 | ||||
elif " " in this: | |||||
before, after = this.split(" ", 1) | |||||
elif this == "'" and nxt == "'": | |||||
separator = tokens.ExternalLinkSeparator() | |||||
separator.suppress_space = True | |||||
self._emit(separator) | |||||
self._context ^= contexts.EXT_LINK_URI | |||||
self._context |= contexts.EXT_LINK_TITLE | |||||
return self._parse(push=False), None, 0 | |||||
elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">", | |||||
"\"")): | |||||
before, after = re.split("[ \n\[\]<>\"]", this, maxsplit=1) | |||||
delimiter = this[len(before)] | |||||
if brackets: | if brackets: | ||||
self._emit_text(before) | self._emit_text(before) | ||||
self._emit(tokens.ExternalLinkSeparator()) | |||||
separator = tokens.ExternalLinkSeparator() | |||||
if delimiter != " ": | |||||
separator.suppress_space = True | |||||
self._emit(separator) | |||||
if after: | if after: | ||||
self._emit_text(after) | self._emit_text(after) | ||||
self._context ^= contexts.EXT_LINK_URI | self._context ^= contexts.EXT_LINK_URI | ||||
self._context |= contexts.EXT_LINK_TITLE | self._context |= contexts.EXT_LINK_TITLE | ||||
self._head += 1 | |||||
if delimiter == " ": | |||||
self._head += 1 | |||||
return self._parse(push=False), None, 0 | return self._parse(push=False), None, 0 | ||||
punct, tail = self._handle_free_link_text(punct, tail, before) | punct, tail = self._handle_free_link_text(punct, tail, before) | ||||
return self._pop(), tail + " " + after, 0 | return self._pop(), tail + " " + after, 0 | ||||
@@ -153,9 +153,9 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), Ext | |||||
--- | --- | ||||
name: brackets_open_bracket_inside | name: brackets_open_bracket_inside | ||||
label: an open bracket inside a bracket-enclosed link that is also included | |||||
label: an open bracket inside a bracket-enclosed link that is not included | |||||
input: "[http://foobar[baz.com Example]" | input: "[http://foobar[baz.com Example]" | ||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar"), ExternalLinkSeparator(suppress_space=True), Text(text="[baz.com Example"), ExternalLinkClose()] | |||||
--- | --- | ||||
@@ -478,3 +478,101 @@ name: brackets_scheme_title_but_no_url | |||||
label: brackets around a scheme, colon, and slashes, with a title | label: brackets around a scheme, colon, and slashes, with a title | ||||
input: "[http:// Example]" | input: "[http:// Example]" | ||||
output: [Text(text="[http:// Example]")] | output: [Text(text="[http:// Example]")] | ||||
--- | |||||
name: url_preceded_by_non_word_character | |||||
label: non-word character immediately before a valid URL | |||||
input: "svn+ssh://server.domain.com:/reponame" | |||||
output: [Text(text="svn+"), ExternalLinkOpen(brackets=False), Text(text="ssh://server.domain.com:/reponame"), ExternalLinkClose()] | |||||
--- | |||||
name: url_preceded_by_underscore | |||||
label: underscore immediately before a valid URL | |||||
input: "svn_ssh://server.domain.com:/reponame" | |||||
output: [Text(text="svn_ssh://server.domain.com:/reponame")] | |||||
--- | |||||
name: url_terminated_by_double_quote | |||||
label: a free link terminated by a double quote | |||||
input: "http://foo\"bar" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="\"bar")] | |||||
--- | |||||
name: url_not_terminated_by_single_quote | |||||
label: a free link not terminated by a single quote | |||||
input: "http://foo'bar" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo'bar"), ExternalLinkClose()] | |||||
--- | |||||
name: url_terminated_by_two_single_quotes | |||||
label: a free link terminated by two single quotes | |||||
input: "http://foo''bar''" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()] | |||||
--- | |||||
name: url_terminated_by_left_angle | |||||
label: a free link terminated by a left angle | |||||
input: "http://foo<bar" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="<bar")] | |||||
--- | |||||
name: url_terminated_by_right_angle | |||||
label: a free link terminated by a right angle | |||||
input: "http://foo>bar" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text=">bar")] | |||||
--- | |||||
name: brackets_terminated_by_double_quote | |||||
label: an external link terminated by a double quote | |||||
input: "[http://foo\"bar]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="\"bar"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_not_terminated_by_single_quote | |||||
label: an external link not terminated by a single quote | |||||
input: "[http://foo'bar]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo'bar"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_terminated_by_two_single_quotes | |||||
label: an external link terminated by two single quotes | |||||
input: "[http://foo''bar'']" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_terminated_by_left_angle | |||||
label: an external link terminated by a left angle | |||||
input: "[http://foo<bar]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="<bar"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_terminated_by_right_angle | |||||
label: an external link terminated by a right angle | |||||
input: "[http://foo>bar]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text=">bar"), ExternalLinkClose()] | |||||
--- | |||||
name: scheme_case | |||||
label: a free link with uppercase letters in the URL scheme | |||||
input: "HtTp://example.com/" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="HtTp://example.com/"), ExternalLinkClose()] | |||||
--- | |||||
name: bracket_scheme_case | |||||
label: an external link with uppercase letters in the URL scheme | |||||
input: "[HtTp://example.com/]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="HtTp://example.com/"), ExternalLinkClose()] |