* Proposed fix for https://github.com/earwig/mwparserfromhell/issues/197 * Port the fix for #197 to the C tokenizer * Fix parsing of external links where the URL is terminated by some special character - One existing test case has been found wrong -- current MediaWiki version always terminates the URL when an opening bracket is encountered. - Other test cases added: double quote, two single quotes and angles always terminate the URL (regardless if it is a free link or external link inside brackets). One single quote does not terminate the URL. * Fix case-insensitive parsing of URI schemestags/v0.6.1
@@ -27,15 +27,18 @@ __all__ = ["ExternalLink"] | |||
class ExternalLink(Node): | |||
"""Represents an external link, like ``[http://example.com/ Example]``.""" | |||
def __init__(self, url, title=None, brackets=True): | |||
def __init__(self, url, title=None, brackets=True, suppress_space=False): | |||
super().__init__() | |||
self.url = url | |||
self.title = title | |||
self.brackets = brackets | |||
self.suppress_space = suppress_space | |||
def __str__(self): | |||
if self.brackets: | |||
if self.title is not None: | |||
if self.suppress_space is True: | |||
return "[" + str(self.url) + str(self.title) + "]" | |||
return "[" + str(self.url) + " " + str(self.title) + "]" | |||
return "[" + str(self.url) + "]" | |||
return str(self.url) | |||
@@ -157,17 +157,20 @@ class Builder: | |||
@_add_handler(tokens.ExternalLinkOpen) | |||
def _handle_external_link(self, token): | |||
"""Handle when an external link is at the head of the tokens.""" | |||
brackets, url = token.brackets, None | |||
brackets, url, suppress_space = token.brackets, None, None | |||
self._push() | |||
while self._tokens: | |||
token = self._tokens.pop() | |||
if isinstance(token, tokens.ExternalLinkSeparator): | |||
url = self._pop() | |||
suppress_space = token.suppress_space | |||
self._push() | |||
elif isinstance(token, tokens.ExternalLinkClose): | |||
if url is not None: | |||
return ExternalLink(url, self._pop(), brackets) | |||
return ExternalLink(self._pop(), brackets=brackets) | |||
return ExternalLink(url, self._pop(), brackets=brackets, | |||
suppress_space=suppress_space is True) | |||
return ExternalLink(self._pop(), brackets=brackets, | |||
suppress_space=suppress_space is True) | |||
else: | |||
self._write(self._handle_token(token)) | |||
raise ParserError("_handle_external_link() missed a close token") | |||
@@ -30,7 +30,7 @@ SOFTWARE. | |||
#define DIGITS "0123456789" | |||
#define HEXDIGITS "0123456789abcdefABCDEF" | |||
#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" | |||
#define URISCHEME "abcdefghijklmnopqrstuvwxyz0123456789+.-" | |||
#define URISCHEME "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" | |||
#define MAX_BRACES 255 | |||
#define MAX_ENTITY_SIZE 8 | |||
@@ -100,6 +100,66 @@ static PyObject* strip_tag_name(PyObject* token, int take_attr) | |||
} | |||
/* | |||
Check if the given character is a non-word character. | |||
Equivalent to this Python code: | |||
def is_non_word_character(ch): | |||
if re.fullmatch(r"\W", chunk): | |||
return True | |||
return False | |||
*/ | |||
static int is_non_word_character(Py_UCS4 ch) | |||
{ | |||
int ret = 0; | |||
PyObject* modname = NULL; | |||
PyObject* module = NULL; | |||
PyObject* fmatch = NULL; | |||
PyObject* pattern = NULL; | |||
PyObject* str = NULL; | |||
PyObject* posArgs = NULL; | |||
PyObject* match = NULL; | |||
modname = PyUnicode_FromString("re"); | |||
if (modname == NULL) | |||
goto error; | |||
module = PyImport_Import(modname); | |||
if (module == NULL) | |||
goto error; | |||
fmatch = PyObject_GetAttrString(module, "fullmatch"); | |||
if (fmatch == NULL) | |||
goto error; | |||
pattern = PyUnicode_FromString("\\W"); | |||
if (pattern == NULL) | |||
goto error; | |||
str = PyUnicode_FROM_SINGLE(ch); | |||
if (str == NULL) | |||
goto error; | |||
posArgs = PyTuple_Pack(2, pattern, str); | |||
if (posArgs == NULL) | |||
goto error; | |||
match = PyObject_Call(fmatch, posArgs, NULL); | |||
if (match == NULL) | |||
goto error; | |||
if (match != Py_None) | |||
ret = 1; | |||
goto end; | |||
error: | |||
ret = -1; | |||
end: | |||
Py_XDECREF(match); | |||
Py_XDECREF(posArgs); | |||
Py_XDECREF(str); | |||
Py_XDECREF(pattern); | |||
Py_XDECREF(fmatch); | |||
Py_XDECREF(module); | |||
Py_XDECREF(modname); | |||
return ret; | |||
} | |||
/* | |||
Parse a template at the head of the wikicode string. | |||
*/ | |||
static int Tokenizer_parse_template(Tokenizer* self, int has_content) | |||
@@ -527,7 +587,13 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||
// it was just parsed as text: | |||
for (i = self->topstack->textbuffer->length - 1; i >= 0; i--) { | |||
chunk = Textbuffer_read(self->topstack->textbuffer, i); | |||
if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) | |||
// stop at the first non-word character | |||
int is_non_word = is_non_word_character(chunk); | |||
if (is_non_word < 0) { | |||
Textbuffer_dealloc(scheme_buffer); | |||
return -1; | |||
} | |||
else if (is_non_word == 1) | |||
goto end_of_loop; | |||
j = 0; | |||
do { | |||
@@ -607,14 +673,15 @@ static int Tokenizer_handle_free_link_text( | |||
Return whether the current head is the end of a free link. | |||
*/ | |||
static int | |||
Tokenizer_is_free_link(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) | |||
Tokenizer_is_free_link_end(Tokenizer* self, Py_UCS4 this, Py_UCS4 next) | |||
{ | |||
// Built from Tokenizer_parse()'s end sentinels: | |||
Py_UCS4 after = Tokenizer_read(self, 2); | |||
uint64_t ctx = self->topstack->context; | |||
return (!this || this == '\n' || this == '[' || this == ']' || | |||
this == '<' || this == '>' || (this == '\'' && next == '\'') || | |||
this == '<' || this == '>' || this == '"' || | |||
(this == '\'' && next == '\'') || | |||
(this == '|' && ctx & LC_TEMPLATE) || | |||
(this == '=' && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || | |||
(this == '}' && next == '}' && | |||
@@ -656,7 +723,7 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||
if (Tokenizer_parse_comment(self)) | |||
return NULL; | |||
} | |||
else if (!brackets && Tokenizer_is_free_link(self, this, next)) { | |||
else if (!brackets && Tokenizer_is_free_link_end(self, this, next)) { | |||
self->head--; | |||
return Tokenizer_pop(self); | |||
} | |||
@@ -669,16 +736,28 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||
} | |||
else if (this == ']') | |||
return Tokenizer_pop(self); | |||
else if (this == ' ') { | |||
else if (this == ' ' || Tokenizer_is_free_link_end(self, this, next)) { | |||
if (brackets) { | |||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | |||
return NULL; | |||
if (this == ' ') { | |||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | |||
return NULL; | |||
} | |||
else { | |||
PyObject* kwargs = PyDict_New(); | |||
if (!kwargs) | |||
return NULL; | |||
if (this != ' ') | |||
PyDict_SetItemString(kwargs, "suppress_space", Py_True); | |||
if (Tokenizer_emit_kwargs(self, ExternalLinkSeparator, kwargs)) | |||
return NULL; | |||
} | |||
self->topstack->context ^= LC_EXT_LINK_URI; | |||
self->topstack->context |= LC_EXT_LINK_TITLE; | |||
self->head++; | |||
if (this == ' ') | |||
self->head++; | |||
return Tokenizer_parse(self, 0, 0); | |||
} | |||
if (Textbuffer_write(extra, ' ')) | |||
if (Textbuffer_write(extra, this)) | |||
return NULL; | |||
return Tokenizer_pop(self); | |||
} | |||
@@ -366,7 +366,7 @@ class Tokenizer: | |||
self._emit_text("//") | |||
self._head += 2 | |||
else: | |||
valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" | |||
valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" | |||
all_valid = lambda: all(char in valid for char in self._read()) | |||
scheme = "" | |||
while self._read() is not self.END and all_valid(): | |||
@@ -386,14 +386,15 @@ class Tokenizer: | |||
def _parse_free_uri_scheme(self): | |||
"""Parse the URI scheme of a free (no brackets) external link.""" | |||
valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" | |||
valid = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-" | |||
scheme = [] | |||
try: | |||
# We have to backtrack through the textbuffer looking for our | |||
# scheme since it was just parsed as text: | |||
for chunk in reversed(self._textbuffer): | |||
for char in reversed(chunk): | |||
if char.isspace() or char in self.MARKERS: | |||
# stop at the first non-word character | |||
if re.fullmatch(r"\W", char): | |||
raise StopIteration() | |||
if char not in valid: | |||
raise BadRoute() | |||
@@ -438,7 +439,7 @@ class Tokenizer: | |||
# Built from _parse()'s end sentinels: | |||
after, ctx = self._read(2), self._context | |||
equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING | |||
return (this in (self.END, "\n", "[", "]", "<", ">") or | |||
return (this in (self.END, "\n", "[", "]", "<", ">", "\"") or | |||
this == nxt == "'" or | |||
(this == "|" and ctx & contexts.TEMPLATE) or | |||
(this == "=" and ctx & equal_sign_contexts) or | |||
@@ -481,16 +482,29 @@ class Tokenizer: | |||
self._parse_template_or_argument() | |||
elif this == "]": | |||
return self._pop(), tail, 0 | |||
elif " " in this: | |||
before, after = this.split(" ", 1) | |||
elif this == "'" and nxt == "'": | |||
separator = tokens.ExternalLinkSeparator() | |||
separator.suppress_space = True | |||
self._emit(separator) | |||
self._context ^= contexts.EXT_LINK_URI | |||
self._context |= contexts.EXT_LINK_TITLE | |||
return self._parse(push=False), None, 0 | |||
elif any(ch in this for ch in (" ", "\n", "[", "]", "<", ">", | |||
"\"")): | |||
before, after = re.split("[ \n\[\]<>\"]", this, maxsplit=1) | |||
delimiter = this[len(before)] | |||
if brackets: | |||
self._emit_text(before) | |||
self._emit(tokens.ExternalLinkSeparator()) | |||
separator = tokens.ExternalLinkSeparator() | |||
if delimiter != " ": | |||
separator.suppress_space = True | |||
self._emit(separator) | |||
if after: | |||
self._emit_text(after) | |||
self._context ^= contexts.EXT_LINK_URI | |||
self._context |= contexts.EXT_LINK_TITLE | |||
self._head += 1 | |||
if delimiter == " ": | |||
self._head += 1 | |||
return self._parse(push=False), None, 0 | |||
punct, tail = self._handle_free_link_text(punct, tail, before) | |||
return self._pop(), tail + " " + after, 0 | |||
@@ -153,9 +153,9 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), Ext | |||
--- | |||
name: brackets_open_bracket_inside | |||
label: an open bracket inside a bracket-enclosed link that is also included | |||
label: an open bracket inside a bracket-enclosed link that is not included | |||
input: "[http://foobar[baz.com Example]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar"), ExternalLinkSeparator(suppress_space=True), Text(text="[baz.com Example"), ExternalLinkClose()] | |||
--- | |||
@@ -478,3 +478,101 @@ name: brackets_scheme_title_but_no_url | |||
label: brackets around a scheme, colon, and slashes, with a title | |||
input: "[http:// Example]" | |||
output: [Text(text="[http:// Example]")] | |||
--- | |||
name: url_preceded_by_non_word_character | |||
label: non-word character immediately before a valid URL | |||
input: "svn+ssh://server.domain.com:/reponame" | |||
output: [Text(text="svn+"), ExternalLinkOpen(brackets=False), Text(text="ssh://server.domain.com:/reponame"), ExternalLinkClose()] | |||
--- | |||
name: url_preceded_by_underscore | |||
label: underscore immediately before a valid URL | |||
input: "svn_ssh://server.domain.com:/reponame" | |||
output: [Text(text="svn_ssh://server.domain.com:/reponame")] | |||
--- | |||
name: url_terminated_by_double_quote | |||
label: a free link terminated by a double quote | |||
input: "http://foo\"bar" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="\"bar")] | |||
--- | |||
name: url_not_terminated_by_single_quote | |||
label: a free link not terminated by a single quote | |||
input: "http://foo'bar" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo'bar"), ExternalLinkClose()] | |||
--- | |||
name: url_terminated_by_two_single_quotes | |||
label: a free link terminated by two single quotes | |||
input: "http://foo''bar''" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose()] | |||
--- | |||
name: url_terminated_by_left_angle | |||
label: a free link terminated by a left angle | |||
input: "http://foo<bar" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text="<bar")] | |||
--- | |||
name: url_terminated_by_right_angle | |||
label: a free link terminated by a right angle | |||
input: "http://foo>bar" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foo"), ExternalLinkClose(), Text(text=">bar")] | |||
--- | |||
name: brackets_terminated_by_double_quote | |||
label: an external link terminated by a double quote | |||
input: "[http://foo\"bar]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="\"bar"), ExternalLinkClose()] | |||
--- | |||
name: brackets_not_terminated_by_single_quote | |||
label: an external link not terminated by a single quote | |||
input: "[http://foo'bar]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo'bar"), ExternalLinkClose()] | |||
--- | |||
name: brackets_terminated_by_two_single_quotes | |||
label: an external link terminated by two single quotes | |||
input: "[http://foo''bar'']" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="bar"), TagOpenClose(), Text(text="i"), TagCloseClose(), ExternalLinkClose()] | |||
--- | |||
name: brackets_terminated_by_left_angle | |||
label: an external link terminated by a left angle | |||
input: "[http://foo<bar]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text="<bar"), ExternalLinkClose()] | |||
--- | |||
name: brackets_terminated_by_right_angle | |||
label: an external link terminated by a right angle | |||
input: "[http://foo>bar]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo"), ExternalLinkSeparator(suppress_space=True), Text(text=">bar"), ExternalLinkClose()] | |||
--- | |||
name: scheme_case | |||
label: a free link with uppercase letters in the URL scheme | |||
input: "HtTp://example.com/" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="HtTp://example.com/"), ExternalLinkClose()] | |||
--- | |||
name: bracket_scheme_case | |||
label: an external link with uppercase letters in the URL scheme | |||
input: "[HtTp://example.com/]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="HtTp://example.com/"), ExternalLinkClose()] |