Also, fix parsing of wikilinks in both tokenizers such that newlines in any location within the title are an automatic failure.tags/v0.2
@@ -1144,17 +1144,24 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||||
Tokenizer_fail_route(self); | Tokenizer_fail_route(self); | ||||
return; | return; | ||||
} | } | ||||
if (context & (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE)) { | |||||
if (data == *"{" || data == *"}" || data == *"[" || data == *"]") { | |||||
if (context & LC_WIKILINK_TITLE) { | |||||
if (data == *"]" || data == *"{") | |||||
self->topstack->context |= LC_FAIL_NEXT; | |||||
else if (data == *"\n" || data == *"[" || data == *"}") | |||||
Tokenizer_fail_route(self); | |||||
return; | |||||
} | |||||
if (context & LC_TEMPLATE_NAME) { | |||||
if (data == *"{" || data == *"}" || data == *"[") { | |||||
self->topstack->context |= LC_FAIL_NEXT; | self->topstack->context |= LC_FAIL_NEXT; | ||||
return; | return; | ||||
} | } | ||||
if (data == *"|") { | |||||
if (context & LC_FAIL_ON_TEXT) { | |||||
self->topstack->context ^= LC_FAIL_ON_TEXT; | |||||
return; | |||||
} | |||||
if (data == *"]") { | |||||
Tokenizer_fail_route(self); | |||||
return; | |||||
} | } | ||||
if (data == *"|") | |||||
return; | |||||
} | } | ||||
else if (context & (LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)) { | else if (context & (LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)) { | ||||
if (context & LC_FAIL_ON_EQUALS) { | if (context & LC_FAIL_ON_EQUALS) { | ||||
@@ -1210,6 +1217,28 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||||
} | } | ||||
/* | /* | ||||
Unset any safety-checking contexts set by Tokenizer_verify_safe(). Used | |||||
when we preserve a context but previous data becomes invalid, like when | |||||
moving between template parameters. | |||||
*/ | |||||
static void | |||||
Tokenizer_reset_safety_checks(Tokenizer* self) | |||||
{ | |||||
static int checks[] = { | |||||
LC_HAS_TEXT, LC_FAIL_ON_TEXT, LC_FAIL_NEXT, LC_FAIL_ON_LBRACE, | |||||
LC_FAIL_ON_RBRACE, LC_FAIL_ON_EQUALS, 0}; | |||||
int context = self->topstack->context, i = 0, this; | |||||
while (1) { | |||||
this = checks[i]; | |||||
if (!this) | |||||
return; | |||||
if (context & this) | |||||
self->topstack->context ^= this; | |||||
i++; | |||||
} | |||||
} | |||||
/* | |||||
Parse the wikicode string, using context for when to stop. | Parse the wikicode string, using context for when to stop. | ||||
*/ | */ | ||||
static PyObject* | static PyObject* | ||||
@@ -1274,6 +1303,7 @@ Tokenizer_parse(Tokenizer* self, int context) | |||||
self->topstack->context ^= LC_FAIL_NEXT; | self->topstack->context ^= LC_FAIL_NEXT; | ||||
} | } | ||||
else if (this == *"|" && this_context & LC_TEMPLATE) { | else if (this == *"|" && this_context & LC_TEMPLATE) { | ||||
Tokenizer_reset_safety_checks(self); | |||||
if (Tokenizer_handle_template_param(self)) | if (Tokenizer_handle_template_param(self)) | ||||
return NULL; | return NULL; | ||||
} | } | ||||
@@ -1294,15 +1324,10 @@ Tokenizer_parse(Tokenizer* self, int context) | |||||
Tokenizer_write_text(self, this); | Tokenizer_write_text(self, this); | ||||
} | } | ||||
else if (this == next && next == *"[") { | else if (this == next && next == *"[") { | ||||
if (!(this_context & LC_WIKILINK_TITLE)) { | |||||
if (Tokenizer_parse_wikilink(self)) | |||||
return NULL; | |||||
if (self->topstack->context & LC_FAIL_NEXT) | |||||
self->topstack->context ^= LC_FAIL_NEXT; | |||||
} | |||||
else { | |||||
Tokenizer_write_text(self, this); | |||||
} | |||||
if (Tokenizer_parse_wikilink(self)) | |||||
return NULL; | |||||
if (self->topstack->context & LC_FAIL_NEXT) | |||||
self->topstack->context ^= LC_FAIL_NEXT; | |||||
} | } | ||||
else if (this == *"|" && this_context & LC_WIKILINK_TITLE) { | else if (this == *"|" && this_context & LC_WIKILINK_TITLE) { | ||||
if (Tokenizer_handle_wikilink_separator(self)) | if (Tokenizer_handle_wikilink_separator(self)) | ||||
@@ -206,6 +206,7 @@ static int Tokenizer_really_parse_entity(Tokenizer*); | |||||
static int Tokenizer_parse_entity(Tokenizer*); | static int Tokenizer_parse_entity(Tokenizer*); | ||||
static int Tokenizer_parse_comment(Tokenizer*); | static int Tokenizer_parse_comment(Tokenizer*); | ||||
static void Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE); | static void Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE); | ||||
static void Tokenizer_reset_safety_checks(Tokenizer*); | |||||
static PyObject* Tokenizer_parse(Tokenizer*, int); | static PyObject* Tokenizer_parse(Tokenizer*, int); | ||||
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | ||||
@@ -213,17 +213,21 @@ class Tokenizer(object): | |||||
self._write_all(argument) | self._write_all(argument) | ||||
self._write(tokens.ArgumentClose()) | self._write(tokens.ArgumentClose()) | ||||
def _verify_safe(self, unsafes): | |||||
def _verify_safe(self, unsafes, strip=True): | |||||
"""Verify that there are no unsafe characters in the current stack. | """Verify that there are no unsafe characters in the current stack. | ||||
The route will be failed if the name contains any element of *unsafes* | The route will be failed if the name contains any element of *unsafes* | ||||
in it (not merely at the beginning or end). This is used when parsing a | |||||
template name or parameter key, which cannot contain newlines. | |||||
in it. This is used when parsing template names, parameter keys, and so | |||||
on, which cannot contain newlines and some other characters. If *strip* | |||||
is ``True``, the text will be stripped of whitespace, since this is | |||||
allowed at the ends of certain elements but not between text. | |||||
""" | """ | ||||
self._push_textbuffer() | self._push_textbuffer() | ||||
if self._stack: | if self._stack: | ||||
text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] | text = [tok for tok in self._stack if isinstance(tok, tokens.Text)] | ||||
text = "".join([token.text for token in text]).strip() | |||||
text = "".join([token.text for token in text]) | |||||
if strip: | |||||
text = text.strip() | |||||
if text and any([unsafe in text for unsafe in unsafes]): | if text and any([unsafe in text for unsafe in unsafes]): | ||||
self._fail_route() | self._fail_route() | ||||
@@ -291,7 +295,7 @@ class Tokenizer(object): | |||||
def _handle_wikilink_separator(self): | def _handle_wikilink_separator(self): | ||||
"""Handle the separator between a wikilink's title and its text.""" | """Handle the separator between a wikilink's title and its text.""" | ||||
self._verify_safe(["\n", "{", "}", "[", "]"]) | |||||
self._verify_safe(["\n", "{", "}", "[", "]"], strip=False) | |||||
self._context ^= contexts.WIKILINK_TITLE | self._context ^= contexts.WIKILINK_TITLE | ||||
self._context |= contexts.WIKILINK_TEXT | self._context |= contexts.WIKILINK_TEXT | ||||
self._write(tokens.WikilinkSeparator()) | self._write(tokens.WikilinkSeparator()) | ||||
@@ -299,7 +303,7 @@ class Tokenizer(object): | |||||
def _handle_wikilink_end(self): | def _handle_wikilink_end(self): | ||||
"""Handle the end of a wikilink at the head of the string.""" | """Handle the end of a wikilink at the head of the string.""" | ||||
if self._context & contexts.WIKILINK_TITLE: | if self._context & contexts.WIKILINK_TITLE: | ||||
self._verify_safe(["\n", "{", "}", "[", "]"]) | |||||
self._verify_safe(["\n", "{", "}", "[", "]"], strip=False) | |||||
self._head += 1 | self._head += 1 | ||||
return self._pop() | return self._pop() | ||||