- Stop parsing new templates if the template depth gets above MAX_DEPTH (40) or if we've already tried to parse over MAX_CYCLES (100,000) templates. - Add two tests to ensure recursion works somewhat correctly. - Fix parsing the string "{{" with the Python tokenizer; add a test.tags/v0.2
@@ -109,6 +109,8 @@ Tokenizer_push(Tokenizer* self, int context) | |||
return -1; | |||
top->next = self->topstack; | |||
self->topstack = top; | |||
self->depth++; | |||
self->cycles++; | |||
return 0; | |||
} | |||
@@ -174,6 +176,7 @@ Tokenizer_delete_top_of_stack(Tokenizer* self) | |||
Textbuffer_dealloc(top->textbuffer); | |||
self->topstack = top->next; | |||
free(top); | |||
self->depth--; | |||
} | |||
/* | |||
@@ -1269,10 +1272,14 @@ Tokenizer_parse(Tokenizer* self, int context) | |||
Tokenizer_write_text(self, this); | |||
} | |||
else if (this == next && next == *"{") { | |||
if (Tokenizer_parse_template_or_argument(self)) | |||
return NULL; | |||
if (self->topstack->context & LC_FAIL_NEXT) | |||
self->topstack->context ^= LC_FAIL_NEXT; | |||
if (Tokenizer_CAN_RECURSE(self)) { | |||
if (Tokenizer_parse_template_or_argument(self)) | |||
return NULL; | |||
if (self->topstack->context & LC_FAIL_NEXT) | |||
self->topstack->context ^= LC_FAIL_NEXT; | |||
} | |||
else | |||
Tokenizer_write_text(self, this); | |||
} | |||
else if (this == *"|" && this_context & LC_TEMPLATE) { | |||
if (Tokenizer_handle_template_param(self)) | |||
@@ -1295,7 +1302,8 @@ Tokenizer_parse(Tokenizer* self, int context) | |||
Tokenizer_write_text(self, this); | |||
} | |||
else if (this == next && next == *"[") { | |||
if (!(this_context & LC_WIKILINK_TITLE)) { | |||
if (!(this_context & LC_WIKILINK_TITLE) && | |||
Tokenizer_CAN_RECURSE(self)) { | |||
if (Tokenizer_parse_wikilink(self)) | |||
return NULL; | |||
if (self->topstack->context & LC_FAIL_NEXT) | |||
@@ -46,6 +46,8 @@ static const char* MARKERS[] = { | |||
#define NUM_MARKERS 18 | |||
#define TEXTBUFFER_BLOCKSIZE 1024 | |||
#define MAX_DEPTH 40 | |||
#define MAX_CYCLES 100000 | |||
#define MAX_ENTITY_SIZE 8 | |||
static int route_state = 0; | |||
@@ -165,12 +167,15 @@ typedef struct { | |||
Py_ssize_t head; /* current position in text */ | |||
Py_ssize_t length; /* length of text */ | |||
int global; /* global context */ | |||
int depth; /* stack recursion depth */ | |||
int cycles; /* total number of stack recursions */ | |||
} Tokenizer; | |||
/* Macros for accessing Tokenizer data: */ | |||
#define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta))) | |||
#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES) | |||
/* Function prototypes: */ | |||
@@ -42,6 +42,8 @@ class Tokenizer(object): | |||
END = object() | |||
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", | |||
"/", "-", "!", "\n", END] | |||
MAX_DEPTH = 40 | |||
MAX_CYCLES = 100000 | |||
regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE) | |||
def __init__(self): | |||
@@ -49,6 +51,8 @@ class Tokenizer(object): | |||
self._head = 0 | |||
self._stacks = [] | |||
self._global = 0 | |||
self._depth = 0 | |||
self._cycles = 0 | |||
@property | |||
def _stack(self): | |||
@@ -76,6 +80,8 @@ class Tokenizer(object): | |||
def _push(self, context=0): | |||
"""Add a new token stack, context, and textbuffer to the list.""" | |||
self._stacks.append([[], context, []]) | |||
self._depth += 1 | |||
self._cycles += 1 | |||
def _push_textbuffer(self): | |||
"""Push the textbuffer onto the stack as a Text node and clear it.""" | |||
@@ -90,6 +96,7 @@ class Tokenizer(object): | |||
stack's context with the current stack's. | |||
""" | |||
self._push_textbuffer() | |||
self._depth -= 1 | |||
if keep_context: | |||
context = self._context | |||
stack = self._stacks.pop()[0] | |||
@@ -97,6 +104,10 @@ class Tokenizer(object): | |||
return stack | |||
return self._stacks.pop()[0] | |||
def _can_recurse(self): | |||
"""Return whether or not our max recursion depth has been exceeded.""" | |||
return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES | |||
def _fail_route(self): | |||
"""Fail the current tokenization route. | |||
@@ -418,7 +429,7 @@ class Tokenizer(object): | |||
else: | |||
if this == "\n": | |||
self._context |= contexts.FAIL_ON_TEXT | |||
elif this is not self.END or not this.isspace(): | |||
elif this is self.END or not this.isspace(): | |||
self._context |= contexts.HAS_TEXT | |||
return True | |||
else: | |||
@@ -479,9 +490,12 @@ class Tokenizer(object): | |||
else: | |||
self._write_text(this) | |||
elif this == next == "{": | |||
self._parse_template_or_argument() | |||
if self._context & contexts.FAIL_NEXT: | |||
self._context ^= contexts.FAIL_NEXT | |||
if self._can_recurse(): | |||
self._parse_template_or_argument() | |||
if self._context & contexts.FAIL_NEXT: | |||
self._context ^= contexts.FAIL_NEXT | |||
else: | |||
self._write_text("{") | |||
elif this == "|" and self._context & contexts.TEMPLATE: | |||
self._handle_template_param() | |||
elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY: | |||
@@ -496,7 +510,7 @@ class Tokenizer(object): | |||
else: | |||
self._write_text("}") | |||
elif this == next == "[": | |||
if not self._context & contexts.WIKILINK_TITLE: | |||
if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): | |||
self._parse_wikilink() | |||
if self._context & contexts.FAIL_NEXT: | |||
self._context ^= contexts.FAIL_NEXT | |||
@@ -481,6 +481,13 @@ output: [TemplateOpen(), Text(text="foo"), TemplateParamSeparator(), Text(text=" | |||
--- | |||
name: incomplete_stub | |||
label: incomplete templates that should fail gracefully: just an opening | |||
input: "{{" | |||
output: [Text(text="{{")] | |||
--- | |||
name: incomplete_plain | |||
label: incomplete templates that should fail gracefully: no close whatsoever | |||
input: "{{stuff}} {{foobar" | |||
@@ -597,3 +604,17 @@ name: incomplete_nested_template_as_param_value | |||
label: incomplete templates that should fail gracefully: a valid nested template as a parameter value | |||
input: "{{stuff}} {{foo|bar={{baz}}" | |||
output: [TemplateOpen(), Text(text="stuff"), TemplateClose(), Text(text=" {{foo|bar="), TemplateOpen(), Text(text="baz"), TemplateClose()] | |||
--- | |||
name: recursion_one_hundred_opens | |||
label: test potentially dangerous recursion: one hundred template openings | |||
input: "{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{" | |||
output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{")] | |||
--- | |||
name: recursion_opens_and_closes | |||
label: test potentially dangerous recursion: template openings and closings | |||
input: "{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}" | |||
output: [Text(text="{{|"), TemplateOpen(), TemplateClose(), Text(text="{{|"), TemplateOpen(), TemplateClose(), TemplateOpen(), TemplateParamSeparator(), TemplateOpen(), TemplateClose(), Text(text="{{"), TemplateParamSeparator(), Text(text="{{"), TemplateClose(), Text(text="{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}")] |