- Stop parsing new templates if the template depth gets above MAX_DEPTH (40) or if we've already tried to parse over MAX_CYCLES (100,000) templates. - Add two tests to ensure recursion works somewhat correctly. - Fix parsing the string "{{" with the Python tokenizer; add a test.tags/v0.2
@@ -109,6 +109,8 @@ Tokenizer_push(Tokenizer* self, int context) | |||||
return -1; | return -1; | ||||
top->next = self->topstack; | top->next = self->topstack; | ||||
self->topstack = top; | self->topstack = top; | ||||
self->depth++; | |||||
self->cycles++; | |||||
return 0; | return 0; | ||||
} | } | ||||
@@ -174,6 +176,7 @@ Tokenizer_delete_top_of_stack(Tokenizer* self) | |||||
Textbuffer_dealloc(top->textbuffer); | Textbuffer_dealloc(top->textbuffer); | ||||
self->topstack = top->next; | self->topstack = top->next; | ||||
free(top); | free(top); | ||||
self->depth--; | |||||
} | } | ||||
/* | /* | ||||
@@ -1269,10 +1272,14 @@ Tokenizer_parse(Tokenizer* self, int context) | |||||
Tokenizer_write_text(self, this); | Tokenizer_write_text(self, this); | ||||
} | } | ||||
else if (this == next && next == *"{") { | else if (this == next && next == *"{") { | ||||
if (Tokenizer_parse_template_or_argument(self)) | |||||
return NULL; | |||||
if (self->topstack->context & LC_FAIL_NEXT) | |||||
self->topstack->context ^= LC_FAIL_NEXT; | |||||
if (Tokenizer_CAN_RECURSE(self)) { | |||||
if (Tokenizer_parse_template_or_argument(self)) | |||||
return NULL; | |||||
if (self->topstack->context & LC_FAIL_NEXT) | |||||
self->topstack->context ^= LC_FAIL_NEXT; | |||||
} | |||||
else | |||||
Tokenizer_write_text(self, this); | |||||
} | } | ||||
else if (this == *"|" && this_context & LC_TEMPLATE) { | else if (this == *"|" && this_context & LC_TEMPLATE) { | ||||
if (Tokenizer_handle_template_param(self)) | if (Tokenizer_handle_template_param(self)) | ||||
@@ -1295,7 +1302,8 @@ Tokenizer_parse(Tokenizer* self, int context) | |||||
Tokenizer_write_text(self, this); | Tokenizer_write_text(self, this); | ||||
} | } | ||||
else if (this == next && next == *"[") { | else if (this == next && next == *"[") { | ||||
if (!(this_context & LC_WIKILINK_TITLE)) { | |||||
if (!(this_context & LC_WIKILINK_TITLE) && | |||||
Tokenizer_CAN_RECURSE(self)) { | |||||
if (Tokenizer_parse_wikilink(self)) | if (Tokenizer_parse_wikilink(self)) | ||||
return NULL; | return NULL; | ||||
if (self->topstack->context & LC_FAIL_NEXT) | if (self->topstack->context & LC_FAIL_NEXT) | ||||
@@ -46,6 +46,8 @@ static const char* MARKERS[] = { | |||||
#define NUM_MARKERS 18 | #define NUM_MARKERS 18 | ||||
#define TEXTBUFFER_BLOCKSIZE 1024 | #define TEXTBUFFER_BLOCKSIZE 1024 | ||||
#define MAX_DEPTH 40 | |||||
#define MAX_CYCLES 100000 | |||||
#define MAX_ENTITY_SIZE 8 | #define MAX_ENTITY_SIZE 8 | ||||
static int route_state = 0; | static int route_state = 0; | ||||
@@ -165,12 +167,15 @@ typedef struct { | |||||
Py_ssize_t head; /* current position in text */ | Py_ssize_t head; /* current position in text */ | ||||
Py_ssize_t length; /* length of text */ | Py_ssize_t length; /* length of text */ | ||||
int global; /* global context */ | int global; /* global context */ | ||||
int depth; /* stack recursion depth */ | |||||
int cycles; /* total number of stack recursions */ | |||||
} Tokenizer; | } Tokenizer; | ||||
/* Macros for accessing Tokenizer data: */ | /* Macros for accessing Tokenizer data: */ | ||||
#define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta))) | #define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta))) | ||||
#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES) | |||||
/* Function prototypes: */ | /* Function prototypes: */ | ||||
@@ -42,6 +42,8 @@ class Tokenizer(object): | |||||
END = object() | END = object() | ||||
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", | MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", | ||||
"/", "-", "!", "\n", END] | "/", "-", "!", "\n", END] | ||||
MAX_DEPTH = 40 | |||||
MAX_CYCLES = 100000 | |||||
regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE) | regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE) | ||||
def __init__(self): | def __init__(self): | ||||
@@ -49,6 +51,8 @@ class Tokenizer(object): | |||||
self._head = 0 | self._head = 0 | ||||
self._stacks = [] | self._stacks = [] | ||||
self._global = 0 | self._global = 0 | ||||
self._depth = 0 | |||||
self._cycles = 0 | |||||
@property | @property | ||||
def _stack(self): | def _stack(self): | ||||
@@ -76,6 +80,8 @@ class Tokenizer(object): | |||||
def _push(self, context=0): | def _push(self, context=0): | ||||
"""Add a new token stack, context, and textbuffer to the list.""" | """Add a new token stack, context, and textbuffer to the list.""" | ||||
self._stacks.append([[], context, []]) | self._stacks.append([[], context, []]) | ||||
self._depth += 1 | |||||
self._cycles += 1 | |||||
def _push_textbuffer(self): | def _push_textbuffer(self): | ||||
"""Push the textbuffer onto the stack as a Text node and clear it.""" | """Push the textbuffer onto the stack as a Text node and clear it.""" | ||||
@@ -90,6 +96,7 @@ class Tokenizer(object): | |||||
stack's context with the current stack's. | stack's context with the current stack's. | ||||
""" | """ | ||||
self._push_textbuffer() | self._push_textbuffer() | ||||
self._depth -= 1 | |||||
if keep_context: | if keep_context: | ||||
context = self._context | context = self._context | ||||
stack = self._stacks.pop()[0] | stack = self._stacks.pop()[0] | ||||
@@ -97,6 +104,10 @@ class Tokenizer(object): | |||||
return stack | return stack | ||||
return self._stacks.pop()[0] | return self._stacks.pop()[0] | ||||
def _can_recurse(self): | |||||
"""Return whether or not our max recursion depth has been exceeded.""" | |||||
return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES | |||||
def _fail_route(self): | def _fail_route(self): | ||||
"""Fail the current tokenization route. | """Fail the current tokenization route. | ||||
@@ -418,7 +429,7 @@ class Tokenizer(object): | |||||
else: | else: | ||||
if this == "\n": | if this == "\n": | ||||
self._context |= contexts.FAIL_ON_TEXT | self._context |= contexts.FAIL_ON_TEXT | ||||
elif this is not self.END or not this.isspace(): | |||||
elif this is self.END or not this.isspace(): | |||||
self._context |= contexts.HAS_TEXT | self._context |= contexts.HAS_TEXT | ||||
return True | return True | ||||
else: | else: | ||||
@@ -479,9 +490,12 @@ class Tokenizer(object): | |||||
else: | else: | ||||
self._write_text(this) | self._write_text(this) | ||||
elif this == next == "{": | elif this == next == "{": | ||||
self._parse_template_or_argument() | |||||
if self._context & contexts.FAIL_NEXT: | |||||
self._context ^= contexts.FAIL_NEXT | |||||
if self._can_recurse(): | |||||
self._parse_template_or_argument() | |||||
if self._context & contexts.FAIL_NEXT: | |||||
self._context ^= contexts.FAIL_NEXT | |||||
else: | |||||
self._write_text("{") | |||||
elif this == "|" and self._context & contexts.TEMPLATE: | elif this == "|" and self._context & contexts.TEMPLATE: | ||||
self._handle_template_param() | self._handle_template_param() | ||||
elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY: | elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY: | ||||
@@ -496,7 +510,7 @@ class Tokenizer(object): | |||||
else: | else: | ||||
self._write_text("}") | self._write_text("}") | ||||
elif this == next == "[": | elif this == next == "[": | ||||
if not self._context & contexts.WIKILINK_TITLE: | |||||
if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): | |||||
self._parse_wikilink() | self._parse_wikilink() | ||||
if self._context & contexts.FAIL_NEXT: | if self._context & contexts.FAIL_NEXT: | ||||
self._context ^= contexts.FAIL_NEXT | self._context ^= contexts.FAIL_NEXT | ||||
@@ -481,6 +481,13 @@ output: [TemplateOpen(), Text(text="foo"), TemplateParamSeparator(), Text(text=" | |||||
--- | --- | ||||
name: incomplete_stub | |||||
label: incomplete templates that should fail gracefully: just an opening | |||||
input: "{{" | |||||
output: [Text(text="{{")] | |||||
--- | |||||
name: incomplete_plain | name: incomplete_plain | ||||
label: incomplete templates that should fail gracefully: no close whatsoever | label: incomplete templates that should fail gracefully: no close whatsoever | ||||
input: "{{stuff}} {{foobar" | input: "{{stuff}} {{foobar" | ||||
@@ -597,3 +604,17 @@ name: incomplete_nested_template_as_param_value | |||||
label: incomplete templates that should fail gracefully: a valid nested template as a parameter value | label: incomplete templates that should fail gracefully: a valid nested template as a parameter value | ||||
input: "{{stuff}} {{foo|bar={{baz}}" | input: "{{stuff}} {{foo|bar={{baz}}" | ||||
output: [TemplateOpen(), Text(text="stuff"), TemplateClose(), Text(text=" {{foo|bar="), TemplateOpen(), Text(text="baz"), TemplateClose()] | output: [TemplateOpen(), Text(text="stuff"), TemplateClose(), Text(text=" {{foo|bar="), TemplateOpen(), Text(text="baz"), TemplateClose()] | ||||
--- | |||||
name: recursion_one_hundred_opens | |||||
label: test potentially dangerous recursion: one hundred template openings | |||||
input: "{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{" | |||||
output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{")] | |||||
--- | |||||
name: recursion_opens_and_closes | |||||
label: test potentially dangerous recursion: template openings and closings | |||||
input: "{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}" | |||||
output: [Text(text="{{|"), TemplateOpen(), TemplateClose(), Text(text="{{|"), TemplateOpen(), TemplateClose(), TemplateOpen(), TemplateParamSeparator(), TemplateOpen(), TemplateClose(), Text(text="{{"), TemplateParamSeparator(), Text(text="{{"), TemplateClose(), Text(text="{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}")] |