diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index d3abb22..875263c 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -109,6 +109,8 @@ Tokenizer_push(Tokenizer* self, int context) return -1; top->next = self->topstack; self->topstack = top; + self->depth++; + self->cycles++; return 0; } @@ -174,6 +176,7 @@ Tokenizer_delete_top_of_stack(Tokenizer* self) Textbuffer_dealloc(top->textbuffer); self->topstack = top->next; free(top); + self->depth--; } /* @@ -1269,10 +1272,14 @@ Tokenizer_parse(Tokenizer* self, int context) Tokenizer_write_text(self, this); } else if (this == next && next == *"{") { - if (Tokenizer_parse_template_or_argument(self)) - return NULL; - if (self->topstack->context & LC_FAIL_NEXT) - self->topstack->context ^= LC_FAIL_NEXT; + if (Tokenizer_CAN_RECURSE(self)) { + if (Tokenizer_parse_template_or_argument(self)) + return NULL; + if (self->topstack->context & LC_FAIL_NEXT) + self->topstack->context ^= LC_FAIL_NEXT; + } + else + Tokenizer_write_text(self, this); } else if (this == *"|" && this_context & LC_TEMPLATE) { if (Tokenizer_handle_template_param(self)) @@ -1295,7 +1302,8 @@ Tokenizer_parse(Tokenizer* self, int context) Tokenizer_write_text(self, this); } else if (this == next && next == *"[") { - if (!(this_context & LC_WIKILINK_TITLE)) { + if (!(this_context & LC_WIKILINK_TITLE) && + Tokenizer_CAN_RECURSE(self)) { if (Tokenizer_parse_wikilink(self)) return NULL; if (self->topstack->context & LC_FAIL_NEXT) diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 693538c..0730ea8 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -46,6 +46,8 @@ static const char* MARKERS[] = { #define NUM_MARKERS 18 #define TEXTBUFFER_BLOCKSIZE 1024 +#define MAX_DEPTH 40 +#define MAX_CYCLES 100000 #define MAX_ENTITY_SIZE 8 static int route_state = 0; @@ -165,12 +167,15 @@ typedef struct { Py_ssize_t head; /* current position in text */ Py_ssize_t length; /* length of text */ int global; /* global context */ + int depth; /* stack recursion depth */ + int cycles; /* total number of stack recursions */ } Tokenizer; /* Macros for accessing Tokenizer data: */ #define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta))) +#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES) /* Function prototypes: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index f995937..24eb9db 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -42,6 +42,8 @@ class Tokenizer(object): END = object() MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-", "!", "\n", END] + MAX_DEPTH = 40 + MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE) def __init__(self): @@ -49,6 +51,8 @@ class Tokenizer(object): self._head = 0 self._stacks = [] self._global = 0 + self._depth = 0 + self._cycles = 0 @property def _stack(self): @@ -76,6 +80,8 @@ class Tokenizer(object): def _push(self, context=0): """Add a new token stack, context, and textbuffer to the list.""" self._stacks.append([[], context, []]) + self._depth += 1 + self._cycles += 1 def _push_textbuffer(self): """Push the textbuffer onto the stack as a Text node and clear it.""" @@ -90,6 +96,7 @@ class Tokenizer(object): stack's context with the current stack's. """ self._push_textbuffer() + self._depth -= 1 if keep_context: context = self._context stack = self._stacks.pop()[0] @@ -97,6 +104,10 @@ class Tokenizer(object): return stack return self._stacks.pop()[0] + def _can_recurse(self): + """Return whether or not our max recursion depth has been exceeded.""" + return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES + def _fail_route(self): """Fail the current tokenization route. @@ -418,7 +429,7 @@ class Tokenizer(object): else: if this == "\n": self._context |= contexts.FAIL_ON_TEXT - elif this is not self.END or not this.isspace(): + elif this is self.END or not this.isspace(): self._context |= contexts.HAS_TEXT return True else: @@ -479,9 +490,12 @@ class Tokenizer(object): else: self._write_text(this) elif this == next == "{": - self._parse_template_or_argument() - if self._context & contexts.FAIL_NEXT: - self._context ^= contexts.FAIL_NEXT + if self._can_recurse(): + self._parse_template_or_argument() + if self._context & contexts.FAIL_NEXT: + self._context ^= contexts.FAIL_NEXT + else: + self._write_text("{") elif this == "|" and self._context & contexts.TEMPLATE: self._handle_template_param() elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY: @@ -496,7 +510,7 @@ class Tokenizer(object): else: self._write_text("}") elif this == next == "[": - if not self._context & contexts.WIKILINK_TITLE: + if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): self._parse_wikilink() if self._context & contexts.FAIL_NEXT: self._context ^= contexts.FAIL_NEXT diff --git a/tests/tokenizer/templates.mwtest b/tests/tokenizer/templates.mwtest index fa3c0a4..cf41bb3 100644 --- a/tests/tokenizer/templates.mwtest +++ b/tests/tokenizer/templates.mwtest @@ -481,6 +481,13 @@ output: [TemplateOpen(), Text(text="foo"), TemplateParamSeparator(), Text(text=" --- +name: incomplete_stub +label: incomplete templates that should fail gracefully: just an opening +input: "{{" +output: [Text(text="{{")] + +--- + name: incomplete_plain label: incomplete templates that should fail gracefully: no close whatsoever input: "{{stuff}} {{foobar" @@ -597,3 +604,17 @@ name: incomplete_nested_template_as_param_value label: incomplete templates that should fail gracefully: a valid nested template as a parameter value input: "{{stuff}} {{foo|bar={{baz}}" output: [TemplateOpen(), Text(text="stuff"), TemplateClose(), Text(text=" {{foo|bar="), TemplateOpen(), Text(text="baz"), TemplateClose()] + +--- + +name: recursion_one_hundred_opens +label: test potentially dangerous recursion: one hundred template openings +input: "{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{" +output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{")] + +--- + +name: recursion_opens_and_closes +label: test potentially dangerous recursion: template openings and closings +input: "{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}" +output: [Text(text="{{|"), TemplateOpen(), TemplateClose(), Text(text="{{|"), TemplateOpen(), TemplateClose(), TemplateOpen(), TemplateParamSeparator(), TemplateOpen(), TemplateClose(), Text(text="{{"), TemplateParamSeparator(), Text(text="{{"), TemplateClose(), Text(text="{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}")]