Bladeren bron

Fix recursion issues by giving up at a certain point (closes #16).

- Stop parsing new templates if the template depth gets above
  MAX_DEPTH (40) or if we've already tried to parse over MAX_CYCLES
  (100,000) templates.
- Add two tests to ensure recursion works somewhat correctly.
- Fix parsing the string "{{" with the Python tokenizer; add a test.
tags/v0.2
Ben Kurtovic 11 jaren geleden
bovenliggende
commit
debcb6577e
4 gewijzigde bestanden met toevoegingen van 58 en 10 verwijderingen
  1. +13
    -5
      mwparserfromhell/parser/tokenizer.c
  2. +5
    -0
      mwparserfromhell/parser/tokenizer.h
  3. +19
    -5
      mwparserfromhell/parser/tokenizer.py
  4. +21
    -0
      tests/tokenizer/templates.mwtest

+ 13
- 5
mwparserfromhell/parser/tokenizer.c Bestand weergeven

@@ -109,6 +109,8 @@ Tokenizer_push(Tokenizer* self, int context)
return -1;
top->next = self->topstack;
self->topstack = top;
self->depth++;
self->cycles++;
return 0;
}

@@ -174,6 +176,7 @@ Tokenizer_delete_top_of_stack(Tokenizer* self)
Textbuffer_dealloc(top->textbuffer);
self->topstack = top->next;
free(top);
self->depth--;
}

/*
@@ -1269,10 +1272,14 @@ Tokenizer_parse(Tokenizer* self, int context)
Tokenizer_write_text(self, this);
}
else if (this == next && next == *"{") {
if (Tokenizer_parse_template_or_argument(self))
return NULL;
if (self->topstack->context & LC_FAIL_NEXT)
self->topstack->context ^= LC_FAIL_NEXT;
if (Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_parse_template_or_argument(self))
return NULL;
if (self->topstack->context & LC_FAIL_NEXT)
self->topstack->context ^= LC_FAIL_NEXT;
}
else
Tokenizer_write_text(self, this);
}
else if (this == *"|" && this_context & LC_TEMPLATE) {
if (Tokenizer_handle_template_param(self))
@@ -1295,7 +1302,8 @@ Tokenizer_parse(Tokenizer* self, int context)
Tokenizer_write_text(self, this);
}
else if (this == next && next == *"[") {
if (!(this_context & LC_WIKILINK_TITLE)) {
if (!(this_context & LC_WIKILINK_TITLE) &&
Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_parse_wikilink(self))
return NULL;
if (self->topstack->context & LC_FAIL_NEXT)


+ 5
- 0
mwparserfromhell/parser/tokenizer.h Bestand weergeven

@@ -46,6 +46,8 @@ static const char* MARKERS[] = {

#define NUM_MARKERS 18
#define TEXTBUFFER_BLOCKSIZE 1024
#define MAX_DEPTH 40
#define MAX_CYCLES 100000
#define MAX_ENTITY_SIZE 8

static int route_state = 0;
@@ -165,12 +167,15 @@ typedef struct {
Py_ssize_t head; /* current position in text */
Py_ssize_t length; /* length of text */
int global; /* global context */
int depth; /* stack recursion depth */
int cycles; /* total number of stack recursions */
} Tokenizer;


/* Macros for accessing Tokenizer data: */

#define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta)))
#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)


/* Function prototypes: */


+ 19
- 5
mwparserfromhell/parser/tokenizer.py Bestand weergeven

@@ -42,6 +42,8 @@ class Tokenizer(object):
END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
"/", "-", "!", "\n", END]
MAX_DEPTH = 40
MAX_CYCLES = 100000
regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE)

def __init__(self):
@@ -49,6 +51,8 @@ class Tokenizer(object):
self._head = 0
self._stacks = []
self._global = 0
self._depth = 0
self._cycles = 0

@property
def _stack(self):
@@ -76,6 +80,8 @@ class Tokenizer(object):
def _push(self, context=0):
"""Add a new token stack, context, and textbuffer to the list."""
self._stacks.append([[], context, []])
self._depth += 1
self._cycles += 1

def _push_textbuffer(self):
"""Push the textbuffer onto the stack as a Text node and clear it."""
@@ -90,6 +96,7 @@ class Tokenizer(object):
stack's context with the current stack's.
"""
self._push_textbuffer()
self._depth -= 1
if keep_context:
context = self._context
stack = self._stacks.pop()[0]
@@ -97,6 +104,10 @@ class Tokenizer(object):
return stack
return self._stacks.pop()[0]

def _can_recurse(self):
"""Return whether or not our max recursion depth has been exceeded."""
return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES

def _fail_route(self):
"""Fail the current tokenization route.

@@ -418,7 +429,7 @@ class Tokenizer(object):
else:
if this == "\n":
self._context |= contexts.FAIL_ON_TEXT
elif this is not self.END or not this.isspace():
elif this is self.END or not this.isspace():
self._context |= contexts.HAS_TEXT
return True
else:
@@ -479,9 +490,12 @@ class Tokenizer(object):
else:
self._write_text(this)
elif this == next == "{":
self._parse_template_or_argument()
if self._context & contexts.FAIL_NEXT:
self._context ^= contexts.FAIL_NEXT
if self._can_recurse():
self._parse_template_or_argument()
if self._context & contexts.FAIL_NEXT:
self._context ^= contexts.FAIL_NEXT
else:
self._write_text("{")
elif this == "|" and self._context & contexts.TEMPLATE:
self._handle_template_param()
elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
@@ -496,7 +510,7 @@ class Tokenizer(object):
else:
self._write_text("}")
elif this == next == "[":
if not self._context & contexts.WIKILINK_TITLE:
if not self._context & contexts.WIKILINK_TITLE and self._can_recurse():
self._parse_wikilink()
if self._context & contexts.FAIL_NEXT:
self._context ^= contexts.FAIL_NEXT


+ 21
- 0
tests/tokenizer/templates.mwtest Bestand weergeven

@@ -481,6 +481,13 @@ output: [TemplateOpen(), Text(text="foo"), TemplateParamSeparator(), Text(text="

---

name: incomplete_stub
label: incomplete templates that should fail gracefully: just an opening
input: "{{"
output: [Text(text="{{")]

---

name: incomplete_plain
label: incomplete templates that should fail gracefully: no close whatsoever
input: "{{stuff}} {{foobar"
@@ -597,3 +604,17 @@ name: incomplete_nested_template_as_param_value
label: incomplete templates that should fail gracefully: a valid nested template as a parameter value
input: "{{stuff}} {{foo|bar={{baz}}"
output: [TemplateOpen(), Text(text="stuff"), TemplateClose(), Text(text=" {{foo|bar="), TemplateOpen(), Text(text="baz"), TemplateClose()]

---

name: recursion_one_hundred_opens
label: test potentially dangerous recursion: one hundred template openings
input: "{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{"
output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{")]

---

name: recursion_opens_and_closes
label: test potentially dangerous recursion: template openings and closings
input: "{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}"
output: [Text(text="{{|"), TemplateOpen(), TemplateClose(), Text(text="{{|"), TemplateOpen(), TemplateClose(), TemplateOpen(), TemplateParamSeparator(), TemplateOpen(), TemplateClose(), Text(text="{{"), TemplateParamSeparator(), Text(text="{{"), TemplateClose(), Text(text="{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}")]

Laden…
Annuleren
Opslaan