Browse Source

Fix safety checks on template params in some odd cases (closes #24).

Also, fix parsing of wikilinks in both tokenizers such that newlines
in any location within the title are an automatic failure.
tags/v0.2
Ben Kurtovic 11 years ago
parent
commit
d6f2723a06
3 changed files with 52 additions and 22 deletions
  1. +41
    -16
      mwparserfromhell/parser/tokenizer.c
  2. +1
    -0
      mwparserfromhell/parser/tokenizer.h
  3. +10
    -6
      mwparserfromhell/parser/tokenizer.py

+ 41
- 16
mwparserfromhell/parser/tokenizer.c View File

@@ -1144,17 +1144,24 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
Tokenizer_fail_route(self);
return;
}
if (context & (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE)) {
if (data == *"{" || data == *"}" || data == *"[" || data == *"]") {
if (context & LC_WIKILINK_TITLE) {
if (data == *"]" || data == *"{")
self->topstack->context |= LC_FAIL_NEXT;
else if (data == *"\n" || data == *"[" || data == *"}")
Tokenizer_fail_route(self);
return;
}
if (context & LC_TEMPLATE_NAME) {
if (data == *"{" || data == *"}" || data == *"[") {
self->topstack->context |= LC_FAIL_NEXT;
return;
}
if (data == *"|") {
if (context & LC_FAIL_ON_TEXT) {
self->topstack->context ^= LC_FAIL_ON_TEXT;
return;
}
if (data == *"]") {
Tokenizer_fail_route(self);
return;
}
if (data == *"|")
return;
}
else if (context & (LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)) {
if (context & LC_FAIL_ON_EQUALS) {
@@ -1210,6 +1217,28 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
}

/*
Unset any safety-checking contexts set by Tokenizer_verify_safe(). Used
when we preserve a context but previous data becomes invalid, like when
moving between template parameters.
*/
static void
Tokenizer_reset_safety_checks(Tokenizer* self)
{
static int checks[] = {
LC_HAS_TEXT, LC_FAIL_ON_TEXT, LC_FAIL_NEXT, LC_FAIL_ON_LBRACE,
LC_FAIL_ON_RBRACE, LC_FAIL_ON_EQUALS, 0};
int context = self->topstack->context, i = 0, this;
while (1) {
this = checks[i];
if (!this)
return;
if (context & this)
self->topstack->context ^= this;
i++;
}
}

/*
Parse the wikicode string, using context for when to stop.
*/
static PyObject*
@@ -1274,6 +1303,7 @@ Tokenizer_parse(Tokenizer* self, int context)
self->topstack->context ^= LC_FAIL_NEXT;
}
else if (this == *"|" && this_context & LC_TEMPLATE) {
Tokenizer_reset_safety_checks(self);
if (Tokenizer_handle_template_param(self))
return NULL;
}
@@ -1294,15 +1324,10 @@ Tokenizer_parse(Tokenizer* self, int context)
Tokenizer_write_text(self, this);
}
else if (this == next && next == *"[") {
if (!(this_context & LC_WIKILINK_TITLE)) {
if (Tokenizer_parse_wikilink(self))
return NULL;
if (self->topstack->context & LC_FAIL_NEXT)
self->topstack->context ^= LC_FAIL_NEXT;
}
else {
Tokenizer_write_text(self, this);
}
if (Tokenizer_parse_wikilink(self))
return NULL;
if (self->topstack->context & LC_FAIL_NEXT)
self->topstack->context ^= LC_FAIL_NEXT;
}
else if (this == *"|" && this_context & LC_WIKILINK_TITLE) {
if (Tokenizer_handle_wikilink_separator(self))


+ 1
- 0
mwparserfromhell/parser/tokenizer.h View File

@@ -206,6 +206,7 @@ static int Tokenizer_really_parse_entity(Tokenizer*);
static int Tokenizer_parse_entity(Tokenizer*);
static int Tokenizer_parse_comment(Tokenizer*);
static void Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE);
static void Tokenizer_reset_safety_checks(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);



+ 10
- 6
mwparserfromhell/parser/tokenizer.py View File

@@ -213,17 +213,21 @@ class Tokenizer(object):
self._write_all(argument)
self._write(tokens.ArgumentClose())

def _verify_safe(self, unsafes):
def _verify_safe(self, unsafes, strip=True):
"""Verify that there are no unsafe characters in the current stack.

The route will be failed if the name contains any element of *unsafes*
in it (not merely at the beginning or end). This is used when parsing a
template name or parameter key, which cannot contain newlines.
in it. This is used when parsing template names, parameter keys, and so
on, which cannot contain newlines and some other characters. If *strip*
is ``True``, the text will be stripped of whitespace, since this is
allowed at the ends of certain elements but not between text.
"""
self._push_textbuffer()
if self._stack:
text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
text = "".join([token.text for token in text]).strip()
text = "".join([token.text for token in text])
if strip:
text = text.strip()
if text and any([unsafe in text for unsafe in unsafes]):
self._fail_route()

@@ -291,7 +295,7 @@ class Tokenizer(object):

def _handle_wikilink_separator(self):
"""Handle the separator between a wikilink's title and its text."""
self._verify_safe(["\n", "{", "}", "[", "]"])
self._verify_safe(["\n", "{", "}", "[", "]"], strip=False)
self._context ^= contexts.WIKILINK_TITLE
self._context |= contexts.WIKILINK_TEXT
self._write(tokens.WikilinkSeparator())
@@ -299,7 +303,7 @@ class Tokenizer(object):
def _handle_wikilink_end(self):
"""Handle the end of a wikilink at the head of the string."""
if self._context & contexts.WIKILINK_TITLE:
self._verify_safe(["\n", "{", "}", "[", "]"])
self._verify_safe(["\n", "{", "}", "[", "]"], strip=False)
self._head += 1
return self._pop()



Loading…
Cancel
Save