Fix safety checks on template params in some odd cases (closes #24).

Also, fix parsing of wikilinks in both tokenizers such that newlines in any location within the title are an automatic failure.
12 years ago · d6f2723a06
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -1144,17 +1144,24 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
        Tokenizer_fail_route(self);
        return;
    }
    if (context & (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE)) {
        if (data == *"{" || data == *"}" || data == *"[" || data == *"]") {
    if (context & LC_WIKILINK_TITLE) {
        if (data == *"]" || data == *"{")
            self->topstack->context |= LC_FAIL_NEXT;
        else if (data == *"\n" || data == *"[" || data == *"}")
            Tokenizer_fail_route(self);
        return;
    }
    if (context & LC_TEMPLATE_NAME) {
        if (data == *"{" || data == *"}" || data == *"[") {
            self->topstack->context |= LC_FAIL_NEXT;
            return;
        }
        if (data == *"|") {
            if (context & LC_FAIL_ON_TEXT) {
                self->topstack->context ^= LC_FAIL_ON_TEXT;
                return;
            }
        if (data == *"]") {
            Tokenizer_fail_route(self);
            return;
        }
        if (data == *"|")
            return;
    }
    else if (context & (LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)) {
        if (context & LC_FAIL_ON_EQUALS) {
@@ -1210,6 +1217,28 @@ Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
 }

 /*
    Unset any safety-checking contexts set by Tokenizer_verify_safe(). Used
    when we preserve a context but previous data becomes invalid, like when
    moving between template parameters.
 */
 static void
 Tokenizer_reset_safety_checks(Tokenizer* self)
 {
    static int checks[] = {
        LC_HAS_TEXT, LC_FAIL_ON_TEXT, LC_FAIL_NEXT, LC_FAIL_ON_LBRACE,
        LC_FAIL_ON_RBRACE, LC_FAIL_ON_EQUALS, 0};
    int context = self->topstack->context, i = 0, this;
    while (1) {
        this = checks[i];
        if (!this)
            return;
        if (context & this)
            self->topstack->context ^= this;
        i++;
    }
 }

 /*
    Parse the wikicode string, using context for when to stop.
 */
 static PyObject*
@@ -1274,6 +1303,7 @@ Tokenizer_parse(Tokenizer* self, int context)
                self->topstack->context ^= LC_FAIL_NEXT;
        }
        else if (this == *"|" && this_context & LC_TEMPLATE) {
            Tokenizer_reset_safety_checks(self);
            if (Tokenizer_handle_template_param(self))
                return NULL;
        }
@@ -1294,15 +1324,10 @@ Tokenizer_parse(Tokenizer* self, int context)
            Tokenizer_write_text(self, this);
        }
        else if (this == next && next == *"[") {
            if (!(this_context & LC_WIKILINK_TITLE)) {
                if (Tokenizer_parse_wikilink(self))
                    return NULL;
                if (self->topstack->context & LC_FAIL_NEXT)
                    self->topstack->context ^= LC_FAIL_NEXT;
            }
            else {
                Tokenizer_write_text(self, this);
            }
            if (Tokenizer_parse_wikilink(self))
                return NULL;
            if (self->topstack->context & LC_FAIL_NEXT)
                self->topstack->context ^= LC_FAIL_NEXT;
        }
        else if (this == *"|" && this_context & LC_WIKILINK_TITLE) {
            if (Tokenizer_handle_wikilink_separator(self))
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -206,6 +206,7 @@ static int Tokenizer_really_parse_entity(Tokenizer*);
 static int Tokenizer_parse_entity(Tokenizer*);
 static int Tokenizer_parse_comment(Tokenizer*);
 static void Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE);
 static void Tokenizer_reset_safety_checks(Tokenizer*);
 static PyObject* Tokenizer_parse(Tokenizer*, int);
 static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);

--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -213,17 +213,21 @@ class Tokenizer(object):
        self._write_all(argument)
        self._write(tokens.ArgumentClose())

    def _verify_safe(self, unsafes):
    def _verify_safe(self, unsafes, strip=True):
        """Verify that there are no unsafe characters in the current stack.

        The route will be failed if the name contains any element of *unsafes*
        in it (not merely at the beginning or end). This is used when parsing a
        template name or parameter key, which cannot contain newlines.
        in it. This is used when parsing template names, parameter keys, and so
        on, which cannot contain newlines and some other characters. If *strip*
        is ``True``, the text will be stripped of whitespace, since this is
        allowed at the ends of certain elements but not between text.
        """
        self._push_textbuffer()
        if self._stack:
            text = [tok for tok in self._stack if isinstance(tok, tokens.Text)]
            text = "".join([token.text for token in text]).strip()
            text = "".join([token.text for token in text])
            if strip:
                text = text.strip()
            if text and any([unsafe in text for unsafe in unsafes]):
                self._fail_route()

@@ -291,7 +295,7 @@ class Tokenizer(object):

    def _handle_wikilink_separator(self):
        """Handle the separator between a wikilink's title and its text."""
        self._verify_safe(["\n", "{", "}", "[", "]"])
        self._verify_safe(["\n", "{", "}", "[", "]"], strip=False)
        self._context ^= contexts.WIKILINK_TITLE
        self._context |= contexts.WIKILINK_TEXT
        self._write(tokens.WikilinkSeparator())
@@ -299,7 +303,7 @@ class Tokenizer(object):
    def _handle_wikilink_end(self):
        """Handle the end of a wikilink at the head of the string."""
        if self._context & contexts.WIKILINK_TITLE:
            self._verify_safe(["\n", "{", "}", "[", "]"])
            self._verify_safe(["\n", "{", "}", "[", "]"], strip=False)
        self._head += 1
        return self._pop()