Fully fix parsing templates with blank names, I hope (#111)

9 years ago · f16c7e25ca
--- a/+ 3
+++ b/+ 3
@@ -8,6 +8,9 @@ v0.4.1 (unreleased):
  includes when denoting tags, but not comments.
 - Fixed the behavior of preserve_spacing in Template.add() and keep_field in
  Template.remove() on parameters with hidden keys.
 - Fixed parser bugs involving:
  - templates with completely blank names;
  - templates with newlines and comments.
 - Fixed some bugs in the release scripts.

 v0.4 (released May 23, 2015):
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -15,6 +15,11 @@ Unreleased
  This includes when denoting tags, but not comments.
 - Fixed the behavior of *preserve_spacing* in :func:`~.Template.add` and
  *keep_field* in :func:`~.Template.remove` on parameters with hidden keys.
 - Fixed parser bugs involving:

  - templates with completely blank names;
  - templates with newlines and comments.

 - Fixed some bugs in the release scripts.

 v0.4
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -89,6 +89,7 @@ Local (stack-specific) contexts:
    * :const:`FAIL_ON_LBRACE`
    * :const:`FAIL_ON_RBRACE`
    * :const:`FAIL_ON_EQUALS`
    * :const:`HAS_TEMPLATE`

 * :const:`TABLE`

@@ -161,15 +162,16 @@ FAIL_NEXT  =     1 << 26
 FAIL_ON_LBRACE = 1 << 27
 FAIL_ON_RBRACE = 1 << 28
 FAIL_ON_EQUALS = 1 << 29
 HAS_TEMPLATE =   1 << 30
 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
                FAIL_ON_RBRACE + FAIL_ON_EQUALS)

 TABLE_OPEN =       1 << 30
 TABLE_CELL_OPEN =  1 << 31
 TABLE_CELL_STYLE = 1 << 32
 TABLE_ROW_OPEN =   1 << 33
 TABLE_TD_LINE =    1 << 34
 TABLE_TH_LINE =    1 << 35
                FAIL_ON_RBRACE + FAIL_ON_EQUALS + HAS_TEMPLATE)

 TABLE_OPEN =       1 << 31
 TABLE_CELL_OPEN =  1 << 32
 TABLE_CELL_STYLE = 1 << 33
 TABLE_ROW_OPEN =   1 << 34
 TABLE_TD_LINE =    1 << 35
 TABLE_TH_LINE =    1 << 36
 TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE
 TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN +
         TABLE_TD_LINE + TABLE_TH_LINE)
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -582,12 +582,16 @@ static PyObject* Tokenizer_read_backwards(Tokenizer* self, Py_ssize_t delta)
 /*
    Parse a template at the head of the wikicode string.
 */
 static int Tokenizer_parse_template(Tokenizer* self)
 static int Tokenizer_parse_template(Tokenizer* self, int has_content)
 {
    PyObject *template;
    Py_ssize_t reset = self->head;
    uint64_t context = LC_TEMPLATE_NAME;

    template = Tokenizer_parse(self, LC_TEMPLATE_NAME, 1);
    if (has_content)
        context |= LC_HAS_TEMPLATE;

    template = Tokenizer_parse(self, context, 1);
    if (BAD_ROUTE) {
        self->head = reset;
        return 0;
@@ -643,6 +647,7 @@ static int Tokenizer_parse_argument(Tokenizer* self)
 static int Tokenizer_parse_template_or_argument(Tokenizer* self)
 {
    unsigned int braces = 2, i;
    int has_content = 0;
    PyObject *tokenlist;

    self->head += 2;
@@ -659,7 +664,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
            return 0;
        }
        if (braces == 2) {
            if (Tokenizer_parse_template(self))
            if (Tokenizer_parse_template(self, has_content))
                return -1;
            if (BAD_ROUTE) {
                RESET_ROUTE();
@@ -673,7 +678,7 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
            return -1;
        if (BAD_ROUTE) {
            RESET_ROUTE();
            if (Tokenizer_parse_template(self))
            if (Tokenizer_parse_template(self, has_content))
                return -1;
            if (BAD_ROUTE) {
                char text[MAX_BRACES + 1];
@@ -689,8 +694,10 @@ static int Tokenizer_parse_template_or_argument(Tokenizer* self)
        }
        else
            braces -= 3;
        if (braces)
        if (braces) {
            has_content = 1;
            self->head++;
        }
    }
    tokenlist = Tokenizer_pop(self);
    if (!tokenlist)
@@ -712,8 +719,13 @@ static int Tokenizer_handle_template_param(Tokenizer* self)
 {
    PyObject *stack;

    if (self->topstack->context & LC_TEMPLATE_NAME)
    if (self->topstack->context & LC_TEMPLATE_NAME) {
        if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE))) {
            Tokenizer_fail_route(self);
            return -1;
        }
        self->topstack->context ^= LC_TEMPLATE_NAME;
    }
    else if (self->topstack->context & LC_TEMPLATE_PARAM_VALUE)
        self->topstack->context ^= LC_TEMPLATE_PARAM_VALUE;
    if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
@@ -764,7 +776,11 @@ static PyObject* Tokenizer_handle_template_end(Tokenizer* self)
 {
    PyObject* stack;

    if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
    if (self->topstack->context & LC_TEMPLATE_NAME) {
        if (!(self->topstack->context & (LC_HAS_TEXT | LC_HAS_TEMPLATE)))
            return Tokenizer_fail_route(self);
    }
    else if (self->topstack->context & LC_TEMPLATE_PARAM_KEY) {
        stack = Tokenizer_pop_keeping_context(self);
        if (!stack)
            return NULL;
@@ -2885,30 +2901,26 @@ Tokenizer_verify_safe(Tokenizer* self, uint64_t context, Py_UNICODE data)
    if (context & LC_TAG_CLOSE)
        return (data == '<') ? -1 : 0;
    if (context & LC_TEMPLATE_NAME) {
        if (data == '{' || data == '}' || data == '[') {
        if (data == '{') {
            self->topstack->context |= LC_HAS_TEMPLATE | LC_FAIL_NEXT;
            return 0;
        }
        if (data == '}' || (data == '<' && Tokenizer_READ(self, 1) == '!')) {
            self->topstack->context |= LC_FAIL_NEXT;
            return 0;
        }
        if (data == ']' || data == '>' || (data == '<' &&
                                           Tokenizer_READ(self, 1) != '!')) {
        if (data == '[' || data == ']' || data == '<' || data == '>') {
            return -1;
        }
        if (data == '|')
            return 0;
        if (context & LC_HAS_TEXT) {
            if (context & LC_FAIL_ON_TEXT) {
                if (!Py_UNICODE_ISSPACE(data)) {
                    if (data == '<' && Tokenizer_READ(self, 1) == '!') {
                        self->topstack->context |= LC_FAIL_NEXT;
                        return 0;
                    }
                if (!Py_UNICODE_ISSPACE(data))
                    return -1;
                }
            }
            else {
                if (data == '\n')
                    self->topstack->context |= LC_FAIL_ON_TEXT;
            }
            else if (data == '\n')
                self->topstack->context |= LC_FAIL_ON_TEXT;
        }
        else if (!Py_UNICODE_ISSPACE(data))
            self->topstack->context |= LC_HAS_TEXT;
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -150,22 +150,23 @@ static PyObject* TagCloseClose;

 #define LC_DLTERM                   0x0000000000800000

 #define LC_SAFETY_CHECK             0x000000003F000000
 #define LC_SAFETY_CHECK             0x000000007F000000
 #define LC_HAS_TEXT                 0x0000000001000000
 #define LC_FAIL_ON_TEXT             0x0000000002000000
 #define LC_FAIL_NEXT                0x0000000004000000
 #define LC_FAIL_ON_LBRACE           0x0000000008000000
 #define LC_FAIL_ON_RBRACE           0x0000000010000000
 #define LC_FAIL_ON_EQUALS           0x0000000020000000

 #define LC_TABLE                    0x0000000FC0000000
 #define LC_TABLE_CELL_LINE_CONTEXTS 0x0000000D00000000
 #define LC_TABLE_OPEN               0x0000000040000000
 #define LC_TABLE_CELL_OPEN          0x0000000080000000
 #define LC_TABLE_CELL_STYLE         0x0000000100000000
 #define LC_TABLE_ROW_OPEN           0x0000000200000000
 #define LC_TABLE_TD_LINE            0x0000000400000000
 #define LC_TABLE_TH_LINE            0x0000000800000000
 #define LC_HAS_TEMPLATE             0x0000000040000000

 #define LC_TABLE                    0x0000001F80000000
 #define LC_TABLE_CELL_LINE_CONTEXTS 0x0000001A00000000
 #define LC_TABLE_OPEN               0x0000000080000000
 #define LC_TABLE_CELL_OPEN          0x0000000100000000
 #define LC_TABLE_CELL_STYLE         0x0000000200000000
 #define LC_TABLE_ROW_OPEN           0x0000000400000000
 #define LC_TABLE_TD_LINE            0x0000000800000000
 #define LC_TABLE_TH_LINE            0x0000001000000000

 /* Global contexts: */

--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -192,11 +192,14 @@ class Tokenizer(object):
                self._fail_route()
            return self.END

    def _parse_template(self):
    def _parse_template(self, has_content):
        """Parse a template at the head of the wikicode string."""
        reset = self._head
        context = contexts.TEMPLATE_NAME
        if has_content:
            context |= contexts.HAS_TEMPLATE
        try:
            template = self._parse(contexts.TEMPLATE_NAME)
            template = self._parse(context)
        except BadRoute:
            self._head = reset
            raise
@@ -223,6 +226,7 @@ class Tokenizer(object):
        while self._read() == "{":
            self._head += 1
            braces += 1
        has_content = False
        self._push()

        while braces:
@@ -230,7 +234,7 @@ class Tokenizer(object):
                return self._emit_text_then_stack("{")
            if braces == 2:
                try:
                    self._parse_template()
                    self._parse_template(has_content)
                except BadRoute:
                    return self._emit_text_then_stack("{{")
                break
@@ -239,11 +243,12 @@ class Tokenizer(object):
                braces -= 3
            except BadRoute:
                try:
                    self._parse_template()
                    self._parse_template(has_content)
                    braces -= 2
                except BadRoute:
                    return self._emit_text_then_stack("{" * braces)
            if braces:
                has_content = True
                self._head += 1

        self._emit_all(self._pop())
@@ -253,6 +258,8 @@ class Tokenizer(object):
    def _handle_template_param(self):
        """Handle a template parameter at the head of the string."""
        if self._context & contexts.TEMPLATE_NAME:
            if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE):
                self._fail_route()
            self._context ^= contexts.TEMPLATE_NAME
        elif self._context & contexts.TEMPLATE_PARAM_VALUE:
            self._context ^= contexts.TEMPLATE_PARAM_VALUE
@@ -271,7 +278,10 @@ class Tokenizer(object):

    def _handle_template_end(self):
        """Handle the end of a template at the head of the string."""
        if self._context & contexts.TEMPLATE_PARAM_KEY:
        if self._context & contexts.TEMPLATE_NAME:
            if not self._context & (contexts.HAS_TEXT | contexts.HAS_TEMPLATE):
                self._fail_route()
        elif self._context & contexts.TEMPLATE_PARAM_KEY:
            self._emit_all(self._pop(keep_context=True))
        self._head += 1
        return self._pop()
@@ -1183,23 +1193,22 @@ class Tokenizer(object):
        elif context & contexts.EXT_LINK_TITLE:
            return this != "\n"
        elif context & contexts.TEMPLATE_NAME:
            if this == "{" or this == "}" or this == "[":
            if this == "{":
                self._context |= contexts.HAS_TEMPLATE | contexts.FAIL_NEXT
                return True
            if this == "}" or (this == "<" and self._read(1) == "!"):
                self._context |= contexts.FAIL_NEXT
                return True
            if this == "]" or this == ">" or (this == "<" and self._read(1) != "!"):
            if this == "[" or this == "]" or this == "<" or this == ">":
                return False
            if this == "|":
                return True
            if context & contexts.HAS_TEXT:
                if context & contexts.FAIL_ON_TEXT:
                    if this is self.END or not this.isspace():
                        if this == "<" and self._read(1) == "!":
                            self._context |= contexts.FAIL_NEXT
                            return True
                        return False
                else:
                    if this == "\n":
                        self._context |= contexts.FAIL_ON_TEXT
                elif this == "\n":
                    self._context |= contexts.FAIL_ON_TEXT
            elif this is self.END or not this.isspace():
                self._context |= contexts.HAS_TEXT
            return True
--- a/tests/tokenizer/templates.mwtest
+++ b/tests/tokenizer/templates.mwtest
@@ -686,5 +686,5 @@ output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{

 name:   recursion_opens_and_closes
 label:  test potentially dangerous recursion: template openings and closings
 input:  "{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}"
 output: [Text(text="{{|"), TemplateOpen(), TemplateClose(), Text(text="{{|"), TemplateOpen(), TemplateClose(), TemplateOpen(), TemplateParamSeparator(), TemplateOpen(), TemplateClose(), Text(text="{{"), TemplateParamSeparator(), Text(text="{{"), TemplateClose(), Text(text="{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}{{|{{}}")]
 input:  "{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}"
 output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), TemplateOpen(), Text(text="x"), TemplateParamSeparator(), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x"), TemplateParamSeparator(), Text(text="{{x"), TemplateClose(), Text(text="{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}")]