Don't get stuck in tags with unclosed quoted attributes (fixes #190).

7 years ago · 86c805d59b
--- a/+ 3
+++ b/+ 3
@@ -1,7 +1,9 @@
 v0.6 (unreleased):

 - Improved behavior when adding parameters to templates (via Template.add())
  with poorly formatted whitespace conventions.
  with poorly formatted whitespace conventions. (#185)
 - Fixed the parser getting stuck in deeply nested HTML tags with unclosed,
  quoted attributes. (#190)

 v0.5 (released June 23, 2017):

--- a/+ 1
+++ b/+ 1
@@ -1,4 +1,4 @@
 Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -9,6 +9,10 @@ Unreleased

 - Improved behavior when adding parameters to templates (via
  :meth:`.Template.add`) with poorly formatted whitespace conventions.
  (`#185 <https://github.com/earwig/mwparserfromhell/issues/185>`_)
 - Fixed the parser getting stuck in deeply nested HTML tags with unclosed,
  quoted attributes.
  (`#190 <https://github.com/earwig/mwparserfromhell/issues/190>`_)

 v0.5
 ----
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -42,7 +42,7 @@ master_doc = 'index'

 # General information about the project.
 project = u'mwparserfromhell'
 copyright = u'2012, 2013, 2014, 2015, 2016, 2017 Ben Kurtovic'
 copyright = u'2012–2018 Ben Kurtovic'

 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
--- a/mwparserfromhell/init.py
+++ b/mwparserfromhell/init.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -27,7 +27,7 @@ outrageously powerful parser for `MediaWiki <http://mediawiki.org>`_ wikicode.
 """

 __author__ = "Ben Kurtovic"
 __copyright__ = "Copyright (C) 2012-2017 Ben Kurtovic"
 __copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic"
 __license__ = "MIT License"
 __version__ = "0.6.dev0"
 __email__ = "ben.kurtovic@gmail.com"
--- a/mwparserfromhell/parser/ctokenizer/tok_parse.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c
@@ -1,5 +1,5 @@
 /*
 Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
@@ -722,7 +722,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
 */
 static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
 {
    #define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS
    #define NOT_A_LINK                                        \
        if (!brackets && self->topstack->context & LC_DLTERM) \
            return Tokenizer_handle_dl_term(self);            \
@@ -732,7 +731,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
    PyObject *link, *kwargs;
    Textbuffer *extra;

    if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) {
    if (self->topstack->context & AGG_NO_EXT_LINKS ||
            !(Tokenizer_CAN_RECURSE(self))) {
        NOT_A_LINK;
    }
    extra = Textbuffer_new(&self->text);
@@ -1280,6 +1280,7 @@ static int Tokenizer_handle_tag_data(
    else if (data->context & TAG_NOTE_SPACE) {
        if (data->context & TAG_QUOTED) {
            data->context = TAG_ATTR_VALUE;
            Tokenizer_memoize_bad_route(self);
            trash = Tokenizer_pop(self);
            Py_XDECREF(trash);
            self->head = data->reset - 1;  // Will be auto-incremented
@@ -1317,7 +1318,12 @@ static int Tokenizer_handle_tag_data(
                data->context |= TAG_QUOTED;
                data->quoter = chunk;
                data->reset = self->head;
                if (Tokenizer_push(self, self->topstack->context))
                if (Tokenizer_check_route(self, self->topstack->context) < 0) {
                    RESET_ROUTE();
                    data->context = TAG_ATTR_VALUE;
                    self->head--;
                }
                else if (Tokenizer_push(self, self->topstack->context))
                    return -1;
                return 0;
            }
@@ -1613,6 +1619,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self)
                if (data->context & TAG_QUOTED) {
                    // Unclosed attribute quote: reset, don't die
                    data->context = TAG_ATTR_VALUE;
                    Tokenizer_memoize_bad_route(self);
                    trash = Tokenizer_pop(self);
                    Py_XDECREF(trash);
                    self->head = data->reset;
@@ -2185,6 +2192,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token
                if (data->context & TAG_QUOTED) {
                    // Unclosed attribute quote: reset, don't die
                    data->context = TAG_ATTR_VALUE;
                    Tokenizer_memoize_bad_route(self);
                    trash = Tokenizer_pop(self);
                    Py_XDECREF(trash);
                    self->head = data->reset;
--- a/mwparserfromhell/parser/ctokenizer/tok_support.c
+++ b/mwparserfromhell/parser/ctokenizer/tok_support.c
@@ -1,5 +1,5 @@
 /*
 Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
@@ -147,6 +147,22 @@ static int compare_nodes(
 }

 /*
    Remember that the current route (head + context at push) is invalid.

    This will be noticed when calling Tokenizer_check_route with the same head
    and context, and the route will be failed immediately.
 */
 void Tokenizer_memoize_bad_route(Tokenizer *self)
 {
    route_tree_node *node = malloc(sizeof(route_tree_node));
    if (node) {
        node->id = self->topstack->ident;
        if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes))
            free(node);
    }
 }

 /*
    Fail the current tokenization route. Discards the current
    stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the
    ident of the failed stack so future parsing attempts down this route can be
@@ -157,13 +173,7 @@ void* Tokenizer_fail_route(Tokenizer* self)
    uint64_t context = self->topstack->context;
    PyObject* stack;

    route_tree_node *node = malloc(sizeof(route_tree_node));
    if (node) {
        node->id = self->topstack->ident;
        if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes))
            free(node);
    }

    Tokenizer_memoize_bad_route(self);
    stack = Tokenizer_pop(self);
    Py_XDECREF(stack);
    FAIL_ROUTE(context);
@@ -173,7 +183,7 @@ void* Tokenizer_fail_route(Tokenizer* self)
 /*
    Check if pushing a new route here with the given context would definitely
    fail, based on a previous call to Tokenizer_fail_route() with the same
    stack.
    stack. (Or any other call to Tokenizer_memoize_bad_route().)

    Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the
    latter case.
--- a/mwparserfromhell/parser/ctokenizer/tok_support.h
+++ b/mwparserfromhell/parser/ctokenizer/tok_support.h
@@ -1,5 +1,5 @@
 /*
 Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>

 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal in
@@ -31,6 +31,7 @@ int Tokenizer_push_textbuffer(Tokenizer*);
 void Tokenizer_delete_top_of_stack(Tokenizer*);
 PyObject* Tokenizer_pop(Tokenizer*);
 PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
 void Tokenizer_memoize_bad_route(Tokenizer*);
 void* Tokenizer_fail_route(Tokenizer*);
 int Tokenizer_check_route(Tokenizer*, uint64_t);
 void Tokenizer_free_bad_route_tree(Tokenizer*);
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8  -*-
 #
 # Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com>
 # Copyright (C) 2012-2018 Ben Kurtovic <ben.kurtovic@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -144,6 +144,14 @@ class Tokenizer(object):
        """Return whether or not our max recursion depth has been exceeded."""
        return self._depth < self.MAX_DEPTH

    def _memoize_bad_route(self):
        """Remember that the current route (head + context at push) is invalid.

        This will be noticed when calling _push with the same head and context,
        and the route will be failed immediately.
        """
        self._bad_routes.add(self._stack_ident)

    def _fail_route(self):
        """Fail the current tokenization route.

@@ -151,7 +159,7 @@ class Tokenizer(object):
        :exc:`.BadRoute`.
        """
        context = self._context
        self._bad_routes.add(self._stack_ident)
        self._memoize_bad_route()
        self._pop()
        raise BadRoute(context)

@@ -506,12 +514,16 @@ class Tokenizer(object):

    def _parse_external_link(self, brackets):
        """Parse an external link at the head of the wikicode string."""
        if self._context & contexts.NO_EXT_LINKS or not self._can_recurse():
            if not brackets and self._context & contexts.DL_TERM:
                self._handle_dl_term()
            else:
                self._emit_text(self._read())
            return

        reset = self._head
        self._head += 1
        try:
            bad_context = self._context & contexts.NO_EXT_LINKS
            if bad_context or not self._can_recurse():
                raise BadRoute()
            link, extra, delta = self._really_parse_external_link(brackets)
        except BadRoute:
            self._head = reset
@@ -719,6 +731,7 @@ class Tokenizer(object):
            elif data.context & data.CX_NOTE_SPACE:
                if data.context & data.CX_QUOTED:
                    data.context = data.CX_ATTR_VALUE
                    self._memoize_bad_route()
                    self._pop()
                    self._head = data.reset - 1  # Will be auto-incremented
                    return  # Break early
@@ -743,7 +756,13 @@ class Tokenizer(object):
                        data.context |= data.CX_QUOTED
                        data.quoter = chunk
                        data.reset = self._head
                        self._push(self._context)
                        try:
                            self._push(self._context)
                        except BadRoute:
                            # Already failed to parse this as a quoted string
                            data.context = data.CX_ATTR_VALUE
                            self._head -= 1
                            return
                        continue
                elif data.context & data.CX_QUOTED:
                    if chunk == data.quoter and not escaped:
@@ -845,6 +864,7 @@ class Tokenizer(object):
                    if data.context & data.CX_QUOTED:
                        # Unclosed attribute quote: reset, don't die
                        data.context = data.CX_ATTR_VALUE
                        self._memoize_bad_route()
                        self._pop()
                        self._head = data.reset
                        continue
@@ -1084,6 +1104,7 @@ class Tokenizer(object):
                    if data.context & data.CX_QUOTED:
                        # Unclosed attribute quote: reset, don't die
                        data.context = data.CX_ATTR_VALUE
                        self._memoize_bad_route()
                        self._pop()
                        self._head = data.reset
                        continue