diff --git a/CHANGELOG b/CHANGELOG index d3a2b2b..ebe4d7d 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,7 +1,9 @@ v0.6 (unreleased): - Improved behavior when adding parameters to templates (via Template.add()) - with poorly formatted whitespace conventions. + with poorly formatted whitespace conventions. (#185) +- Fixed the parser getting stuck in deeply nested HTML tags with unclosed, + quoted attributes. (#190) v0.5 (released June 23, 2017): diff --git a/LICENSE b/LICENSE index 588e737..f353cd7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (C) 2012-2017 Ben Kurtovic +Copyright (C) 2012-2018 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/changelog.rst b/docs/changelog.rst index 841f04a..7aa8f22 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,6 +9,10 @@ Unreleased - Improved behavior when adding parameters to templates (via :meth:`.Template.add`) with poorly formatted whitespace conventions. + (`#185 `_) +- Fixed the parser getting stuck in deeply nested HTML tags with unclosed, + quoted attributes. + (`#190 `_) v0.5 ---- diff --git a/docs/conf.py b/docs/conf.py index 3739429..5ac9c70 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -42,7 +42,7 @@ master_doc = 'index' # General information about the project. project = u'mwparserfromhell' -copyright = u'2012, 2013, 2014, 2015, 2016, 2017 Ben Kurtovic' +copyright = u'2012–2018 Ben Kurtovic' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 1a9c542..11e1094 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2017 Ben Kurtovic +# Copyright (C) 2012-2018 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -27,7 +27,7 @@ outrageously powerful parser for `MediaWiki `_ wikicode. """ __author__ = "Ben Kurtovic" -__copyright__ = "Copyright (C) 2012-2017 Ben Kurtovic" +__copyright__ = "Copyright (C) 2012-2018 Ben Kurtovic" __license__ = "MIT License" __version__ = "0.6.dev0" __email__ = "ben.kurtovic@gmail.com" diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index 90ee19d..1998368 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2017 Ben Kurtovic +Copyright (C) 2012-2018 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -722,7 +722,6 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) */ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) { - #define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS #define NOT_A_LINK \ if (!brackets && self->topstack->context & LC_DLTERM) \ return Tokenizer_handle_dl_term(self); \ @@ -732,7 +731,8 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) PyObject *link, *kwargs; Textbuffer *extra; - if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { + if (self->topstack->context & AGG_NO_EXT_LINKS || + !(Tokenizer_CAN_RECURSE(self))) { NOT_A_LINK; } extra = Textbuffer_new(&self->text); @@ -1280,6 +1280,7 @@ static int Tokenizer_handle_tag_data( else if (data->context & TAG_NOTE_SPACE) { if (data->context & TAG_QUOTED) { data->context = TAG_ATTR_VALUE; + Tokenizer_memoize_bad_route(self); trash = Tokenizer_pop(self); Py_XDECREF(trash); self->head = data->reset - 1; // Will be auto-incremented @@ -1317,7 +1318,12 @@ static int Tokenizer_handle_tag_data( data->context |= TAG_QUOTED; data->quoter = chunk; data->reset = self->head; - if (Tokenizer_push(self, self->topstack->context)) + if (Tokenizer_check_route(self, self->topstack->context) < 0) { + RESET_ROUTE(); + data->context = TAG_ATTR_VALUE; + self->head--; + } + else if (Tokenizer_push(self, self->topstack->context)) return -1; return 0; } @@ -1613,6 +1619,7 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) if (data->context & TAG_QUOTED) { // Unclosed attribute quote: reset, don't die data->context = TAG_ATTR_VALUE; + Tokenizer_memoize_bad_route(self); trash = Tokenizer_pop(self); Py_XDECREF(trash); self->head = data->reset; @@ -2185,6 +2192,7 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token if (data->context & TAG_QUOTED) { // Unclosed attribute quote: reset, don't die data->context = TAG_ATTR_VALUE; + Tokenizer_memoize_bad_route(self); trash = Tokenizer_pop(self); Py_XDECREF(trash); self->head = data->reset; diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.c b/mwparserfromhell/parser/ctokenizer/tok_support.c index 062c631..30dc2a1 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.c +++ b/mwparserfromhell/parser/ctokenizer/tok_support.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2017 Ben Kurtovic +Copyright (C) 2012-2018 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -147,6 +147,22 @@ static int compare_nodes( } /* + Remember that the current route (head + context at push) is invalid. + + This will be noticed when calling Tokenizer_check_route with the same head + and context, and the route will be failed immediately. +*/ +void Tokenizer_memoize_bad_route(Tokenizer *self) +{ + route_tree_node *node = malloc(sizeof(route_tree_node)); + if (node) { + node->id = self->topstack->ident; + if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) + free(node); + } +} + +/* Fail the current tokenization route. Discards the current stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the ident of the failed stack so future parsing attempts down this route can be @@ -157,13 +173,7 @@ void* Tokenizer_fail_route(Tokenizer* self) uint64_t context = self->topstack->context; PyObject* stack; - route_tree_node *node = malloc(sizeof(route_tree_node)); - if (node) { - node->id = self->topstack->ident; - if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) - free(node); - } - + Tokenizer_memoize_bad_route(self); stack = Tokenizer_pop(self); Py_XDECREF(stack); FAIL_ROUTE(context); @@ -173,7 +183,7 @@ void* Tokenizer_fail_route(Tokenizer* self) /* Check if pushing a new route here with the given context would definitely fail, based on a previous call to Tokenizer_fail_route() with the same - stack. + stack. (Or any other call to Tokenizer_memoize_bad_route().) Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the latter case. diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.h b/mwparserfromhell/parser/ctokenizer/tok_support.h index 57f4126..f65d102 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.h +++ b/mwparserfromhell/parser/ctokenizer/tok_support.h @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2017 Ben Kurtovic +Copyright (C) 2012-2018 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -31,6 +31,7 @@ int Tokenizer_push_textbuffer(Tokenizer*); void Tokenizer_delete_top_of_stack(Tokenizer*); PyObject* Tokenizer_pop(Tokenizer*); PyObject* Tokenizer_pop_keeping_context(Tokenizer*); +void Tokenizer_memoize_bad_route(Tokenizer*); void* Tokenizer_fail_route(Tokenizer*); int Tokenizer_check_route(Tokenizer*, uint64_t); void Tokenizer_free_bad_route_tree(Tokenizer*); diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index d7a0282..1bfbc8d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2017 Ben Kurtovic +# Copyright (C) 2012-2018 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -144,6 +144,14 @@ class Tokenizer(object): """Return whether or not our max recursion depth has been exceeded.""" return self._depth < self.MAX_DEPTH + def _memoize_bad_route(self): + """Remember that the current route (head + context at push) is invalid. + + This will be noticed when calling _push with the same head and context, + and the route will be failed immediately. + """ + self._bad_routes.add(self._stack_ident) + def _fail_route(self): """Fail the current tokenization route. @@ -151,7 +159,7 @@ class Tokenizer(object): :exc:`.BadRoute`. """ context = self._context - self._bad_routes.add(self._stack_ident) + self._memoize_bad_route() self._pop() raise BadRoute(context) @@ -506,12 +514,16 @@ class Tokenizer(object): def _parse_external_link(self, brackets): """Parse an external link at the head of the wikicode string.""" + if self._context & contexts.NO_EXT_LINKS or not self._can_recurse(): + if not brackets and self._context & contexts.DL_TERM: + self._handle_dl_term() + else: + self._emit_text(self._read()) + return + reset = self._head self._head += 1 try: - bad_context = self._context & contexts.NO_EXT_LINKS - if bad_context or not self._can_recurse(): - raise BadRoute() link, extra, delta = self._really_parse_external_link(brackets) except BadRoute: self._head = reset @@ -719,6 +731,7 @@ class Tokenizer(object): elif data.context & data.CX_NOTE_SPACE: if data.context & data.CX_QUOTED: data.context = data.CX_ATTR_VALUE + self._memoize_bad_route() self._pop() self._head = data.reset - 1 # Will be auto-incremented return # Break early @@ -743,7 +756,13 @@ class Tokenizer(object): data.context |= data.CX_QUOTED data.quoter = chunk data.reset = self._head - self._push(self._context) + try: + self._push(self._context) + except BadRoute: + # Already failed to parse this as a quoted string + data.context = data.CX_ATTR_VALUE + self._head -= 1 + return continue elif data.context & data.CX_QUOTED: if chunk == data.quoter and not escaped: @@ -845,6 +864,7 @@ class Tokenizer(object): if data.context & data.CX_QUOTED: # Unclosed attribute quote: reset, don't die data.context = data.CX_ATTR_VALUE + self._memoize_bad_route() self._pop() self._head = data.reset continue @@ -1084,6 +1104,7 @@ class Tokenizer(object): if data.context & data.CX_QUOTED: # Unclosed attribute quote: reset, don't die data.context = data.CX_ATTR_VALUE + self._memoize_bad_route() self._pop() self._head = data.reset continue