From 8a9c9224be6cb2020ed4ad67a401081096dd21d1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 01:08:19 -0400 Subject: [PATCH] Speed up parsing deeply nested syntax by caching bad routes (fixes #42) Also removed the max cycles stop-gap, allowing much more complex pages to be parsed quickly without losing nodes at the end Also fixes #65, fixes #102, fixes #165, fixes #183 Also fixes #81 (Rafael Nadal parsing bug) Also fixes #53, fixes #58, fixes #88, fixes #152 (duplicate issues) --- CHANGELOG | 4 + LICENSE | 2 +- docs/changelog.rst | 9 +- mwparserfromhell/parser/contexts.py | 6 +- mwparserfromhell/parser/ctokenizer/avl_tree.c | 789 +++++++++++++++++++++++ mwparserfromhell/parser/ctokenizer/avl_tree.h | 358 ++++++++++ mwparserfromhell/parser/ctokenizer/common.h | 19 +- mwparserfromhell/parser/ctokenizer/contexts.h | 4 +- mwparserfromhell/parser/ctokenizer/tok_parse.c | 53 +- mwparserfromhell/parser/ctokenizer/tok_support.c | 58 +- mwparserfromhell/parser/ctokenizer/tok_support.h | 10 +- mwparserfromhell/parser/ctokenizer/tokenizer.c | 14 +- mwparserfromhell/parser/tokenizer.py | 55 +- tests/tokenizer/integration.mwtest | 7 + tests/tokenizer/templates.mwtest | 2 +- 15 files changed, 1337 insertions(+), 53 deletions(-) create mode 100644 mwparserfromhell/parser/ctokenizer/avl_tree.c create mode 100644 mwparserfromhell/parser/ctokenizer/avl_tree.h diff --git a/CHANGELOG b/CHANGELOG index 7d34015..bebacbf 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,10 @@ v0.5 (unreleased): contained within another Wikicode object. - Added Wikicode.get_ancestors() and Wikicode.get_parent() to find all ancestors and the direct parent of a Node, respectively. +- Fixed a long-standing performance issue with deeply nested, invalid syntax + (issue #42). The parser should be much faster on certain complex pages. The + "max cycle" restriction has also been removed, so some situations where + templates at the end of a page were being skipped are now resolved. - Made Template.remove(keep_field=True) behave more reasonably when the parameter is already empty. - Added the keep_template_params argument to Wikicode.strip_code(). If True, diff --git a/LICENSE b/LICENSE index 230bc5c..588e737 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/changelog.rst b/docs/changelog.rst index 4d0d6fd..c558579 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -12,6 +12,11 @@ Unreleased object. - Added :meth:`.Wikicode.get_ancestors` and :meth:`.Wikicode.get_parent` to find all ancestors and the direct parent of a :class:`.Node`, respectively. +- Fixed a long-standing performance issue with deeply nested, invalid syntax + (`issue #42 `_). The + parser should be much faster on certain complex pages. The "max cycle" + restriction has also been removed, so some situations where templates at the + end of a page were being skipped are now resolved. - Made :meth:`Template.remove(keep_field=True) <.Template.remove>` behave more reasonably when the parameter is already empty. - Added the *keep_template_params* argument to :meth:`.Wikicode.strip_code`. @@ -54,7 +59,7 @@ v0.4.3 v0.4.2 ------ -`Released July 30, 2015 `_ +`Released July 30, 2015 `__ (`changes `__): - Fixed setup script not including header files in releases. @@ -63,7 +68,7 @@ v0.4.2 v0.4.1 ------ -`Released July 30, 2015 `_ +`Released July 30, 2015 `__ (`changes `__): - The process for building Windows binaries has been fixed, and these should be diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 405a027..af6dea6 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -100,6 +100,8 @@ Local (stack-specific) contexts: * :const:`TABLE_TH_LINE` * :const:`TABLE_CELL_LINE_CONTEXTS` +* :const:`HTML_ENTITY` + Global contexts: * :const:`GL_HEADING` @@ -176,6 +178,8 @@ TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + TABLE_TD_LINE + TABLE_TH_LINE) +HTML_ENTITY = 1 << 37 + # Global contexts: GL_HEADING = 1 << 0 diff --git a/mwparserfromhell/parser/ctokenizer/avl_tree.c b/mwparserfromhell/parser/ctokenizer/avl_tree.c new file mode 100644 index 0000000..4fdff6f --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/avl_tree.c @@ -0,0 +1,789 @@ +/* + * avl_tree.c - intrusive, nonrecursive AVL tree data structure (self-balancing + * binary search tree), implementation file + * + * Written in 2014-2016 by Eric Biggers + * + * To the extent possible under law, the author(s) have dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide via the Creative Commons Zero 1.0 Universal Public Domain + * Dedication (the "CC0"). + * + * This software is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the CC0 for more details. + * + * You should have received a copy of the CC0 along with this software; if not + * see . + */ + +#include "avl_tree.h" + +/* Returns the left child (sign < 0) or the right child (sign > 0) of the + * specified AVL tree node. + * Note: for all calls of this, 'sign' is constant at compilation time, + * so the compiler can remove the conditional. */ +static AVL_INLINE struct avl_tree_node * +avl_get_child(const struct avl_tree_node *parent, int sign) +{ + if (sign < 0) + return parent->left; + else + return parent->right; +} + +static AVL_INLINE struct avl_tree_node * +avl_tree_first_or_last_in_order(const struct avl_tree_node *root, int sign) +{ + const struct avl_tree_node *first = root; + + if (first) + while (avl_get_child(first, +sign)) + first = avl_get_child(first, +sign); + return (struct avl_tree_node *)first; +} + +/* Starts an in-order traversal of the tree: returns the least-valued node, or + * NULL if the tree is empty. */ +struct avl_tree_node * +avl_tree_first_in_order(const struct avl_tree_node *root) +{ + return avl_tree_first_or_last_in_order(root, -1); +} + +/* Starts a *reverse* in-order traversal of the tree: returns the + * greatest-valued node, or NULL if the tree is empty. */ +struct avl_tree_node * +avl_tree_last_in_order(const struct avl_tree_node *root) +{ + return avl_tree_first_or_last_in_order(root, 1); +} + +static AVL_INLINE struct avl_tree_node * +avl_tree_next_or_prev_in_order(const struct avl_tree_node *node, int sign) +{ + const struct avl_tree_node *next; + + if (avl_get_child(node, +sign)) + for (next = avl_get_child(node, +sign); + avl_get_child(next, -sign); + next = avl_get_child(next, -sign)) + ; + else + for (next = avl_get_parent(node); + next && node == avl_get_child(next, +sign); + node = next, next = avl_get_parent(next)) + ; + return (struct avl_tree_node *)next; +} + +/* Continues an in-order traversal of the tree: returns the next-greatest-valued + * node, or NULL if there is none. */ +struct avl_tree_node * +avl_tree_next_in_order(const struct avl_tree_node *node) +{ + return avl_tree_next_or_prev_in_order(node, 1); +} + +/* Continues a *reverse* in-order traversal of the tree: returns the + * previous-greatest-valued node, or NULL if there is none. */ +struct avl_tree_node * +avl_tree_prev_in_order(const struct avl_tree_node *node) +{ + return avl_tree_next_or_prev_in_order(node, -1); +} + +/* Starts a postorder traversal of the tree. */ +struct avl_tree_node * +avl_tree_first_in_postorder(const struct avl_tree_node *root) +{ + const struct avl_tree_node *first = root; + + if (first) + while (first->left || first->right) + first = first->left ? first->left : first->right; + + return (struct avl_tree_node *)first; +} + +/* Continues a postorder traversal of the tree. @prev will not be deferenced as + * it's allowed that its memory has been freed; @prev_parent must be its saved + * parent node. Returns NULL if there are no more nodes (i.e. @prev was the + * root of the tree). */ +struct avl_tree_node * +avl_tree_next_in_postorder(const struct avl_tree_node *prev, + const struct avl_tree_node *prev_parent) +{ + const struct avl_tree_node *next = prev_parent; + + if (next && prev == next->left && next->right) + for (next = next->right; + next->left || next->right; + next = next->left ? next->left : next->right) + ; + return (struct avl_tree_node *)next; +} + +/* Sets the left child (sign < 0) or the right child (sign > 0) of the + * specified AVL tree node. + * Note: for all calls of this, 'sign' is constant at compilation time, + * so the compiler can remove the conditional. */ +static AVL_INLINE void +avl_set_child(struct avl_tree_node *parent, int sign, + struct avl_tree_node *child) +{ + if (sign < 0) + parent->left = child; + else + parent->right = child; +} + +/* Sets the parent and balance factor of the specified AVL tree node. */ +static AVL_INLINE void +avl_set_parent_balance(struct avl_tree_node *node, struct avl_tree_node *parent, + int balance_factor) +{ + node->parent_balance = (uintptr_t)parent | (balance_factor + 1); +} + +/* Sets the parent of the specified AVL tree node. */ +static AVL_INLINE void +avl_set_parent(struct avl_tree_node *node, struct avl_tree_node *parent) +{ + node->parent_balance = (uintptr_t)parent | (node->parent_balance & 3); +} + +/* Returns the balance factor of the specified AVL tree node --- that is, the + * height of its right subtree minus the height of its left subtree. */ +static AVL_INLINE int +avl_get_balance_factor(const struct avl_tree_node *node) +{ + return (int)(node->parent_balance & 3) - 1; +} + +/* Adds @amount to the balance factor of the specified AVL tree node. + * The caller must ensure this still results in a valid balance factor + * (-1, 0, or 1). */ +static AVL_INLINE void +avl_adjust_balance_factor(struct avl_tree_node *node, int amount) +{ + node->parent_balance += amount; +} + +static AVL_INLINE void +avl_replace_child(struct avl_tree_node **root_ptr, + struct avl_tree_node *parent, + struct avl_tree_node *old_child, + struct avl_tree_node *new_child) +{ + if (parent) { + if (old_child == parent->left) + parent->left = new_child; + else + parent->right = new_child; + } else { + *root_ptr = new_child; + } +} + +/* + * Template for performing a single rotation --- + * + * sign > 0: Rotate clockwise (right) rooted at A: + * + * P? P? + * | | + * A B + * / \ / \ + * B C? => D? A + * / \ / \ + * D? E? E? C? + * + * (nodes marked with ? may not exist) + * + * sign < 0: Rotate counterclockwise (left) rooted at A: + * + * P? P? + * | | + * A B + * / \ / \ + * C? B => A D? + * / \ / \ + * E? D? C? E? + * + * This updates pointers but not balance factors! + */ +static AVL_INLINE void +avl_rotate(struct avl_tree_node ** const root_ptr, + struct avl_tree_node * const A, const int sign) +{ + struct avl_tree_node * const B = avl_get_child(A, -sign); + struct avl_tree_node * const E = avl_get_child(B, +sign); + struct avl_tree_node * const P = avl_get_parent(A); + + avl_set_child(A, -sign, E); + avl_set_parent(A, B); + + avl_set_child(B, +sign, A); + avl_set_parent(B, P); + + if (E) + avl_set_parent(E, A); + + avl_replace_child(root_ptr, P, A, B); +} + +/* + * Template for performing a double rotation --- + * + * sign > 0: Rotate counterclockwise (left) rooted at B, then + * clockwise (right) rooted at A: + * + * P? P? P? + * | | | + * A A E + * / \ / \ / \ + * B C? => E C? => B A + * / \ / \ / \ / \ + * D? E B G? D? F?G? C? + * / \ / \ + * F? G? D? F? + * + * (nodes marked with ? may not exist) + * + * sign < 0: Rotate clockwise (right) rooted at B, then + * counterclockwise (left) rooted at A: + * + * P? P? P? + * | | | + * A A E + * / \ / \ / \ + * C? B => C? E => A B + * / \ / \ / \ / \ + * E D? G? B C? G?F? D? + * / \ / \ + * G? F? F? D? + * + * Returns a pointer to E and updates balance factors. Except for those + * two things, this function is equivalent to: + * avl_rotate(root_ptr, B, -sign); + * avl_rotate(root_ptr, A, +sign); + * + * See comment in avl_handle_subtree_growth() for explanation of balance + * factor updates. + */ +static AVL_INLINE struct avl_tree_node * +avl_do_double_rotate(struct avl_tree_node ** const root_ptr, + struct avl_tree_node * const B, + struct avl_tree_node * const A, const int sign) +{ + struct avl_tree_node * const E = avl_get_child(B, +sign); + struct avl_tree_node * const F = avl_get_child(E, -sign); + struct avl_tree_node * const G = avl_get_child(E, +sign); + struct avl_tree_node * const P = avl_get_parent(A); + const int e = avl_get_balance_factor(E); + + avl_set_child(A, -sign, G); + avl_set_parent_balance(A, E, ((sign * e >= 0) ? 0 : -e)); + + avl_set_child(B, +sign, F); + avl_set_parent_balance(B, E, ((sign * e <= 0) ? 0 : -e)); + + avl_set_child(E, +sign, A); + avl_set_child(E, -sign, B); + avl_set_parent_balance(E, P, 0); + + if (G) + avl_set_parent(G, A); + + if (F) + avl_set_parent(F, B); + + avl_replace_child(root_ptr, P, A, E); + + return E; +} + +/* + * This function handles the growth of a subtree due to an insertion. + * + * @root_ptr + * Location of the tree's root pointer. + * + * @node + * A subtree that has increased in height by 1 due to an insertion. + * + * @parent + * Parent of @node; must not be NULL. + * + * @sign + * -1 if @node is the left child of @parent; + * +1 if @node is the right child of @parent. + * + * This function will adjust @parent's balance factor, then do a (single + * or double) rotation if necessary. The return value will be %true if + * the full AVL tree is now adequately balanced, or %false if the subtree + * rooted at @parent is now adequately balanced but has increased in + * height by 1, so the caller should continue up the tree. + * + * Note that if %false is returned, no rotation will have been done. + * Indeed, a single node insertion cannot require that more than one + * (single or double) rotation be done. + */ +static AVL_INLINE bool +avl_handle_subtree_growth(struct avl_tree_node ** const root_ptr, + struct avl_tree_node * const node, + struct avl_tree_node * const parent, + const int sign) +{ + int old_balance_factor, new_balance_factor; + + old_balance_factor = avl_get_balance_factor(parent); + + if (old_balance_factor == 0) { + avl_adjust_balance_factor(parent, sign); + /* @parent is still sufficiently balanced (-1 or +1 + * balance factor), but must have increased in height. + * Continue up the tree. */ + return false; + } + + new_balance_factor = old_balance_factor + sign; + + if (new_balance_factor == 0) { + avl_adjust_balance_factor(parent, sign); + /* @parent is now perfectly balanced (0 balance factor). + * It cannot have increased in height, so there is + * nothing more to do. */ + return true; + } + + /* @parent is too left-heavy (new_balance_factor == -2) or + * too right-heavy (new_balance_factor == +2). */ + + /* Test whether @node is left-heavy (-1 balance factor) or + * right-heavy (+1 balance factor). + * Note that it cannot be perfectly balanced (0 balance factor) + * because here we are under the invariant that @node has + * increased in height due to the insertion. */ + if (sign * avl_get_balance_factor(node) > 0) { + + /* @node (B below) is heavy in the same direction @parent + * (A below) is heavy. + * + * @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + * The comment, diagram, and equations below assume sign < 0. + * The other case is symmetric! + * @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + * + * Do a clockwise rotation rooted at @parent (A below): + * + * A B + * / \ / \ + * B C? => D A + * / \ / \ / \ + * D E? F? G?E? C? + * / \ + * F? G? + * + * Before the rotation: + * balance(A) = -2 + * balance(B) = -1 + * Let x = height(C). Then: + * height(B) = x + 2 + * height(D) = x + 1 + * height(E) = x + * max(height(F), height(G)) = x. + * + * After the rotation: + * height(D) = max(height(F), height(G)) + 1 + * = x + 1 + * height(A) = max(height(E), height(C)) + 1 + * = max(x, x) + 1 = x + 1 + * balance(B) = 0 + * balance(A) = 0 + */ + avl_rotate(root_ptr, parent, -sign); + + /* Equivalent to setting @parent's balance factor to 0. */ + avl_adjust_balance_factor(parent, -sign); /* A */ + + /* Equivalent to setting @node's balance factor to 0. */ + avl_adjust_balance_factor(node, -sign); /* B */ + } else { + /* @node (B below) is heavy in the direction opposite + * from the direction @parent (A below) is heavy. + * + * @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + * The comment, diagram, and equations below assume sign < 0. + * The other case is symmetric! + * @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + * + * Do a counterblockwise rotation rooted at @node (B below), + * then a clockwise rotation rooted at @parent (A below): + * + * A A E + * / \ / \ / \ + * B C? => E C? => B A + * / \ / \ / \ / \ + * D? E B G? D? F?G? C? + * / \ / \ + * F? G? D? F? + * + * Before the rotation: + * balance(A) = -2 + * balance(B) = +1 + * Let x = height(C). Then: + * height(B) = x + 2 + * height(E) = x + 1 + * height(D) = x + * max(height(F), height(G)) = x + * + * After both rotations: + * height(A) = max(height(G), height(C)) + 1 + * = x + 1 + * balance(A) = balance(E{orig}) >= 0 ? 0 : -balance(E{orig}) + * height(B) = max(height(D), height(F)) + 1 + * = x + 1 + * balance(B) = balance(E{orig} <= 0) ? 0 : -balance(E{orig}) + * + * height(E) = x + 2 + * balance(E) = 0 + */ + avl_do_double_rotate(root_ptr, node, parent, -sign); + } + + /* Height after rotation is unchanged; nothing more to do. */ + return true; +} + +/* Rebalance the tree after insertion of the specified node. */ +void +avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr, + struct avl_tree_node *inserted) +{ + struct avl_tree_node *node, *parent; + bool done; + + inserted->left = NULL; + inserted->right = NULL; + + node = inserted; + + /* Adjust balance factor of new node's parent. + * No rotation will need to be done at this level. */ + + parent = avl_get_parent(node); + if (!parent) + return; + + if (node == parent->left) + avl_adjust_balance_factor(parent, -1); + else + avl_adjust_balance_factor(parent, +1); + + if (avl_get_balance_factor(parent) == 0) + /* @parent did not change in height. Nothing more to do. */ + return; + + /* The subtree rooted at @parent increased in height by 1. */ + + do { + /* Adjust balance factor of next ancestor. */ + + node = parent; + parent = avl_get_parent(node); + if (!parent) + return; + + /* The subtree rooted at @node has increased in height by 1. */ + if (node == parent->left) + done = avl_handle_subtree_growth(root_ptr, node, + parent, -1); + else + done = avl_handle_subtree_growth(root_ptr, node, + parent, +1); + } while (!done); +} + +/* + * This function handles the shrinkage of a subtree due to a deletion. + * + * @root_ptr + * Location of the tree's root pointer. + * + * @parent + * A node in the tree, exactly one of whose subtrees has decreased + * in height by 1 due to a deletion. (This includes the case where + * one of the child pointers has become NULL, since we can consider + * the "NULL" subtree to have a height of 0.) + * + * @sign + * +1 if the left subtree of @parent has decreased in height by 1; + * -1 if the right subtree of @parent has decreased in height by 1. + * + * @left_deleted_ret + * If the return value is not NULL, this will be set to %true if the + * left subtree of the returned node has decreased in height by 1, + * or %false if the right subtree of the returned node has decreased + * in height by 1. + * + * This function will adjust @parent's balance factor, then do a (single + * or double) rotation if necessary. The return value will be NULL if + * the full AVL tree is now adequately balanced, or a pointer to the + * parent of @parent if @parent is now adequately balanced but has + * decreased in height by 1. Also in the latter case, *left_deleted_ret + * will be set. + */ +static AVL_INLINE struct avl_tree_node * +avl_handle_subtree_shrink(struct avl_tree_node ** const root_ptr, + struct avl_tree_node *parent, + const int sign, + bool * const left_deleted_ret) +{ + struct avl_tree_node *node; + int old_balance_factor, new_balance_factor; + + old_balance_factor = avl_get_balance_factor(parent); + + if (old_balance_factor == 0) { + /* Prior to the deletion, the subtree rooted at + * @parent was perfectly balanced. It's now + * unbalanced by 1, but that's okay and its height + * hasn't changed. Nothing more to do. */ + avl_adjust_balance_factor(parent, sign); + return NULL; + } + + new_balance_factor = old_balance_factor + sign; + + if (new_balance_factor == 0) { + /* The subtree rooted at @parent is now perfectly + * balanced, whereas before the deletion it was + * unbalanced by 1. Its height must have decreased + * by 1. No rotation is needed at this location, + * but continue up the tree. */ + avl_adjust_balance_factor(parent, sign); + node = parent; + } else { + /* @parent is too left-heavy (new_balance_factor == -2) or + * too right-heavy (new_balance_factor == +2). */ + + node = avl_get_child(parent, sign); + + /* The rotations below are similar to those done during + * insertion (see avl_handle_subtree_growth()), so full + * comments are not provided. The only new case is the + * one where @node has a balance factor of 0, and that is + * commented. */ + + if (sign * avl_get_balance_factor(node) >= 0) { + + avl_rotate(root_ptr, parent, -sign); + + if (avl_get_balance_factor(node) == 0) { + /* + * @node (B below) is perfectly balanced. + * + * @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + * The comment, diagram, and equations + * below assume sign < 0. The other case + * is symmetric! + * @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + * + * Do a clockwise rotation rooted at + * @parent (A below): + * + * A B + * / \ / \ + * B C? => D A + * / \ / \ / \ + * D E F? G?E C? + * / \ + * F? G? + * + * Before the rotation: + * balance(A) = -2 + * balance(B) = 0 + * Let x = height(C). Then: + * height(B) = x + 2 + * height(D) = x + 1 + * height(E) = x + 1 + * max(height(F), height(G)) = x. + * + * After the rotation: + * height(D) = max(height(F), height(G)) + 1 + * = x + 1 + * height(A) = max(height(E), height(C)) + 1 + * = max(x + 1, x) + 1 = x + 2 + * balance(A) = -1 + * balance(B) = +1 + */ + + /* A: -2 => -1 (sign < 0) + * or +2 => +1 (sign > 0) + * No change needed --- that's the same as + * old_balance_factor. */ + + /* B: 0 => +1 (sign < 0) + * or 0 => -1 (sign > 0) */ + avl_adjust_balance_factor(node, -sign); + + /* Height is unchanged; nothing more to do. */ + return NULL; + } else { + avl_adjust_balance_factor(parent, -sign); + avl_adjust_balance_factor(node, -sign); + } + } else { + node = avl_do_double_rotate(root_ptr, node, + parent, -sign); + } + } + parent = avl_get_parent(node); + if (parent) + *left_deleted_ret = (node == parent->left); + return parent; +} + +/* Swaps node X, which must have 2 children, with its in-order successor, then + * unlinks node X. Returns the parent of X just before unlinking, without its + * balance factor having been updated to account for the unlink. */ +static AVL_INLINE struct avl_tree_node * +avl_tree_swap_with_successor(struct avl_tree_node **root_ptr, + struct avl_tree_node *X, + bool *left_deleted_ret) +{ + struct avl_tree_node *Y, *ret; + + Y = X->right; + if (!Y->left) { + /* + * P? P? P? + * | | | + * X Y Y + * / \ / \ / \ + * A Y => A X => A B? + * / \ / \ + * (0) B? (0) B? + * + * [ X unlinked, Y returned ] + */ + ret = Y; + *left_deleted_ret = false; + } else { + struct avl_tree_node *Q; + + do { + Q = Y; + Y = Y->left; + } while (Y->left); + + /* + * P? P? P? + * | | | + * X Y Y + * / \ / \ / \ + * A ... => A ... => A ... + * | | | + * Q Q Q + * / / / + * Y X B? + * / \ / \ + * (0) B? (0) B? + * + * + * [ X unlinked, Q returned ] + */ + + Q->left = Y->right; + if (Q->left) + avl_set_parent(Q->left, Q); + Y->right = X->right; + avl_set_parent(X->right, Y); + ret = Q; + *left_deleted_ret = true; + } + + Y->left = X->left; + avl_set_parent(X->left, Y); + + Y->parent_balance = X->parent_balance; + avl_replace_child(root_ptr, avl_get_parent(X), X, Y); + + return ret; +} + +/* + * Removes an item from the specified AVL tree. + * + * @root_ptr + * Location of the AVL tree's root pointer. Indirection is needed + * because the root node may change if the tree needed to be rebalanced + * because of the deletion or if @node was the root node. + * + * @node + * Pointer to the `struct avl_tree_node' embedded in the item to + * remove from the tree. + * + * Note: This function *only* removes the node and rebalances the tree. + * It does not free any memory, nor does it do the equivalent of + * avl_tree_node_set_unlinked(). + */ +void +avl_tree_remove(struct avl_tree_node **root_ptr, struct avl_tree_node *node) +{ + struct avl_tree_node *parent; + bool left_deleted = false; + + if (node->left && node->right) { + /* @node is fully internal, with two children. Swap it + * with its in-order successor (which must exist in the + * right subtree of @node and can have, at most, a right + * child), then unlink @node. */ + parent = avl_tree_swap_with_successor(root_ptr, node, + &left_deleted); + /* @parent is now the parent of what was @node's in-order + * successor. It cannot be NULL, since @node itself was + * an ancestor of its in-order successor. + * @left_deleted has been set to %true if @node's + * in-order successor was the left child of @parent, + * otherwise %false. */ + } else { + struct avl_tree_node *child; + + /* @node is missing at least one child. Unlink it. Set + * @parent to @node's parent, and set @left_deleted to + * reflect which child of @parent @node was. Or, if + * @node was the root node, simply update the root node + * and return. */ + child = node->left ? node->left : node->right; + parent = avl_get_parent(node); + if (parent) { + if (node == parent->left) { + parent->left = child; + left_deleted = true; + } else { + parent->right = child; + left_deleted = false; + } + if (child) + avl_set_parent(child, parent); + } else { + if (child) + avl_set_parent(child, parent); + *root_ptr = child; + return; + } + } + + /* Rebalance the tree. */ + do { + if (left_deleted) + parent = avl_handle_subtree_shrink(root_ptr, parent, + +1, &left_deleted); + else + parent = avl_handle_subtree_shrink(root_ptr, parent, + -1, &left_deleted); + } while (parent); +} diff --git a/mwparserfromhell/parser/ctokenizer/avl_tree.h b/mwparserfromhell/parser/ctokenizer/avl_tree.h new file mode 100644 index 0000000..86ade3f --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/avl_tree.h @@ -0,0 +1,358 @@ +/* + * avl_tree.h - intrusive, nonrecursive AVL tree data structure (self-balancing + * binary search tree), header file + * + * Written in 2014-2016 by Eric Biggers + * + * To the extent possible under law, the author(s) have dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide via the Creative Commons Zero 1.0 Universal Public Domain + * Dedication (the "CC0"). + * + * This software is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the CC0 for more details. + * + * You should have received a copy of the CC0 along with this software; if not + * see . + */ + +#ifndef _AVL_TREE_H_ +#define _AVL_TREE_H_ + +#include +#include +#include /* for uintptr_t */ + +#ifdef __GNUC__ +# define AVL_INLINE inline __attribute__((always_inline)) +#else +# define AVL_INLINE inline +#endif + +/* Node in an AVL tree. Embed this in some other data structure. */ +struct avl_tree_node { + + /* Pointer to left child or NULL */ + struct avl_tree_node *left; + + /* Pointer to right child or NULL */ + struct avl_tree_node *right; + + /* Pointer to parent combined with the balance factor. This saves 4 or + * 8 bytes of memory depending on the CPU architecture. + * + * Low 2 bits: One greater than the balance factor of this subtree, + * which is equal to height(right) - height(left). The mapping is: + * + * 00 => -1 + * 01 => 0 + * 10 => +1 + * 11 => undefined + * + * The rest of the bits are the pointer to the parent node. It must be + * 4-byte aligned, and it will be NULL if this is the root node and + * therefore has no parent. */ + uintptr_t parent_balance; +}; + +/* Cast an AVL tree node to the containing data structure. */ +#define avl_tree_entry(entry, type, member) \ + ((type*) ((char *)(entry) - offsetof(type, member))) + +/* Returns a pointer to the parent of the specified AVL tree node, or NULL if it + * is already the root of the tree. */ +static AVL_INLINE struct avl_tree_node * +avl_get_parent(const struct avl_tree_node *node) +{ + return (struct avl_tree_node *)(node->parent_balance & ~3); +} + +/* Marks the specified AVL tree node as unlinked from any tree. */ +static AVL_INLINE void +avl_tree_node_set_unlinked(struct avl_tree_node *node) +{ + node->parent_balance = (uintptr_t)node; +} + +/* Returns true iff the specified AVL tree node has been marked with + * avl_tree_node_set_unlinked() and has not subsequently been inserted into a + * tree. */ +static AVL_INLINE bool +avl_tree_node_is_unlinked(const struct avl_tree_node *node) +{ + return node->parent_balance == (uintptr_t)node; +} + +/* (Internal use only) */ +extern void +avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr, + struct avl_tree_node *inserted); + +/* + * Looks up an item in the specified AVL tree. + * + * @root + * Pointer to the root of the AVL tree. (This can be NULL --- that just + * means the tree is empty.) + * + * @cmp_ctx + * First argument to pass to the comparison callback. This generally + * should be a pointer to an object equal to the one being searched for. + * + * @cmp + * Comparison callback. Must return < 0, 0, or > 0 if the first argument + * is less than, equal to, or greater than the second argument, + * respectively. The first argument will be @cmp_ctx and the second + * argument will be a pointer to the AVL tree node of an item in the tree. + * + * Returns a pointer to the AVL tree node of the resulting item, or NULL if the + * item was not found. + * + * Example: + * + * struct int_wrapper { + * int data; + * struct avl_tree_node index_node; + * }; + * + * static int _avl_cmp_int_to_node(const void *intptr, + * const struct avl_tree_node *nodeptr) + * { + * int n1 = *(const int *)intptr; + * int n2 = avl_tree_entry(nodeptr, struct int_wrapper, index_node)->data; + * if (n1 < n2) + * return -1; + * else if (n1 > n2) + * return 1; + * else + * return 0; + * } + * + * bool contains_int(struct avl_tree_node *root, int n) + * { + * struct avl_tree_node *result; + * + * result = avl_tree_lookup(root, &n, _avl_cmp_int_to_node); + * return result ? true : false; + * } + */ +static AVL_INLINE struct avl_tree_node * +avl_tree_lookup(const struct avl_tree_node *root, + const void *cmp_ctx, + int (*cmp)(const void *, const struct avl_tree_node *)) +{ + const struct avl_tree_node *cur = root; + + while (cur) { + int res = (*cmp)(cmp_ctx, cur); + if (res < 0) + cur = cur->left; + else if (res > 0) + cur = cur->right; + else + break; + } + return (struct avl_tree_node*)cur; +} + +/* Same as avl_tree_lookup(), but uses a more specific type for the comparison + * function. Specifically, with this function the item being searched for is + * expected to be in the same format as those already in the tree, with an + * embedded 'struct avl_tree_node'. */ +static AVL_INLINE struct avl_tree_node * +avl_tree_lookup_node(const struct avl_tree_node *root, + const struct avl_tree_node *node, + int (*cmp)(const struct avl_tree_node *, + const struct avl_tree_node *)) +{ + const struct avl_tree_node *cur = root; + + while (cur) { + int res = (*cmp)(node, cur); + if (res < 0) + cur = cur->left; + else if (res > 0) + cur = cur->right; + else + break; + } + return (struct avl_tree_node*)cur; +} + +/* + * Inserts an item into the specified AVL tree. + * + * @root_ptr + * Location of the AVL tree's root pointer. Indirection is needed because + * the root node may change as a result of rotations caused by the + * insertion. Initialize *root_ptr to NULL for an empty tree. + * + * @item + * Pointer to the `struct avl_tree_node' embedded in the item to insert. + * No members in it need be pre-initialized, although members in the + * containing structure should be pre-initialized so that @cmp can use them + * in comparisons. + * + * @cmp + * Comparison callback. Must return < 0, 0, or > 0 if the first argument + * is less than, equal to, or greater than the second argument, + * respectively. The first argument will be @item and the second + * argument will be a pointer to an AVL tree node embedded in some + * previously-inserted item to which @item is being compared. + * + * If no item in the tree is comparatively equal (via @cmp) to @item, inserts + * @item and returns NULL. Otherwise does nothing and returns a pointer to the + * AVL tree node embedded in the previously-inserted item which compared equal + * to @item. + * + * Example: + * + * struct int_wrapper { + * int data; + * struct avl_tree_node index_node; + * }; + * + * #define GET_DATA(i) avl_tree_entry((i), struct int_wrapper, index_node)->data + * + * static int _avl_cmp_ints(const struct avl_tree_node *node1, + * const struct avl_tree_node *node2) + * { + * int n1 = GET_DATA(node1); + * int n2 = GET_DATA(node2); + * if (n1 < n2) + * return -1; + * else if (n1 > n2) + * return 1; + * else + * return 0; + * } + * + * bool insert_int(struct avl_tree_node **root_ptr, int data) + * { + * struct int_wrapper *i = malloc(sizeof(struct int_wrapper)); + * i->data = data; + * if (avl_tree_insert(root_ptr, &i->index_node, _avl_cmp_ints)) { + * // Duplicate. + * free(i); + * return false; + * } + * return true; + * } + */ +static AVL_INLINE struct avl_tree_node * +avl_tree_insert(struct avl_tree_node **root_ptr, + struct avl_tree_node *item, + int (*cmp)(const struct avl_tree_node *, + const struct avl_tree_node *)) +{ + struct avl_tree_node **cur_ptr = root_ptr, *cur = NULL; + int res; + + while (*cur_ptr) { + cur = *cur_ptr; + res = (*cmp)(item, cur); + if (res < 0) + cur_ptr = &cur->left; + else if (res > 0) + cur_ptr = &cur->right; + else + return cur; + } + *cur_ptr = item; + item->parent_balance = (uintptr_t)cur | 1; + avl_tree_rebalance_after_insert(root_ptr, item); + return NULL; +} + +/* Removes an item from the specified AVL tree. + * See implementation for details. */ +extern void +avl_tree_remove(struct avl_tree_node **root_ptr, struct avl_tree_node *node); + +/* Nonrecursive AVL tree traversal functions */ + +extern struct avl_tree_node * +avl_tree_first_in_order(const struct avl_tree_node *root); + +extern struct avl_tree_node * +avl_tree_last_in_order(const struct avl_tree_node *root); + +extern struct avl_tree_node * +avl_tree_next_in_order(const struct avl_tree_node *node); + +extern struct avl_tree_node * +avl_tree_prev_in_order(const struct avl_tree_node *node); + +extern struct avl_tree_node * +avl_tree_first_in_postorder(const struct avl_tree_node *root); + +extern struct avl_tree_node * +avl_tree_next_in_postorder(const struct avl_tree_node *prev, + const struct avl_tree_node *prev_parent); + +/* + * Iterate through the nodes in an AVL tree in sorted order. + * You may not modify the tree during the iteration. + * + * @child_struct + * Variable that will receive a pointer to each struct inserted into the + * tree. + * @root + * Root of the AVL tree. + * @struct_name + * Type of *child_struct. + * @struct_member + * Member of @struct_name type that is the AVL tree node. + * + * Example: + * + * struct int_wrapper { + * int data; + * struct avl_tree_node index_node; + * }; + * + * void print_ints(struct avl_tree_node *root) + * { + * struct int_wrapper *i; + * + * avl_tree_for_each_in_order(i, root, struct int_wrapper, index_node) + * printf("%d\n", i->data); + * } + */ +#define avl_tree_for_each_in_order(child_struct, root, \ + struct_name, struct_member) \ + for (struct avl_tree_node *_cur = \ + avl_tree_first_in_order(root); \ + _cur && ((child_struct) = \ + avl_tree_entry(_cur, struct_name, \ + struct_member), 1); \ + _cur = avl_tree_next_in_order(_cur)) + +/* + * Like avl_tree_for_each_in_order(), but uses the reverse order. + */ +#define avl_tree_for_each_in_reverse_order(child_struct, root, \ + struct_name, struct_member) \ + for (struct avl_tree_node *_cur = \ + avl_tree_last_in_order(root); \ + _cur && ((child_struct) = \ + avl_tree_entry(_cur, struct_name, \ + struct_member), 1); \ + _cur = avl_tree_prev_in_order(_cur)) + +/* + * Like avl_tree_for_each_in_order(), but iterates through the nodes in + * postorder, so the current node may be deleted or freed. + */ +#define avl_tree_for_each_in_postorder(child_struct, root, \ + struct_name, struct_member) \ + for (struct avl_tree_node *_cur = \ + avl_tree_first_in_postorder(root), *_parent; \ + _cur && ((child_struct) = \ + avl_tree_entry(_cur, struct_name, \ + struct_member), 1) \ + && (_parent = avl_get_parent(_cur), 1); \ + _cur = avl_tree_next_in_postorder(_cur, _parent)) + +#endif /* _AVL_TREE_H_ */ diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h index 3bd22af..f3d51f4 100644 --- a/mwparserfromhell/parser/ctokenizer/common.h +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -30,6 +30,8 @@ SOFTWARE. #include #include +#include "avl_tree.h" + /* Compatibility macros */ #if PY_MAJOR_VERSION >= 3 @@ -92,10 +94,16 @@ typedef struct { #endif } Textbuffer; +typedef struct { + Py_ssize_t head; + uint64_t context; +} StackIdent; + struct Stack { PyObject* stack; uint64_t context; Textbuffer* textbuffer; + StackIdent ident; struct Stack* next; }; typedef struct Stack Stack; @@ -111,6 +119,13 @@ typedef struct { #endif } TokenizerInput; +typedef struct avl_tree_node avl_tree; + +typedef struct { + StackIdent id; + struct avl_tree_node node; +} route_tree_node; + typedef struct { PyObject_HEAD TokenizerInput text; /* text to tokenize */ @@ -118,8 +133,8 @@ typedef struct { Py_ssize_t head; /* current position in text */ int global; /* global context */ int depth; /* stack recursion depth */ - int cycles; /* total number of stack recursions */ int route_state; /* whether a BadRoute has been triggered */ uint64_t route_context; /* context when the last BadRoute was triggered */ + avl_tree* bad_routes; /* stack idents for routes known to fail */ int skip_style_tags; /* temp fix for the sometimes broken tag parser */ } Tokenizer; diff --git a/mwparserfromhell/parser/ctokenizer/contexts.h b/mwparserfromhell/parser/ctokenizer/contexts.h index 96afd6c..2696925 100644 --- a/mwparserfromhell/parser/ctokenizer/contexts.h +++ b/mwparserfromhell/parser/ctokenizer/contexts.h @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -81,6 +81,8 @@ SOFTWARE. #define LC_TABLE_TD_LINE 0x0000000800000000 #define LC_TABLE_TH_LINE 0x0000001000000000 +#define LC_HTML_ENTITY 0x0000002000000000 + /* Global contexts */ #define GL_HEADING 0x1 diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index f4e9606..27eed67 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -445,6 +445,8 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) Unicode this; int slashes, i; + if (Tokenizer_check_route(self, LC_EXT_LINK_URI) < 0) + return 0; if (Tokenizer_push(self, LC_EXT_LINK_URI)) return -1; if (Tokenizer_read(self, 0) == '/' && Tokenizer_read(self, 1) == '/') { @@ -461,7 +463,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) while (1) { if (!valid[i]) goto end_of_loop; - if (this == valid[i]) + if (this == (Unicode) valid[i]) break; i++; } @@ -533,7 +535,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) FAIL_ROUTE(0); return 0; } - } while (chunk != valid[j++]); + } while (chunk != (Unicode) valid[j++]); Textbuffer_write(scheme_buffer, chunk); } end_of_loop: @@ -552,7 +554,12 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) return 0; } Py_DECREF(scheme); - if (Tokenizer_push(self, self->topstack->context | LC_EXT_LINK_URI)) { + uint64_t new_context = self->topstack->context | LC_EXT_LINK_URI; + if (Tokenizer_check_route(self, new_context) < 0) { + Textbuffer_dealloc(scheme_buffer); + return 0; + } + if (Tokenizer_push(self, new_context)) { Textbuffer_dealloc(scheme_buffer); return -1; } @@ -1000,7 +1007,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) while (1) { if (!valid[j]) FAIL_ROUTE_AND_EXIT() - if (this == valid[j]) + if (this == (Unicode) valid[j]) break; j++; } @@ -1065,11 +1072,14 @@ static int Tokenizer_parse_entity(Tokenizer* self) Py_ssize_t reset = self->head; PyObject *tokenlist; - if (Tokenizer_push(self, 0)) + if (Tokenizer_check_route(self, LC_HTML_ENTITY) < 0) + goto on_bad_route; + if (Tokenizer_push(self, LC_HTML_ENTITY)) return -1; if (Tokenizer_really_parse_entity(self)) return -1; if (BAD_ROUTE) { + on_bad_route: RESET_ROUTE(); self->head = reset; if (Tokenizer_emit_char(self, '&')) @@ -1574,6 +1584,8 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) if (!data) return NULL; + if (Tokenizer_check_route(self, LC_TAG_OPEN) < 0) + return NULL; if (Tokenizer_push(self, LC_TAG_OPEN)) { TagData_dealloc(data); return NULL; @@ -2191,14 +2203,17 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token static int Tokenizer_parse_table(Tokenizer* self) { Py_ssize_t reset = self->head; - PyObject *style, *padding; + PyObject *style, *padding, *trash; PyObject *table = NULL; self->head += 2; - if(Tokenizer_push(self, LC_TABLE_OPEN)) + if (Tokenizer_check_route(self, LC_TABLE_OPEN) < 0) + goto on_bad_route; + if (Tokenizer_push(self, LC_TABLE_OPEN)) return -1; padding = Tokenizer_handle_table_style(self, '\n'); if (BAD_ROUTE) { + on_bad_route: RESET_ROUTE(); self->head = reset; if (Tokenizer_emit_char(self, '{')) @@ -2214,11 +2229,16 @@ static int Tokenizer_parse_table(Tokenizer* self) } self->head++; + StackIdent restore_point = self->topstack->ident; table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); if (BAD_ROUTE) { RESET_ROUTE(); Py_DECREF(padding); Py_DECREF(style); + while (!Tokenizer_IS_CURRENT_STACK(self, restore_point)) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + } self->head = reset; if (Tokenizer_emit_char(self, '{')) return -1; @@ -2243,7 +2263,7 @@ static int Tokenizer_parse_table(Tokenizer* self) */ static int Tokenizer_handle_table_row(Tokenizer* self) { - PyObject *padding, *style, *row, *trash; + PyObject *padding, *style, *row; self->head += 2; if (!Tokenizer_CAN_RECURSE(self)) { @@ -2253,14 +2273,13 @@ static int Tokenizer_handle_table_row(Tokenizer* self) return 0; } - if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) + if (Tokenizer_check_route(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN) < 0) + return 0; + if (Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) return -1; padding = Tokenizer_handle_table_style(self, '\n'); - if (BAD_ROUTE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); + if (BAD_ROUTE) return 0; - } if (!padding) return -1; style = Tokenizer_pop(self); @@ -2319,8 +2338,8 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if (cell_context & LC_TABLE_CELL_STYLE) { Py_DECREF(cell); self->head = reset; - if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | - line_context)) + if (Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | + line_context)) return -1; padding = Tokenizer_handle_table_style(self, '|'); if (!padding) @@ -2541,6 +2560,8 @@ PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) PyObject* temp; if (push) { + if (Tokenizer_check_route(self, context) < 0) + return NULL; if (Tokenizer_push(self, context)) return NULL; } diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.c b/mwparserfromhell/parser/ctokenizer/tok_support.c index 31c6bb9..08bfe9c 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.c +++ b/mwparserfromhell/parser/ctokenizer/tok_support.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -40,10 +40,11 @@ int Tokenizer_push(Tokenizer* self, uint64_t context) top->textbuffer = Textbuffer_new(&self->text); if (!top->textbuffer) return -1; + top->ident.head = self->head; + top->ident.context = context; top->next = self->topstack; self->topstack = top; self->depth++; - self->cycles++; return 0; } @@ -130,12 +131,38 @@ PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) } /* + Compare two route_tree_nodes that are in their avl_tree_node forms. +*/ +static int compare_nodes( + const struct avl_tree_node* na, const struct avl_tree_node* nb) +{ + route_tree_node *a = avl_tree_entry(na, route_tree_node, node); + route_tree_node *b = avl_tree_entry(nb, route_tree_node, node); + + if (a->id.head < b->id.head) + return -1; + if (a->id.head > b->id.head) + return 1; + return (a->id.context > b->id.context) - (a->id.context < b->id.context); +} + +/* Fail the current tokenization route. Discards the current - stack/context/textbuffer and sets the BAD_ROUTE flag. + stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the + ident of the failed stack so future parsing attempts down this route can be + stopped early. */ void* Tokenizer_fail_route(Tokenizer* self) { uint64_t context = self->topstack->context; + + route_tree_node *node = malloc(sizeof(route_tree_node)); + if (node) { + node->id = self->topstack->ident; + if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) + free(node); + } + PyObject* stack = Tokenizer_pop(self); Py_XDECREF(stack); @@ -144,6 +171,31 @@ void* Tokenizer_fail_route(Tokenizer* self) } /* + Check if pushing a new route here with the given context would definitely + fail, based on a previous call to Tokenizer_fail_route() with the same + stack. + + Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the + latter case. + + This function is not necessary to call and works as an optimization + implementation detail. (The Python tokenizer checks every route on push, + but this would introduce too much overhead in C tokenizer due to the need + to check for a bad route after every call to Tokenizer_push.) +*/ +int Tokenizer_check_route(Tokenizer* self, uint64_t context) +{ + StackIdent ident = {self->head, context}; + struct avl_tree_node *node = (struct avl_tree_node*) (&ident + 1); + + if (avl_tree_lookup_node(self->bad_routes, node, compare_nodes)) { + FAIL_ROUTE(context); + return -1; + } + return 0; +} + +/* Write a token to the current token stack. */ int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.h b/mwparserfromhell/parser/ctokenizer/tok_support.h index 182f9a0..ccc6af5 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.h +++ b/mwparserfromhell/parser/ctokenizer/tok_support.h @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -32,6 +32,7 @@ void Tokenizer_delete_top_of_stack(Tokenizer*); PyObject* Tokenizer_pop(Tokenizer*); PyObject* Tokenizer_pop_keeping_context(Tokenizer*); void* Tokenizer_fail_route(Tokenizer*); +int Tokenizer_check_route(Tokenizer*, uint64_t); int Tokenizer_emit_token(Tokenizer*, PyObject*, int); int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int); @@ -47,10 +48,11 @@ Unicode Tokenizer_read_backwards(Tokenizer*, Py_ssize_t); /* Macros */ #define MAX_DEPTH 40 -#define MAX_CYCLES 100000 - #define Tokenizer_CAN_RECURSE(self) \ - (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES) + (self->depth < MAX_DEPTH) +#define Tokenizer_IS_CURRENT_STACK(self, id) \ + (self->topstack->ident.head == (id).head && \ + self->topstack->ident.context == (id).context) #define Tokenizer_emit(self, token) \ Tokenizer_emit_token(self, token, 0) diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c index 47d2993..213c47b 100644 --- a/mwparserfromhell/parser/ctokenizer/tokenizer.c +++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -103,8 +103,9 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) return -1; init_tokenizer_text(&self->text); self->topstack = NULL; - self->head = self->global = self->depth = self->cycles = 0; + self->head = self->global = self->depth = 0; self->route_context = self->route_state = 0; + self->bad_routes = NULL; self->skip_style_tags = 0; return 0; } @@ -158,10 +159,17 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) return NULL; } - self->head = self->global = self->depth = self->cycles = 0; + self->head = self->global = self->depth = 0; self->skip_style_tags = skip_style_tags; + self->bad_routes = NULL; + tokens = Tokenizer_parse(self, context, 1); + route_tree_node *n; + avl_tree_for_each_in_postorder(n, self->bad_routes, route_tree_node, node) + free(n); + self->bad_routes = NULL; + if (!tokens || self->topstack) { Py_XDECREF(tokens); if (PyErr_Occurred()) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 309d0d3..b3e5883 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -65,7 +65,6 @@ class Tokenizer(object): MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", ":", "/", "-", "!", "\n", START, END] MAX_DEPTH = 40 - MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) tag_splitter = re.compile(r"([\s\"\'\\]+)") @@ -75,7 +74,8 @@ class Tokenizer(object): self._stacks = [] self._global = 0 self._depth = 0 - self._cycles = 0 + self._bad_routes = set() + self._skip_style_tags = False @property def _stack(self): @@ -100,11 +100,24 @@ class Tokenizer(object): def _textbuffer(self, value): self._stacks[-1][2] = value + @property + def _stack_ident(self): + """An identifier for the current stack. + + This is based on the starting head position and context. Stacks with + the same identifier are always parsed in the same way. This can be used + to cache intermediate parsing info. + """ + return self._stacks[-1][3] + def _push(self, context=0): """Add a new token stack, context, and textbuffer to the list.""" - self._stacks.append([[], context, []]) + new_ident = (self._head, context) + if new_ident in self._bad_routes: + raise BadRoute(context) + + self._stacks.append([[], context, [], new_ident]) self._depth += 1 - self._cycles += 1 def _push_textbuffer(self): """Push the textbuffer onto the stack as a Text node and clear it.""" @@ -129,7 +142,7 @@ class Tokenizer(object): def _can_recurse(self): """Return whether or not our max recursion depth has been exceeded.""" - return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES + return self._depth < self.MAX_DEPTH def _fail_route(self): """Fail the current tokenization route. @@ -138,6 +151,7 @@ class Tokenizer(object): :exc:`.BadRoute`. """ context = self._context + self._bad_routes.add(self._stack_ident) self._pop() raise BadRoute(context) @@ -609,8 +623,8 @@ class Tokenizer(object): def _parse_entity(self): """Parse an HTML entity at the head of the wikicode string.""" reset = self._head - self._push() try: + self._push(contexts.HTML_ENTITY) self._really_parse_entity() except BadRoute: self._head = reset @@ -650,8 +664,9 @@ class Tokenizer(object): self._emit_first(tokens.TagAttrQuote(char=data.quoter)) self._emit_all(self._pop()) buf = data.padding_buffer - self._emit_first(tokens.TagAttrStart(pad_first=buf["first"], - pad_before_eq=buf["before_eq"], pad_after_eq=buf["after_eq"])) + self._emit_first(tokens.TagAttrStart( + pad_first=buf["first"], pad_before_eq=buf["before_eq"], + pad_after_eq=buf["after_eq"])) self._emit_all(self._pop()) for key in data.padding_buffer: data.padding_buffer[key] = "" @@ -1076,8 +1091,8 @@ class Tokenizer(object): """Parse a wikicode table by starting with the first line.""" reset = self._head self._head += 2 - self._push(contexts.TABLE_OPEN) try: + self._push(contexts.TABLE_OPEN) padding = self._handle_table_style("\n") except BadRoute: self._head = reset @@ -1086,9 +1101,12 @@ class Tokenizer(object): style = self._pop() self._head += 1 + restore_point = self._stack_ident try: table = self._parse(contexts.TABLE_OPEN) except BadRoute: + while self._stack_ident != restore_point: + self._pop() self._head = reset self._emit_text("{") return @@ -1106,11 +1124,7 @@ class Tokenizer(object): return self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) - try: - padding = self._handle_table_style("\n") - except BadRoute: - self._pop() - raise + padding = self._handle_table_style("\n") style = self._pop() # Don't parse the style separator: @@ -1348,7 +1362,8 @@ class Tokenizer(object): # Kill potential table contexts self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS # Start of table parsing - elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or + elif this == "{" and next == "|" and ( + self._read(-1) in ("\n", self.START) or (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if self._can_recurse(): self._parse_table() @@ -1374,7 +1389,7 @@ class Tokenizer(object): self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS self._emit_text(this) elif (self._read(-1) in ("\n", self.START) or - (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): + (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if this == "|" and next == "}": if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() @@ -1406,10 +1421,12 @@ class Tokenizer(object): def tokenize(self, text, context=0, skip_style_tags=False): """Build a list of tokens from a string of wikicode and return it.""" - self._skip_style_tags = skip_style_tags split = self.regex.split(text) self._text = [segment for segment in split if segment] - self._head = self._global = self._depth = self._cycles = 0 + self._head = self._global = self._depth = 0 + self._bad_routes = set() + self._skip_style_tags = skip_style_tags + try: tokens = self._parse(context) except BadRoute: # pragma: no cover (untestable/exceptional case) diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index 831f4d0..7137c50 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -346,3 +346,10 @@ name: tables_in_templates_2 label: catch error handling mistakes when wikitables are inside templates input: "{{hello|test\n{|\n| }}" output: [TemplateOpen(), Text(text="hello"), TemplateParamSeparator(), Text(text="test\n{"), TemplateParamSeparator(), Text(text="\n"), TemplateParamSeparator(), Text(text=" "), TemplateClose()] + +--- + +name: many_invalid_nested_tags +label: many unending nested tags that should be treated as plain text, followed by valid wikitext (see issues #42, #183) +input: "[[{{x}}" +output: [Text(text="[["), TemplateOpen(), Text(text="x"), TemplateClose()] diff --git a/tests/tokenizer/templates.mwtest b/tests/tokenizer/templates.mwtest index dccee37..8d30069 100644 --- a/tests/tokenizer/templates.mwtest +++ b/tests/tokenizer/templates.mwtest @@ -694,4 +694,4 @@ output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ name: recursion_opens_and_closes label: test potentially dangerous recursion: template openings and closings input: "{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}" -output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), TemplateOpen(), Text(text="x"), TemplateParamSeparator(), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x"), TemplateParamSeparator(), Text(text="{{x"), TemplateClose(), Text(text="{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}")] +output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose()]