Also removed the max cycles stop-gap, allowing much more complex pages to be parsed quickly without losing nodes at the end Also fixes #65, fixes #102, fixes #165, fixes #183 Also fixes #81 (Rafael Nadal parsing bug) Also fixes #53, fixes #58, fixes #88, fixes #152 (duplicate issues)tags/v0.5
@@ -4,6 +4,10 @@ v0.5 (unreleased): | |||
contained within another Wikicode object. | |||
- Added Wikicode.get_ancestors() and Wikicode.get_parent() to find all | |||
ancestors and the direct parent of a Node, respectively. | |||
- Fixed a long-standing performance issue with deeply nested, invalid syntax | |||
(issue #42). The parser should be much faster on certain complex pages. The | |||
"max cycle" restriction has also been removed, so some situations where | |||
templates at the end of a page were being skipped are now resolved. | |||
- Made Template.remove(keep_field=True) behave more reasonably when the | |||
parameter is already empty. | |||
- Added the keep_template_params argument to Wikicode.strip_code(). If True, | |||
@@ -1,4 +1,4 @@ | |||
Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||
of this software and associated documentation files (the "Software"), to deal | |||
@@ -12,6 +12,11 @@ Unreleased | |||
object. | |||
- Added :meth:`.Wikicode.get_ancestors` and :meth:`.Wikicode.get_parent` to | |||
find all ancestors and the direct parent of a :class:`.Node`, respectively. | |||
- Fixed a long-standing performance issue with deeply nested, invalid syntax | |||
(`issue #42 <https://github.com/earwig/mwparserfromhell/issues/42>`_). The | |||
parser should be much faster on certain complex pages. The "max cycle" | |||
restriction has also been removed, so some situations where templates at the | |||
end of a page were being skipped are now resolved. | |||
- Made :meth:`Template.remove(keep_field=True) <.Template.remove>` behave more | |||
reasonably when the parameter is already empty. | |||
- Added the *keep_template_params* argument to :meth:`.Wikicode.strip_code`. | |||
@@ -54,7 +59,7 @@ v0.4.3 | |||
v0.4.2 | |||
------ | |||
`Released July 30, 2015 <https://github.com/earwig/mwparserfromhell/tree/v0.4.2>`_ | |||
`Released July 30, 2015 <https://github.com/earwig/mwparserfromhell/tree/v0.4.2>`__ | |||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.4.1...v0.4.2>`__): | |||
- Fixed setup script not including header files in releases. | |||
@@ -63,7 +68,7 @@ v0.4.2 | |||
v0.4.1 | |||
------ | |||
`Released July 30, 2015 <https://github.com/earwig/mwparserfromhell/tree/v0.4.1>`_ | |||
`Released July 30, 2015 <https://github.com/earwig/mwparserfromhell/tree/v0.4.1>`__ | |||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.4...v0.4.1>`__): | |||
- The process for building Windows binaries has been fixed, and these should be | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -100,6 +100,8 @@ Local (stack-specific) contexts: | |||
* :const:`TABLE_TH_LINE` | |||
* :const:`TABLE_CELL_LINE_CONTEXTS` | |||
* :const:`HTML_ENTITY` | |||
Global contexts: | |||
* :const:`GL_HEADING` | |||
@@ -176,6 +178,8 @@ TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE | |||
TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + | |||
TABLE_TD_LINE + TABLE_TH_LINE) | |||
HTML_ENTITY = 1 << 37 | |||
# Global contexts: | |||
GL_HEADING = 1 << 0 | |||
@@ -0,0 +1,789 @@ | |||
/* | |||
* avl_tree.c - intrusive, nonrecursive AVL tree data structure (self-balancing | |||
* binary search tree), implementation file | |||
* | |||
* Written in 2014-2016 by Eric Biggers <ebiggers3@gmail.com> | |||
* | |||
* To the extent possible under law, the author(s) have dedicated all copyright | |||
* and related and neighboring rights to this software to the public domain | |||
* worldwide via the Creative Commons Zero 1.0 Universal Public Domain | |||
* Dedication (the "CC0"). | |||
* | |||
* This software is distributed in the hope that it will be useful, but WITHOUT | |||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |||
* FOR A PARTICULAR PURPOSE. See the CC0 for more details. | |||
* | |||
* You should have received a copy of the CC0 along with this software; if not | |||
* see <http://creativecommons.org/publicdomain/zero/1.0/>. | |||
*/ | |||
#include "avl_tree.h" | |||
/* Returns the left child (sign < 0) or the right child (sign > 0) of the | |||
* specified AVL tree node. | |||
* Note: for all calls of this, 'sign' is constant at compilation time, | |||
* so the compiler can remove the conditional. */ | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_get_child(const struct avl_tree_node *parent, int sign) | |||
{ | |||
if (sign < 0) | |||
return parent->left; | |||
else | |||
return parent->right; | |||
} | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_tree_first_or_last_in_order(const struct avl_tree_node *root, int sign) | |||
{ | |||
const struct avl_tree_node *first = root; | |||
if (first) | |||
while (avl_get_child(first, +sign)) | |||
first = avl_get_child(first, +sign); | |||
return (struct avl_tree_node *)first; | |||
} | |||
/* Starts an in-order traversal of the tree: returns the least-valued node, or | |||
* NULL if the tree is empty. */ | |||
struct avl_tree_node * | |||
avl_tree_first_in_order(const struct avl_tree_node *root) | |||
{ | |||
return avl_tree_first_or_last_in_order(root, -1); | |||
} | |||
/* Starts a *reverse* in-order traversal of the tree: returns the | |||
* greatest-valued node, or NULL if the tree is empty. */ | |||
struct avl_tree_node * | |||
avl_tree_last_in_order(const struct avl_tree_node *root) | |||
{ | |||
return avl_tree_first_or_last_in_order(root, 1); | |||
} | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_tree_next_or_prev_in_order(const struct avl_tree_node *node, int sign) | |||
{ | |||
const struct avl_tree_node *next; | |||
if (avl_get_child(node, +sign)) | |||
for (next = avl_get_child(node, +sign); | |||
avl_get_child(next, -sign); | |||
next = avl_get_child(next, -sign)) | |||
; | |||
else | |||
for (next = avl_get_parent(node); | |||
next && node == avl_get_child(next, +sign); | |||
node = next, next = avl_get_parent(next)) | |||
; | |||
return (struct avl_tree_node *)next; | |||
} | |||
/* Continues an in-order traversal of the tree: returns the next-greatest-valued | |||
* node, or NULL if there is none. */ | |||
struct avl_tree_node * | |||
avl_tree_next_in_order(const struct avl_tree_node *node) | |||
{ | |||
return avl_tree_next_or_prev_in_order(node, 1); | |||
} | |||
/* Continues a *reverse* in-order traversal of the tree: returns the | |||
* previous-greatest-valued node, or NULL if there is none. */ | |||
struct avl_tree_node * | |||
avl_tree_prev_in_order(const struct avl_tree_node *node) | |||
{ | |||
return avl_tree_next_or_prev_in_order(node, -1); | |||
} | |||
/* Starts a postorder traversal of the tree. */ | |||
struct avl_tree_node * | |||
avl_tree_first_in_postorder(const struct avl_tree_node *root) | |||
{ | |||
const struct avl_tree_node *first = root; | |||
if (first) | |||
while (first->left || first->right) | |||
first = first->left ? first->left : first->right; | |||
return (struct avl_tree_node *)first; | |||
} | |||
/* Continues a postorder traversal of the tree. @prev will not be deferenced as | |||
* it's allowed that its memory has been freed; @prev_parent must be its saved | |||
* parent node. Returns NULL if there are no more nodes (i.e. @prev was the | |||
* root of the tree). */ | |||
struct avl_tree_node * | |||
avl_tree_next_in_postorder(const struct avl_tree_node *prev, | |||
const struct avl_tree_node *prev_parent) | |||
{ | |||
const struct avl_tree_node *next = prev_parent; | |||
if (next && prev == next->left && next->right) | |||
for (next = next->right; | |||
next->left || next->right; | |||
next = next->left ? next->left : next->right) | |||
; | |||
return (struct avl_tree_node *)next; | |||
} | |||
/* Sets the left child (sign < 0) or the right child (sign > 0) of the | |||
* specified AVL tree node. | |||
* Note: for all calls of this, 'sign' is constant at compilation time, | |||
* so the compiler can remove the conditional. */ | |||
static AVL_INLINE void | |||
avl_set_child(struct avl_tree_node *parent, int sign, | |||
struct avl_tree_node *child) | |||
{ | |||
if (sign < 0) | |||
parent->left = child; | |||
else | |||
parent->right = child; | |||
} | |||
/* Sets the parent and balance factor of the specified AVL tree node. */ | |||
static AVL_INLINE void | |||
avl_set_parent_balance(struct avl_tree_node *node, struct avl_tree_node *parent, | |||
int balance_factor) | |||
{ | |||
node->parent_balance = (uintptr_t)parent | (balance_factor + 1); | |||
} | |||
/* Sets the parent of the specified AVL tree node. */ | |||
static AVL_INLINE void | |||
avl_set_parent(struct avl_tree_node *node, struct avl_tree_node *parent) | |||
{ | |||
node->parent_balance = (uintptr_t)parent | (node->parent_balance & 3); | |||
} | |||
/* Returns the balance factor of the specified AVL tree node --- that is, the | |||
* height of its right subtree minus the height of its left subtree. */ | |||
static AVL_INLINE int | |||
avl_get_balance_factor(const struct avl_tree_node *node) | |||
{ | |||
return (int)(node->parent_balance & 3) - 1; | |||
} | |||
/* Adds @amount to the balance factor of the specified AVL tree node. | |||
* The caller must ensure this still results in a valid balance factor | |||
* (-1, 0, or 1). */ | |||
static AVL_INLINE void | |||
avl_adjust_balance_factor(struct avl_tree_node *node, int amount) | |||
{ | |||
node->parent_balance += amount; | |||
} | |||
static AVL_INLINE void | |||
avl_replace_child(struct avl_tree_node **root_ptr, | |||
struct avl_tree_node *parent, | |||
struct avl_tree_node *old_child, | |||
struct avl_tree_node *new_child) | |||
{ | |||
if (parent) { | |||
if (old_child == parent->left) | |||
parent->left = new_child; | |||
else | |||
parent->right = new_child; | |||
} else { | |||
*root_ptr = new_child; | |||
} | |||
} | |||
/* | |||
* Template for performing a single rotation --- | |||
* | |||
* sign > 0: Rotate clockwise (right) rooted at A: | |||
* | |||
* P? P? | |||
* | | | |||
* A B | |||
* / \ / \ | |||
* B C? => D? A | |||
* / \ / \ | |||
* D? E? E? C? | |||
* | |||
* (nodes marked with ? may not exist) | |||
* | |||
* sign < 0: Rotate counterclockwise (left) rooted at A: | |||
* | |||
* P? P? | |||
* | | | |||
* A B | |||
* / \ / \ | |||
* C? B => A D? | |||
* / \ / \ | |||
* E? D? C? E? | |||
* | |||
* This updates pointers but not balance factors! | |||
*/ | |||
static AVL_INLINE void | |||
avl_rotate(struct avl_tree_node ** const root_ptr, | |||
struct avl_tree_node * const A, const int sign) | |||
{ | |||
struct avl_tree_node * const B = avl_get_child(A, -sign); | |||
struct avl_tree_node * const E = avl_get_child(B, +sign); | |||
struct avl_tree_node * const P = avl_get_parent(A); | |||
avl_set_child(A, -sign, E); | |||
avl_set_parent(A, B); | |||
avl_set_child(B, +sign, A); | |||
avl_set_parent(B, P); | |||
if (E) | |||
avl_set_parent(E, A); | |||
avl_replace_child(root_ptr, P, A, B); | |||
} | |||
/* | |||
* Template for performing a double rotation --- | |||
* | |||
* sign > 0: Rotate counterclockwise (left) rooted at B, then | |||
* clockwise (right) rooted at A: | |||
* | |||
* P? P? P? | |||
* | | | | |||
* A A E | |||
* / \ / \ / \ | |||
* B C? => E C? => B A | |||
* / \ / \ / \ / \ | |||
* D? E B G? D? F?G? C? | |||
* / \ / \ | |||
* F? G? D? F? | |||
* | |||
* (nodes marked with ? may not exist) | |||
* | |||
* sign < 0: Rotate clockwise (right) rooted at B, then | |||
* counterclockwise (left) rooted at A: | |||
* | |||
* P? P? P? | |||
* | | | | |||
* A A E | |||
* / \ / \ / \ | |||
* C? B => C? E => A B | |||
* / \ / \ / \ / \ | |||
* E D? G? B C? G?F? D? | |||
* / \ / \ | |||
* G? F? F? D? | |||
* | |||
* Returns a pointer to E and updates balance factors. Except for those | |||
* two things, this function is equivalent to: | |||
* avl_rotate(root_ptr, B, -sign); | |||
* avl_rotate(root_ptr, A, +sign); | |||
* | |||
* See comment in avl_handle_subtree_growth() for explanation of balance | |||
* factor updates. | |||
*/ | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_do_double_rotate(struct avl_tree_node ** const root_ptr, | |||
struct avl_tree_node * const B, | |||
struct avl_tree_node * const A, const int sign) | |||
{ | |||
struct avl_tree_node * const E = avl_get_child(B, +sign); | |||
struct avl_tree_node * const F = avl_get_child(E, -sign); | |||
struct avl_tree_node * const G = avl_get_child(E, +sign); | |||
struct avl_tree_node * const P = avl_get_parent(A); | |||
const int e = avl_get_balance_factor(E); | |||
avl_set_child(A, -sign, G); | |||
avl_set_parent_balance(A, E, ((sign * e >= 0) ? 0 : -e)); | |||
avl_set_child(B, +sign, F); | |||
avl_set_parent_balance(B, E, ((sign * e <= 0) ? 0 : -e)); | |||
avl_set_child(E, +sign, A); | |||
avl_set_child(E, -sign, B); | |||
avl_set_parent_balance(E, P, 0); | |||
if (G) | |||
avl_set_parent(G, A); | |||
if (F) | |||
avl_set_parent(F, B); | |||
avl_replace_child(root_ptr, P, A, E); | |||
return E; | |||
} | |||
/* | |||
* This function handles the growth of a subtree due to an insertion. | |||
* | |||
* @root_ptr | |||
* Location of the tree's root pointer. | |||
* | |||
* @node | |||
* A subtree that has increased in height by 1 due to an insertion. | |||
* | |||
* @parent | |||
* Parent of @node; must not be NULL. | |||
* | |||
* @sign | |||
* -1 if @node is the left child of @parent; | |||
* +1 if @node is the right child of @parent. | |||
* | |||
* This function will adjust @parent's balance factor, then do a (single | |||
* or double) rotation if necessary. The return value will be %true if | |||
* the full AVL tree is now adequately balanced, or %false if the subtree | |||
* rooted at @parent is now adequately balanced but has increased in | |||
* height by 1, so the caller should continue up the tree. | |||
* | |||
* Note that if %false is returned, no rotation will have been done. | |||
* Indeed, a single node insertion cannot require that more than one | |||
* (single or double) rotation be done. | |||
*/ | |||
static AVL_INLINE bool | |||
avl_handle_subtree_growth(struct avl_tree_node ** const root_ptr, | |||
struct avl_tree_node * const node, | |||
struct avl_tree_node * const parent, | |||
const int sign) | |||
{ | |||
int old_balance_factor, new_balance_factor; | |||
old_balance_factor = avl_get_balance_factor(parent); | |||
if (old_balance_factor == 0) { | |||
avl_adjust_balance_factor(parent, sign); | |||
/* @parent is still sufficiently balanced (-1 or +1 | |||
* balance factor), but must have increased in height. | |||
* Continue up the tree. */ | |||
return false; | |||
} | |||
new_balance_factor = old_balance_factor + sign; | |||
if (new_balance_factor == 0) { | |||
avl_adjust_balance_factor(parent, sign); | |||
/* @parent is now perfectly balanced (0 balance factor). | |||
* It cannot have increased in height, so there is | |||
* nothing more to do. */ | |||
return true; | |||
} | |||
/* @parent is too left-heavy (new_balance_factor == -2) or | |||
* too right-heavy (new_balance_factor == +2). */ | |||
/* Test whether @node is left-heavy (-1 balance factor) or | |||
* right-heavy (+1 balance factor). | |||
* Note that it cannot be perfectly balanced (0 balance factor) | |||
* because here we are under the invariant that @node has | |||
* increased in height due to the insertion. */ | |||
if (sign * avl_get_balance_factor(node) > 0) { | |||
/* @node (B below) is heavy in the same direction @parent | |||
* (A below) is heavy. | |||
* | |||
* @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |||
* The comment, diagram, and equations below assume sign < 0. | |||
* The other case is symmetric! | |||
* @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |||
* | |||
* Do a clockwise rotation rooted at @parent (A below): | |||
* | |||
* A B | |||
* / \ / \ | |||
* B C? => D A | |||
* / \ / \ / \ | |||
* D E? F? G?E? C? | |||
* / \ | |||
* F? G? | |||
* | |||
* Before the rotation: | |||
* balance(A) = -2 | |||
* balance(B) = -1 | |||
* Let x = height(C). Then: | |||
* height(B) = x + 2 | |||
* height(D) = x + 1 | |||
* height(E) = x | |||
* max(height(F), height(G)) = x. | |||
* | |||
* After the rotation: | |||
* height(D) = max(height(F), height(G)) + 1 | |||
* = x + 1 | |||
* height(A) = max(height(E), height(C)) + 1 | |||
* = max(x, x) + 1 = x + 1 | |||
* balance(B) = 0 | |||
* balance(A) = 0 | |||
*/ | |||
avl_rotate(root_ptr, parent, -sign); | |||
/* Equivalent to setting @parent's balance factor to 0. */ | |||
avl_adjust_balance_factor(parent, -sign); /* A */ | |||
/* Equivalent to setting @node's balance factor to 0. */ | |||
avl_adjust_balance_factor(node, -sign); /* B */ | |||
} else { | |||
/* @node (B below) is heavy in the direction opposite | |||
* from the direction @parent (A below) is heavy. | |||
* | |||
* @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |||
* The comment, diagram, and equations below assume sign < 0. | |||
* The other case is symmetric! | |||
* @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |||
* | |||
* Do a counterblockwise rotation rooted at @node (B below), | |||
* then a clockwise rotation rooted at @parent (A below): | |||
* | |||
* A A E | |||
* / \ / \ / \ | |||
* B C? => E C? => B A | |||
* / \ / \ / \ / \ | |||
* D? E B G? D? F?G? C? | |||
* / \ / \ | |||
* F? G? D? F? | |||
* | |||
* Before the rotation: | |||
* balance(A) = -2 | |||
* balance(B) = +1 | |||
* Let x = height(C). Then: | |||
* height(B) = x + 2 | |||
* height(E) = x + 1 | |||
* height(D) = x | |||
* max(height(F), height(G)) = x | |||
* | |||
* After both rotations: | |||
* height(A) = max(height(G), height(C)) + 1 | |||
* = x + 1 | |||
* balance(A) = balance(E{orig}) >= 0 ? 0 : -balance(E{orig}) | |||
* height(B) = max(height(D), height(F)) + 1 | |||
* = x + 1 | |||
* balance(B) = balance(E{orig} <= 0) ? 0 : -balance(E{orig}) | |||
* | |||
* height(E) = x + 2 | |||
* balance(E) = 0 | |||
*/ | |||
avl_do_double_rotate(root_ptr, node, parent, -sign); | |||
} | |||
/* Height after rotation is unchanged; nothing more to do. */ | |||
return true; | |||
} | |||
/* Rebalance the tree after insertion of the specified node. */ | |||
void | |||
avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr, | |||
struct avl_tree_node *inserted) | |||
{ | |||
struct avl_tree_node *node, *parent; | |||
bool done; | |||
inserted->left = NULL; | |||
inserted->right = NULL; | |||
node = inserted; | |||
/* Adjust balance factor of new node's parent. | |||
* No rotation will need to be done at this level. */ | |||
parent = avl_get_parent(node); | |||
if (!parent) | |||
return; | |||
if (node == parent->left) | |||
avl_adjust_balance_factor(parent, -1); | |||
else | |||
avl_adjust_balance_factor(parent, +1); | |||
if (avl_get_balance_factor(parent) == 0) | |||
/* @parent did not change in height. Nothing more to do. */ | |||
return; | |||
/* The subtree rooted at @parent increased in height by 1. */ | |||
do { | |||
/* Adjust balance factor of next ancestor. */ | |||
node = parent; | |||
parent = avl_get_parent(node); | |||
if (!parent) | |||
return; | |||
/* The subtree rooted at @node has increased in height by 1. */ | |||
if (node == parent->left) | |||
done = avl_handle_subtree_growth(root_ptr, node, | |||
parent, -1); | |||
else | |||
done = avl_handle_subtree_growth(root_ptr, node, | |||
parent, +1); | |||
} while (!done); | |||
} | |||
/* | |||
* This function handles the shrinkage of a subtree due to a deletion. | |||
* | |||
* @root_ptr | |||
* Location of the tree's root pointer. | |||
* | |||
* @parent | |||
* A node in the tree, exactly one of whose subtrees has decreased | |||
* in height by 1 due to a deletion. (This includes the case where | |||
* one of the child pointers has become NULL, since we can consider | |||
* the "NULL" subtree to have a height of 0.) | |||
* | |||
* @sign | |||
* +1 if the left subtree of @parent has decreased in height by 1; | |||
* -1 if the right subtree of @parent has decreased in height by 1. | |||
* | |||
* @left_deleted_ret | |||
* If the return value is not NULL, this will be set to %true if the | |||
* left subtree of the returned node has decreased in height by 1, | |||
* or %false if the right subtree of the returned node has decreased | |||
* in height by 1. | |||
* | |||
* This function will adjust @parent's balance factor, then do a (single | |||
* or double) rotation if necessary. The return value will be NULL if | |||
* the full AVL tree is now adequately balanced, or a pointer to the | |||
* parent of @parent if @parent is now adequately balanced but has | |||
* decreased in height by 1. Also in the latter case, *left_deleted_ret | |||
* will be set. | |||
*/ | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_handle_subtree_shrink(struct avl_tree_node ** const root_ptr, | |||
struct avl_tree_node *parent, | |||
const int sign, | |||
bool * const left_deleted_ret) | |||
{ | |||
struct avl_tree_node *node; | |||
int old_balance_factor, new_balance_factor; | |||
old_balance_factor = avl_get_balance_factor(parent); | |||
if (old_balance_factor == 0) { | |||
/* Prior to the deletion, the subtree rooted at | |||
* @parent was perfectly balanced. It's now | |||
* unbalanced by 1, but that's okay and its height | |||
* hasn't changed. Nothing more to do. */ | |||
avl_adjust_balance_factor(parent, sign); | |||
return NULL; | |||
} | |||
new_balance_factor = old_balance_factor + sign; | |||
if (new_balance_factor == 0) { | |||
/* The subtree rooted at @parent is now perfectly | |||
* balanced, whereas before the deletion it was | |||
* unbalanced by 1. Its height must have decreased | |||
* by 1. No rotation is needed at this location, | |||
* but continue up the tree. */ | |||
avl_adjust_balance_factor(parent, sign); | |||
node = parent; | |||
} else { | |||
/* @parent is too left-heavy (new_balance_factor == -2) or | |||
* too right-heavy (new_balance_factor == +2). */ | |||
node = avl_get_child(parent, sign); | |||
/* The rotations below are similar to those done during | |||
* insertion (see avl_handle_subtree_growth()), so full | |||
* comments are not provided. The only new case is the | |||
* one where @node has a balance factor of 0, and that is | |||
* commented. */ | |||
if (sign * avl_get_balance_factor(node) >= 0) { | |||
avl_rotate(root_ptr, parent, -sign); | |||
if (avl_get_balance_factor(node) == 0) { | |||
/* | |||
* @node (B below) is perfectly balanced. | |||
* | |||
* @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |||
* The comment, diagram, and equations | |||
* below assume sign < 0. The other case | |||
* is symmetric! | |||
* @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ | |||
* | |||
* Do a clockwise rotation rooted at | |||
* @parent (A below): | |||
* | |||
* A B | |||
* / \ / \ | |||
* B C? => D A | |||
* / \ / \ / \ | |||
* D E F? G?E C? | |||
* / \ | |||
* F? G? | |||
* | |||
* Before the rotation: | |||
* balance(A) = -2 | |||
* balance(B) = 0 | |||
* Let x = height(C). Then: | |||
* height(B) = x + 2 | |||
* height(D) = x + 1 | |||
* height(E) = x + 1 | |||
* max(height(F), height(G)) = x. | |||
* | |||
* After the rotation: | |||
* height(D) = max(height(F), height(G)) + 1 | |||
* = x + 1 | |||
* height(A) = max(height(E), height(C)) + 1 | |||
* = max(x + 1, x) + 1 = x + 2 | |||
* balance(A) = -1 | |||
* balance(B) = +1 | |||
*/ | |||
/* A: -2 => -1 (sign < 0) | |||
* or +2 => +1 (sign > 0) | |||
* No change needed --- that's the same as | |||
* old_balance_factor. */ | |||
/* B: 0 => +1 (sign < 0) | |||
* or 0 => -1 (sign > 0) */ | |||
avl_adjust_balance_factor(node, -sign); | |||
/* Height is unchanged; nothing more to do. */ | |||
return NULL; | |||
} else { | |||
avl_adjust_balance_factor(parent, -sign); | |||
avl_adjust_balance_factor(node, -sign); | |||
} | |||
} else { | |||
node = avl_do_double_rotate(root_ptr, node, | |||
parent, -sign); | |||
} | |||
} | |||
parent = avl_get_parent(node); | |||
if (parent) | |||
*left_deleted_ret = (node == parent->left); | |||
return parent; | |||
} | |||
/* Swaps node X, which must have 2 children, with its in-order successor, then | |||
* unlinks node X. Returns the parent of X just before unlinking, without its | |||
* balance factor having been updated to account for the unlink. */ | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_tree_swap_with_successor(struct avl_tree_node **root_ptr, | |||
struct avl_tree_node *X, | |||
bool *left_deleted_ret) | |||
{ | |||
struct avl_tree_node *Y, *ret; | |||
Y = X->right; | |||
if (!Y->left) { | |||
/* | |||
* P? P? P? | |||
* | | | | |||
* X Y Y | |||
* / \ / \ / \ | |||
* A Y => A X => A B? | |||
* / \ / \ | |||
* (0) B? (0) B? | |||
* | |||
* [ X unlinked, Y returned ] | |||
*/ | |||
ret = Y; | |||
*left_deleted_ret = false; | |||
} else { | |||
struct avl_tree_node *Q; | |||
do { | |||
Q = Y; | |||
Y = Y->left; | |||
} while (Y->left); | |||
/* | |||
* P? P? P? | |||
* | | | | |||
* X Y Y | |||
* / \ / \ / \ | |||
* A ... => A ... => A ... | |||
* | | | | |||
* Q Q Q | |||
* / / / | |||
* Y X B? | |||
* / \ / \ | |||
* (0) B? (0) B? | |||
* | |||
* | |||
* [ X unlinked, Q returned ] | |||
*/ | |||
Q->left = Y->right; | |||
if (Q->left) | |||
avl_set_parent(Q->left, Q); | |||
Y->right = X->right; | |||
avl_set_parent(X->right, Y); | |||
ret = Q; | |||
*left_deleted_ret = true; | |||
} | |||
Y->left = X->left; | |||
avl_set_parent(X->left, Y); | |||
Y->parent_balance = X->parent_balance; | |||
avl_replace_child(root_ptr, avl_get_parent(X), X, Y); | |||
return ret; | |||
} | |||
/* | |||
* Removes an item from the specified AVL tree. | |||
* | |||
* @root_ptr | |||
* Location of the AVL tree's root pointer. Indirection is needed | |||
* because the root node may change if the tree needed to be rebalanced | |||
* because of the deletion or if @node was the root node. | |||
* | |||
* @node | |||
* Pointer to the `struct avl_tree_node' embedded in the item to | |||
* remove from the tree. | |||
* | |||
* Note: This function *only* removes the node and rebalances the tree. | |||
* It does not free any memory, nor does it do the equivalent of | |||
* avl_tree_node_set_unlinked(). | |||
*/ | |||
void | |||
avl_tree_remove(struct avl_tree_node **root_ptr, struct avl_tree_node *node) | |||
{ | |||
struct avl_tree_node *parent; | |||
bool left_deleted = false; | |||
if (node->left && node->right) { | |||
/* @node is fully internal, with two children. Swap it | |||
* with its in-order successor (which must exist in the | |||
* right subtree of @node and can have, at most, a right | |||
* child), then unlink @node. */ | |||
parent = avl_tree_swap_with_successor(root_ptr, node, | |||
&left_deleted); | |||
/* @parent is now the parent of what was @node's in-order | |||
* successor. It cannot be NULL, since @node itself was | |||
* an ancestor of its in-order successor. | |||
* @left_deleted has been set to %true if @node's | |||
* in-order successor was the left child of @parent, | |||
* otherwise %false. */ | |||
} else { | |||
struct avl_tree_node *child; | |||
/* @node is missing at least one child. Unlink it. Set | |||
* @parent to @node's parent, and set @left_deleted to | |||
* reflect which child of @parent @node was. Or, if | |||
* @node was the root node, simply update the root node | |||
* and return. */ | |||
child = node->left ? node->left : node->right; | |||
parent = avl_get_parent(node); | |||
if (parent) { | |||
if (node == parent->left) { | |||
parent->left = child; | |||
left_deleted = true; | |||
} else { | |||
parent->right = child; | |||
left_deleted = false; | |||
} | |||
if (child) | |||
avl_set_parent(child, parent); | |||
} else { | |||
if (child) | |||
avl_set_parent(child, parent); | |||
*root_ptr = child; | |||
return; | |||
} | |||
} | |||
/* Rebalance the tree. */ | |||
do { | |||
if (left_deleted) | |||
parent = avl_handle_subtree_shrink(root_ptr, parent, | |||
+1, &left_deleted); | |||
else | |||
parent = avl_handle_subtree_shrink(root_ptr, parent, | |||
-1, &left_deleted); | |||
} while (parent); | |||
} |
@@ -0,0 +1,358 @@ | |||
/* | |||
* avl_tree.h - intrusive, nonrecursive AVL tree data structure (self-balancing | |||
* binary search tree), header file | |||
* | |||
* Written in 2014-2016 by Eric Biggers <ebiggers3@gmail.com> | |||
* | |||
* To the extent possible under law, the author(s) have dedicated all copyright | |||
* and related and neighboring rights to this software to the public domain | |||
* worldwide via the Creative Commons Zero 1.0 Universal Public Domain | |||
* Dedication (the "CC0"). | |||
* | |||
* This software is distributed in the hope that it will be useful, but WITHOUT | |||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |||
* FOR A PARTICULAR PURPOSE. See the CC0 for more details. | |||
* | |||
* You should have received a copy of the CC0 along with this software; if not | |||
* see <http://creativecommons.org/publicdomain/zero/1.0/>. | |||
*/ | |||
#ifndef _AVL_TREE_H_ | |||
#define _AVL_TREE_H_ | |||
#include <stdbool.h> | |||
#include <stddef.h> | |||
#include <inttypes.h> /* for uintptr_t */ | |||
#ifdef __GNUC__ | |||
# define AVL_INLINE inline __attribute__((always_inline)) | |||
#else | |||
# define AVL_INLINE inline | |||
#endif | |||
/* Node in an AVL tree. Embed this in some other data structure. */ | |||
struct avl_tree_node { | |||
/* Pointer to left child or NULL */ | |||
struct avl_tree_node *left; | |||
/* Pointer to right child or NULL */ | |||
struct avl_tree_node *right; | |||
/* Pointer to parent combined with the balance factor. This saves 4 or | |||
* 8 bytes of memory depending on the CPU architecture. | |||
* | |||
* Low 2 bits: One greater than the balance factor of this subtree, | |||
* which is equal to height(right) - height(left). The mapping is: | |||
* | |||
* 00 => -1 | |||
* 01 => 0 | |||
* 10 => +1 | |||
* 11 => undefined | |||
* | |||
* The rest of the bits are the pointer to the parent node. It must be | |||
* 4-byte aligned, and it will be NULL if this is the root node and | |||
* therefore has no parent. */ | |||
uintptr_t parent_balance; | |||
}; | |||
/* Cast an AVL tree node to the containing data structure. */ | |||
#define avl_tree_entry(entry, type, member) \ | |||
((type*) ((char *)(entry) - offsetof(type, member))) | |||
/* Returns a pointer to the parent of the specified AVL tree node, or NULL if it | |||
* is already the root of the tree. */ | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_get_parent(const struct avl_tree_node *node) | |||
{ | |||
return (struct avl_tree_node *)(node->parent_balance & ~3); | |||
} | |||
/* Marks the specified AVL tree node as unlinked from any tree. */ | |||
static AVL_INLINE void | |||
avl_tree_node_set_unlinked(struct avl_tree_node *node) | |||
{ | |||
node->parent_balance = (uintptr_t)node; | |||
} | |||
/* Returns true iff the specified AVL tree node has been marked with | |||
* avl_tree_node_set_unlinked() and has not subsequently been inserted into a | |||
* tree. */ | |||
static AVL_INLINE bool | |||
avl_tree_node_is_unlinked(const struct avl_tree_node *node) | |||
{ | |||
return node->parent_balance == (uintptr_t)node; | |||
} | |||
/* (Internal use only) */ | |||
extern void | |||
avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr, | |||
struct avl_tree_node *inserted); | |||
/* | |||
* Looks up an item in the specified AVL tree. | |||
* | |||
* @root | |||
* Pointer to the root of the AVL tree. (This can be NULL --- that just | |||
* means the tree is empty.) | |||
* | |||
* @cmp_ctx | |||
* First argument to pass to the comparison callback. This generally | |||
* should be a pointer to an object equal to the one being searched for. | |||
* | |||
* @cmp | |||
* Comparison callback. Must return < 0, 0, or > 0 if the first argument | |||
* is less than, equal to, or greater than the second argument, | |||
* respectively. The first argument will be @cmp_ctx and the second | |||
* argument will be a pointer to the AVL tree node of an item in the tree. | |||
* | |||
* Returns a pointer to the AVL tree node of the resulting item, or NULL if the | |||
* item was not found. | |||
* | |||
* Example: | |||
* | |||
* struct int_wrapper { | |||
* int data; | |||
* struct avl_tree_node index_node; | |||
* }; | |||
* | |||
* static int _avl_cmp_int_to_node(const void *intptr, | |||
* const struct avl_tree_node *nodeptr) | |||
* { | |||
* int n1 = *(const int *)intptr; | |||
* int n2 = avl_tree_entry(nodeptr, struct int_wrapper, index_node)->data; | |||
* if (n1 < n2) | |||
* return -1; | |||
* else if (n1 > n2) | |||
* return 1; | |||
* else | |||
* return 0; | |||
* } | |||
* | |||
* bool contains_int(struct avl_tree_node *root, int n) | |||
* { | |||
* struct avl_tree_node *result; | |||
* | |||
* result = avl_tree_lookup(root, &n, _avl_cmp_int_to_node); | |||
* return result ? true : false; | |||
* } | |||
*/ | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_tree_lookup(const struct avl_tree_node *root, | |||
const void *cmp_ctx, | |||
int (*cmp)(const void *, const struct avl_tree_node *)) | |||
{ | |||
const struct avl_tree_node *cur = root; | |||
while (cur) { | |||
int res = (*cmp)(cmp_ctx, cur); | |||
if (res < 0) | |||
cur = cur->left; | |||
else if (res > 0) | |||
cur = cur->right; | |||
else | |||
break; | |||
} | |||
return (struct avl_tree_node*)cur; | |||
} | |||
/* Same as avl_tree_lookup(), but uses a more specific type for the comparison | |||
* function. Specifically, with this function the item being searched for is | |||
* expected to be in the same format as those already in the tree, with an | |||
* embedded 'struct avl_tree_node'. */ | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_tree_lookup_node(const struct avl_tree_node *root, | |||
const struct avl_tree_node *node, | |||
int (*cmp)(const struct avl_tree_node *, | |||
const struct avl_tree_node *)) | |||
{ | |||
const struct avl_tree_node *cur = root; | |||
while (cur) { | |||
int res = (*cmp)(node, cur); | |||
if (res < 0) | |||
cur = cur->left; | |||
else if (res > 0) | |||
cur = cur->right; | |||
else | |||
break; | |||
} | |||
return (struct avl_tree_node*)cur; | |||
} | |||
/* | |||
* Inserts an item into the specified AVL tree. | |||
* | |||
* @root_ptr | |||
* Location of the AVL tree's root pointer. Indirection is needed because | |||
* the root node may change as a result of rotations caused by the | |||
* insertion. Initialize *root_ptr to NULL for an empty tree. | |||
* | |||
* @item | |||
* Pointer to the `struct avl_tree_node' embedded in the item to insert. | |||
* No members in it need be pre-initialized, although members in the | |||
* containing structure should be pre-initialized so that @cmp can use them | |||
* in comparisons. | |||
* | |||
* @cmp | |||
* Comparison callback. Must return < 0, 0, or > 0 if the first argument | |||
* is less than, equal to, or greater than the second argument, | |||
* respectively. The first argument will be @item and the second | |||
* argument will be a pointer to an AVL tree node embedded in some | |||
* previously-inserted item to which @item is being compared. | |||
* | |||
* If no item in the tree is comparatively equal (via @cmp) to @item, inserts | |||
* @item and returns NULL. Otherwise does nothing and returns a pointer to the | |||
* AVL tree node embedded in the previously-inserted item which compared equal | |||
* to @item. | |||
* | |||
* Example: | |||
* | |||
* struct int_wrapper { | |||
* int data; | |||
* struct avl_tree_node index_node; | |||
* }; | |||
* | |||
* #define GET_DATA(i) avl_tree_entry((i), struct int_wrapper, index_node)->data | |||
* | |||
* static int _avl_cmp_ints(const struct avl_tree_node *node1, | |||
* const struct avl_tree_node *node2) | |||
* { | |||
* int n1 = GET_DATA(node1); | |||
* int n2 = GET_DATA(node2); | |||
* if (n1 < n2) | |||
* return -1; | |||
* else if (n1 > n2) | |||
* return 1; | |||
* else | |||
* return 0; | |||
* } | |||
* | |||
* bool insert_int(struct avl_tree_node **root_ptr, int data) | |||
* { | |||
* struct int_wrapper *i = malloc(sizeof(struct int_wrapper)); | |||
* i->data = data; | |||
* if (avl_tree_insert(root_ptr, &i->index_node, _avl_cmp_ints)) { | |||
* // Duplicate. | |||
* free(i); | |||
* return false; | |||
* } | |||
* return true; | |||
* } | |||
*/ | |||
static AVL_INLINE struct avl_tree_node * | |||
avl_tree_insert(struct avl_tree_node **root_ptr, | |||
struct avl_tree_node *item, | |||
int (*cmp)(const struct avl_tree_node *, | |||
const struct avl_tree_node *)) | |||
{ | |||
struct avl_tree_node **cur_ptr = root_ptr, *cur = NULL; | |||
int res; | |||
while (*cur_ptr) { | |||
cur = *cur_ptr; | |||
res = (*cmp)(item, cur); | |||
if (res < 0) | |||
cur_ptr = &cur->left; | |||
else if (res > 0) | |||
cur_ptr = &cur->right; | |||
else | |||
return cur; | |||
} | |||
*cur_ptr = item; | |||
item->parent_balance = (uintptr_t)cur | 1; | |||
avl_tree_rebalance_after_insert(root_ptr, item); | |||
return NULL; | |||
} | |||
/* Removes an item from the specified AVL tree. | |||
* See implementation for details. */ | |||
extern void | |||
avl_tree_remove(struct avl_tree_node **root_ptr, struct avl_tree_node *node); | |||
/* Nonrecursive AVL tree traversal functions */ | |||
extern struct avl_tree_node * | |||
avl_tree_first_in_order(const struct avl_tree_node *root); | |||
extern struct avl_tree_node * | |||
avl_tree_last_in_order(const struct avl_tree_node *root); | |||
extern struct avl_tree_node * | |||
avl_tree_next_in_order(const struct avl_tree_node *node); | |||
extern struct avl_tree_node * | |||
avl_tree_prev_in_order(const struct avl_tree_node *node); | |||
extern struct avl_tree_node * | |||
avl_tree_first_in_postorder(const struct avl_tree_node *root); | |||
extern struct avl_tree_node * | |||
avl_tree_next_in_postorder(const struct avl_tree_node *prev, | |||
const struct avl_tree_node *prev_parent); | |||
/* | |||
* Iterate through the nodes in an AVL tree in sorted order. | |||
* You may not modify the tree during the iteration. | |||
* | |||
* @child_struct | |||
* Variable that will receive a pointer to each struct inserted into the | |||
* tree. | |||
* @root | |||
* Root of the AVL tree. | |||
* @struct_name | |||
* Type of *child_struct. | |||
* @struct_member | |||
* Member of @struct_name type that is the AVL tree node. | |||
* | |||
* Example: | |||
* | |||
* struct int_wrapper { | |||
* int data; | |||
* struct avl_tree_node index_node; | |||
* }; | |||
* | |||
* void print_ints(struct avl_tree_node *root) | |||
* { | |||
* struct int_wrapper *i; | |||
* | |||
* avl_tree_for_each_in_order(i, root, struct int_wrapper, index_node) | |||
* printf("%d\n", i->data); | |||
* } | |||
*/ | |||
#define avl_tree_for_each_in_order(child_struct, root, \ | |||
struct_name, struct_member) \ | |||
for (struct avl_tree_node *_cur = \ | |||
avl_tree_first_in_order(root); \ | |||
_cur && ((child_struct) = \ | |||
avl_tree_entry(_cur, struct_name, \ | |||
struct_member), 1); \ | |||
_cur = avl_tree_next_in_order(_cur)) | |||
/* | |||
* Like avl_tree_for_each_in_order(), but uses the reverse order. | |||
*/ | |||
#define avl_tree_for_each_in_reverse_order(child_struct, root, \ | |||
struct_name, struct_member) \ | |||
for (struct avl_tree_node *_cur = \ | |||
avl_tree_last_in_order(root); \ | |||
_cur && ((child_struct) = \ | |||
avl_tree_entry(_cur, struct_name, \ | |||
struct_member), 1); \ | |||
_cur = avl_tree_prev_in_order(_cur)) | |||
/* | |||
* Like avl_tree_for_each_in_order(), but iterates through the nodes in | |||
* postorder, so the current node may be deleted or freed. | |||
*/ | |||
#define avl_tree_for_each_in_postorder(child_struct, root, \ | |||
struct_name, struct_member) \ | |||
for (struct avl_tree_node *_cur = \ | |||
avl_tree_first_in_postorder(root), *_parent; \ | |||
_cur && ((child_struct) = \ | |||
avl_tree_entry(_cur, struct_name, \ | |||
struct_member), 1) \ | |||
&& (_parent = avl_get_parent(_cur), 1); \ | |||
_cur = avl_tree_next_in_postorder(_cur, _parent)) | |||
#endif /* _AVL_TREE_H_ */ |
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -30,6 +30,8 @@ SOFTWARE. | |||
#include <structmember.h> | |||
#include <bytesobject.h> | |||
#include "avl_tree.h" | |||
/* Compatibility macros */ | |||
#if PY_MAJOR_VERSION >= 3 | |||
@@ -92,10 +94,16 @@ typedef struct { | |||
#endif | |||
} Textbuffer; | |||
typedef struct { | |||
Py_ssize_t head; | |||
uint64_t context; | |||
} StackIdent; | |||
struct Stack { | |||
PyObject* stack; | |||
uint64_t context; | |||
Textbuffer* textbuffer; | |||
StackIdent ident; | |||
struct Stack* next; | |||
}; | |||
typedef struct Stack Stack; | |||
@@ -111,6 +119,13 @@ typedef struct { | |||
#endif | |||
} TokenizerInput; | |||
typedef struct avl_tree_node avl_tree; | |||
typedef struct { | |||
StackIdent id; | |||
struct avl_tree_node node; | |||
} route_tree_node; | |||
typedef struct { | |||
PyObject_HEAD | |||
TokenizerInput text; /* text to tokenize */ | |||
@@ -118,8 +133,8 @@ typedef struct { | |||
Py_ssize_t head; /* current position in text */ | |||
int global; /* global context */ | |||
int depth; /* stack recursion depth */ | |||
int cycles; /* total number of stack recursions */ | |||
int route_state; /* whether a BadRoute has been triggered */ | |||
uint64_t route_context; /* context when the last BadRoute was triggered */ | |||
avl_tree* bad_routes; /* stack idents for routes known to fail */ | |||
int skip_style_tags; /* temp fix for the sometimes broken tag parser */ | |||
} Tokenizer; |
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -81,6 +81,8 @@ SOFTWARE. | |||
#define LC_TABLE_TD_LINE 0x0000000800000000 | |||
#define LC_TABLE_TH_LINE 0x0000001000000000 | |||
#define LC_HTML_ENTITY 0x0000002000000000 | |||
/* Global contexts */ | |||
#define GL_HEADING 0x1 | |||
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -445,6 +445,8 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) | |||
Unicode this; | |||
int slashes, i; | |||
if (Tokenizer_check_route(self, LC_EXT_LINK_URI) < 0) | |||
return 0; | |||
if (Tokenizer_push(self, LC_EXT_LINK_URI)) | |||
return -1; | |||
if (Tokenizer_read(self, 0) == '/' && Tokenizer_read(self, 1) == '/') { | |||
@@ -461,7 +463,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) | |||
while (1) { | |||
if (!valid[i]) | |||
goto end_of_loop; | |||
if (this == valid[i]) | |||
if (this == (Unicode) valid[i]) | |||
break; | |||
i++; | |||
} | |||
@@ -533,7 +535,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||
FAIL_ROUTE(0); | |||
return 0; | |||
} | |||
} while (chunk != valid[j++]); | |||
} while (chunk != (Unicode) valid[j++]); | |||
Textbuffer_write(scheme_buffer, chunk); | |||
} | |||
end_of_loop: | |||
@@ -552,7 +554,12 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||
return 0; | |||
} | |||
Py_DECREF(scheme); | |||
if (Tokenizer_push(self, self->topstack->context | LC_EXT_LINK_URI)) { | |||
uint64_t new_context = self->topstack->context | LC_EXT_LINK_URI; | |||
if (Tokenizer_check_route(self, new_context) < 0) { | |||
Textbuffer_dealloc(scheme_buffer); | |||
return 0; | |||
} | |||
if (Tokenizer_push(self, new_context)) { | |||
Textbuffer_dealloc(scheme_buffer); | |||
return -1; | |||
} | |||
@@ -1000,7 +1007,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) | |||
while (1) { | |||
if (!valid[j]) | |||
FAIL_ROUTE_AND_EXIT() | |||
if (this == valid[j]) | |||
if (this == (Unicode) valid[j]) | |||
break; | |||
j++; | |||
} | |||
@@ -1065,11 +1072,14 @@ static int Tokenizer_parse_entity(Tokenizer* self) | |||
Py_ssize_t reset = self->head; | |||
PyObject *tokenlist; | |||
if (Tokenizer_push(self, 0)) | |||
if (Tokenizer_check_route(self, LC_HTML_ENTITY) < 0) | |||
goto on_bad_route; | |||
if (Tokenizer_push(self, LC_HTML_ENTITY)) | |||
return -1; | |||
if (Tokenizer_really_parse_entity(self)) | |||
return -1; | |||
if (BAD_ROUTE) { | |||
on_bad_route: | |||
RESET_ROUTE(); | |||
self->head = reset; | |||
if (Tokenizer_emit_char(self, '&')) | |||
@@ -1574,6 +1584,8 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) | |||
if (!data) | |||
return NULL; | |||
if (Tokenizer_check_route(self, LC_TAG_OPEN) < 0) | |||
return NULL; | |||
if (Tokenizer_push(self, LC_TAG_OPEN)) { | |||
TagData_dealloc(data); | |||
return NULL; | |||
@@ -2191,14 +2203,17 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token | |||
static int Tokenizer_parse_table(Tokenizer* self) | |||
{ | |||
Py_ssize_t reset = self->head; | |||
PyObject *style, *padding; | |||
PyObject *style, *padding, *trash; | |||
PyObject *table = NULL; | |||
self->head += 2; | |||
if(Tokenizer_push(self, LC_TABLE_OPEN)) | |||
if (Tokenizer_check_route(self, LC_TABLE_OPEN) < 0) | |||
goto on_bad_route; | |||
if (Tokenizer_push(self, LC_TABLE_OPEN)) | |||
return -1; | |||
padding = Tokenizer_handle_table_style(self, '\n'); | |||
if (BAD_ROUTE) { | |||
on_bad_route: | |||
RESET_ROUTE(); | |||
self->head = reset; | |||
if (Tokenizer_emit_char(self, '{')) | |||
@@ -2214,11 +2229,16 @@ static int Tokenizer_parse_table(Tokenizer* self) | |||
} | |||
self->head++; | |||
StackIdent restore_point = self->topstack->ident; | |||
table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); | |||
if (BAD_ROUTE) { | |||
RESET_ROUTE(); | |||
Py_DECREF(padding); | |||
Py_DECREF(style); | |||
while (!Tokenizer_IS_CURRENT_STACK(self, restore_point)) { | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
} | |||
self->head = reset; | |||
if (Tokenizer_emit_char(self, '{')) | |||
return -1; | |||
@@ -2243,7 +2263,7 @@ static int Tokenizer_parse_table(Tokenizer* self) | |||
*/ | |||
static int Tokenizer_handle_table_row(Tokenizer* self) | |||
{ | |||
PyObject *padding, *style, *row, *trash; | |||
PyObject *padding, *style, *row; | |||
self->head += 2; | |||
if (!Tokenizer_CAN_RECURSE(self)) { | |||
@@ -2253,14 +2273,13 @@ static int Tokenizer_handle_table_row(Tokenizer* self) | |||
return 0; | |||
} | |||
if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) | |||
if (Tokenizer_check_route(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN) < 0) | |||
return 0; | |||
if (Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) | |||
return -1; | |||
padding = Tokenizer_handle_table_style(self, '\n'); | |||
if (BAD_ROUTE) { | |||
trash = Tokenizer_pop(self); | |||
Py_XDECREF(trash); | |||
if (BAD_ROUTE) | |||
return 0; | |||
} | |||
if (!padding) | |||
return -1; | |||
style = Tokenizer_pop(self); | |||
@@ -2319,8 +2338,8 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, | |||
if (cell_context & LC_TABLE_CELL_STYLE) { | |||
Py_DECREF(cell); | |||
self->head = reset; | |||
if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | | |||
line_context)) | |||
if (Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | | |||
line_context)) | |||
return -1; | |||
padding = Tokenizer_handle_table_style(self, '|'); | |||
if (!padding) | |||
@@ -2541,6 +2560,8 @@ PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) | |||
PyObject* temp; | |||
if (push) { | |||
if (Tokenizer_check_route(self, context) < 0) | |||
return NULL; | |||
if (Tokenizer_push(self, context)) | |||
return NULL; | |||
} | |||
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -40,10 +40,11 @@ int Tokenizer_push(Tokenizer* self, uint64_t context) | |||
top->textbuffer = Textbuffer_new(&self->text); | |||
if (!top->textbuffer) | |||
return -1; | |||
top->ident.head = self->head; | |||
top->ident.context = context; | |||
top->next = self->topstack; | |||
self->topstack = top; | |||
self->depth++; | |||
self->cycles++; | |||
return 0; | |||
} | |||
@@ -130,12 +131,38 @@ PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) | |||
} | |||
/* | |||
Compare two route_tree_nodes that are in their avl_tree_node forms. | |||
*/ | |||
static int compare_nodes( | |||
const struct avl_tree_node* na, const struct avl_tree_node* nb) | |||
{ | |||
route_tree_node *a = avl_tree_entry(na, route_tree_node, node); | |||
route_tree_node *b = avl_tree_entry(nb, route_tree_node, node); | |||
if (a->id.head < b->id.head) | |||
return -1; | |||
if (a->id.head > b->id.head) | |||
return 1; | |||
return (a->id.context > b->id.context) - (a->id.context < b->id.context); | |||
} | |||
/* | |||
Fail the current tokenization route. Discards the current | |||
stack/context/textbuffer and sets the BAD_ROUTE flag. | |||
stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the | |||
ident of the failed stack so future parsing attempts down this route can be | |||
stopped early. | |||
*/ | |||
void* Tokenizer_fail_route(Tokenizer* self) | |||
{ | |||
uint64_t context = self->topstack->context; | |||
route_tree_node *node = malloc(sizeof(route_tree_node)); | |||
if (node) { | |||
node->id = self->topstack->ident; | |||
if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) | |||
free(node); | |||
} | |||
PyObject* stack = Tokenizer_pop(self); | |||
Py_XDECREF(stack); | |||
@@ -144,6 +171,31 @@ void* Tokenizer_fail_route(Tokenizer* self) | |||
} | |||
/* | |||
Check if pushing a new route here with the given context would definitely | |||
fail, based on a previous call to Tokenizer_fail_route() with the same | |||
stack. | |||
Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the | |||
latter case. | |||
This function is not necessary to call and works as an optimization | |||
implementation detail. (The Python tokenizer checks every route on push, | |||
but this would introduce too much overhead in C tokenizer due to the need | |||
to check for a bad route after every call to Tokenizer_push.) | |||
*/ | |||
int Tokenizer_check_route(Tokenizer* self, uint64_t context) | |||
{ | |||
StackIdent ident = {self->head, context}; | |||
struct avl_tree_node *node = (struct avl_tree_node*) (&ident + 1); | |||
if (avl_tree_lookup_node(self->bad_routes, node, compare_nodes)) { | |||
FAIL_ROUTE(context); | |||
return -1; | |||
} | |||
return 0; | |||
} | |||
/* | |||
Write a token to the current token stack. | |||
*/ | |||
int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) | |||
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -32,6 +32,7 @@ void Tokenizer_delete_top_of_stack(Tokenizer*); | |||
PyObject* Tokenizer_pop(Tokenizer*); | |||
PyObject* Tokenizer_pop_keeping_context(Tokenizer*); | |||
void* Tokenizer_fail_route(Tokenizer*); | |||
int Tokenizer_check_route(Tokenizer*, uint64_t); | |||
int Tokenizer_emit_token(Tokenizer*, PyObject*, int); | |||
int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int); | |||
@@ -47,10 +48,11 @@ Unicode Tokenizer_read_backwards(Tokenizer*, Py_ssize_t); | |||
/* Macros */ | |||
#define MAX_DEPTH 40 | |||
#define MAX_CYCLES 100000 | |||
#define Tokenizer_CAN_RECURSE(self) \ | |||
(self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES) | |||
(self->depth < MAX_DEPTH) | |||
#define Tokenizer_IS_CURRENT_STACK(self, id) \ | |||
(self->topstack->ident.head == (id).head && \ | |||
self->topstack->ident.context == (id).context) | |||
#define Tokenizer_emit(self, token) \ | |||
Tokenizer_emit_token(self, token, 0) | |||
@@ -1,5 +1,5 @@ | |||
/* | |||
Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
Permission is hereby granted, free of charge, to any person obtaining a copy of | |||
this software and associated documentation files (the "Software"), to deal in | |||
@@ -103,8 +103,9 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) | |||
return -1; | |||
init_tokenizer_text(&self->text); | |||
self->topstack = NULL; | |||
self->head = self->global = self->depth = self->cycles = 0; | |||
self->head = self->global = self->depth = 0; | |||
self->route_context = self->route_state = 0; | |||
self->bad_routes = NULL; | |||
self->skip_style_tags = 0; | |||
return 0; | |||
} | |||
@@ -158,10 +159,17 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
return NULL; | |||
} | |||
self->head = self->global = self->depth = self->cycles = 0; | |||
self->head = self->global = self->depth = 0; | |||
self->skip_style_tags = skip_style_tags; | |||
self->bad_routes = NULL; | |||
tokens = Tokenizer_parse(self, context, 1); | |||
route_tree_node *n; | |||
avl_tree_for_each_in_postorder(n, self->bad_routes, route_tree_node, node) | |||
free(n); | |||
self->bad_routes = NULL; | |||
if (!tokens || self->topstack) { | |||
Py_XDECREF(tokens); | |||
if (PyErr_Occurred()) | |||
@@ -1,6 +1,6 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2016 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# Copyright (C) 2012-2017 Ben Kurtovic <ben.kurtovic@gmail.com> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
@@ -65,7 +65,6 @@ class Tokenizer(object): | |||
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", | |||
":", "/", "-", "!", "\n", START, END] | |||
MAX_DEPTH = 40 | |||
MAX_CYCLES = 100000 | |||
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) | |||
tag_splitter = re.compile(r"([\s\"\'\\]+)") | |||
@@ -75,7 +74,8 @@ class Tokenizer(object): | |||
self._stacks = [] | |||
self._global = 0 | |||
self._depth = 0 | |||
self._cycles = 0 | |||
self._bad_routes = set() | |||
self._skip_style_tags = False | |||
@property | |||
def _stack(self): | |||
@@ -100,11 +100,24 @@ class Tokenizer(object): | |||
def _textbuffer(self, value): | |||
self._stacks[-1][2] = value | |||
@property | |||
def _stack_ident(self): | |||
"""An identifier for the current stack. | |||
This is based on the starting head position and context. Stacks with | |||
the same identifier are always parsed in the same way. This can be used | |||
to cache intermediate parsing info. | |||
""" | |||
return self._stacks[-1][3] | |||
def _push(self, context=0): | |||
"""Add a new token stack, context, and textbuffer to the list.""" | |||
self._stacks.append([[], context, []]) | |||
new_ident = (self._head, context) | |||
if new_ident in self._bad_routes: | |||
raise BadRoute(context) | |||
self._stacks.append([[], context, [], new_ident]) | |||
self._depth += 1 | |||
self._cycles += 1 | |||
def _push_textbuffer(self): | |||
"""Push the textbuffer onto the stack as a Text node and clear it.""" | |||
@@ -129,7 +142,7 @@ class Tokenizer(object): | |||
def _can_recurse(self): | |||
"""Return whether or not our max recursion depth has been exceeded.""" | |||
return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES | |||
return self._depth < self.MAX_DEPTH | |||
def _fail_route(self): | |||
"""Fail the current tokenization route. | |||
@@ -138,6 +151,7 @@ class Tokenizer(object): | |||
:exc:`.BadRoute`. | |||
""" | |||
context = self._context | |||
self._bad_routes.add(self._stack_ident) | |||
self._pop() | |||
raise BadRoute(context) | |||
@@ -609,8 +623,8 @@ class Tokenizer(object): | |||
def _parse_entity(self): | |||
"""Parse an HTML entity at the head of the wikicode string.""" | |||
reset = self._head | |||
self._push() | |||
try: | |||
self._push(contexts.HTML_ENTITY) | |||
self._really_parse_entity() | |||
except BadRoute: | |||
self._head = reset | |||
@@ -650,8 +664,9 @@ class Tokenizer(object): | |||
self._emit_first(tokens.TagAttrQuote(char=data.quoter)) | |||
self._emit_all(self._pop()) | |||
buf = data.padding_buffer | |||
self._emit_first(tokens.TagAttrStart(pad_first=buf["first"], | |||
pad_before_eq=buf["before_eq"], pad_after_eq=buf["after_eq"])) | |||
self._emit_first(tokens.TagAttrStart( | |||
pad_first=buf["first"], pad_before_eq=buf["before_eq"], | |||
pad_after_eq=buf["after_eq"])) | |||
self._emit_all(self._pop()) | |||
for key in data.padding_buffer: | |||
data.padding_buffer[key] = "" | |||
@@ -1076,8 +1091,8 @@ class Tokenizer(object): | |||
"""Parse a wikicode table by starting with the first line.""" | |||
reset = self._head | |||
self._head += 2 | |||
self._push(contexts.TABLE_OPEN) | |||
try: | |||
self._push(contexts.TABLE_OPEN) | |||
padding = self._handle_table_style("\n") | |||
except BadRoute: | |||
self._head = reset | |||
@@ -1086,9 +1101,12 @@ class Tokenizer(object): | |||
style = self._pop() | |||
self._head += 1 | |||
restore_point = self._stack_ident | |||
try: | |||
table = self._parse(contexts.TABLE_OPEN) | |||
except BadRoute: | |||
while self._stack_ident != restore_point: | |||
self._pop() | |||
self._head = reset | |||
self._emit_text("{") | |||
return | |||
@@ -1106,11 +1124,7 @@ class Tokenizer(object): | |||
return | |||
self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) | |||
try: | |||
padding = self._handle_table_style("\n") | |||
except BadRoute: | |||
self._pop() | |||
raise | |||
padding = self._handle_table_style("\n") | |||
style = self._pop() | |||
# Don't parse the style separator: | |||
@@ -1348,7 +1362,8 @@ class Tokenizer(object): | |||
# Kill potential table contexts | |||
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS | |||
# Start of table parsing | |||
elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or | |||
elif this == "{" and next == "|" and ( | |||
self._read(-1) in ("\n", self.START) or | |||
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): | |||
if self._can_recurse(): | |||
self._parse_table() | |||
@@ -1374,7 +1389,7 @@ class Tokenizer(object): | |||
self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS | |||
self._emit_text(this) | |||
elif (self._read(-1) in ("\n", self.START) or | |||
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): | |||
(self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): | |||
if this == "|" and next == "}": | |||
if self._context & contexts.TABLE_CELL_OPEN: | |||
return self._handle_table_cell_end() | |||
@@ -1406,10 +1421,12 @@ class Tokenizer(object): | |||
def tokenize(self, text, context=0, skip_style_tags=False): | |||
"""Build a list of tokens from a string of wikicode and return it.""" | |||
self._skip_style_tags = skip_style_tags | |||
split = self.regex.split(text) | |||
self._text = [segment for segment in split if segment] | |||
self._head = self._global = self._depth = self._cycles = 0 | |||
self._head = self._global = self._depth = 0 | |||
self._bad_routes = set() | |||
self._skip_style_tags = skip_style_tags | |||
try: | |||
tokens = self._parse(context) | |||
except BadRoute: # pragma: no cover (untestable/exceptional case) | |||
@@ -346,3 +346,10 @@ name: tables_in_templates_2 | |||
label: catch error handling mistakes when wikitables are inside templates | |||
input: "{{hello|test\n{|\n| }}" | |||
output: [TemplateOpen(), Text(text="hello"), TemplateParamSeparator(), Text(text="test\n{"), TemplateParamSeparator(), Text(text="\n"), TemplateParamSeparator(), Text(text=" "), TemplateClose()] | |||
--- | |||
name: many_invalid_nested_tags | |||
label: many unending nested tags that should be treated as plain text, followed by valid wikitext (see issues #42, #183) | |||
input: "<b><b><b><b><b><b><b><b><b><b><b><b><b><b><b><b><b><b>[[{{x}}" | |||
output: [Text(text="<b><b><b><b><b><b><b><b><b><b><b><b><b><b><b><b><b><b>[["), TemplateOpen(), Text(text="x"), TemplateClose()] |
@@ -694,4 +694,4 @@ output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ | |||
name: recursion_opens_and_closes | |||
label: test potentially dangerous recursion: template openings and closings | |||
input: "{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}" | |||
output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), TemplateOpen(), Text(text="x"), TemplateParamSeparator(), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x"), TemplateParamSeparator(), Text(text="{{x"), TemplateClose(), Text(text="{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}")] | |||
output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose()] |