From d54509e190faf97e8adda385a47d09a7ab15901c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 30 Dec 2016 04:15:16 -0500 Subject: [PATCH 01/24] Fix release script. --- CHANGELOG | 4 ++++ docs/changelog.rst | 8 ++++++++ scripts/release.sh | 6 +++--- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 053b37e..05b64ef 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,7 @@ +v0.5 (unreleased): + +- Fixed release script after changes to PyPI. + v0.4.4 (released December 30, 2016): - Added support for Python 3.6. diff --git a/docs/changelog.rst b/docs/changelog.rst index 43400a9..ec12e6d 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,14 @@ Changelog ========= +v0.5 +---- + +Unreleased +(`changes `__): + +- Fixed release script after changes to PyPI. + v0.4.4 ------ diff --git a/scripts/release.sh b/scripts/release.sh index 1171718..4f1e9b0 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -117,11 +117,11 @@ test_release() { fi pip -q uninstall -y mwparserfromhell echo -n "Downloading mwparserfromhell source tarball and GPG signature..." - curl -sL "https://pypi.python.org/packages/source/m/mwparserfromhell/mwparserfromhell-$VERSION.tar.gz" -o "mwparserfromhell.tar.gz" - curl -sL "https://pypi.python.org/packages/source/m/mwparserfromhell/mwparserfromhell-$VERSION.tar.gz.asc" -o "mwparserfromhell.tar.gz.asc" + curl -sL "https://pypi.io/packages/source/m/mwparserfromhell/mwparserfromhell-$VERSION.tar.gz" -o "mwparserfromhell.tar.gz" + curl -sL "https://pypi.io/packages/source/m/mwparserfromhell/mwparserfromhell-$VERSION.tar.gz.asc" -o "mwparserfromhell.tar.gz.asc" echo " done." echo "Verifying tarball..." - gpg --verify mwparserfromhell.tar.gz.asc + gpg --verify mwparserfromhell.tar.gz.asc mwparserfromhell.tar.gz if [[ "$?" != "0" ]]; then echo "*** ERROR: GPG signature verification failed!" deactivate From f34f662f35075cd51c893979c2353ccf92e7c6a1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 10 Jan 2017 02:34:21 -0500 Subject: [PATCH 02/24] Fix len() sometimes raising ValueError on empty node lists (fixes #174) --- CHANGELOG | 1 + docs/changelog.rst | 1 + mwparserfromhell/smart_list.py | 2 +- tests/test_smart_list.py | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG b/CHANGELOG index 05b64ef..f3728dd 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,6 @@ v0.5 (unreleased): +- Fixed len() sometimes raising ValueError on empty node lists. - Fixed release script after changes to PyPI. v0.4.4 (released December 30, 2016): diff --git a/docs/changelog.rst b/docs/changelog.rst index ec12e6d..edf5ab9 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,6 +7,7 @@ v0.5 Unreleased (`changes `__): +- Fixed ``len()`` sometimes raising ``ValueError`` on empty node lists. - Fixed release script after changes to PyPI. v0.4.4 diff --git a/mwparserfromhell/smart_list.py b/mwparserfromhell/smart_list.py index c59a363..e7fa59f 100644 --- a/mwparserfromhell/smart_list.py +++ b/mwparserfromhell/smart_list.py @@ -271,7 +271,7 @@ class _ListProxy(_SliceNormalizerMixIn, list): return bool(self._render()) def __len__(self): - return (self._stop - self._start) // self._step + return max((self._stop - self._start) // self._step, 0) def __getitem__(self, key): if isinstance(key, slice): diff --git a/tests/test_smart_list.py b/tests/test_smart_list.py index 0330aed..3de7db7 100644 --- a/tests/test_smart_list.py +++ b/tests/test_smart_list.py @@ -398,6 +398,7 @@ class TestSmartList(unittest.TestCase): self.assertEqual([4, 3, 2, 1.9, 1.8, 5, 6], child1) self.assertEqual([4, 3, 2, 1.9, 1.8], child2) self.assertEqual([], child3) + self.assertEqual(0, len(child3)) del child1 self.assertEqual([1, 4, 3, 2, 1.9, 1.8, 5, 6], parent) From 120d6a036607d911a58527ae43c789aa0cc348ed Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 14 Jan 2017 23:32:05 -0600 Subject: [PATCH 03/24] Fix Wikicode.matches behavior on non-list/tuple iterables. --- CHANGELOG | 1 + docs/changelog.rst | 1 + mwparserfromhell/wikicode.py | 27 ++++++++++++++------------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index f3728dd..4988112 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,6 @@ v0.5 (unreleased): +- Fixed Wikicode.matches() on iterables besides lists and tuples. - Fixed len() sometimes raising ValueError on empty node lists. - Fixed release script after changes to PyPI. diff --git a/docs/changelog.rst b/docs/changelog.rst index edf5ab9..e1e8ac8 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,6 +7,7 @@ v0.5 Unreleased (`changes `__): +- Fixed :meth:`.Wikicode.matches` on iterables besides lists and tuples. - Fixed ``len()`` sometimes raising ``ValueError`` on empty node lists. - Fixed release script after changes to PyPI. diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index e3f6b92..447f6ff 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -24,7 +24,7 @@ from __future__ import unicode_literals from itertools import chain import re -from .compat import py3k, range, str +from .compat import bytes, py3k, range, str from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Node, Tag, Template, Text, Wikilink) from .string_mixin import StringMixIn @@ -413,22 +413,23 @@ class Wikicode(StringMixIn): """Do a loose equivalency test suitable for comparing page names. *other* can be any string-like object, including :class:`.Wikicode`, or - a tuple of these. This operation is symmetric; both sides are adjusted. - Specifically, whitespace and markup is stripped and the first letter's - case is normalized. Typical usage is + an iterable of these. This operation is symmetric; both sides are + adjusted. Specifically, whitespace and markup is stripped and the first + letter's case is normalized. Typical usage is ``if template.name.matches("stub"): ...``. """ cmp = lambda a, b: (a[0].upper() + a[1:] == b[0].upper() + b[1:] if a and b else a == b) this = self.strip_code().strip() - if isinstance(other, (tuple, list)): - for obj in other: - that = parse_anything(obj).strip_code().strip() - if cmp(this, that): - return True - return False - that = parse_anything(other).strip_code().strip() - return cmp(this, that) + if isinstance(other, (str, bytes, Wikicode, Node)): + that = parse_anything(other).strip_code().strip() + return cmp(this, that) + + for obj in other: + that = parse_anything(obj).strip_code().strip() + if cmp(this, that): + return True + return False def ifilter(self, recursive=True, matches=None, flags=FLAGS, forcetype=None): From 6159171e0464428a8568566d34e2ebffee413530 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 15 Jan 2017 04:12:34 -0600 Subject: [PATCH 04/24] Make Template.remove(keep_field=True) slightly more reasonable. --- CHANGELOG | 4 +++- docs/changelog.rst | 5 ++++- mwparserfromhell/nodes/template.py | 34 +++++++++++++++++++++------------- tests/test_template.py | 3 +++ 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 4988112..5b592cd 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,8 @@ v0.5 (unreleased): -- Fixed Wikicode.matches() on iterables besides lists and tuples. +- Made Template.remove(keep_field=True) behave more reasonably when the + parameter is already empty. +- Fixed Wikicode.matches()'s behavior on iterables besides lists and tuples. - Fixed len() sometimes raising ValueError on empty node lists. - Fixed release script after changes to PyPI. diff --git a/docs/changelog.rst b/docs/changelog.rst index e1e8ac8..bf0f492 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,7 +7,10 @@ v0.5 Unreleased (`changes `__): -- Fixed :meth:`.Wikicode.matches` on iterables besides lists and tuples. +- Made :meth:`Template.remove(keep_field=True) <.Template.remove>` behave more + reasonably when the parameter is already empty. +- Fixed :meth:`.Wikicode.matches`\ 's behavior on iterables besides lists and + tuples. - Fixed ``len()`` sometimes raising ``ValueError`` on empty node lists. - Fixed release script after changes to PyPI. diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 57fec70..ccc63fd 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -70,7 +70,8 @@ class Template(Node): get(param.value) write("}}") - def _surface_escape(self, code, char): + @staticmethod + def _surface_escape(code, char): """Return *code* with *char* escaped as an HTML entity. The main use of this is to escape pipes (``|``) or equal signs (``=``) @@ -82,7 +83,8 @@ class Template(Node): if char in node: code.replace(node, node.replace(char, replacement), False) - def _select_theory(self, theories): + @staticmethod + def _select_theory(theories): """Return the most likely spacing convention given different options. Given a dictionary of convention options as keys and their occurrence @@ -96,6 +98,22 @@ class Template(Node): if confidence >= 0.75: return tuple(theories.keys())[values.index(best)] + @staticmethod + def _blank_param_value(value): + """Remove the content from *value* while keeping its whitespace. + + Replace *value*\ 's nodes with two text nodes, the first containing + whitespace from before its content and the second containing whitespace + from after its content. + """ + sval = str(value) + if sval.isspace(): + before, after = "", sval + else: + match = re.search(r"^(\s*).*?(\s*)$", sval, FLAGS) + before, after = match.group(1), match.group(2) + value.nodes = [Text(before), Text(after)] + def _get_spacing_conventions(self, use_names): """Try to determine the whitespace conventions for parameters. @@ -119,16 +137,6 @@ class Template(Node): after = self._select_theory(after_theories) return before, after - def _blank_param_value(self, value): - """Remove the content from *value* while keeping its whitespace. - - Replace *value*\ 's nodes with two text nodes, the first containing - whitespace from before its content and the second containing whitespace - from after its content. - """ - match = re.search(r"^(\s*).*?(\s*)$", str(value), FLAGS) - value.nodes = [Text(match.group(1)), Text(match.group(2))] - def _fix_dependendent_params(self, i): """Unhide keys if necessary after removing the param at index *i*.""" if not self.params[i].showkey: diff --git a/tests/test_template.py b/tests/test_template.py index c306b60..a97d6de 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -216,6 +216,7 @@ class TestTemplate(TreeEqualityTestCase): node39 = Template(wraptext("a"), [pgenh("1", " b ")]) node40 = Template(wraptext("a"), [pgenh("1", " b"), pgenh("2", " c")]) node41 = Template(wraptext("a"), [pgens("1", " b"), pgens("2", " c")]) + node42 = Template(wraptext("a"), [pgens("b", " \n")]) node1.add("e", "f", showkey=True) node2.add(2, "g", showkey=False) @@ -261,6 +262,7 @@ class TestTemplate(TreeEqualityTestCase): node39.add("1", "c") node40.add("3", "d") node41.add("3", "d") + node42.add("b", "hello") self.assertEqual("{{a|b=c|d|e=f}}", node1) self.assertEqual("{{a|b=c|d|g}}", node2) @@ -308,6 +310,7 @@ class TestTemplate(TreeEqualityTestCase): self.assertEqual("{{a|c}}", node39) self.assertEqual("{{a| b| c|d}}", node40) self.assertEqual("{{a|1= b|2= c|3= d}}", node41) + self.assertEqual("{{a|b=hello \n}}", node42) def test_remove(self): """test Template.remove()""" From 6ffdfa52efdde478d667add0b850742a084c9838 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 3 Mar 2017 20:42:23 -0600 Subject: [PATCH 05/24] Allow Wikicode objects to be pickled properly. --- CHANGELOG | 2 ++ docs/changelog.rst | 3 +++ mwparserfromhell/string_mixin.py | 3 +++ 3 files changed, 8 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 5b592cd..4480035 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,8 @@ v0.5 (unreleased): - Made Template.remove(keep_field=True) behave more reasonably when the parameter is already empty. +- Wikicode objects can now be pickled properly (fixed infinite recursion error + on incompletely-constructed StringMixIn subclasses). - Fixed Wikicode.matches()'s behavior on iterables besides lists and tuples. - Fixed len() sometimes raising ValueError on empty node lists. - Fixed release script after changes to PyPI. diff --git a/docs/changelog.rst b/docs/changelog.rst index bf0f492..669b448 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,6 +9,9 @@ Unreleased - Made :meth:`Template.remove(keep_field=True) <.Template.remove>` behave more reasonably when the parameter is already empty. +- :class:`.Wikicode` objects can now be pickled properly (fixed infinite + recursion error on incompletely-constructed :class:`.StringMixIn` + subclasses). - Fixed :meth:`.Wikicode.matches`\ 's behavior on iterables besides lists and tuples. - Fixed ``len()`` sometimes raising ``ValueError`` on empty node lists. diff --git a/mwparserfromhell/string_mixin.py b/mwparserfromhell/string_mixin.py index b5ba5a4..88898a1 100644 --- a/mwparserfromhell/string_mixin.py +++ b/mwparserfromhell/string_mixin.py @@ -108,6 +108,9 @@ class StringMixIn(object): return str(item) in self.__unicode__() def __getattr__(self, attr): + if not hasattr(str, attr): + raise AttributeError("{0!r} object has no attribute {1!r}".format( + type(self).__name__, attr)) return getattr(self.__unicode__(), attr) if py3k: From 68ded2f890c7965cc560471602f5cdad5ca435bc Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 18 Mar 2017 23:43:30 -0400 Subject: [PATCH 06/24] Add keep_template_params to Wikicode.strip_code (#175) --- CHANGELOG | 2 ++ docs/changelog.rst | 2 ++ mwparserfromhell/nodes/__init__.py | 2 +- mwparserfromhell/nodes/argument.py | 4 ++-- mwparserfromhell/nodes/external_link.py | 6 +++--- mwparserfromhell/nodes/heading.py | 4 ++-- mwparserfromhell/nodes/html_entity.py | 4 ++-- mwparserfromhell/nodes/tag.py | 4 ++-- mwparserfromhell/nodes/template.py | 6 ++++++ mwparserfromhell/nodes/text.py | 2 +- mwparserfromhell/nodes/wikilink.py | 6 +++--- mwparserfromhell/wikicode.py | 20 +++++++++++++++----- tests/test_argument.py | 8 +++----- tests/test_comment.py | 4 +--- tests/test_external_link.py | 11 +++++------ tests/test_heading.py | 4 +--- tests/test_html_entity.py | 14 +++++++------- tests/test_tag.py | 9 ++++----- tests/test_template.py | 19 +++++++++++++------ tests/test_text.py | 4 +--- tests/test_wikicode.py | 5 ++++- tests/test_wikilink.py | 6 ++---- 22 files changed, 82 insertions(+), 64 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 4480035..3832524 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -2,6 +2,8 @@ v0.5 (unreleased): - Made Template.remove(keep_field=True) behave more reasonably when the parameter is already empty. +- Added the keep_template_params argument to Wikicode.strip_code(). If True, + then template parameters will be preserved in the output. - Wikicode objects can now be pickled properly (fixed infinite recursion error on incompletely-constructed StringMixIn subclasses). - Fixed Wikicode.matches()'s behavior on iterables besides lists and tuples. diff --git a/docs/changelog.rst b/docs/changelog.rst index 669b448..2c6be16 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,6 +9,8 @@ Unreleased - Made :meth:`Template.remove(keep_field=True) <.Template.remove>` behave more reasonably when the parameter is already empty. +- Added the *keep_template_params* argument to :meth:`.Wikicode.strip_code`. + If *True*, then template parameters will be preserved in the output. - :class:`.Wikicode` objects can now be pickled properly (fixed infinite recursion error on incompletely-constructed :class:`.StringMixIn` subclasses). diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index 91678c8..17ad3c3 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -58,7 +58,7 @@ class Node(StringMixIn): return yield # pragma: no cover (this is a generator that yields nothing) - def __strip__(self, normalize, collapse): + def __strip__(self, **kwargs): return None def __showtree__(self, write, get, mark): diff --git a/mwparserfromhell/nodes/argument.py b/mwparserfromhell/nodes/argument.py index 9146704..4259a35 100644 --- a/mwparserfromhell/nodes/argument.py +++ b/mwparserfromhell/nodes/argument.py @@ -47,9 +47,9 @@ class Argument(Node): if self.default is not None: yield self.default - def __strip__(self, normalize, collapse): + def __strip__(self, **kwargs): if self.default is not None: - return self.default.strip_code(normalize, collapse) + return self.default.strip_code(**kwargs) return None def __showtree__(self, write, get, mark): diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py index 8493a25..f2659ab 100644 --- a/mwparserfromhell/nodes/external_link.py +++ b/mwparserfromhell/nodes/external_link.py @@ -49,12 +49,12 @@ class ExternalLink(Node): if self.title is not None: yield self.title - def __strip__(self, normalize, collapse): + def __strip__(self, **kwargs): if self.brackets: if self.title: - return self.title.strip_code(normalize, collapse) + return self.title.strip_code(**kwargs) return None - return self.url.strip_code(normalize, collapse) + return self.url.strip_code(**kwargs) def __showtree__(self, write, get, mark): if self.brackets: diff --git a/mwparserfromhell/nodes/heading.py b/mwparserfromhell/nodes/heading.py index 7bba702..79f3364 100644 --- a/mwparserfromhell/nodes/heading.py +++ b/mwparserfromhell/nodes/heading.py @@ -42,8 +42,8 @@ class Heading(Node): def __children__(self): yield self.title - def __strip__(self, normalize, collapse): - return self.title.strip_code(normalize, collapse) + def __strip__(self, **kwargs): + return self.title.strip_code(**kwargs) def __showtree__(self, write, get, mark): write("=" * self.level) diff --git a/mwparserfromhell/nodes/html_entity.py b/mwparserfromhell/nodes/html_entity.py index 8b7f270..d5e9d73 100644 --- a/mwparserfromhell/nodes/html_entity.py +++ b/mwparserfromhell/nodes/html_entity.py @@ -58,8 +58,8 @@ class HTMLEntity(Node): return "&#{0}{1};".format(self.hex_char, self.value) return "&#{0};".format(self.value) - def __strip__(self, normalize, collapse): - if normalize: + def __strip__(self, **kwargs): + if kwargs.get("normalize"): return self.normalize() return self diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index d393e2c..f0611a6 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -98,9 +98,9 @@ class Tag(Node): if not self.self_closing and not self.wiki_markup and self.closing_tag: yield self.closing_tag - def __strip__(self, normalize, collapse): + def __strip__(self, **kwargs): if self.contents and is_visible(self.tag): - return self.contents.strip_code(normalize, collapse) + return self.contents.strip_code(**kwargs) return None def __showtree__(self, write, get, mark): diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index ccc63fd..9c89fbd 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -58,6 +58,12 @@ class Template(Node): yield param.name yield param.value + def __strip__(self, **kwargs): + if kwargs.get("keep_template_params"): + parts = [param.value.strip_code(**kwargs) for param in self.params] + return " ".join(part for part in parts if part) + return None + def __showtree__(self, write, get, mark): write("{{") get(self.name) diff --git a/mwparserfromhell/nodes/text.py b/mwparserfromhell/nodes/text.py index 08ac205..a49930f 100644 --- a/mwparserfromhell/nodes/text.py +++ b/mwparserfromhell/nodes/text.py @@ -37,7 +37,7 @@ class Text(Node): def __unicode__(self): return self.value - def __strip__(self, normalize, collapse): + def __strip__(self, **kwargs): return self def __showtree__(self, write, get, mark): diff --git a/mwparserfromhell/nodes/wikilink.py b/mwparserfromhell/nodes/wikilink.py index f71b5f6..8f4bf7d 100644 --- a/mwparserfromhell/nodes/wikilink.py +++ b/mwparserfromhell/nodes/wikilink.py @@ -46,10 +46,10 @@ class Wikilink(Node): if self.text is not None: yield self.text - def __strip__(self, normalize, collapse): + def __strip__(self, **kwargs): if self.text is not None: - return self.text.strip_code(normalize, collapse) - return self.title.strip_code(normalize, collapse) + return self.text.strip_code(**kwargs) + return self.title.strip_code(**kwargs) def __showtree__(self, write, get, mark): write("[[") diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index 447f6ff..73aea41 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -531,23 +531,33 @@ class Wikicode(StringMixIn): # Ensure that earlier sections are earlier in the returned list: return [section for i, section in sorted(sections)] - def strip_code(self, normalize=True, collapse=True): + def strip_code(self, normalize=True, collapse=True, + keep_template_params=False): """Return a rendered string without unprintable code such as templates. The way a node is stripped is handled by the :meth:`~.Node.__strip__` method of :class:`.Node` objects, which generally return a subset of their nodes or ``None``. For example, templates and tags are removed completely, links are stripped to just - their display part, headings are stripped to just their title. If - *normalize* is ``True``, various things may be done to strip code + their display part, headings are stripped to just their title. + + If *normalize* is ``True``, various things may be done to strip code further, such as converting HTML entities like ``Σ``, ``Σ``, and ``Σ`` to ``Σ``. If *collapse* is ``True``, we will try to remove excess whitespace as well (three or more newlines are converted - to two, for example). + to two, for example). If *keep_template_params* is ``True``, then + template parameters will be preserved in the output (normally, they are + removed completely). """ + kwargs = { + "normalize": normalize, + "collapse": collapse, + "keep_template_params": keep_template_params + } + nodes = [] for node in self.nodes: - stripped = node.__strip__(normalize, collapse) + stripped = node.__strip__(**kwargs) if stripped: nodes.append(str(stripped)) diff --git a/tests/test_argument.py b/tests/test_argument.py index de12eab..6209b2f 100644 --- a/tests/test_argument.py +++ b/tests/test_argument.py @@ -56,12 +56,10 @@ class TestArgument(TreeEqualityTestCase): def test_strip(self): """test Argument.__strip__()""" - node = Argument(wraptext("foobar")) + node1 = Argument(wraptext("foobar")) node2 = Argument(wraptext("foo"), wraptext("bar")) - for a in (True, False): - for b in (True, False): - self.assertIs(None, node.__strip__(a, b)) - self.assertEqual("bar", node2.__strip__(a, b)) + self.assertIs(None, node1.__strip__()) + self.assertEqual("bar", node2.__strip__()) def test_showtree(self): """test Argument.__showtree__()""" diff --git a/tests/test_comment.py b/tests/test_comment.py index 97a6503..27129c9 100644 --- a/tests/test_comment.py +++ b/tests/test_comment.py @@ -49,9 +49,7 @@ class TestComment(TreeEqualityTestCase): def test_strip(self): """test Comment.__strip__()""" node = Comment("foobar") - for a in (True, False): - for b in (True, False): - self.assertIs(None, node.__strip__(a, b)) + self.assertIs(None, node.__strip__()) def test_showtree(self): """test Comment.__showtree__()""" diff --git a/tests/test_external_link.py b/tests/test_external_link.py index 3432ae1..8cb3158 100644 --- a/tests/test_external_link.py +++ b/tests/test_external_link.py @@ -66,12 +66,11 @@ class TestExternalLink(TreeEqualityTestCase): node2 = ExternalLink(wraptext("http://example.com")) node3 = ExternalLink(wraptext("http://example.com"), wrap([])) node4 = ExternalLink(wraptext("http://example.com"), wraptext("Link")) - for a in (True, False): - for b in (True, False): - self.assertEqual("http://example.com", node1.__strip__(a, b)) - self.assertEqual(None, node2.__strip__(a, b)) - self.assertEqual(None, node3.__strip__(a, b)) - self.assertEqual("Link", node4.__strip__(a, b)) + + self.assertEqual("http://example.com", node1.__strip__()) + self.assertEqual(None, node2.__strip__()) + self.assertEqual(None, node3.__strip__()) + self.assertEqual("Link", node4.__strip__()) def test_showtree(self): """test ExternalLink.__showtree__()""" diff --git a/tests/test_heading.py b/tests/test_heading.py index cb7ac8b..5e6776a 100644 --- a/tests/test_heading.py +++ b/tests/test_heading.py @@ -52,9 +52,7 @@ class TestHeading(TreeEqualityTestCase): def test_strip(self): """test Heading.__strip__()""" node = Heading(wraptext("foobar"), 3) - for a in (True, False): - for b in (True, False): - self.assertEqual("foobar", node.__strip__(a, b)) + self.assertEqual("foobar", node.__strip__()) def test_showtree(self): """test Heading.__showtree__()""" diff --git a/tests/test_html_entity.py b/tests/test_html_entity.py index 4aa176f..4db1c13 100644 --- a/tests/test_html_entity.py +++ b/tests/test_html_entity.py @@ -57,13 +57,13 @@ class TestHTMLEntity(TreeEqualityTestCase): node1 = HTMLEntity("nbsp", named=True, hexadecimal=False) node2 = HTMLEntity("107", named=False, hexadecimal=False) node3 = HTMLEntity("e9", named=False, hexadecimal=True) - for a in (True, False): - self.assertEqual("\xa0", node1.__strip__(True, a)) - self.assertEqual(" ", node1.__strip__(False, a)) - self.assertEqual("k", node2.__strip__(True, a)) - self.assertEqual("k", node2.__strip__(False, a)) - self.assertEqual("é", node3.__strip__(True, a)) - self.assertEqual("é", node3.__strip__(False, a)) + + self.assertEqual("\xa0", node1.__strip__(normalize=True)) + self.assertEqual(" ", node1.__strip__(normalize=False)) + self.assertEqual("k", node2.__strip__(normalize=True)) + self.assertEqual("k", node2.__strip__(normalize=False)) + self.assertEqual("é", node3.__strip__(normalize=True)) + self.assertEqual("é", node3.__strip__(normalize=False)) def test_showtree(self): """test HTMLEntity.__showtree__()""" diff --git a/tests/test_tag.py b/tests/test_tag.py index 0ac75a9..2e6d8a3 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -103,11 +103,10 @@ class TestTag(TreeEqualityTestCase): node1 = Tag(wraptext("i"), wraptext("foobar")) node2 = Tag(wraptext("math"), wraptext("foobar")) node3 = Tag(wraptext("br"), self_closing=True) - for a in (True, False): - for b in (True, False): - self.assertEqual("foobar", node1.__strip__(a, b)) - self.assertEqual(None, node2.__strip__(a, b)) - self.assertEqual(None, node3.__strip__(a, b)) + + self.assertEqual("foobar", node1.__strip__()) + self.assertEqual(None, node2.__strip__()) + self.assertEqual(None, node3.__strip__()) def test_showtree(self): """test Tag.__showtree__()""" diff --git a/tests/test_template.py b/tests/test_template.py index a97d6de..76a45cf 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -67,12 +67,19 @@ class TestTemplate(TreeEqualityTestCase): def test_strip(self): """test Template.__strip__()""" node1 = Template(wraptext("foobar")) - node2 = Template(wraptext("foo"), - [pgenh("1", "bar"), pgens("abc", "def")]) - for a in (True, False): - for b in (True, False): - self.assertEqual(None, node1.__strip__(a, b)) - self.assertEqual(None, node2.__strip__(a, b)) + node2 = Template(wraptext("foo"), [ + pgenh("1", "bar"), pgens("foo", ""), pgens("abc", "def")]) + node3 = Template(wraptext("foo"), [ + pgenh("1", "foo"), + Parameter(wraptext("2"), wrap([Template(wraptext("hello"))]), + showkey=False), + pgenh("3", "bar")]) + + self.assertEqual(None, node1.__strip__(keep_template_params=False)) + self.assertEqual(None, node2.__strip__(keep_template_params=False)) + self.assertEqual("", node1.__strip__(keep_template_params=True)) + self.assertEqual("bar def", node2.__strip__(keep_template_params=True)) + self.assertEqual("foo bar", node3.__strip__(keep_template_params=True)) def test_showtree(self): """test Template.__showtree__()""" diff --git a/tests/test_text.py b/tests/test_text.py index d890323..aaf8db2 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -49,9 +49,7 @@ class TestText(unittest.TestCase): def test_strip(self): """test Text.__strip__()""" node = Text("foobar") - for a in (True, False): - for b in (True, False): - self.assertIs(node, node.__strip__(a, b)) + self.assertIs(node, node.__strip__()) def test_showtree(self): """test Text.__showtree__()""" diff --git a/tests/test_wikicode.py b/tests/test_wikicode.py index d0c11fd..5457920 100644 --- a/tests/test_wikicode.py +++ b/tests/test_wikicode.py @@ -433,7 +433,7 @@ class TestWikicode(TreeEqualityTestCase): """test Wikicode.strip_code()""" # Since individual nodes have test cases for their __strip__ methods, # we're only going to do an integration test: - code = parse("Foo [[bar]]\n\n{{baz}}\n\n[[a|b]] Σ") + code = parse("Foo [[bar]]\n\n{{baz|hello}}\n\n[[a|b]] Σ") self.assertEqual("Foo bar\n\nb Σ", code.strip_code(normalize=True, collapse=True)) self.assertEqual("Foo bar\n\n\n\nb Σ", @@ -442,6 +442,9 @@ class TestWikicode(TreeEqualityTestCase): code.strip_code(normalize=False, collapse=True)) self.assertEqual("Foo bar\n\n\n\nb Σ", code.strip_code(normalize=False, collapse=False)) + self.assertEqual("Foo bar\n\nhello\n\nb Σ", + code.strip_code(normalize=True, collapse=True, + keep_template_params=True)) def test_get_tree(self): """test Wikicode.get_tree()""" diff --git a/tests/test_wikilink.py b/tests/test_wikilink.py index 965d8d5..80116ca 100644 --- a/tests/test_wikilink.py +++ b/tests/test_wikilink.py @@ -58,10 +58,8 @@ class TestWikilink(TreeEqualityTestCase): """test Wikilink.__strip__()""" node = Wikilink(wraptext("foobar")) node2 = Wikilink(wraptext("foo"), wraptext("bar")) - for a in (True, False): - for b in (True, False): - self.assertEqual("foobar", node.__strip__(a, b)) - self.assertEqual("bar", node2.__strip__(a, b)) + self.assertEqual("foobar", node.__strip__()) + self.assertEqual("bar", node2.__strip__()) def test_showtree(self): """test Wikilink.__showtree__()""" From d7c755f5263cbd5d57ff0631b95b8dfded94daf5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 3 Jun 2017 19:17:21 -0400 Subject: [PATCH 07/24] Add Wikicode.contains(), Wikicode.get_ancestors(), Wikicode.get_parent() (#177) --- CHANGELOG | 4 +++ docs/changelog.rst | 5 ++++ mwparserfromhell/wikicode.py | 61 ++++++++++++++++++++++++++++++++++++++++++++ tests/test_wikicode.py | 27 ++++++++++++++++++++ 4 files changed, 97 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index 3832524..7d34015 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,9 @@ v0.5 (unreleased): +- Added Wikicode.contains() to determine whether a Node or Wikicode object is + contained within another Wikicode object. +- Added Wikicode.get_ancestors() and Wikicode.get_parent() to find all + ancestors and the direct parent of a Node, respectively. - Made Template.remove(keep_field=True) behave more reasonably when the parameter is already empty. - Added the keep_template_params argument to Wikicode.strip_code(). If True, diff --git a/docs/changelog.rst b/docs/changelog.rst index 2c6be16..4d0d6fd 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,6 +7,11 @@ v0.5 Unreleased (`changes `__): +- Added :meth:`.Wikicode.contains` to determine whether a :class:`.Node` or + :class:`.Wikicode` object is contained within another :class:`.Wikicode` + object. +- Added :meth:`.Wikicode.get_ancestors` and :meth:`.Wikicode.get_parent` to + find all ancestors and the direct parent of a :class:`.Node`, respectively. - Made :meth:`Template.remove(keep_field=True) <.Template.remove>` behave more reasonably when the parameter is already empty. - Added the *keep_template_params* argument to :meth:`.Wikicode.strip_code`. diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index 73aea41..4379b0a 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -275,6 +275,21 @@ class Wikicode(StringMixIn): else: self.nodes.pop(index) + def contains(self, obj): + """Return whether this Wikicode object contains *obj*. + + If *obj* is a :class:`.Node` or :class:`.Wikicode` object, then we + search for it exactly among all of our children, recursively. + Otherwise, this method just uses :meth:`.__contains__` on the string. + """ + if not isinstance(obj, (Node, Wikicode)): + return obj in self + try: + self._do_strong_search(obj, recursive=True) + except ValueError: + return False + return True + def index(self, obj, recursive=False): """Return the index of *obj* in the list of nodes. @@ -294,6 +309,52 @@ class Wikicode(StringMixIn): return i raise ValueError(obj) + def get_ancestors(self, obj): + """Return a list of all ancestor nodes of the :class:`.Node` *obj*. + + The list is ordered from the most shallow ancestor (greatest great- + grandparent) to the direct parent. The node itself is not included in + the list. For example:: + + >>> text = "{{a|{{b|{{c|{{d}}}}}}}}" + >>> code = mwparserfromhell.parse(text) + >>> node = code.filter_templates(matches=lambda n: n == "{{d}}")[0] + >>> code.get_ancestors(node) + ['{{a|{{b|{{c|{{d}}}}}}}}', '{{b|{{c|{{d}}}}}}', '{{c|{{d}}}}'] + + Will return an empty list if *obj* is at the top level of this Wikicode + object. Will raise :exc:`ValueError` if it wasn't found. + """ + def _get_ancestors(code, needle): + for node in code.nodes: + if node is needle: + return [] + for code in node.__children__(): + ancestors = _get_ancestors(code, needle) + if ancestors is not None: + return [node] + ancestors + + if isinstance(obj, Wikicode): + obj = obj.get(0) + elif not isinstance(obj, Node): + raise ValueError(obj) + + ancestors = _get_ancestors(self, obj) + if ancestors is None: + raise ValueError(obj) + return ancestors + + def get_parent(self, obj): + """Return the direct parent node of the :class:`.Node` *obj*. + + This function is equivalent to calling :meth:`.get_ancestors` and + taking the last element of the resulting list. Will return None if + the node exists but does not have a parent; i.e., it is at the top + level of the Wikicode object. + """ + ancestors = self.get_ancestors(obj) + return ancestors[-1] if ancestors else None + def insert(self, index, value): """Insert *value* at *index* in the list of nodes. diff --git a/tests/test_wikicode.py b/tests/test_wikicode.py index 5457920..c77fdd2 100644 --- a/tests/test_wikicode.py +++ b/tests/test_wikicode.py @@ -85,6 +85,17 @@ class TestWikicode(TreeEqualityTestCase): self.assertRaises(IndexError, code.set, 3, "{{baz}}") self.assertRaises(IndexError, code.set, -4, "{{baz}}") + def test_contains(self): + """test Wikicode.contains()""" + code = parse("Here is {{aaa|{{bbb|xyz{{ccc}}}}}} and a [[page|link]]") + tmpl1, tmpl2, tmpl3 = code.filter_templates() + tmpl4 = parse("{{ccc}}").filter_templates()[0] + self.assertTrue(code.contains(tmpl1)) + self.assertTrue(code.contains(tmpl3)) + self.assertFalse(code.contains(tmpl4)) + self.assertTrue(code.contains(str(tmpl4))) + self.assertTrue(code.contains(tmpl2.params[0].value)) + def test_index(self): """test Wikicode.index()""" code = parse("Have a {{template}} and a [[page|link]]") @@ -102,6 +113,22 @@ class TestWikicode(TreeEqualityTestCase): self.assertRaises(ValueError, code.index, code.get(1).get(1).value, recursive=False) + def test_get_ancestors_parent(self): + """test Wikicode.get_ancestors() and Wikicode.get_parent()""" + code = parse("{{a|{{b|{{d|{{e}}{{f}}}}{{g}}}}}}{{c}}") + tmpl = code.filter_templates(matches=lambda n: n.name == "f")[0] + parent1 = code.filter_templates(matches=lambda n: n.name == "d")[0] + parent2 = code.filter_templates(matches=lambda n: n.name == "b")[0] + parent3 = code.filter_templates(matches=lambda n: n.name == "a")[0] + fake = parse("{{f}}").get(0) + + self.assertEqual([parent3, parent2, parent1], code.get_ancestors(tmpl)) + self.assertIs(parent1, code.get_parent(tmpl)) + self.assertEqual([], code.get_ancestors(parent3)) + self.assertIs(None, code.get_parent(parent3)) + self.assertRaises(ValueError, code.get_ancestors, fake) + self.assertRaises(ValueError, code.get_parent, fake) + def test_insert(self): """test Wikicode.insert()""" code = parse("Have a {{template}} and a [[page|link]]") From a25304dc444a769c1159ca736aa2bc5a1e68c06a Mon Sep 17 00:00:00 2001 From: Larivact Date: Sun, 4 Jun 2017 11:45:15 +0200 Subject: [PATCH 08/24] partially rewrite Caveats, external link caveat "inherent limitation in wikicode" sounds misleading it's about generating an AST instead of HTML. --- README.rst | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/README.rst b/README.rst index b7d324c..86143c6 100644 --- a/README.rst +++ b/README.rst @@ -115,21 +115,24 @@ Likewise, use ``unicode(code)`` in Python 2. Caveats ------- +mwparserfromhell generates an abstract syntax tree instead of HTML. +This has several implications: -An inherent limitation in wikicode prevents us from generating complete parse -trees in certain cases. For example, the string ``{{echo|''Hello}}, world!''`` -produces the valid output ``Hello, world!`` in MediaWiki, assuming -``{{echo}}`` is a template that returns its first parameter. But since -representing this in mwparserfromhell's node tree would be impossible, we -compromise by treating the first node (i.e., the template) as plain text, -parsing only the italics. +* Crossed constructs like ``{{echo|''Hello}}, world!''`` are not supported, + since they cannot be represented in the node tree. We compromise by treating + the first node (i.e. the template) as plain text, parsing only the italics. -The current workaround for cases where you are not interested in text -formatting is to pass ``skip_style_tags=True`` to ``mwparserfromhell.parse()``. -This treats ``''`` and ``'''`` like plain text. + The current workaround for cases where you are not interested in text + formatting is to pass ``skip_style_tags=True`` to ``mwparserfromhell.parse()``. + This treats ``''`` and ``'''`` like plain text. -A future version of mwparserfromhell will include multiple parsing modes to get -around this restriction. + A future version of mwparserfromhell will include multiple parsing modes to get + around this restriction. + +* Templates adjacent to external links e.g. ``http://example.com{{foo}}`` are + considered part of the link, since mwparserfromhell does not know the + definition of templates and even if it did the template could only be + partially part of the link which also couldn't be represented in the AST. Integration ----------- From 2d89f611be365e181d2fa3df2bfbab6fde2ab07c Mon Sep 17 00:00:00 2001 From: Larivact Date: Sun, 4 Jun 2017 22:37:05 +0200 Subject: [PATCH 09/24] rewrite Caveats >not supported, since they cannot be represented in the node tree. It's not that they cannot be represented, it's that they would have to be evaluated. --- README.rst | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index 86143c6..5ac605a 100644 --- a/README.rst +++ b/README.rst @@ -115,12 +115,18 @@ Likewise, use ``unicode(code)`` in Python 2. Caveats ------- -mwparserfromhell generates an abstract syntax tree instead of HTML. +While the MediaWiki parser generates HTML, mwparserfromhell acts as an interface to +the source code. mwparserfromhell therefore is unaware of template definitions since +if it would substitute templates with their output you could no longer change the templates. This has several implications: -* Crossed constructs like ``{{echo|''Hello}}, world!''`` are not supported, - since they cannot be represented in the node tree. We compromise by treating - the first node (i.e. the template) as plain text, parsing only the italics. +* Start and end tags generated by templates aren't recognized e.g. ``foobar{{bold-end}}``. + +* Templates adjacent to external links e.g. ``http://example.com{{foo}}`` are + considered part of the link. + +* Crossed constructs like ``{{echo|''Hello}}, world!''`` are not supported. + We compromise by treating the first node as plain text. The current workaround for cases where you are not interested in text formatting is to pass ``skip_style_tags=True`` to ``mwparserfromhell.parse()``. @@ -129,11 +135,6 @@ This has several implications: A future version of mwparserfromhell will include multiple parsing modes to get around this restriction. -* Templates adjacent to external links e.g. ``http://example.com{{foo}}`` are - considered part of the link, since mwparserfromhell does not know the - definition of templates and even if it did the template could only be - partially part of the link which also couldn't be represented in the AST. - Integration ----------- From 4d4a25152e7f504f27e8deaa9dc60cbec1981ac1 Mon Sep 17 00:00:00 2001 From: Larivact Date: Mon, 5 Jun 2017 07:38:06 +0200 Subject: [PATCH 10/24] Caveats -> Limitations, add Config unawareness --- README.rst | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 5ac605a..00fbd0b 100644 --- a/README.rst +++ b/README.rst @@ -113,20 +113,20 @@ saving the page!) by calling ``str()`` on it:: Likewise, use ``unicode(code)`` in Python 2. -Caveats -------- +Limitations +----------- While the MediaWiki parser generates HTML, mwparserfromhell acts as an interface to the source code. mwparserfromhell therefore is unaware of template definitions since -if it would substitute templates with their output you could no longer change the templates. -This has several implications: +if it would substitute templates with their output you would no longer be working +with the source code. This has several implications: * Start and end tags generated by templates aren't recognized e.g. ``foobar{{bold-end}}``. * Templates adjacent to external links e.g. ``http://example.com{{foo}}`` are considered part of the link. -* Crossed constructs like ``{{echo|''Hello}}, world!''`` are not supported. - We compromise by treating the first node as plain text. +* Crossed constructs like ``{{echo|''Hello}}, world!''`` are not supported, + the first node is treated as plain text. The current workaround for cases where you are not interested in text formatting is to pass ``skip_style_tags=True`` to ``mwparserfromhell.parse()``. @@ -135,6 +135,17 @@ This has several implications: A future version of mwparserfromhell will include multiple parsing modes to get around this restriction. +Configuration unawareness +------------------------- + +* `word-ending links`_ are not supported since the linktrail rules are language-specific. + +* Localized namespace names aren't recognized, e.g. ``[[File:...]]`` + links are treated as regular wikilinks. + +* Anything that looks like an XML tag is parsed as a tag since, + the available tags are extension-dependent. + Integration ----------- @@ -178,6 +189,7 @@ Python 3 code (via the API_):: .. _GitHub: https://github.com/earwig/mwparserfromhell .. _Python Package Index: http://pypi.python.org .. _get pip: http://pypi.python.org/pypi/pip +.. _word-ending links: https://www.mediawiki.org/wiki/Help:Links#linktrail .. _EarwigBot: https://github.com/earwig/earwigbot .. _Pywikibot: https://www.mediawiki.org/wiki/Manual:Pywikibot .. _API: http://mediawiki.org/wiki/API From 2e486f7544c607d0d4d966114f28c6ad651cca52 Mon Sep 17 00:00:00 2001 From: Larivact Date: Mon, 5 Jun 2017 11:44:27 +0200 Subject: [PATCH 11/24] fix comma --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 00fbd0b..6fd3be5 100644 --- a/README.rst +++ b/README.rst @@ -143,8 +143,8 @@ Configuration unawareness * Localized namespace names aren't recognized, e.g. ``[[File:...]]`` links are treated as regular wikilinks. -* Anything that looks like an XML tag is parsed as a tag since, - the available tags are extension-dependent. +* Anything that looks like an XML tag is parsed as a tag + since the available tags are extension-dependent. Integration ----------- From 784e5e7b8d72738faf2cd0d1ad212f436199dbd1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 12 Jun 2017 23:46:45 -0400 Subject: [PATCH 12/24] Revise/add to new limitations section. --- README.rst | 53 +++++++++++++++++++++++++++++++--------------------- docs/caveats.rst | 17 ----------------- docs/index.rst | 2 +- docs/limitations.rst | 45 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 39 deletions(-) delete mode 100644 docs/caveats.rst create mode 100644 docs/limitations.rst diff --git a/README.rst b/README.rst index 6fd3be5..6316ed9 100644 --- a/README.rst +++ b/README.rst @@ -115,36 +115,47 @@ Likewise, use ``unicode(code)`` in Python 2. Limitations ----------- -While the MediaWiki parser generates HTML, mwparserfromhell acts as an interface to -the source code. mwparserfromhell therefore is unaware of template definitions since -if it would substitute templates with their output you would no longer be working -with the source code. This has several implications: -* Start and end tags generated by templates aren't recognized e.g. ``foobar{{bold-end}}``. +While the MediaWiki parser generates HTML and has access to the contents of +templates, among other things, mwparserfromhell acts as a direct interface to +the source code only. This has several implications: -* Templates adjacent to external links e.g. ``http://example.com{{foo}}`` are - considered part of the link. +* Syntax elements produced by a template transclusion cannot be detected. For + example, imagine a hypothetical page ``"Template:End-bold"`` that contained + the text ````. While MediaWiki would correctly understand that + ``foobar{{end-bold}}`` translates to ``foobar``, mwparserfromhell + has no way of examining the contents of ``{{end-bold}}``. Instead, it would + treat the bold tag as unfinished, possibly extending further down the page. -* Crossed constructs like ``{{echo|''Hello}}, world!''`` are not supported, - the first node is treated as plain text. +* Templates adjacent to external links, as in ``http://example.com{{foo}}``, + are considered part of the link. In reality, this would depend on the + contents of the template. - The current workaround for cases where you are not interested in text - formatting is to pass ``skip_style_tags=True`` to ``mwparserfromhell.parse()``. - This treats ``''`` and ``'''`` like plain text. +* When different syntax elements cross over each other, as in + ``{{echo|''Hello}}, world!''``, the parser gets confused because this cannot + be represented by an ordinary syntax tree. Instead, the parser will treat the + first syntax construct as plain text. In this case, only the italic tag would + be properly parsed. - A future version of mwparserfromhell will include multiple parsing modes to get - around this restriction. + **Workaround:** Since this commonly occurs with text formatting and text + formatting is often not of interest to users, you may pass + *skip_style_tags=True* to ``mwparserfromhell.parse()``. This treats ``''`` + and ``'''`` as plain text. -Configuration unawareness -------------------------- + A future version of mwparserfromhell may include multiple parsing modes to + get around this restriction more sensibly. -* `word-ending links`_ are not supported since the linktrail rules are language-specific. +Additionally, the parser lacks awareness of certain wiki-specific settings: -* Localized namespace names aren't recognized, e.g. ``[[File:...]]`` - links are treated as regular wikilinks. +* `word-ending links`_ are not supported, since the linktrail rules are + language-specific. -* Anything that looks like an XML tag is parsed as a tag - since the available tags are extension-dependent. +* Localized namespace names aren't recognized, so file links (such as + ``[[File:...]]``) are treated as regular wikilinks. + +* Anything that looks like an XML tag is treated as a tag, even if it is not a + recognized tag name, since the list of valid tags depends on loaded MediaWiki + extensions. Integration ----------- diff --git a/docs/caveats.rst b/docs/caveats.rst deleted file mode 100644 index 927aa54..0000000 --- a/docs/caveats.rst +++ /dev/null @@ -1,17 +0,0 @@ -Caveats -======= - -An inherent limitation in wikicode prevents us from generating complete parse -trees in certain cases. For example, the string ``{{echo|''Hello}}, world!''`` -produces the valid output ``Hello, world!`` in MediaWiki, assuming -``{{echo}}`` is a template that returns its first parameter. But since -representing this in mwparserfromhell's node tree would be impossible, we -compromise by treating the first node (i.e., the template) as plain text, -parsing only the italics. - -The current workaround for cases where you are not interested in text -formatting is to pass *skip_style_tags=True* to :func:`mwparserfromhell.parse`. -This treats ``''`` and ``'''`` like plain text. - -A future version of mwparserfromhell will include multiple parsing modes to get -around this restriction. diff --git a/docs/index.rst b/docs/index.rst index 6593881..06dc2f9 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -40,7 +40,7 @@ Contents :maxdepth: 2 usage - caveats + limitations integration changelog API Reference diff --git a/docs/limitations.rst b/docs/limitations.rst new file mode 100644 index 0000000..7d5f7e7 --- /dev/null +++ b/docs/limitations.rst @@ -0,0 +1,45 @@ +Limitations +=========== + +While the MediaWiki parser generates HTML and has access to the contents of +templates, among other things, mwparserfromhell acts as a direct interface to +the source code only. This has several implications: + +* Syntax elements produced by a template transclusion cannot be detected. For + example, imagine a hypothetical page ``"Template:End-bold"`` that contained + the text ````. While MediaWiki would correctly understand that + ``foobar{{end-bold}}`` translates to ``foobar``, mwparserfromhell + has no way of examining the contents of ``{{end-bold}}``. Instead, it would + treat the bold tag as unfinished, possibly extending further down the page. + +* Templates adjacent to external links, as in ``http://example.com{{foo}}``, + are considered part of the link. In reality, this would depend on the + contents of the template. + +* When different syntax elements cross over each other, as in + ``{{echo|''Hello}}, world!''``, the parser gets confused because this cannot + be represented by an ordinary syntax tree. Instead, the parser will treat the + first syntax construct as plain text. In this case, only the italic tag would + be properly parsed. + + **Workaround:** Since this commonly occurs with text formatting and text + formatting is often not of interest to users, you may pass + *skip_style_tags=True* to ``mwparserfromhell.parse()``. This treats ``''`` + and ``'''`` as plain text. + + A future version of mwparserfromhell may include multiple parsing modes to + get around this restriction more sensibly. + +Additionally, the parser lacks awareness of certain wiki-specific settings: + +* `word-ending links`_ are not supported, since the linktrail rules are + language-specific. + +* Localized namespace names aren't recognized, so file links (such as + ``[[File:...]]``) are treated as regular wikilinks. + +* Anything that looks like an XML tag is treated as a tag, even if it is not a + recognized tag name, since the list of valid tags depends on loaded MediaWiki + extensions. + +.. _word-ending links: https://www.mediawiki.org/wiki/Help:Links#linktrail From f01bdc51eef11412a7b50b687aea8e655e7fffe5 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 12 Jun 2017 23:47:48 -0400 Subject: [PATCH 13/24] Capitalization [ci skip] --- README.rst | 4 ++-- docs/limitations.rst | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 6316ed9..ab1bef9 100644 --- a/README.rst +++ b/README.rst @@ -147,7 +147,7 @@ the source code only. This has several implications: Additionally, the parser lacks awareness of certain wiki-specific settings: -* `word-ending links`_ are not supported, since the linktrail rules are +* `Word-ending links`_ are not supported, since the linktrail rules are language-specific. * Localized namespace names aren't recognized, so file links (such as @@ -200,7 +200,7 @@ Python 3 code (via the API_):: .. _GitHub: https://github.com/earwig/mwparserfromhell .. _Python Package Index: http://pypi.python.org .. _get pip: http://pypi.python.org/pypi/pip -.. _word-ending links: https://www.mediawiki.org/wiki/Help:Links#linktrail +.. _Word-ending links: https://www.mediawiki.org/wiki/Help:Links#linktrail .. _EarwigBot: https://github.com/earwig/earwigbot .. _Pywikibot: https://www.mediawiki.org/wiki/Manual:Pywikibot .. _API: http://mediawiki.org/wiki/API diff --git a/docs/limitations.rst b/docs/limitations.rst index 7d5f7e7..294f4c5 100644 --- a/docs/limitations.rst +++ b/docs/limitations.rst @@ -32,7 +32,7 @@ the source code only. This has several implications: Additionally, the parser lacks awareness of certain wiki-specific settings: -* `word-ending links`_ are not supported, since the linktrail rules are +* `Word-ending links`_ are not supported, since the linktrail rules are language-specific. * Localized namespace names aren't recognized, so file links (such as @@ -42,4 +42,4 @@ Additionally, the parser lacks awareness of certain wiki-specific settings: recognized tag name, since the list of valid tags depends on loaded MediaWiki extensions. -.. _word-ending links: https://www.mediawiki.org/wiki/Help:Links#linktrail +.. _Word-ending links: https://www.mediawiki.org/wiki/Help:Links#linktrail From 08e5f7e1a5a3f67d1be0a339f4d3596f57f71f9b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 12 Jun 2017 23:51:11 -0400 Subject: [PATCH 14/24] Forgot version bump, sigh. --- appveyor.yml | 2 +- docs/conf.py | 2 +- mwparserfromhell/__init__.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index daec144..d60b14b 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,6 +1,6 @@ # This config file is used by appveyor.com to build Windows release binaries -version: 0.4.4-b{build} +version: 0.5.dev0-b{build} branches: only: diff --git a/docs/conf.py b/docs/conf.py index 8d48dff..3739429 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -42,7 +42,7 @@ master_doc = 'index' # General information about the project. project = u'mwparserfromhell' -copyright = u'2012, 2013, 2014, 2015, 2016 Ben Kurtovic' +copyright = u'2012, 2013, 2014, 2015, 2016, 2017 Ben Kurtovic' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 1d3c7d7..64f3681 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -29,7 +29,7 @@ outrageously powerful parser for `MediaWiki `_ wikicode. __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012, 2013, 2014, 2015, 2016 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.4.4" +__version__ = "0.5.dev0" __email__ = "ben.kurtovic@gmail.com" from . import (compat, definitions, nodes, parser, smart_list, string_mixin, From 8a9c9224be6cb2020ed4ad67a401081096dd21d1 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 01:08:19 -0400 Subject: [PATCH 15/24] Speed up parsing deeply nested syntax by caching bad routes (fixes #42) Also removed the max cycles stop-gap, allowing much more complex pages to be parsed quickly without losing nodes at the end Also fixes #65, fixes #102, fixes #165, fixes #183 Also fixes #81 (Rafael Nadal parsing bug) Also fixes #53, fixes #58, fixes #88, fixes #152 (duplicate issues) --- CHANGELOG | 4 + LICENSE | 2 +- docs/changelog.rst | 9 +- mwparserfromhell/parser/contexts.py | 6 +- mwparserfromhell/parser/ctokenizer/avl_tree.c | 789 +++++++++++++++++++++++ mwparserfromhell/parser/ctokenizer/avl_tree.h | 358 ++++++++++ mwparserfromhell/parser/ctokenizer/common.h | 19 +- mwparserfromhell/parser/ctokenizer/contexts.h | 4 +- mwparserfromhell/parser/ctokenizer/tok_parse.c | 53 +- mwparserfromhell/parser/ctokenizer/tok_support.c | 58 +- mwparserfromhell/parser/ctokenizer/tok_support.h | 10 +- mwparserfromhell/parser/ctokenizer/tokenizer.c | 14 +- mwparserfromhell/parser/tokenizer.py | 55 +- tests/tokenizer/integration.mwtest | 7 + tests/tokenizer/templates.mwtest | 2 +- 15 files changed, 1337 insertions(+), 53 deletions(-) create mode 100644 mwparserfromhell/parser/ctokenizer/avl_tree.c create mode 100644 mwparserfromhell/parser/ctokenizer/avl_tree.h diff --git a/CHANGELOG b/CHANGELOG index 7d34015..bebacbf 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -4,6 +4,10 @@ v0.5 (unreleased): contained within another Wikicode object. - Added Wikicode.get_ancestors() and Wikicode.get_parent() to find all ancestors and the direct parent of a Node, respectively. +- Fixed a long-standing performance issue with deeply nested, invalid syntax + (issue #42). The parser should be much faster on certain complex pages. The + "max cycle" restriction has also been removed, so some situations where + templates at the end of a page were being skipped are now resolved. - Made Template.remove(keep_field=True) behave more reasonably when the parameter is already empty. - Added the keep_template_params argument to Wikicode.strip_code(). If True, diff --git a/LICENSE b/LICENSE index 230bc5c..588e737 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,4 @@ -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/docs/changelog.rst b/docs/changelog.rst index 4d0d6fd..c558579 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -12,6 +12,11 @@ Unreleased object. - Added :meth:`.Wikicode.get_ancestors` and :meth:`.Wikicode.get_parent` to find all ancestors and the direct parent of a :class:`.Node`, respectively. +- Fixed a long-standing performance issue with deeply nested, invalid syntax + (`issue #42 `_). The + parser should be much faster on certain complex pages. The "max cycle" + restriction has also been removed, so some situations where templates at the + end of a page were being skipped are now resolved. - Made :meth:`Template.remove(keep_field=True) <.Template.remove>` behave more reasonably when the parameter is already empty. - Added the *keep_template_params* argument to :meth:`.Wikicode.strip_code`. @@ -54,7 +59,7 @@ v0.4.3 v0.4.2 ------ -`Released July 30, 2015 `_ +`Released July 30, 2015 `__ (`changes `__): - Fixed setup script not including header files in releases. @@ -63,7 +68,7 @@ v0.4.2 v0.4.1 ------ -`Released July 30, 2015 `_ +`Released July 30, 2015 `__ (`changes `__): - The process for building Windows binaries has been fixed, and these should be diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 405a027..af6dea6 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -100,6 +100,8 @@ Local (stack-specific) contexts: * :const:`TABLE_TH_LINE` * :const:`TABLE_CELL_LINE_CONTEXTS` +* :const:`HTML_ENTITY` + Global contexts: * :const:`GL_HEADING` @@ -176,6 +178,8 @@ TABLE_CELL_LINE_CONTEXTS = TABLE_TD_LINE + TABLE_TH_LINE + TABLE_CELL_STYLE TABLE = (TABLE_OPEN + TABLE_CELL_OPEN + TABLE_CELL_STYLE + TABLE_ROW_OPEN + TABLE_TD_LINE + TABLE_TH_LINE) +HTML_ENTITY = 1 << 37 + # Global contexts: GL_HEADING = 1 << 0 diff --git a/mwparserfromhell/parser/ctokenizer/avl_tree.c b/mwparserfromhell/parser/ctokenizer/avl_tree.c new file mode 100644 index 0000000..4fdff6f --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/avl_tree.c @@ -0,0 +1,789 @@ +/* + * avl_tree.c - intrusive, nonrecursive AVL tree data structure (self-balancing + * binary search tree), implementation file + * + * Written in 2014-2016 by Eric Biggers + * + * To the extent possible under law, the author(s) have dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide via the Creative Commons Zero 1.0 Universal Public Domain + * Dedication (the "CC0"). + * + * This software is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the CC0 for more details. + * + * You should have received a copy of the CC0 along with this software; if not + * see . + */ + +#include "avl_tree.h" + +/* Returns the left child (sign < 0) or the right child (sign > 0) of the + * specified AVL tree node. + * Note: for all calls of this, 'sign' is constant at compilation time, + * so the compiler can remove the conditional. */ +static AVL_INLINE struct avl_tree_node * +avl_get_child(const struct avl_tree_node *parent, int sign) +{ + if (sign < 0) + return parent->left; + else + return parent->right; +} + +static AVL_INLINE struct avl_tree_node * +avl_tree_first_or_last_in_order(const struct avl_tree_node *root, int sign) +{ + const struct avl_tree_node *first = root; + + if (first) + while (avl_get_child(first, +sign)) + first = avl_get_child(first, +sign); + return (struct avl_tree_node *)first; +} + +/* Starts an in-order traversal of the tree: returns the least-valued node, or + * NULL if the tree is empty. */ +struct avl_tree_node * +avl_tree_first_in_order(const struct avl_tree_node *root) +{ + return avl_tree_first_or_last_in_order(root, -1); +} + +/* Starts a *reverse* in-order traversal of the tree: returns the + * greatest-valued node, or NULL if the tree is empty. */ +struct avl_tree_node * +avl_tree_last_in_order(const struct avl_tree_node *root) +{ + return avl_tree_first_or_last_in_order(root, 1); +} + +static AVL_INLINE struct avl_tree_node * +avl_tree_next_or_prev_in_order(const struct avl_tree_node *node, int sign) +{ + const struct avl_tree_node *next; + + if (avl_get_child(node, +sign)) + for (next = avl_get_child(node, +sign); + avl_get_child(next, -sign); + next = avl_get_child(next, -sign)) + ; + else + for (next = avl_get_parent(node); + next && node == avl_get_child(next, +sign); + node = next, next = avl_get_parent(next)) + ; + return (struct avl_tree_node *)next; +} + +/* Continues an in-order traversal of the tree: returns the next-greatest-valued + * node, or NULL if there is none. */ +struct avl_tree_node * +avl_tree_next_in_order(const struct avl_tree_node *node) +{ + return avl_tree_next_or_prev_in_order(node, 1); +} + +/* Continues a *reverse* in-order traversal of the tree: returns the + * previous-greatest-valued node, or NULL if there is none. */ +struct avl_tree_node * +avl_tree_prev_in_order(const struct avl_tree_node *node) +{ + return avl_tree_next_or_prev_in_order(node, -1); +} + +/* Starts a postorder traversal of the tree. */ +struct avl_tree_node * +avl_tree_first_in_postorder(const struct avl_tree_node *root) +{ + const struct avl_tree_node *first = root; + + if (first) + while (first->left || first->right) + first = first->left ? first->left : first->right; + + return (struct avl_tree_node *)first; +} + +/* Continues a postorder traversal of the tree. @prev will not be deferenced as + * it's allowed that its memory has been freed; @prev_parent must be its saved + * parent node. Returns NULL if there are no more nodes (i.e. @prev was the + * root of the tree). */ +struct avl_tree_node * +avl_tree_next_in_postorder(const struct avl_tree_node *prev, + const struct avl_tree_node *prev_parent) +{ + const struct avl_tree_node *next = prev_parent; + + if (next && prev == next->left && next->right) + for (next = next->right; + next->left || next->right; + next = next->left ? next->left : next->right) + ; + return (struct avl_tree_node *)next; +} + +/* Sets the left child (sign < 0) or the right child (sign > 0) of the + * specified AVL tree node. + * Note: for all calls of this, 'sign' is constant at compilation time, + * so the compiler can remove the conditional. */ +static AVL_INLINE void +avl_set_child(struct avl_tree_node *parent, int sign, + struct avl_tree_node *child) +{ + if (sign < 0) + parent->left = child; + else + parent->right = child; +} + +/* Sets the parent and balance factor of the specified AVL tree node. */ +static AVL_INLINE void +avl_set_parent_balance(struct avl_tree_node *node, struct avl_tree_node *parent, + int balance_factor) +{ + node->parent_balance = (uintptr_t)parent | (balance_factor + 1); +} + +/* Sets the parent of the specified AVL tree node. */ +static AVL_INLINE void +avl_set_parent(struct avl_tree_node *node, struct avl_tree_node *parent) +{ + node->parent_balance = (uintptr_t)parent | (node->parent_balance & 3); +} + +/* Returns the balance factor of the specified AVL tree node --- that is, the + * height of its right subtree minus the height of its left subtree. */ +static AVL_INLINE int +avl_get_balance_factor(const struct avl_tree_node *node) +{ + return (int)(node->parent_balance & 3) - 1; +} + +/* Adds @amount to the balance factor of the specified AVL tree node. + * The caller must ensure this still results in a valid balance factor + * (-1, 0, or 1). */ +static AVL_INLINE void +avl_adjust_balance_factor(struct avl_tree_node *node, int amount) +{ + node->parent_balance += amount; +} + +static AVL_INLINE void +avl_replace_child(struct avl_tree_node **root_ptr, + struct avl_tree_node *parent, + struct avl_tree_node *old_child, + struct avl_tree_node *new_child) +{ + if (parent) { + if (old_child == parent->left) + parent->left = new_child; + else + parent->right = new_child; + } else { + *root_ptr = new_child; + } +} + +/* + * Template for performing a single rotation --- + * + * sign > 0: Rotate clockwise (right) rooted at A: + * + * P? P? + * | | + * A B + * / \ / \ + * B C? => D? A + * / \ / \ + * D? E? E? C? + * + * (nodes marked with ? may not exist) + * + * sign < 0: Rotate counterclockwise (left) rooted at A: + * + * P? P? + * | | + * A B + * / \ / \ + * C? B => A D? + * / \ / \ + * E? D? C? E? + * + * This updates pointers but not balance factors! + */ +static AVL_INLINE void +avl_rotate(struct avl_tree_node ** const root_ptr, + struct avl_tree_node * const A, const int sign) +{ + struct avl_tree_node * const B = avl_get_child(A, -sign); + struct avl_tree_node * const E = avl_get_child(B, +sign); + struct avl_tree_node * const P = avl_get_parent(A); + + avl_set_child(A, -sign, E); + avl_set_parent(A, B); + + avl_set_child(B, +sign, A); + avl_set_parent(B, P); + + if (E) + avl_set_parent(E, A); + + avl_replace_child(root_ptr, P, A, B); +} + +/* + * Template for performing a double rotation --- + * + * sign > 0: Rotate counterclockwise (left) rooted at B, then + * clockwise (right) rooted at A: + * + * P? P? P? + * | | | + * A A E + * / \ / \ / \ + * B C? => E C? => B A + * / \ / \ / \ / \ + * D? E B G? D? F?G? C? + * / \ / \ + * F? G? D? F? + * + * (nodes marked with ? may not exist) + * + * sign < 0: Rotate clockwise (right) rooted at B, then + * counterclockwise (left) rooted at A: + * + * P? P? P? + * | | | + * A A E + * / \ / \ / \ + * C? B => C? E => A B + * / \ / \ / \ / \ + * E D? G? B C? G?F? D? + * / \ / \ + * G? F? F? D? + * + * Returns a pointer to E and updates balance factors. Except for those + * two things, this function is equivalent to: + * avl_rotate(root_ptr, B, -sign); + * avl_rotate(root_ptr, A, +sign); + * + * See comment in avl_handle_subtree_growth() for explanation of balance + * factor updates. + */ +static AVL_INLINE struct avl_tree_node * +avl_do_double_rotate(struct avl_tree_node ** const root_ptr, + struct avl_tree_node * const B, + struct avl_tree_node * const A, const int sign) +{ + struct avl_tree_node * const E = avl_get_child(B, +sign); + struct avl_tree_node * const F = avl_get_child(E, -sign); + struct avl_tree_node * const G = avl_get_child(E, +sign); + struct avl_tree_node * const P = avl_get_parent(A); + const int e = avl_get_balance_factor(E); + + avl_set_child(A, -sign, G); + avl_set_parent_balance(A, E, ((sign * e >= 0) ? 0 : -e)); + + avl_set_child(B, +sign, F); + avl_set_parent_balance(B, E, ((sign * e <= 0) ? 0 : -e)); + + avl_set_child(E, +sign, A); + avl_set_child(E, -sign, B); + avl_set_parent_balance(E, P, 0); + + if (G) + avl_set_parent(G, A); + + if (F) + avl_set_parent(F, B); + + avl_replace_child(root_ptr, P, A, E); + + return E; +} + +/* + * This function handles the growth of a subtree due to an insertion. + * + * @root_ptr + * Location of the tree's root pointer. + * + * @node + * A subtree that has increased in height by 1 due to an insertion. + * + * @parent + * Parent of @node; must not be NULL. + * + * @sign + * -1 if @node is the left child of @parent; + * +1 if @node is the right child of @parent. + * + * This function will adjust @parent's balance factor, then do a (single + * or double) rotation if necessary. The return value will be %true if + * the full AVL tree is now adequately balanced, or %false if the subtree + * rooted at @parent is now adequately balanced but has increased in + * height by 1, so the caller should continue up the tree. + * + * Note that if %false is returned, no rotation will have been done. + * Indeed, a single node insertion cannot require that more than one + * (single or double) rotation be done. + */ +static AVL_INLINE bool +avl_handle_subtree_growth(struct avl_tree_node ** const root_ptr, + struct avl_tree_node * const node, + struct avl_tree_node * const parent, + const int sign) +{ + int old_balance_factor, new_balance_factor; + + old_balance_factor = avl_get_balance_factor(parent); + + if (old_balance_factor == 0) { + avl_adjust_balance_factor(parent, sign); + /* @parent is still sufficiently balanced (-1 or +1 + * balance factor), but must have increased in height. + * Continue up the tree. */ + return false; + } + + new_balance_factor = old_balance_factor + sign; + + if (new_balance_factor == 0) { + avl_adjust_balance_factor(parent, sign); + /* @parent is now perfectly balanced (0 balance factor). + * It cannot have increased in height, so there is + * nothing more to do. */ + return true; + } + + /* @parent is too left-heavy (new_balance_factor == -2) or + * too right-heavy (new_balance_factor == +2). */ + + /* Test whether @node is left-heavy (-1 balance factor) or + * right-heavy (+1 balance factor). + * Note that it cannot be perfectly balanced (0 balance factor) + * because here we are under the invariant that @node has + * increased in height due to the insertion. */ + if (sign * avl_get_balance_factor(node) > 0) { + + /* @node (B below) is heavy in the same direction @parent + * (A below) is heavy. + * + * @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + * The comment, diagram, and equations below assume sign < 0. + * The other case is symmetric! + * @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + * + * Do a clockwise rotation rooted at @parent (A below): + * + * A B + * / \ / \ + * B C? => D A + * / \ / \ / \ + * D E? F? G?E? C? + * / \ + * F? G? + * + * Before the rotation: + * balance(A) = -2 + * balance(B) = -1 + * Let x = height(C). Then: + * height(B) = x + 2 + * height(D) = x + 1 + * height(E) = x + * max(height(F), height(G)) = x. + * + * After the rotation: + * height(D) = max(height(F), height(G)) + 1 + * = x + 1 + * height(A) = max(height(E), height(C)) + 1 + * = max(x, x) + 1 = x + 1 + * balance(B) = 0 + * balance(A) = 0 + */ + avl_rotate(root_ptr, parent, -sign); + + /* Equivalent to setting @parent's balance factor to 0. */ + avl_adjust_balance_factor(parent, -sign); /* A */ + + /* Equivalent to setting @node's balance factor to 0. */ + avl_adjust_balance_factor(node, -sign); /* B */ + } else { + /* @node (B below) is heavy in the direction opposite + * from the direction @parent (A below) is heavy. + * + * @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + * The comment, diagram, and equations below assume sign < 0. + * The other case is symmetric! + * @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + * + * Do a counterblockwise rotation rooted at @node (B below), + * then a clockwise rotation rooted at @parent (A below): + * + * A A E + * / \ / \ / \ + * B C? => E C? => B A + * / \ / \ / \ / \ + * D? E B G? D? F?G? C? + * / \ / \ + * F? G? D? F? + * + * Before the rotation: + * balance(A) = -2 + * balance(B) = +1 + * Let x = height(C). Then: + * height(B) = x + 2 + * height(E) = x + 1 + * height(D) = x + * max(height(F), height(G)) = x + * + * After both rotations: + * height(A) = max(height(G), height(C)) + 1 + * = x + 1 + * balance(A) = balance(E{orig}) >= 0 ? 0 : -balance(E{orig}) + * height(B) = max(height(D), height(F)) + 1 + * = x + 1 + * balance(B) = balance(E{orig} <= 0) ? 0 : -balance(E{orig}) + * + * height(E) = x + 2 + * balance(E) = 0 + */ + avl_do_double_rotate(root_ptr, node, parent, -sign); + } + + /* Height after rotation is unchanged; nothing more to do. */ + return true; +} + +/* Rebalance the tree after insertion of the specified node. */ +void +avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr, + struct avl_tree_node *inserted) +{ + struct avl_tree_node *node, *parent; + bool done; + + inserted->left = NULL; + inserted->right = NULL; + + node = inserted; + + /* Adjust balance factor of new node's parent. + * No rotation will need to be done at this level. */ + + parent = avl_get_parent(node); + if (!parent) + return; + + if (node == parent->left) + avl_adjust_balance_factor(parent, -1); + else + avl_adjust_balance_factor(parent, +1); + + if (avl_get_balance_factor(parent) == 0) + /* @parent did not change in height. Nothing more to do. */ + return; + + /* The subtree rooted at @parent increased in height by 1. */ + + do { + /* Adjust balance factor of next ancestor. */ + + node = parent; + parent = avl_get_parent(node); + if (!parent) + return; + + /* The subtree rooted at @node has increased in height by 1. */ + if (node == parent->left) + done = avl_handle_subtree_growth(root_ptr, node, + parent, -1); + else + done = avl_handle_subtree_growth(root_ptr, node, + parent, +1); + } while (!done); +} + +/* + * This function handles the shrinkage of a subtree due to a deletion. + * + * @root_ptr + * Location of the tree's root pointer. + * + * @parent + * A node in the tree, exactly one of whose subtrees has decreased + * in height by 1 due to a deletion. (This includes the case where + * one of the child pointers has become NULL, since we can consider + * the "NULL" subtree to have a height of 0.) + * + * @sign + * +1 if the left subtree of @parent has decreased in height by 1; + * -1 if the right subtree of @parent has decreased in height by 1. + * + * @left_deleted_ret + * If the return value is not NULL, this will be set to %true if the + * left subtree of the returned node has decreased in height by 1, + * or %false if the right subtree of the returned node has decreased + * in height by 1. + * + * This function will adjust @parent's balance factor, then do a (single + * or double) rotation if necessary. The return value will be NULL if + * the full AVL tree is now adequately balanced, or a pointer to the + * parent of @parent if @parent is now adequately balanced but has + * decreased in height by 1. Also in the latter case, *left_deleted_ret + * will be set. + */ +static AVL_INLINE struct avl_tree_node * +avl_handle_subtree_shrink(struct avl_tree_node ** const root_ptr, + struct avl_tree_node *parent, + const int sign, + bool * const left_deleted_ret) +{ + struct avl_tree_node *node; + int old_balance_factor, new_balance_factor; + + old_balance_factor = avl_get_balance_factor(parent); + + if (old_balance_factor == 0) { + /* Prior to the deletion, the subtree rooted at + * @parent was perfectly balanced. It's now + * unbalanced by 1, but that's okay and its height + * hasn't changed. Nothing more to do. */ + avl_adjust_balance_factor(parent, sign); + return NULL; + } + + new_balance_factor = old_balance_factor + sign; + + if (new_balance_factor == 0) { + /* The subtree rooted at @parent is now perfectly + * balanced, whereas before the deletion it was + * unbalanced by 1. Its height must have decreased + * by 1. No rotation is needed at this location, + * but continue up the tree. */ + avl_adjust_balance_factor(parent, sign); + node = parent; + } else { + /* @parent is too left-heavy (new_balance_factor == -2) or + * too right-heavy (new_balance_factor == +2). */ + + node = avl_get_child(parent, sign); + + /* The rotations below are similar to those done during + * insertion (see avl_handle_subtree_growth()), so full + * comments are not provided. The only new case is the + * one where @node has a balance factor of 0, and that is + * commented. */ + + if (sign * avl_get_balance_factor(node) >= 0) { + + avl_rotate(root_ptr, parent, -sign); + + if (avl_get_balance_factor(node) == 0) { + /* + * @node (B below) is perfectly balanced. + * + * @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + * The comment, diagram, and equations + * below assume sign < 0. The other case + * is symmetric! + * @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + * + * Do a clockwise rotation rooted at + * @parent (A below): + * + * A B + * / \ / \ + * B C? => D A + * / \ / \ / \ + * D E F? G?E C? + * / \ + * F? G? + * + * Before the rotation: + * balance(A) = -2 + * balance(B) = 0 + * Let x = height(C). Then: + * height(B) = x + 2 + * height(D) = x + 1 + * height(E) = x + 1 + * max(height(F), height(G)) = x. + * + * After the rotation: + * height(D) = max(height(F), height(G)) + 1 + * = x + 1 + * height(A) = max(height(E), height(C)) + 1 + * = max(x + 1, x) + 1 = x + 2 + * balance(A) = -1 + * balance(B) = +1 + */ + + /* A: -2 => -1 (sign < 0) + * or +2 => +1 (sign > 0) + * No change needed --- that's the same as + * old_balance_factor. */ + + /* B: 0 => +1 (sign < 0) + * or 0 => -1 (sign > 0) */ + avl_adjust_balance_factor(node, -sign); + + /* Height is unchanged; nothing more to do. */ + return NULL; + } else { + avl_adjust_balance_factor(parent, -sign); + avl_adjust_balance_factor(node, -sign); + } + } else { + node = avl_do_double_rotate(root_ptr, node, + parent, -sign); + } + } + parent = avl_get_parent(node); + if (parent) + *left_deleted_ret = (node == parent->left); + return parent; +} + +/* Swaps node X, which must have 2 children, with its in-order successor, then + * unlinks node X. Returns the parent of X just before unlinking, without its + * balance factor having been updated to account for the unlink. */ +static AVL_INLINE struct avl_tree_node * +avl_tree_swap_with_successor(struct avl_tree_node **root_ptr, + struct avl_tree_node *X, + bool *left_deleted_ret) +{ + struct avl_tree_node *Y, *ret; + + Y = X->right; + if (!Y->left) { + /* + * P? P? P? + * | | | + * X Y Y + * / \ / \ / \ + * A Y => A X => A B? + * / \ / \ + * (0) B? (0) B? + * + * [ X unlinked, Y returned ] + */ + ret = Y; + *left_deleted_ret = false; + } else { + struct avl_tree_node *Q; + + do { + Q = Y; + Y = Y->left; + } while (Y->left); + + /* + * P? P? P? + * | | | + * X Y Y + * / \ / \ / \ + * A ... => A ... => A ... + * | | | + * Q Q Q + * / / / + * Y X B? + * / \ / \ + * (0) B? (0) B? + * + * + * [ X unlinked, Q returned ] + */ + + Q->left = Y->right; + if (Q->left) + avl_set_parent(Q->left, Q); + Y->right = X->right; + avl_set_parent(X->right, Y); + ret = Q; + *left_deleted_ret = true; + } + + Y->left = X->left; + avl_set_parent(X->left, Y); + + Y->parent_balance = X->parent_balance; + avl_replace_child(root_ptr, avl_get_parent(X), X, Y); + + return ret; +} + +/* + * Removes an item from the specified AVL tree. + * + * @root_ptr + * Location of the AVL tree's root pointer. Indirection is needed + * because the root node may change if the tree needed to be rebalanced + * because of the deletion or if @node was the root node. + * + * @node + * Pointer to the `struct avl_tree_node' embedded in the item to + * remove from the tree. + * + * Note: This function *only* removes the node and rebalances the tree. + * It does not free any memory, nor does it do the equivalent of + * avl_tree_node_set_unlinked(). + */ +void +avl_tree_remove(struct avl_tree_node **root_ptr, struct avl_tree_node *node) +{ + struct avl_tree_node *parent; + bool left_deleted = false; + + if (node->left && node->right) { + /* @node is fully internal, with two children. Swap it + * with its in-order successor (which must exist in the + * right subtree of @node and can have, at most, a right + * child), then unlink @node. */ + parent = avl_tree_swap_with_successor(root_ptr, node, + &left_deleted); + /* @parent is now the parent of what was @node's in-order + * successor. It cannot be NULL, since @node itself was + * an ancestor of its in-order successor. + * @left_deleted has been set to %true if @node's + * in-order successor was the left child of @parent, + * otherwise %false. */ + } else { + struct avl_tree_node *child; + + /* @node is missing at least one child. Unlink it. Set + * @parent to @node's parent, and set @left_deleted to + * reflect which child of @parent @node was. Or, if + * @node was the root node, simply update the root node + * and return. */ + child = node->left ? node->left : node->right; + parent = avl_get_parent(node); + if (parent) { + if (node == parent->left) { + parent->left = child; + left_deleted = true; + } else { + parent->right = child; + left_deleted = false; + } + if (child) + avl_set_parent(child, parent); + } else { + if (child) + avl_set_parent(child, parent); + *root_ptr = child; + return; + } + } + + /* Rebalance the tree. */ + do { + if (left_deleted) + parent = avl_handle_subtree_shrink(root_ptr, parent, + +1, &left_deleted); + else + parent = avl_handle_subtree_shrink(root_ptr, parent, + -1, &left_deleted); + } while (parent); +} diff --git a/mwparserfromhell/parser/ctokenizer/avl_tree.h b/mwparserfromhell/parser/ctokenizer/avl_tree.h new file mode 100644 index 0000000..86ade3f --- /dev/null +++ b/mwparserfromhell/parser/ctokenizer/avl_tree.h @@ -0,0 +1,358 @@ +/* + * avl_tree.h - intrusive, nonrecursive AVL tree data structure (self-balancing + * binary search tree), header file + * + * Written in 2014-2016 by Eric Biggers + * + * To the extent possible under law, the author(s) have dedicated all copyright + * and related and neighboring rights to this software to the public domain + * worldwide via the Creative Commons Zero 1.0 Universal Public Domain + * Dedication (the "CC0"). + * + * This software is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the CC0 for more details. + * + * You should have received a copy of the CC0 along with this software; if not + * see . + */ + +#ifndef _AVL_TREE_H_ +#define _AVL_TREE_H_ + +#include +#include +#include /* for uintptr_t */ + +#ifdef __GNUC__ +# define AVL_INLINE inline __attribute__((always_inline)) +#else +# define AVL_INLINE inline +#endif + +/* Node in an AVL tree. Embed this in some other data structure. */ +struct avl_tree_node { + + /* Pointer to left child or NULL */ + struct avl_tree_node *left; + + /* Pointer to right child or NULL */ + struct avl_tree_node *right; + + /* Pointer to parent combined with the balance factor. This saves 4 or + * 8 bytes of memory depending on the CPU architecture. + * + * Low 2 bits: One greater than the balance factor of this subtree, + * which is equal to height(right) - height(left). The mapping is: + * + * 00 => -1 + * 01 => 0 + * 10 => +1 + * 11 => undefined + * + * The rest of the bits are the pointer to the parent node. It must be + * 4-byte aligned, and it will be NULL if this is the root node and + * therefore has no parent. */ + uintptr_t parent_balance; +}; + +/* Cast an AVL tree node to the containing data structure. */ +#define avl_tree_entry(entry, type, member) \ + ((type*) ((char *)(entry) - offsetof(type, member))) + +/* Returns a pointer to the parent of the specified AVL tree node, or NULL if it + * is already the root of the tree. */ +static AVL_INLINE struct avl_tree_node * +avl_get_parent(const struct avl_tree_node *node) +{ + return (struct avl_tree_node *)(node->parent_balance & ~3); +} + +/* Marks the specified AVL tree node as unlinked from any tree. */ +static AVL_INLINE void +avl_tree_node_set_unlinked(struct avl_tree_node *node) +{ + node->parent_balance = (uintptr_t)node; +} + +/* Returns true iff the specified AVL tree node has been marked with + * avl_tree_node_set_unlinked() and has not subsequently been inserted into a + * tree. */ +static AVL_INLINE bool +avl_tree_node_is_unlinked(const struct avl_tree_node *node) +{ + return node->parent_balance == (uintptr_t)node; +} + +/* (Internal use only) */ +extern void +avl_tree_rebalance_after_insert(struct avl_tree_node **root_ptr, + struct avl_tree_node *inserted); + +/* + * Looks up an item in the specified AVL tree. + * + * @root + * Pointer to the root of the AVL tree. (This can be NULL --- that just + * means the tree is empty.) + * + * @cmp_ctx + * First argument to pass to the comparison callback. This generally + * should be a pointer to an object equal to the one being searched for. + * + * @cmp + * Comparison callback. Must return < 0, 0, or > 0 if the first argument + * is less than, equal to, or greater than the second argument, + * respectively. The first argument will be @cmp_ctx and the second + * argument will be a pointer to the AVL tree node of an item in the tree. + * + * Returns a pointer to the AVL tree node of the resulting item, or NULL if the + * item was not found. + * + * Example: + * + * struct int_wrapper { + * int data; + * struct avl_tree_node index_node; + * }; + * + * static int _avl_cmp_int_to_node(const void *intptr, + * const struct avl_tree_node *nodeptr) + * { + * int n1 = *(const int *)intptr; + * int n2 = avl_tree_entry(nodeptr, struct int_wrapper, index_node)->data; + * if (n1 < n2) + * return -1; + * else if (n1 > n2) + * return 1; + * else + * return 0; + * } + * + * bool contains_int(struct avl_tree_node *root, int n) + * { + * struct avl_tree_node *result; + * + * result = avl_tree_lookup(root, &n, _avl_cmp_int_to_node); + * return result ? true : false; + * } + */ +static AVL_INLINE struct avl_tree_node * +avl_tree_lookup(const struct avl_tree_node *root, + const void *cmp_ctx, + int (*cmp)(const void *, const struct avl_tree_node *)) +{ + const struct avl_tree_node *cur = root; + + while (cur) { + int res = (*cmp)(cmp_ctx, cur); + if (res < 0) + cur = cur->left; + else if (res > 0) + cur = cur->right; + else + break; + } + return (struct avl_tree_node*)cur; +} + +/* Same as avl_tree_lookup(), but uses a more specific type for the comparison + * function. Specifically, with this function the item being searched for is + * expected to be in the same format as those already in the tree, with an + * embedded 'struct avl_tree_node'. */ +static AVL_INLINE struct avl_tree_node * +avl_tree_lookup_node(const struct avl_tree_node *root, + const struct avl_tree_node *node, + int (*cmp)(const struct avl_tree_node *, + const struct avl_tree_node *)) +{ + const struct avl_tree_node *cur = root; + + while (cur) { + int res = (*cmp)(node, cur); + if (res < 0) + cur = cur->left; + else if (res > 0) + cur = cur->right; + else + break; + } + return (struct avl_tree_node*)cur; +} + +/* + * Inserts an item into the specified AVL tree. + * + * @root_ptr + * Location of the AVL tree's root pointer. Indirection is needed because + * the root node may change as a result of rotations caused by the + * insertion. Initialize *root_ptr to NULL for an empty tree. + * + * @item + * Pointer to the `struct avl_tree_node' embedded in the item to insert. + * No members in it need be pre-initialized, although members in the + * containing structure should be pre-initialized so that @cmp can use them + * in comparisons. + * + * @cmp + * Comparison callback. Must return < 0, 0, or > 0 if the first argument + * is less than, equal to, or greater than the second argument, + * respectively. The first argument will be @item and the second + * argument will be a pointer to an AVL tree node embedded in some + * previously-inserted item to which @item is being compared. + * + * If no item in the tree is comparatively equal (via @cmp) to @item, inserts + * @item and returns NULL. Otherwise does nothing and returns a pointer to the + * AVL tree node embedded in the previously-inserted item which compared equal + * to @item. + * + * Example: + * + * struct int_wrapper { + * int data; + * struct avl_tree_node index_node; + * }; + * + * #define GET_DATA(i) avl_tree_entry((i), struct int_wrapper, index_node)->data + * + * static int _avl_cmp_ints(const struct avl_tree_node *node1, + * const struct avl_tree_node *node2) + * { + * int n1 = GET_DATA(node1); + * int n2 = GET_DATA(node2); + * if (n1 < n2) + * return -1; + * else if (n1 > n2) + * return 1; + * else + * return 0; + * } + * + * bool insert_int(struct avl_tree_node **root_ptr, int data) + * { + * struct int_wrapper *i = malloc(sizeof(struct int_wrapper)); + * i->data = data; + * if (avl_tree_insert(root_ptr, &i->index_node, _avl_cmp_ints)) { + * // Duplicate. + * free(i); + * return false; + * } + * return true; + * } + */ +static AVL_INLINE struct avl_tree_node * +avl_tree_insert(struct avl_tree_node **root_ptr, + struct avl_tree_node *item, + int (*cmp)(const struct avl_tree_node *, + const struct avl_tree_node *)) +{ + struct avl_tree_node **cur_ptr = root_ptr, *cur = NULL; + int res; + + while (*cur_ptr) { + cur = *cur_ptr; + res = (*cmp)(item, cur); + if (res < 0) + cur_ptr = &cur->left; + else if (res > 0) + cur_ptr = &cur->right; + else + return cur; + } + *cur_ptr = item; + item->parent_balance = (uintptr_t)cur | 1; + avl_tree_rebalance_after_insert(root_ptr, item); + return NULL; +} + +/* Removes an item from the specified AVL tree. + * See implementation for details. */ +extern void +avl_tree_remove(struct avl_tree_node **root_ptr, struct avl_tree_node *node); + +/* Nonrecursive AVL tree traversal functions */ + +extern struct avl_tree_node * +avl_tree_first_in_order(const struct avl_tree_node *root); + +extern struct avl_tree_node * +avl_tree_last_in_order(const struct avl_tree_node *root); + +extern struct avl_tree_node * +avl_tree_next_in_order(const struct avl_tree_node *node); + +extern struct avl_tree_node * +avl_tree_prev_in_order(const struct avl_tree_node *node); + +extern struct avl_tree_node * +avl_tree_first_in_postorder(const struct avl_tree_node *root); + +extern struct avl_tree_node * +avl_tree_next_in_postorder(const struct avl_tree_node *prev, + const struct avl_tree_node *prev_parent); + +/* + * Iterate through the nodes in an AVL tree in sorted order. + * You may not modify the tree during the iteration. + * + * @child_struct + * Variable that will receive a pointer to each struct inserted into the + * tree. + * @root + * Root of the AVL tree. + * @struct_name + * Type of *child_struct. + * @struct_member + * Member of @struct_name type that is the AVL tree node. + * + * Example: + * + * struct int_wrapper { + * int data; + * struct avl_tree_node index_node; + * }; + * + * void print_ints(struct avl_tree_node *root) + * { + * struct int_wrapper *i; + * + * avl_tree_for_each_in_order(i, root, struct int_wrapper, index_node) + * printf("%d\n", i->data); + * } + */ +#define avl_tree_for_each_in_order(child_struct, root, \ + struct_name, struct_member) \ + for (struct avl_tree_node *_cur = \ + avl_tree_first_in_order(root); \ + _cur && ((child_struct) = \ + avl_tree_entry(_cur, struct_name, \ + struct_member), 1); \ + _cur = avl_tree_next_in_order(_cur)) + +/* + * Like avl_tree_for_each_in_order(), but uses the reverse order. + */ +#define avl_tree_for_each_in_reverse_order(child_struct, root, \ + struct_name, struct_member) \ + for (struct avl_tree_node *_cur = \ + avl_tree_last_in_order(root); \ + _cur && ((child_struct) = \ + avl_tree_entry(_cur, struct_name, \ + struct_member), 1); \ + _cur = avl_tree_prev_in_order(_cur)) + +/* + * Like avl_tree_for_each_in_order(), but iterates through the nodes in + * postorder, so the current node may be deleted or freed. + */ +#define avl_tree_for_each_in_postorder(child_struct, root, \ + struct_name, struct_member) \ + for (struct avl_tree_node *_cur = \ + avl_tree_first_in_postorder(root), *_parent; \ + _cur && ((child_struct) = \ + avl_tree_entry(_cur, struct_name, \ + struct_member), 1) \ + && (_parent = avl_get_parent(_cur), 1); \ + _cur = avl_tree_next_in_postorder(_cur, _parent)) + +#endif /* _AVL_TREE_H_ */ diff --git a/mwparserfromhell/parser/ctokenizer/common.h b/mwparserfromhell/parser/ctokenizer/common.h index 3bd22af..f3d51f4 100644 --- a/mwparserfromhell/parser/ctokenizer/common.h +++ b/mwparserfromhell/parser/ctokenizer/common.h @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -30,6 +30,8 @@ SOFTWARE. #include #include +#include "avl_tree.h" + /* Compatibility macros */ #if PY_MAJOR_VERSION >= 3 @@ -92,10 +94,16 @@ typedef struct { #endif } Textbuffer; +typedef struct { + Py_ssize_t head; + uint64_t context; +} StackIdent; + struct Stack { PyObject* stack; uint64_t context; Textbuffer* textbuffer; + StackIdent ident; struct Stack* next; }; typedef struct Stack Stack; @@ -111,6 +119,13 @@ typedef struct { #endif } TokenizerInput; +typedef struct avl_tree_node avl_tree; + +typedef struct { + StackIdent id; + struct avl_tree_node node; +} route_tree_node; + typedef struct { PyObject_HEAD TokenizerInput text; /* text to tokenize */ @@ -118,8 +133,8 @@ typedef struct { Py_ssize_t head; /* current position in text */ int global; /* global context */ int depth; /* stack recursion depth */ - int cycles; /* total number of stack recursions */ int route_state; /* whether a BadRoute has been triggered */ uint64_t route_context; /* context when the last BadRoute was triggered */ + avl_tree* bad_routes; /* stack idents for routes known to fail */ int skip_style_tags; /* temp fix for the sometimes broken tag parser */ } Tokenizer; diff --git a/mwparserfromhell/parser/ctokenizer/contexts.h b/mwparserfromhell/parser/ctokenizer/contexts.h index 96afd6c..2696925 100644 --- a/mwparserfromhell/parser/ctokenizer/contexts.h +++ b/mwparserfromhell/parser/ctokenizer/contexts.h @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -81,6 +81,8 @@ SOFTWARE. #define LC_TABLE_TD_LINE 0x0000000800000000 #define LC_TABLE_TH_LINE 0x0000001000000000 +#define LC_HTML_ENTITY 0x0000002000000000 + /* Global contexts */ #define GL_HEADING 0x1 diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index f4e9606..27eed67 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -445,6 +445,8 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) Unicode this; int slashes, i; + if (Tokenizer_check_route(self, LC_EXT_LINK_URI) < 0) + return 0; if (Tokenizer_push(self, LC_EXT_LINK_URI)) return -1; if (Tokenizer_read(self, 0) == '/' && Tokenizer_read(self, 1) == '/') { @@ -461,7 +463,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) while (1) { if (!valid[i]) goto end_of_loop; - if (this == valid[i]) + if (this == (Unicode) valid[i]) break; i++; } @@ -533,7 +535,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) FAIL_ROUTE(0); return 0; } - } while (chunk != valid[j++]); + } while (chunk != (Unicode) valid[j++]); Textbuffer_write(scheme_buffer, chunk); } end_of_loop: @@ -552,7 +554,12 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) return 0; } Py_DECREF(scheme); - if (Tokenizer_push(self, self->topstack->context | LC_EXT_LINK_URI)) { + uint64_t new_context = self->topstack->context | LC_EXT_LINK_URI; + if (Tokenizer_check_route(self, new_context) < 0) { + Textbuffer_dealloc(scheme_buffer); + return 0; + } + if (Tokenizer_push(self, new_context)) { Textbuffer_dealloc(scheme_buffer); return -1; } @@ -1000,7 +1007,7 @@ static int Tokenizer_really_parse_entity(Tokenizer* self) while (1) { if (!valid[j]) FAIL_ROUTE_AND_EXIT() - if (this == valid[j]) + if (this == (Unicode) valid[j]) break; j++; } @@ -1065,11 +1072,14 @@ static int Tokenizer_parse_entity(Tokenizer* self) Py_ssize_t reset = self->head; PyObject *tokenlist; - if (Tokenizer_push(self, 0)) + if (Tokenizer_check_route(self, LC_HTML_ENTITY) < 0) + goto on_bad_route; + if (Tokenizer_push(self, LC_HTML_ENTITY)) return -1; if (Tokenizer_really_parse_entity(self)) return -1; if (BAD_ROUTE) { + on_bad_route: RESET_ROUTE(); self->head = reset; if (Tokenizer_emit_char(self, '&')) @@ -1574,6 +1584,8 @@ static PyObject* Tokenizer_really_parse_tag(Tokenizer* self) if (!data) return NULL; + if (Tokenizer_check_route(self, LC_TAG_OPEN) < 0) + return NULL; if (Tokenizer_push(self, LC_TAG_OPEN)) { TagData_dealloc(data); return NULL; @@ -2191,14 +2203,17 @@ static PyObject* Tokenizer_handle_table_style(Tokenizer* self, Unicode end_token static int Tokenizer_parse_table(Tokenizer* self) { Py_ssize_t reset = self->head; - PyObject *style, *padding; + PyObject *style, *padding, *trash; PyObject *table = NULL; self->head += 2; - if(Tokenizer_push(self, LC_TABLE_OPEN)) + if (Tokenizer_check_route(self, LC_TABLE_OPEN) < 0) + goto on_bad_route; + if (Tokenizer_push(self, LC_TABLE_OPEN)) return -1; padding = Tokenizer_handle_table_style(self, '\n'); if (BAD_ROUTE) { + on_bad_route: RESET_ROUTE(); self->head = reset; if (Tokenizer_emit_char(self, '{')) @@ -2214,11 +2229,16 @@ static int Tokenizer_parse_table(Tokenizer* self) } self->head++; + StackIdent restore_point = self->topstack->ident; table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); if (BAD_ROUTE) { RESET_ROUTE(); Py_DECREF(padding); Py_DECREF(style); + while (!Tokenizer_IS_CURRENT_STACK(self, restore_point)) { + trash = Tokenizer_pop(self); + Py_XDECREF(trash); + } self->head = reset; if (Tokenizer_emit_char(self, '{')) return -1; @@ -2243,7 +2263,7 @@ static int Tokenizer_parse_table(Tokenizer* self) */ static int Tokenizer_handle_table_row(Tokenizer* self) { - PyObject *padding, *style, *row, *trash; + PyObject *padding, *style, *row; self->head += 2; if (!Tokenizer_CAN_RECURSE(self)) { @@ -2253,14 +2273,13 @@ static int Tokenizer_handle_table_row(Tokenizer* self) return 0; } - if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) + if (Tokenizer_check_route(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN) < 0) + return 0; + if (Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_ROW_OPEN)) return -1; padding = Tokenizer_handle_table_style(self, '\n'); - if (BAD_ROUTE) { - trash = Tokenizer_pop(self); - Py_XDECREF(trash); + if (BAD_ROUTE) return 0; - } if (!padding) return -1; style = Tokenizer_pop(self); @@ -2319,8 +2338,8 @@ Tokenizer_handle_table_cell(Tokenizer* self, const char *markup, if (cell_context & LC_TABLE_CELL_STYLE) { Py_DECREF(cell); self->head = reset; - if(Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | - line_context)) + if (Tokenizer_push(self, LC_TABLE_OPEN | LC_TABLE_CELL_OPEN | + line_context)) return -1; padding = Tokenizer_handle_table_style(self, '|'); if (!padding) @@ -2541,6 +2560,8 @@ PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) PyObject* temp; if (push) { + if (Tokenizer_check_route(self, context) < 0) + return NULL; if (Tokenizer_push(self, context)) return NULL; } diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.c b/mwparserfromhell/parser/ctokenizer/tok_support.c index 31c6bb9..08bfe9c 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.c +++ b/mwparserfromhell/parser/ctokenizer/tok_support.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -40,10 +40,11 @@ int Tokenizer_push(Tokenizer* self, uint64_t context) top->textbuffer = Textbuffer_new(&self->text); if (!top->textbuffer) return -1; + top->ident.head = self->head; + top->ident.context = context; top->next = self->topstack; self->topstack = top; self->depth++; - self->cycles++; return 0; } @@ -130,12 +131,38 @@ PyObject* Tokenizer_pop_keeping_context(Tokenizer* self) } /* + Compare two route_tree_nodes that are in their avl_tree_node forms. +*/ +static int compare_nodes( + const struct avl_tree_node* na, const struct avl_tree_node* nb) +{ + route_tree_node *a = avl_tree_entry(na, route_tree_node, node); + route_tree_node *b = avl_tree_entry(nb, route_tree_node, node); + + if (a->id.head < b->id.head) + return -1; + if (a->id.head > b->id.head) + return 1; + return (a->id.context > b->id.context) - (a->id.context < b->id.context); +} + +/* Fail the current tokenization route. Discards the current - stack/context/textbuffer and sets the BAD_ROUTE flag. + stack/context/textbuffer and sets the BAD_ROUTE flag. Also records the + ident of the failed stack so future parsing attempts down this route can be + stopped early. */ void* Tokenizer_fail_route(Tokenizer* self) { uint64_t context = self->topstack->context; + + route_tree_node *node = malloc(sizeof(route_tree_node)); + if (node) { + node->id = self->topstack->ident; + if (avl_tree_insert(&self->bad_routes, &node->node, compare_nodes)) + free(node); + } + PyObject* stack = Tokenizer_pop(self); Py_XDECREF(stack); @@ -144,6 +171,31 @@ void* Tokenizer_fail_route(Tokenizer* self) } /* + Check if pushing a new route here with the given context would definitely + fail, based on a previous call to Tokenizer_fail_route() with the same + stack. + + Return 0 if safe and -1 if unsafe. The BAD_ROUTE flag will be set in the + latter case. + + This function is not necessary to call and works as an optimization + implementation detail. (The Python tokenizer checks every route on push, + but this would introduce too much overhead in C tokenizer due to the need + to check for a bad route after every call to Tokenizer_push.) +*/ +int Tokenizer_check_route(Tokenizer* self, uint64_t context) +{ + StackIdent ident = {self->head, context}; + struct avl_tree_node *node = (struct avl_tree_node*) (&ident + 1); + + if (avl_tree_lookup_node(self->bad_routes, node, compare_nodes)) { + FAIL_ROUTE(context); + return -1; + } + return 0; +} + +/* Write a token to the current token stack. */ int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.h b/mwparserfromhell/parser/ctokenizer/tok_support.h index 182f9a0..ccc6af5 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.h +++ b/mwparserfromhell/parser/ctokenizer/tok_support.h @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -32,6 +32,7 @@ void Tokenizer_delete_top_of_stack(Tokenizer*); PyObject* Tokenizer_pop(Tokenizer*); PyObject* Tokenizer_pop_keeping_context(Tokenizer*); void* Tokenizer_fail_route(Tokenizer*); +int Tokenizer_check_route(Tokenizer*, uint64_t); int Tokenizer_emit_token(Tokenizer*, PyObject*, int); int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int); @@ -47,10 +48,11 @@ Unicode Tokenizer_read_backwards(Tokenizer*, Py_ssize_t); /* Macros */ #define MAX_DEPTH 40 -#define MAX_CYCLES 100000 - #define Tokenizer_CAN_RECURSE(self) \ - (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES) + (self->depth < MAX_DEPTH) +#define Tokenizer_IS_CURRENT_STACK(self, id) \ + (self->topstack->ident.head == (id).head && \ + self->topstack->ident.context == (id).context) #define Tokenizer_emit(self, token) \ Tokenizer_emit_token(self, token, 0) diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c index 47d2993..213c47b 100644 --- a/mwparserfromhell/parser/ctokenizer/tokenizer.c +++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c @@ -1,5 +1,5 @@ /* -Copyright (C) 2012-2016 Ben Kurtovic +Copyright (C) 2012-2017 Ben Kurtovic Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in @@ -103,8 +103,9 @@ static int Tokenizer_init(Tokenizer* self, PyObject* args, PyObject* kwds) return -1; init_tokenizer_text(&self->text); self->topstack = NULL; - self->head = self->global = self->depth = self->cycles = 0; + self->head = self->global = self->depth = 0; self->route_context = self->route_state = 0; + self->bad_routes = NULL; self->skip_style_tags = 0; return 0; } @@ -158,10 +159,17 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) return NULL; } - self->head = self->global = self->depth = self->cycles = 0; + self->head = self->global = self->depth = 0; self->skip_style_tags = skip_style_tags; + self->bad_routes = NULL; + tokens = Tokenizer_parse(self, context, 1); + route_tree_node *n; + avl_tree_for_each_in_postorder(n, self->bad_routes, route_tree_node, node) + free(n); + self->bad_routes = NULL; + if (!tokens || self->topstack) { Py_XDECREF(tokens); if (PyErr_Occurred()) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 309d0d3..b3e5883 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright (C) 2012-2016 Ben Kurtovic +# Copyright (C) 2012-2017 Ben Kurtovic # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -65,7 +65,6 @@ class Tokenizer(object): MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", ":", "/", "-", "!", "\n", START, END] MAX_DEPTH = 40 - MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) tag_splitter = re.compile(r"([\s\"\'\\]+)") @@ -75,7 +74,8 @@ class Tokenizer(object): self._stacks = [] self._global = 0 self._depth = 0 - self._cycles = 0 + self._bad_routes = set() + self._skip_style_tags = False @property def _stack(self): @@ -100,11 +100,24 @@ class Tokenizer(object): def _textbuffer(self, value): self._stacks[-1][2] = value + @property + def _stack_ident(self): + """An identifier for the current stack. + + This is based on the starting head position and context. Stacks with + the same identifier are always parsed in the same way. This can be used + to cache intermediate parsing info. + """ + return self._stacks[-1][3] + def _push(self, context=0): """Add a new token stack, context, and textbuffer to the list.""" - self._stacks.append([[], context, []]) + new_ident = (self._head, context) + if new_ident in self._bad_routes: + raise BadRoute(context) + + self._stacks.append([[], context, [], new_ident]) self._depth += 1 - self._cycles += 1 def _push_textbuffer(self): """Push the textbuffer onto the stack as a Text node and clear it.""" @@ -129,7 +142,7 @@ class Tokenizer(object): def _can_recurse(self): """Return whether or not our max recursion depth has been exceeded.""" - return self._depth < self.MAX_DEPTH and self._cycles < self.MAX_CYCLES + return self._depth < self.MAX_DEPTH def _fail_route(self): """Fail the current tokenization route. @@ -138,6 +151,7 @@ class Tokenizer(object): :exc:`.BadRoute`. """ context = self._context + self._bad_routes.add(self._stack_ident) self._pop() raise BadRoute(context) @@ -609,8 +623,8 @@ class Tokenizer(object): def _parse_entity(self): """Parse an HTML entity at the head of the wikicode string.""" reset = self._head - self._push() try: + self._push(contexts.HTML_ENTITY) self._really_parse_entity() except BadRoute: self._head = reset @@ -650,8 +664,9 @@ class Tokenizer(object): self._emit_first(tokens.TagAttrQuote(char=data.quoter)) self._emit_all(self._pop()) buf = data.padding_buffer - self._emit_first(tokens.TagAttrStart(pad_first=buf["first"], - pad_before_eq=buf["before_eq"], pad_after_eq=buf["after_eq"])) + self._emit_first(tokens.TagAttrStart( + pad_first=buf["first"], pad_before_eq=buf["before_eq"], + pad_after_eq=buf["after_eq"])) self._emit_all(self._pop()) for key in data.padding_buffer: data.padding_buffer[key] = "" @@ -1076,8 +1091,8 @@ class Tokenizer(object): """Parse a wikicode table by starting with the first line.""" reset = self._head self._head += 2 - self._push(contexts.TABLE_OPEN) try: + self._push(contexts.TABLE_OPEN) padding = self._handle_table_style("\n") except BadRoute: self._head = reset @@ -1086,9 +1101,12 @@ class Tokenizer(object): style = self._pop() self._head += 1 + restore_point = self._stack_ident try: table = self._parse(contexts.TABLE_OPEN) except BadRoute: + while self._stack_ident != restore_point: + self._pop() self._head = reset self._emit_text("{") return @@ -1106,11 +1124,7 @@ class Tokenizer(object): return self._push(contexts.TABLE_OPEN | contexts.TABLE_ROW_OPEN) - try: - padding = self._handle_table_style("\n") - except BadRoute: - self._pop() - raise + padding = self._handle_table_style("\n") style = self._pop() # Don't parse the style separator: @@ -1348,7 +1362,8 @@ class Tokenizer(object): # Kill potential table contexts self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS # Start of table parsing - elif this == "{" and next == "|" and (self._read(-1) in ("\n", self.START) or + elif this == "{" and next == "|" and ( + self._read(-1) in ("\n", self.START) or (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if self._can_recurse(): self._parse_table() @@ -1374,7 +1389,7 @@ class Tokenizer(object): self._context &= ~contexts.TABLE_CELL_LINE_CONTEXTS self._emit_text(this) elif (self._read(-1) in ("\n", self.START) or - (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): + (self._read(-2) in ("\n", self.START) and self._read(-1).isspace())): if this == "|" and next == "}": if self._context & contexts.TABLE_CELL_OPEN: return self._handle_table_cell_end() @@ -1406,10 +1421,12 @@ class Tokenizer(object): def tokenize(self, text, context=0, skip_style_tags=False): """Build a list of tokens from a string of wikicode and return it.""" - self._skip_style_tags = skip_style_tags split = self.regex.split(text) self._text = [segment for segment in split if segment] - self._head = self._global = self._depth = self._cycles = 0 + self._head = self._global = self._depth = 0 + self._bad_routes = set() + self._skip_style_tags = skip_style_tags + try: tokens = self._parse(context) except BadRoute: # pragma: no cover (untestable/exceptional case) diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index 831f4d0..7137c50 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -346,3 +346,10 @@ name: tables_in_templates_2 label: catch error handling mistakes when wikitables are inside templates input: "{{hello|test\n{|\n| }}" output: [TemplateOpen(), Text(text="hello"), TemplateParamSeparator(), Text(text="test\n{"), TemplateParamSeparator(), Text(text="\n"), TemplateParamSeparator(), Text(text=" "), TemplateClose()] + +--- + +name: many_invalid_nested_tags +label: many unending nested tags that should be treated as plain text, followed by valid wikitext (see issues #42, #183) +input: "[[{{x}}" +output: [Text(text="[["), TemplateOpen(), Text(text="x"), TemplateClose()] diff --git a/tests/tokenizer/templates.mwtest b/tests/tokenizer/templates.mwtest index dccee37..8d30069 100644 --- a/tests/tokenizer/templates.mwtest +++ b/tests/tokenizer/templates.mwtest @@ -694,4 +694,4 @@ output: [Text(text="{{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ {{ name: recursion_opens_and_closes label: test potentially dangerous recursion: template openings and closings input: "{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}" -output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), TemplateOpen(), Text(text="x"), TemplateParamSeparator(), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x"), TemplateParamSeparator(), Text(text="{{x"), TemplateClose(), Text(text="{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}{{x|{{x}}")] +output: [Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose(), Text(text="{{x|"), TemplateOpen(), Text(text="x"), TemplateClose()] From 6ee61789da11a23720e743e14d856f7d5ed1c234 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 01:26:37 -0400 Subject: [PATCH 16/24] Fix compilation issue on Travis since GCC uses C90 by default there. --- mwparserfromhell/parser/ctokenizer/tok_support.c | 17 +++++++++++++++++ mwparserfromhell/parser/ctokenizer/tok_support.h | 1 + mwparserfromhell/parser/ctokenizer/tokenizer.c | 6 ++---- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.c b/mwparserfromhell/parser/ctokenizer/tok_support.c index 08bfe9c..f3814ed 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.c +++ b/mwparserfromhell/parser/ctokenizer/tok_support.c @@ -196,6 +196,23 @@ int Tokenizer_check_route(Tokenizer* self, uint64_t context) } /* + Free the tokenizer's bad route cache tree. Intended to be called by the + main tokenizer function after parsing is finished. +*/ +void Tokenizer_free_bad_route_tree(Tokenizer *self) +{ + struct avl_tree_node *cur = avl_tree_first_in_postorder(self->bad_routes); + struct avl_tree_node *parent; + while (cur) { + route_tree_node *node = avl_tree_entry(cur, route_tree_node, node); + parent = avl_get_parent(cur); + free(node); + cur = avl_tree_next_in_postorder(cur, parent); + } + self->bad_routes = NULL; +} + +/* Write a token to the current token stack. */ int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.h b/mwparserfromhell/parser/ctokenizer/tok_support.h index ccc6af5..57f4126 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.h +++ b/mwparserfromhell/parser/ctokenizer/tok_support.h @@ -33,6 +33,7 @@ PyObject* Tokenizer_pop(Tokenizer*); PyObject* Tokenizer_pop_keeping_context(Tokenizer*); void* Tokenizer_fail_route(Tokenizer*); int Tokenizer_check_route(Tokenizer*, uint64_t); +void Tokenizer_free_bad_route_tree(Tokenizer*); int Tokenizer_emit_token(Tokenizer*, PyObject*, int); int Tokenizer_emit_token_kwargs(Tokenizer*, PyObject*, PyObject*, int); diff --git a/mwparserfromhell/parser/ctokenizer/tokenizer.c b/mwparserfromhell/parser/ctokenizer/tokenizer.c index 213c47b..9017909 100644 --- a/mwparserfromhell/parser/ctokenizer/tokenizer.c +++ b/mwparserfromhell/parser/ctokenizer/tokenizer.c @@ -22,6 +22,7 @@ SOFTWARE. #include "tokenizer.h" #include "tok_parse.h" +#include "tok_support.h" #include "tokens.h" /* Globals */ @@ -165,10 +166,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) tokens = Tokenizer_parse(self, context, 1); - route_tree_node *n; - avl_tree_for_each_in_postorder(n, self->bad_routes, route_tree_node, node) - free(n); - self->bad_routes = NULL; + Tokenizer_free_bad_route_tree(self); if (!tokens || self->topstack) { Py_XDECREF(tokens); From 2593675651d76abda2b03b93a1dd24910974ca16 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 01:47:08 -0400 Subject: [PATCH 17/24] Remove stdbool.h from avl_tree since MSVC doesn't like it. --- mwparserfromhell/parser/ctokenizer/avl_tree.c | 6 ++++++ mwparserfromhell/parser/ctokenizer/avl_tree.h | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/ctokenizer/avl_tree.c b/mwparserfromhell/parser/ctokenizer/avl_tree.c index 4fdff6f..dd034b2 100644 --- a/mwparserfromhell/parser/ctokenizer/avl_tree.c +++ b/mwparserfromhell/parser/ctokenizer/avl_tree.c @@ -3,6 +3,7 @@ * binary search tree), implementation file * * Written in 2014-2016 by Eric Biggers + * Slight changes for compatibility by Ben Kurtovic * * To the extent possible under law, the author(s) have dedicated all copyright * and related and neighboring rights to this software to the public domain @@ -17,6 +18,11 @@ * see . */ +#define false 0 +#define true 1 + +typedef int bool; + #include "avl_tree.h" /* Returns the left child (sign < 0) or the right child (sign > 0) of the diff --git a/mwparserfromhell/parser/ctokenizer/avl_tree.h b/mwparserfromhell/parser/ctokenizer/avl_tree.h index 86ade3f..86e2c75 100644 --- a/mwparserfromhell/parser/ctokenizer/avl_tree.h +++ b/mwparserfromhell/parser/ctokenizer/avl_tree.h @@ -3,6 +3,7 @@ * binary search tree), header file * * Written in 2014-2016 by Eric Biggers + * Slight changes for compatibility by Ben Kurtovic * * To the extent possible under law, the author(s) have dedicated all copyright * and related and neighboring rights to this software to the public domain @@ -20,7 +21,6 @@ #ifndef _AVL_TREE_H_ #define _AVL_TREE_H_ -#include #include #include /* for uintptr_t */ @@ -78,7 +78,7 @@ avl_tree_node_set_unlinked(struct avl_tree_node *node) /* Returns true iff the specified AVL tree node has been marked with * avl_tree_node_set_unlinked() and has not subsequently been inserted into a * tree. */ -static AVL_INLINE bool +static AVL_INLINE int avl_tree_node_is_unlinked(const struct avl_tree_node *node) { return node->parent_balance == (uintptr_t)node; From 6ad3b9fb2ab8d05bd842c079df0b351aeae45c20 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 01:55:26 -0400 Subject: [PATCH 18/24] inttypes.h doesn't exist on Windows, so try using stdint.h --- mwparserfromhell/parser/ctokenizer/avl_tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mwparserfromhell/parser/ctokenizer/avl_tree.h b/mwparserfromhell/parser/ctokenizer/avl_tree.h index 86e2c75..f4869a6 100644 --- a/mwparserfromhell/parser/ctokenizer/avl_tree.h +++ b/mwparserfromhell/parser/ctokenizer/avl_tree.h @@ -22,7 +22,7 @@ #define _AVL_TREE_H_ #include -#include /* for uintptr_t */ +#include #ifdef __GNUC__ # define AVL_INLINE inline __attribute__((always_inline)) From dc0b3ae44686f4d69c1043983ae8c1da720f8186 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 02:08:09 -0400 Subject: [PATCH 19/24] Enable Windows builds on Python 3.6; try to fix again. --- appveyor.yml | 8 ++++++++ mwparserfromhell/parser/ctokenizer/avl_tree.h | 7 +++++++ 2 files changed, 15 insertions(+) diff --git a/appveyor.yml b/appveyor.yml index d60b14b..afe1450 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -52,6 +52,14 @@ environment: PYTHON_VERSION: "3.5" PYTHON_ARCH: "64" + - PYTHON: "C:\\Python36" + PYTHON_VERSION: "3.6" + PYTHON_ARCH: "32" + + - PYTHON: "C:\\Python36-x64" + PYTHON_VERSION: "3.6" + PYTHON_ARCH: "64" + install: - "%PIP% install --disable-pip-version-check --user --upgrade pip" - "%PIP% install wheel twine" diff --git a/mwparserfromhell/parser/ctokenizer/avl_tree.h b/mwparserfromhell/parser/ctokenizer/avl_tree.h index f4869a6..8508411 100644 --- a/mwparserfromhell/parser/ctokenizer/avl_tree.h +++ b/mwparserfromhell/parser/ctokenizer/avl_tree.h @@ -22,10 +22,17 @@ #define _AVL_TREE_H_ #include + +#if defined(_MSC_VER) && (_MSC_VER < 1600) +typedef unsigned long uintptr_t; +#else #include +#endif #ifdef __GNUC__ # define AVL_INLINE inline __attribute__((always_inline)) +#elif defined(_MSC_VER) && (_MSC_VER < 1900) +# define AVL_INLINE __inline #else # define AVL_INLINE inline #endif From 0ef6a2ffbe78e1031b46a3ba463cd014fb9a995e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 02:17:29 -0400 Subject: [PATCH 20/24] Fix declarations for C89 compatibility (forgot MSVC needed that...) --- mwparserfromhell/parser/ctokenizer/avl_tree.h | 4 +--- mwparserfromhell/parser/ctokenizer/tok_parse.c | 6 ++++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mwparserfromhell/parser/ctokenizer/avl_tree.h b/mwparserfromhell/parser/ctokenizer/avl_tree.h index 8508411..9caa2bc 100644 --- a/mwparserfromhell/parser/ctokenizer/avl_tree.h +++ b/mwparserfromhell/parser/ctokenizer/avl_tree.h @@ -23,9 +23,7 @@ #include -#if defined(_MSC_VER) && (_MSC_VER < 1600) -typedef unsigned long uintptr_t; -#else +#if !defined(_MSC_VER) || (_MSC_VER >= 1600) #include #endif diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index 27eed67..f8e52ec 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -519,6 +519,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) Unicode chunk; Py_ssize_t i; int slashes, j; + uint64_t new_context; if (!scheme_buffer) return -1; @@ -554,7 +555,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) return 0; } Py_DECREF(scheme); - uint64_t new_context = self->topstack->context | LC_EXT_LINK_URI; + new_context = self->topstack->context | LC_EXT_LINK_URI; if (Tokenizer_check_route(self, new_context) < 0) { Textbuffer_dealloc(scheme_buffer); return 0; @@ -2205,6 +2206,7 @@ static int Tokenizer_parse_table(Tokenizer* self) Py_ssize_t reset = self->head; PyObject *style, *padding, *trash; PyObject *table = NULL; + StackIdent restore_point; self->head += 2; if (Tokenizer_check_route(self, LC_TABLE_OPEN) < 0) @@ -2229,7 +2231,7 @@ static int Tokenizer_parse_table(Tokenizer* self) } self->head++; - StackIdent restore_point = self->topstack->ident; + restore_point = self->topstack->ident; table = Tokenizer_parse(self, LC_TABLE_OPEN, 1); if (BAD_ROUTE) { RESET_ROUTE(); From 5a99597eb3333508e504eb0debaa42b5561c8cae Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 02:19:08 -0400 Subject: [PATCH 21/24] Another C89 fix for MSVC. --- mwparserfromhell/parser/ctokenizer/tok_support.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/ctokenizer/tok_support.c b/mwparserfromhell/parser/ctokenizer/tok_support.c index f3814ed..062c631 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_support.c +++ b/mwparserfromhell/parser/ctokenizer/tok_support.c @@ -155,6 +155,7 @@ static int compare_nodes( void* Tokenizer_fail_route(Tokenizer* self) { uint64_t context = self->topstack->context; + PyObject* stack; route_tree_node *node = malloc(sizeof(route_tree_node)); if (node) { @@ -163,8 +164,7 @@ void* Tokenizer_fail_route(Tokenizer* self) free(node); } - PyObject* stack = Tokenizer_pop(self); - + stack = Tokenizer_pop(self); Py_XDECREF(stack); FAIL_ROUTE(context); return NULL; From 7308c8055ec50475ccd3df146b76ee6b986f789c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 02:46:06 -0400 Subject: [PATCH 22/24] Not perfect, but slightly better template param space guessing (#155) --- mwparserfromhell/nodes/template.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mwparserfromhell/nodes/template.py b/mwparserfromhell/nodes/template.py index 9c89fbd..58d25ae 100644 --- a/mwparserfromhell/nodes/template.py +++ b/mwparserfromhell/nodes/template.py @@ -136,6 +136,11 @@ class Template(Node): component = str(param.value) match = re.search(r"^(\s*).*?(\s*)$", component, FLAGS) before, after = match.group(1), match.group(2) + if not use_names and component.isspace() and "\n" in before: + # If the value is empty, we expect newlines in the whitespace + # to be after the content, not before it: + before, after = before.split("\n", 1) + after = "\n" + after before_theories[before] += 1 after_theories[after] += 1 From cd4f90e663fa421b836e93ddc56e4a573eefb664 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 04:14:17 -0400 Subject: [PATCH 23/24] Fix a rare parsing bug involving nested broken tags. --- CHANGELOG | 2 ++ docs/changelog.rst | 2 ++ mwparserfromhell/parser/ctokenizer/tok_parse.c | 8 ++++++++ mwparserfromhell/parser/tokenizer.py | 6 ++++++ tests/tokenizer/tags.mwtest | 7 +++++++ 5 files changed, 25 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index bebacbf..b52a70f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -16,6 +16,8 @@ v0.5 (unreleased): on incompletely-constructed StringMixIn subclasses). - Fixed Wikicode.matches()'s behavior on iterables besides lists and tuples. - Fixed len() sometimes raising ValueError on empty node lists. +- Fixed a rare parsing bug involving self-closing tags inside the attributes of + unpaired tags. - Fixed release script after changes to PyPI. v0.4.4 (released December 30, 2016): diff --git a/docs/changelog.rst b/docs/changelog.rst index c558579..b02437f 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -27,6 +27,8 @@ Unreleased - Fixed :meth:`.Wikicode.matches`\ 's behavior on iterables besides lists and tuples. - Fixed ``len()`` sometimes raising ``ValueError`` on empty node lists. +- Fixed a rare parsing bug involving self-closing tags inside the attributes of + unpaired tags. - Fixed release script after changes to PyPI. v0.4.4 diff --git a/mwparserfromhell/parser/ctokenizer/tok_parse.c b/mwparserfromhell/parser/ctokenizer/tok_parse.c index f8e52ec..90ee19d 100644 --- a/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -1548,6 +1548,14 @@ static PyObject* Tokenizer_handle_single_tag_end(Tokenizer* self) if (depth == 0) break; } + is_instance = PyObject_IsInstance(token, TagCloseSelfclose); + if (is_instance == -1) + return NULL; + else if (is_instance == 1) { + depth--; + if (depth == 0) // Should never happen + return NULL; + } } if (!token || depth > 0) return NULL; diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index b3e5883..d7a0282 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -819,6 +819,12 @@ class Tokenizer(object): depth -= 1 if depth == 0: break + elif isinstance(token, tokens.TagCloseSelfclose): + depth -= 1 + if depth == 0: # pragma: no cover (untestable/exceptional) + raise ParserError( + "_handle_single_tag_end() got an unexpected " + "TagCloseSelfclose") else: # pragma: no cover (untestable/exceptional case) raise ParserError("_handle_single_tag_end() missed a TagCloseOpen") padding = stack[index].padding diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 3c07ac9..40815a6 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -646,3 +646,10 @@ name: non_ascii_full label: an open/close tag pair containing non-ASCII characters input: "<éxamplé>" output: [TagOpenOpen(), Text(text="éxamplé"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="éxamplé"), TagCloseClose()] + +--- + +name: single_nested_selfclosing +label: a single (unpaired) tag with a self-closing tag in the middle (see issue #147) +input: "
  • c>foobar" +output: [TagOpenOpen(), Text(text="li"), TagAttrStart(pad_first=" ", pad_after_eq="", pad_before_eq=" "), Text(text="a"), TagAttrStart(pad_first="", pad_after_eq="", pad_before_eq=" "), TagOpenOpen(), Text(text="br"), TagCloseSelfclose(padding=""), TagAttrStart(pad_first="", pad_after_eq="", pad_before_eq=""), Text(text="c"), TagCloseSelfclose(padding="", implicit=True), Text(text="foobar")] From 3ffc13bfd47edb2b96425f05e45d7c8a29cea126 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Jun 2017 04:31:18 -0400 Subject: [PATCH 24/24] release/0.5 --- CHANGELOG | 2 +- appveyor.yml | 2 +- docs/changelog.rst | 4 ++-- mwparserfromhell/__init__.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index b52a70f..bdcf906 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,4 @@ -v0.5 (unreleased): +v0.5 (released June 23, 2017): - Added Wikicode.contains() to determine whether a Node or Wikicode object is contained within another Wikicode object. diff --git a/appveyor.yml b/appveyor.yml index afe1450..ff2ef4a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,6 +1,6 @@ # This config file is used by appveyor.com to build Windows release binaries -version: 0.5.dev0-b{build} +version: 0.5-b{build} branches: only: diff --git a/docs/changelog.rst b/docs/changelog.rst index b02437f..cf4e31a 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -4,8 +4,8 @@ Changelog v0.5 ---- -Unreleased -(`changes `__): +`Released June 23, 2017 `_ +(`changes `__): - Added :meth:`.Wikicode.contains` to determine whether a :class:`.Node` or :class:`.Wikicode` object is contained within another :class:`.Wikicode` diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 64f3681..17f9e97 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -29,7 +29,7 @@ outrageously powerful parser for `MediaWiki `_ wikicode. __author__ = "Ben Kurtovic" __copyright__ = "Copyright (C) 2012, 2013, 2014, 2015, 2016 Ben Kurtovic" __license__ = "MIT License" -__version__ = "0.5.dev0" +__version__ = "0.5" __email__ = "ben.kurtovic@gmail.com" from . import (compat, definitions, nodes, parser, smart_list, string_mixin,