From fac60dee48eb346f839c0145138151dbc5b7d999 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 16 May 2021 01:10:31 -0400 Subject: [PATCH] Fix parsing of nested wikilinks --- CHANGELOG | 1 + docs/changelog.rst | 1 + src/mwparserfromhell/parser/ctokenizer/tok_parse.c | 3 ++ src/mwparserfromhell/parser/tokenizer.py | 2 ++ tests/tokenizer/integration.mwtest | 7 +++++ tests/tokenizer/wikilinks.mwtest | 36 +++++++++++----------- 6 files changed, 32 insertions(+), 18 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 1500128..637180f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,7 @@ v0.7 (unreleased): - Improved parsing of external links. (#232) +- Fixed parsing of nested wikilinks. - Ported tests to pytest. (#237) - Moved mwparserfromhell package to src/ dir. diff --git a/docs/changelog.rst b/docs/changelog.rst index 496494d..68f548f 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -9,6 +9,7 @@ Unreleased - Improved parsing of external links. (`#232 `_) +- Fixed parsing of nested wikilinks. - Ported tests to pytest. (`#237 `_) - Moved mwparserfromhell package to src/ dir. diff --git a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c index d36ce56..6e9022d 100644 --- a/src/mwparserfromhell/parser/ctokenizer/tok_parse.c +++ b/src/mwparserfromhell/parser/ctokenizer/tok_parse.c @@ -2728,6 +2728,9 @@ PyObject* Tokenizer_parse(Tokenizer* self, uint64_t context, int push) return NULL; } else if (this == next && next == '[' && Tokenizer_CAN_RECURSE(self)) { + if (this_context & LC_WIKILINK_TEXT) { + return Tokenizer_fail_route(self); + } if (!(this_context & AGG_NO_WIKILINKS)) { if (Tokenizer_parse_wikilink(self)) return NULL; diff --git a/src/mwparserfromhell/parser/tokenizer.py b/src/mwparserfromhell/parser/tokenizer.py index 90f3425..76efd9b 100644 --- a/src/mwparserfromhell/parser/tokenizer.py +++ b/src/mwparserfromhell/parser/tokenizer.py @@ -1353,6 +1353,8 @@ class Tokenizer: return self._handle_argument_end() self._emit_text("}") elif this == nxt == "[" and self._can_recurse(): + if self._context & contexts.WIKILINK_TEXT: + self._fail_route() if not self._context & contexts.NO_WIKILINKS: self._parse_wikilink() else: diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index 58bcc74..8c41f8b 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -359,3 +359,10 @@ name: nested_templates_and_style_tags label: many nested templates and style tags, testing edge case behavior and error recovery near the recursion depth limit (see issue #224) input: "{{a|'''}}{{b|1='''c''}}{{d|1='''e''}}{{f|1='''g''}}{{h|1='''i''}}{{j|1='''k''}}{{l|1='''m''}}{{n|1='''o''}}{{p|1='''q''}}{{r|1=''s'''}}{{t|1='''u''}}{{v|1='''w''x'''y'''}}\n{|\n|-\n|'''\n|}" output: [TemplateOpen(), Text(text="a"), TemplateParamSeparator(), Text(text="'''"), TemplateClose(), TemplateOpen(), Text(text="b"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="c"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="d"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="e"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="f"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="g"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="h"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="i"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="j"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="k"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="m"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="n"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="o''}}"), TemplateOpen(), Text(text="p"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="q"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), TemplateOpen(), Text(text="r"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="''s'''"), TemplateClose(), TemplateOpen(), Text(text="t"), TemplateParamSeparator(), Text(text="1"), TemplateParamEquals(), Text(text="'"), TagOpenOpen(wiki_markup="''"), Text(text="i"), TagCloseOpen(), Text(text="u"), TagOpenClose(), Text(text="i"), TagCloseClose(), TemplateClose(), Text(text="{{v|1="), TagOpenClose(), Text(text="b"), TagCloseClose(), Text(text="w''x"), TagOpenOpen(wiki_markup="'''"), Text(text="b"), TagCloseOpen(), Text(text="y"), TagOpenClose(), Text(text="b"), TagCloseClose(), TemplateClose(), Text(text="\n"), TagOpenOpen(wiki_markup="{|"), Text(text="table"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|-"), Text(text="tr"), TagCloseOpen(padding="\n"), TagOpenOpen(wiki_markup="|"), Text(text="td"), TagCloseOpen(padding=""), Text(text="'''\n"), TagOpenClose(wiki_markup=""), Text(text="td"), TagCloseClose(), TagOpenClose(wiki_markup=""), Text(text="tr"), TagCloseClose(), TagOpenClose(wiki_markup="|}"), Text(text="table"), TagCloseClose()] + +--- + +name: wikilink_nested_with_nowiki +label: wikilinks nested within the text of another, but surrounded by nowiki tags +input: [[foo|bar[[baz]][[qux]]]] +output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar"), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="[[baz]][[qux]]"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), WikilinkClose()] diff --git a/tests/tokenizer/wikilinks.mwtest b/tests/tokenizer/wikilinks.mwtest index c8b7c6f..1dab688 100644 --- a/tests/tokenizer/wikilinks.mwtest +++ b/tests/tokenizer/wikilinks.mwtest @@ -54,20 +54,6 @@ output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar[b --- -name: nested -label: a wikilink nested within another -input: "[[foo|[[bar]]]]" -output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), WikilinkOpen(), Text(text="bar"), WikilinkClose(), WikilinkClose()] - ---- - -name: nested_padding -label: a wikilink nested within another, separated by other data -input: "[[foo|a[[b]]c]]" -output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c"), WikilinkClose()] - ---- - name: invalid_newline label: invalid wikilink: newline as only content input: "[[\n]]" @@ -103,15 +89,29 @@ output: [Text(text="[[foo[bar]]")] --- -name: invalid_nested -label: invalid wikilink: trying to nest in the wrong context +name: invalid_nested_text +label: invalid wikilink: nested within the text of another +input: "[[foo|[[bar]]]]" +output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="]]")] + +--- + +name: invalid_nested_text_2 +label: invalid wikilink: a wikilink nested within the text of another, with additional content +input: "[[foo|a[[b]]c]]" +output: [Text(text="[[foo|a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c]]")] + +--- + +name: invalid_nested_title +label: invalid wikilink: nested within the title of another input: "[[foo[[bar]]]]" output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="]]")] --- -name: invalid_nested_padding -label: invalid wikilink: trying to nest in the wrong context, with a text param +name: invalid_nested_title_and_text +label: invalid wikilink: nested within the title of another, with a text param input: "[[foo[[bar]]|baz]]" output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="|baz]]")]