diff --git a/CHANGELOG b/CHANGELOG index 84dc148..2c94ebc 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -13,6 +13,7 @@ v0.4 (unreleased): - Given the frequency of issues with the (admittedly insufficient) tag parser, there's a temporary skip_style_tags argument to parse() that ignores '' and ''' until these issues are corrected. +- Fixed a parser bug involving nested wikilinks and external links. - C code cleanup and speed improvements. v0.3.2 (released September 1, 2013): diff --git a/docs/changelog.rst b/docs/changelog.rst index ada6e1e..6e1ce47 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -24,6 +24,7 @@ Unreleased there's a temporary *skip_style_tags* argument to :py:meth:`~mwparserfromhell.parse` that ignores ``''`` and ``'''`` until these issues are corrected. +- Fixed a parser bug involving nested wikilinks and external links. - C code cleanup and speed improvements. v0.3.2 diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 1d9adf1..28023b5 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -55,7 +55,6 @@ Local (stack-specific) contexts: * :py:const:`EXT_LINK_URI` * :py:const:`EXT_LINK_TITLE` - * :py:const:`EXT_LINK_BRACKETS` * :py:const:`HEADING` @@ -100,7 +99,8 @@ Aggregate contexts: * :py:const:`FAIL` * :py:const:`UNSAFE` * :py:const:`DOUBLE` -* :py:const:`INVALID_LINK` +* :py:const:`NO_WIKILINKS` +* :py:const:`NO_EXT_LINKS` """ @@ -121,38 +121,37 @@ WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT EXT_LINK_URI = 1 << 7 EXT_LINK_TITLE = 1 << 8 -EXT_LINK_BRACKETS = 1 << 9 -EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE + EXT_LINK_BRACKETS - -HEADING_LEVEL_1 = 1 << 10 -HEADING_LEVEL_2 = 1 << 11 -HEADING_LEVEL_3 = 1 << 12 -HEADING_LEVEL_4 = 1 << 13 -HEADING_LEVEL_5 = 1 << 14 -HEADING_LEVEL_6 = 1 << 15 +EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE + +HEADING_LEVEL_1 = 1 << 9 +HEADING_LEVEL_2 = 1 << 10 +HEADING_LEVEL_3 = 1 << 11 +HEADING_LEVEL_4 = 1 << 12 +HEADING_LEVEL_5 = 1 << 13 +HEADING_LEVEL_6 = 1 << 14 HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 + HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6) -TAG_OPEN = 1 << 16 -TAG_ATTR = 1 << 17 -TAG_BODY = 1 << 18 -TAG_CLOSE = 1 << 19 +TAG_OPEN = 1 << 15 +TAG_ATTR = 1 << 16 +TAG_BODY = 1 << 17 +TAG_CLOSE = 1 << 18 TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE -STYLE_ITALICS = 1 << 20 -STYLE_BOLD = 1 << 21 -STYLE_PASS_AGAIN = 1 << 22 -STYLE_SECOND_PASS = 1 << 23 +STYLE_ITALICS = 1 << 19 +STYLE_BOLD = 1 << 20 +STYLE_PASS_AGAIN = 1 << 21 +STYLE_SECOND_PASS = 1 << 22 STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS -DL_TERM = 1 << 24 +DL_TERM = 1 << 23 -HAS_TEXT = 1 << 25 -FAIL_ON_TEXT = 1 << 26 -FAIL_NEXT = 1 << 27 -FAIL_ON_LBRACE = 1 << 28 -FAIL_ON_RBRACE = 1 << 29 -FAIL_ON_EQUALS = 1 << 30 +HAS_TEXT = 1 << 24 +FAIL_ON_TEXT = 1 << 25 +FAIL_NEXT = 1 << 26 +FAIL_ON_LBRACE = 1 << 27 +FAIL_ON_RBRACE = 1 << 28 +FAIL_ON_EQUALS = 1 << 29 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) @@ -163,7 +162,8 @@ GL_HEADING = 1 << 0 # Aggregate contexts: FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE -UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + - ARGUMENT_NAME + TAG_CLOSE) +UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + EXT_LINK_TITLE + + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE -INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK + EXT_LINK +NO_WIKILINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URI +NO_EXT_LINKS = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 88f6490..de58e72 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1158,7 +1158,7 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) */ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) { - #define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK + #define INVALID_CONTEXT self->topstack->context & AGG_NO_EXT_LINKS #define NOT_A_LINK \ if (!brackets && self->topstack->context & LC_DLTERM) \ return Tokenizer_handle_dl_term(self); \ @@ -2440,10 +2440,8 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) { if (context & LC_FAIL_NEXT) return -1; - if (context & LC_WIKILINK) { - if (context & LC_WIKILINK_TEXT) - return (data == '[' && Tokenizer_READ(self, 1) == '[') ? -1 : 0; - else if (data == ']' || data == '{') + if (context & LC_WIKILINK_TITLE) { + if (data == ']' || data == '{') self->topstack->context |= LC_FAIL_NEXT; else if (data == '\n' || data == '[' || data == '}') return -1; @@ -2577,7 +2575,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) return NULL; } else if (this == next && next == '[' && Tokenizer_CAN_RECURSE(self)) { - if (!(this_context & AGG_INVALID_LINK)) { + if (!(this_context & AGG_NO_WIKILINKS)) { if (Tokenizer_parse_wikilink(self)) return NULL; } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 41d1e0b..032480d 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -121,40 +121,39 @@ static PyObject* TagCloseClose; #define LC_WIKILINK_TITLE 0x00000020 #define LC_WIKILINK_TEXT 0x00000040 -#define LC_EXT_LINK 0x00000380 +#define LC_EXT_LINK 0x00000180 #define LC_EXT_LINK_URI 0x00000080 #define LC_EXT_LINK_TITLE 0x00000100 -#define LC_EXT_LINK_BRACKETS 0x00000200 - -#define LC_HEADING 0x0000FC00 -#define LC_HEADING_LEVEL_1 0x00000400 -#define LC_HEADING_LEVEL_2 0x00000800 -#define LC_HEADING_LEVEL_3 0x00001000 -#define LC_HEADING_LEVEL_4 0x00002000 -#define LC_HEADING_LEVEL_5 0x00004000 -#define LC_HEADING_LEVEL_6 0x00008000 - -#define LC_TAG 0x000F0000 -#define LC_TAG_OPEN 0x00010000 -#define LC_TAG_ATTR 0x00020000 -#define LC_TAG_BODY 0x00040000 -#define LC_TAG_CLOSE 0x00080000 - -#define LC_STYLE 0x00F00000 -#define LC_STYLE_ITALICS 0x00100000 -#define LC_STYLE_BOLD 0x00200000 -#define LC_STYLE_PASS_AGAIN 0x00400000 -#define LC_STYLE_SECOND_PASS 0x00800000 - -#define LC_DLTERM 0x01000000 - -#define LC_SAFETY_CHECK 0x7E000000 -#define LC_HAS_TEXT 0x02000000 -#define LC_FAIL_ON_TEXT 0x04000000 -#define LC_FAIL_NEXT 0x08000000 -#define LC_FAIL_ON_LBRACE 0x10000000 -#define LC_FAIL_ON_RBRACE 0x20000000 -#define LC_FAIL_ON_EQUALS 0x40000000 + +#define LC_HEADING 0x00007E00 +#define LC_HEADING_LEVEL_1 0x00000200 +#define LC_HEADING_LEVEL_2 0x00000400 +#define LC_HEADING_LEVEL_3 0x00000800 +#define LC_HEADING_LEVEL_4 0x00001000 +#define LC_HEADING_LEVEL_5 0x00002000 +#define LC_HEADING_LEVEL_6 0x00004000 + +#define LC_TAG 0x00078000 +#define LC_TAG_OPEN 0x00008000 +#define LC_TAG_ATTR 0x00010000 +#define LC_TAG_BODY 0x00020000 +#define LC_TAG_CLOSE 0x00040000 + +#define LC_STYLE 0x00780000 +#define LC_STYLE_ITALICS 0x00080000 +#define LC_STYLE_BOLD 0x00100000 +#define LC_STYLE_PASS_AGAIN 0x00200000 +#define LC_STYLE_SECOND_PASS 0x00400000 + +#define LC_DLTERM 0x00800000 + +#define LC_SAFETY_CHECK 0x3F000000 +#define LC_HAS_TEXT 0x01000000 +#define LC_FAIL_ON_TEXT 0x02000000 +#define LC_FAIL_NEXT 0x04000000 +#define LC_FAIL_ON_LBRACE 0x08000000 +#define LC_FAIL_ON_RBRACE 0x10000000 +#define LC_FAIL_ON_EQUALS 0x20000000 /* Global contexts: */ @@ -163,9 +162,10 @@ static PyObject* TagCloseClose; /* Aggregate contexts: */ #define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) -#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) +#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) -#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK) +#define AGG_NO_WIKILINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URI) +#define AGG_NO_EXT_LINKS (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK) /* Tag contexts: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 269cee2..29a7e25 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -467,7 +467,7 @@ class Tokenizer(object): reset = self._head self._head += 1 try: - bad_context = self._context & contexts.INVALID_LINK + bad_context = self._context & contexts.NO_EXT_LINKS if bad_context or not self._can_recurse(): raise BadRoute() link, extra, delta = self._really_parse_external_link(brackets) @@ -990,10 +990,8 @@ class Tokenizer(object): context = self._context if context & contexts.FAIL_NEXT: return False - if context & contexts.WIKILINK: - if context & contexts.WIKILINK_TEXT: - return not (this == self._read(1) == "[") - elif this == "]" or this == "{": + if context & contexts.WIKILINK_TITLE: + if this == "]" or this == "{": self._context |= contexts.FAIL_NEXT elif this == "\n" or this == "[" or this == "}": return False @@ -1083,7 +1081,7 @@ class Tokenizer(object): else: self._emit_text("}") elif this == next == "[" and self._can_recurse(): - if not self._context & contexts.INVALID_LINK: + if not self._context & contexts.NO_WIKILINKS: self._parse_wikilink() else: self._emit_text("[") diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index 37ef9f1..bf19f4d 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -150,3 +150,31 @@ name: comment_inside_bracketed_link label: an HTML comment inside a bracketed external link input: "[http://example.com/foobar]" output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), CommentStart(), Text(text="comment"), CommentEnd(), Text(text="bar"), ExternalLinkClose()] + +--- + +name: wikilink_inside_external_link +label: a wikilink inside an external link, which the parser considers valid (see issue #61) +input: "[http://example.com/foo Foo [[Bar]]]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo"), ExternalLinkSeparator(), Text(text="Foo "), WikilinkOpen(), Text(text="Bar"), WikilinkClose(), ExternalLinkClose()] + +--- + +name: external_link_inside_wikilink +label: an external link inside a wikilink, valid in the case of images (see issue #62) +input: "[[File:Example.png|thumb|http://example.com]]" +output: [WikilinkOpen(), Text(text="File:Example.png"), WikilinkSeparator(), Text(text="thumb|"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), WikilinkClose()] + +--- + +name: external_link_inside_wikilink_brackets +label: an external link with brackets inside a wikilink +input: "[[File:Example.png|thumb|[http://example.com Example]]]" +output: [WikilinkOpen(), Text(text="File:Example.png"), WikilinkSeparator(), Text(text="thumb|"), ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose(), WikilinkClose()] + +--- + +name: external_link_inside_wikilink_title +label: an external link inside a wikilink title, which is invalid +input: "[[File:Example.png http://example.com]]" +output: [WikilinkOpen(), Text(text="File:Example.png http://example.com"), WikilinkClose()] diff --git a/tests/tokenizer/wikilinks.mwtest b/tests/tokenizer/wikilinks.mwtest index 8eb381a..ce0ec79 100644 --- a/tests/tokenizer/wikilinks.mwtest +++ b/tests/tokenizer/wikilinks.mwtest @@ -54,6 +54,20 @@ output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar[b --- +name: nested +label: a wikilink nested within another +input: "[[foo|[[bar]]]]" +output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), WikilinkOpen(), Text(text="bar"), WikilinkClose(), WikilinkClose()] + +--- + +name: nested_padding +label: a wikilink nested within another, separated by other data +input: "[[foo|a[[b]]c]]" +output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c"), WikilinkClose()] + +--- + name: invalid_newline label: invalid wikilink: newline as only content input: "[[\n]]" @@ -103,27 +117,13 @@ output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), --- -name: invalid_nested_text -label: invalid wikilink: a wikilink nested within the value of another +name: invalid_nested_no_close +label: invalid wikilink: a wikilink nested within the value of another, missing a pair of closing brackets input: "[[foo|[[bar]]" output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose()] --- -name: invalid_nested_text_2 -label: invalid wikilink: a wikilink nested within the value of another, two pairs of closing brackets -input: "[[foo|[[bar]]]]" -output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="]]")] - ---- - -name: invalid_nested_text_padding -label: invalid wikilink: a wikilink nested within the value of another, separated by other data -input: "[[foo|a[[b]]c]]" -output: [Text(text="[[foo|a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c]]")] - ---- - name: incomplete_open_only label: incomplete wikilinks: just an open input: "[["