diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 38154bb..c6d2941 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -51,11 +51,11 @@ Local (stack-specific) contexts: * :py:const:`WIKILINK_TITLE` * :py:const:`WIKILINK_TEXT` -* :py:const:`EXTERNAL_LINK` +* :py:const:`EXT_LINK` - * :py:const:`EXTERNAL_LINK_URL` - * :py:const:`EXTERNAL_LINK_TITLE` - * :py:const:`EXTERNAL_LINK_BRACKETS` + * :py:const:`EXT_LINK_URL` + * :py:const:`EXT_LINK_TITLE` + * :py:const:`EXT_LINK_BRACKETS` * :py:const:`HEADING` @@ -100,6 +100,7 @@ Aggregate contexts: * :py:const:`FAIL` * :py:const:`UNSAFE` * :py:const:`DOUBLE` +* :py:const:`INVALID_LINK` """ @@ -118,10 +119,10 @@ WIKILINK_TITLE = 1 << 5 WIKILINK_TEXT = 1 << 6 WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT -EXTERNAL_LINK_URL = 1 << 7 -EXTERNAL_LINK_TITLE = 1 << 8 -EXTERNAL_LINK_BRACKETS = 1 << 9 -EXTERNAL_LINK = EXTERNAL_LINK_URL + EXTERNAL_LINK_TITLE +EXT_LINK_URL = 1 << 7 +EXT_LINK_TITLE = 1 << 8 +EXT_LINK_BRACKETS = 1 << 9 +EXT_LINK = EXT_LINK_URL + EXT_LINK_TITLE + EXT_LINK_BRACKETS HEADING_LEVEL_1 = 1 << 10 HEADING_LEVEL_2 = 1 << 11 @@ -161,7 +162,8 @@ GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXTERNAL_LINK + HEADING + TAG + STYLE +FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK + HEADING + TAG + STYLE UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE +INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URL diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 67a4ae6..267e7c5 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2192,9 +2192,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) if (Tokenizer_emit_char(self, this)) return NULL; } - else if (this == next && next == *"[") { - if (!(this_context & LC_WIKILINK_TITLE) && - Tokenizer_CAN_RECURSE(self)) { + else if (this == next && next == *"[" && Tokenizer_CAN_RECURSE(self)) { + if (!(this_context & AGG_INVALID_LINK)) { if (Tokenizer_parse_wikilink(self)) return NULL; } @@ -2243,9 +2242,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) return NULL; } } - else if (this == *"<") { - if (!(this_context & LC_TAG_CLOSE) && - Tokenizer_CAN_RECURSE(self)) { + else if (this == *"<" && !(this_context & LC_TAG_CLOSE)) { + if (Tokenizer_CAN_RECURSE(self)) { if (Tokenizer_parse_tag(self)) return NULL; } @@ -2389,6 +2387,11 @@ static int load_tokens(void) WikilinkSeparator = PyObject_GetAttrString(tokens, "WikilinkSeparator"); WikilinkClose = PyObject_GetAttrString(tokens, "WikilinkClose"); + ExternalLinkOpen = PyObject_GetAttrString(tokens, "ExternalLinkOpen"); + ExternalLinkSeparator = PyObject_GetAttrString(tokens, + "ExternalLinkSeparator"); + ExternalLinkClose = PyObject_GetAttrString(tokens, "ExternalLinkClose"); + HTMLEntityStart = PyObject_GetAttrString(tokens, "HTMLEntityStart"); HTMLEntityNumeric = PyObject_GetAttrString(tokens, "HTMLEntityNumeric"); HTMLEntityHex = PyObject_GetAttrString(tokens, "HTMLEntityHex"); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 264360e..16c76eb 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -82,6 +82,10 @@ static PyObject* WikilinkOpen; static PyObject* WikilinkSeparator; static PyObject* WikilinkClose; +static PyObject* ExternalLinkOpen; +static PyObject* ExternalLinkSeparator; +static PyObject* ExternalLinkClose; + static PyObject* HTMLEntityStart; static PyObject* HTMLEntityNumeric; static PyObject* HTMLEntityHex; @@ -104,48 +108,53 @@ static PyObject* TagCloseClose; /* Local contexts: */ -#define LC_TEMPLATE 0x0000007 -#define LC_TEMPLATE_NAME 0x0000001 -#define LC_TEMPLATE_PARAM_KEY 0x0000002 -#define LC_TEMPLATE_PARAM_VALUE 0x0000004 - -#define LC_ARGUMENT 0x0000018 -#define LC_ARGUMENT_NAME 0x0000008 -#define LC_ARGUMENT_DEFAULT 0x0000010 - -#define LC_WIKILINK 0x0000060 -#define LC_WIKILINK_TITLE 0x0000020 -#define LC_WIKILINK_TEXT 0x0000040 - -#define LC_HEADING 0x0001F80 -#define LC_HEADING_LEVEL_1 0x0000080 -#define LC_HEADING_LEVEL_2 0x0000100 -#define LC_HEADING_LEVEL_3 0x0000200 -#define LC_HEADING_LEVEL_4 0x0000400 -#define LC_HEADING_LEVEL_5 0x0000800 -#define LC_HEADING_LEVEL_6 0x0001000 - -#define LC_TAG 0x001E000 -#define LC_TAG_OPEN 0x0002000 -#define LC_TAG_ATTR 0x0004000 -#define LC_TAG_BODY 0x0008000 -#define LC_TAG_CLOSE 0x0010000 - -#define LC_STYLE 0x01E0000 -#define LC_STYLE_ITALICS 0x0020000 -#define LC_STYLE_BOLD 0x0040000 -#define LC_STYLE_PASS_AGAIN 0x0080000 -#define LC_STYLE_SECOND_PASS 0x0100000 - -#define LC_DLTERM 0x0200000 - -#define LC_SAFETY_CHECK 0xFC00000 -#define LC_HAS_TEXT 0x0400000 -#define LC_FAIL_ON_TEXT 0x0800000 -#define LC_FAIL_NEXT 0x1000000 -#define LC_FAIL_ON_LBRACE 0x2000000 -#define LC_FAIL_ON_RBRACE 0x4000000 -#define LC_FAIL_ON_EQUALS 0x8000000 +#define LC_TEMPLATE 0x00000007 +#define LC_TEMPLATE_NAME 0x00000001 +#define LC_TEMPLATE_PARAM_KEY 0x00000002 +#define LC_TEMPLATE_PARAM_VALUE 0x00000004 + +#define LC_ARGUMENT 0x00000018 +#define LC_ARGUMENT_NAME 0x00000008 +#define LC_ARGUMENT_DEFAULT 0x00000010 + +#define LC_WIKILINK 0x00000060 +#define LC_WIKILINK_TITLE 0x00000020 +#define LC_WIKILINK_TEXT 0x00000040 + +#define LC_EXT_LINK 0x00000380 +#define LC_EXT_LINK_URL 0x00000080 +#define LC_EXT_LINK_TITLE 0x00000100 +#define LC_EXT_LINK_BRACKETS 0x00000200 + +#define LC_HEADING 0x0000FC00 +#define LC_HEADING_LEVEL_1 0x00000400 +#define LC_HEADING_LEVEL_2 0x00000800 +#define LC_HEADING_LEVEL_3 0x00001000 +#define LC_HEADING_LEVEL_4 0x00002000 +#define LC_HEADING_LEVEL_5 0x00004000 +#define LC_HEADING_LEVEL_6 0x00008000 + +#define LC_TAG 0x000F0000 +#define LC_TAG_OPEN 0x00010000 +#define LC_TAG_ATTR 0x00020000 +#define LC_TAG_BODY 0x00040000 +#define LC_TAG_CLOSE 0x00080000 + +#define LC_STYLE 0x00F00000 +#define LC_STYLE_ITALICS 0x00100000 +#define LC_STYLE_BOLD 0x00200000 +#define LC_STYLE_PASS_AGAIN 0x00400000 +#define LC_STYLE_SECOND_PASS 0x00800000 + +#define LC_DLTERM 0x01000000 + +#define LC_SAFETY_CHECK 0x7E000000 +#define LC_HAS_TEXT 0x02000000 +#define LC_FAIL_ON_TEXT 0x04000000 +#define LC_FAIL_NEXT 0x08000000 +#define LC_FAIL_ON_LBRACE 0x10000000 +#define LC_FAIL_ON_RBRACE 0x20000000 +#define LC_FAIL_ON_EQUALS 0x40000000 /* Global contexts: */ @@ -153,9 +162,10 @@ static PyObject* TagCloseClose; /* Aggregate contexts: */ -#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) -#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) -#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) +#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) +#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) +#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URL) /* Tag contexts: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 583d2f8..9f675ac 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -311,6 +311,11 @@ class Tokenizer(object): self._head += 1 return self._pop() + def _parse_external_link(self, brackets): + """Parse an external link at the head of the wikicode string.""" + self._emit_text(self._read()) + # raise NotImplementedError() + def _parse_heading(self): """Parse a section heading at the head of the wikicode string.""" self._global |= contexts.GL_HEADING @@ -898,8 +903,8 @@ class Tokenizer(object): return self._handle_argument_end() else: self._emit_text("}") - elif this == next == "[": - if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): + elif this == next == "[" and self._can_recurse(): + if not self._context & contexts.INVALID_LINK: self._parse_wikilink() else: self._emit_text("[") @@ -907,6 +912,11 @@ class Tokenizer(object): self._handle_wikilink_separator() elif this == next == "]" and self._context & contexts.WIKILINK: return self._handle_wikilink_end() + elif this == "[" and not self._context & contexts.INVALID_LINK: ## or this == ":" + if self._can_recurse(): + self._parse_external_link(brackets=this == "[") + else: + self._emit_text("[") elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() @@ -928,8 +938,8 @@ class Tokenizer(object): self._handle_tag_open_close() else: self._handle_invalid_tag_start() - elif this == "<": - if not self._context & contexts.TAG_CLOSE and self._can_recurse(): + elif this == "<" and not self._context & contexts.TAG_CLOSE: + if self._can_recurse(): self._parse_tag() else: self._emit_text("<") diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index 0277a51..e4ff8c4 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -12,6 +12,13 @@ output: [TemplateOpen(), ArgumentOpen(), ArgumentOpen(), Text(text="foo"), Argum --- +name: link_in_template_name +label: a wikilink inside a template name, which breaks the template +input: "{{foo[[bar]]}}" +output: [Text(text="{{foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="}}")] + +--- + name: rich_heading label: a heading with templates/wikilinks in it input: "== Head{{ing}} [[with]] {{{funky|{{stuf}}}}} =="