From d42e05a554076d43dd53568bf383ec3e265c2fe2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 00:57:34 -0400 Subject: [PATCH] Implement improved wikilink handling. --- mwparserfromhell/parser/contexts.py | 12 ++++++------ mwparserfromhell/parser/tokenizer.c | 13 ++++++------- mwparserfromhell/parser/tokenizer.h | 4 ++-- mwparserfromhell/parser/tokenizer.py | 10 ++++++++-- 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index c6d2941..0d25400 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -53,7 +53,7 @@ Local (stack-specific) contexts: * :py:const:`EXT_LINK` - * :py:const:`EXT_LINK_URL` + * :py:const:`EXT_LINK_URI` * :py:const:`EXT_LINK_TITLE` * :py:const:`EXT_LINK_BRACKETS` @@ -119,10 +119,10 @@ WIKILINK_TITLE = 1 << 5 WIKILINK_TEXT = 1 << 6 WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT -EXT_LINK_URL = 1 << 7 +EXT_LINK_URI = 1 << 7 EXT_LINK_TITLE = 1 << 8 EXT_LINK_BRACKETS = 1 << 9 -EXT_LINK = EXT_LINK_URL + EXT_LINK_TITLE + EXT_LINK_BRACKETS +EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE + EXT_LINK_BRACKETS HEADING_LEVEL_1 = 1 << 10 HEADING_LEVEL_2 = 1 << 11 @@ -163,7 +163,7 @@ GL_HEADING = 1 << 0 # Aggregate contexts: FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK + HEADING + TAG + STYLE -UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + - TAG_CLOSE) +UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE -INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URL +INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK + EXT_LINK diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 2b74f6b..46df405 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2050,18 +2050,17 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) if (context & LC_FAIL_NEXT) { return -1; } - if (context & LC_WIKILINK_TITLE) { - if (data == *"]" || data == *"{") + if (context & LC_WIKILINK) { + if (context & LC_WIKILINK_TEXT) + return (data == *"[" && Tokenizer_READ(self, 1) == *"[") ? -1 : 0; + else if (data == *"]" || data == *"{") self->topstack->context |= LC_FAIL_NEXT; else if (data == *"\n" || data == *"[" || data == *"}") return -1; return 0; } - if (context & LC_TAG_CLOSE) { - if (data == *"<") - return -1; - return 0; - } + if (context & LC_TAG_CLOSE) + return (data == *"<") ? -1 : 0; if (context & LC_TEMPLATE_NAME) { if (data == *"{" || data == *"}" || data == *"[") { self->topstack->context |= LC_FAIL_NEXT; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 41c1c1b..5961dcc 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -163,9 +163,9 @@ static PyObject* TagCloseClose; /* Aggregate contexts: */ #define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) -#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) +#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) -#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URL) +#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK_URL) /* Tag contexts: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 07ae0b1..84de78e 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -835,12 +835,16 @@ class Tokenizer(object): context = self._context if context & contexts.FAIL_NEXT: return False - if context & contexts.WIKILINK_TITLE: - if this == "]" or this == "{": + if context & contexts.WIKILINK: + if context & contexts.WIKILINK_TEXT: + return not (this == self._read(1) == "[") + elif this == "]" or this == "{": self._context |= contexts.FAIL_NEXT elif this == "\n" or this == "[" or this == "}": return False return True + elif context & contexts.EXT_LINK_TITLE: + return this != "\n" elif context & contexts.TEMPLATE_NAME: if this == "{" or this == "}" or this == "[": self._context |= contexts.FAIL_NEXT @@ -936,6 +940,8 @@ class Tokenizer(object): self._parse_external_link(True) elif this == ":" and self._read(-1) not in self.MARKERS: self._parse_external_link(False) + elif this == "]" and self._context & contexts.EXT_LINK_TITLE: + return self._pop() elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading()