diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index e0a2adb..aa1c00b 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1037,17 +1037,12 @@ Tokenizer_is_free_link(Tokenizer* self, Py_UNICODE this, Py_UNICODE next) Py_UNICODE after = Tokenizer_READ(self, 2); int ctx = self->topstack->context; - return ((this == *"" || this == *"\n" || this == *"[" || this == *"]") || - (this == *"|" && ctx & LC_TEMPLATE) || - (this == *"=" && ctx & LC_TEMPLATE_PARAM_KEY) || - (this == *"}" && next == *"}" && ctx & LC_TEMPLATE) || - (this == *"}" && next == *"}" && after == *"}" - && ctx & LC_ARGUMENT) || - (this == *"=" && ctx & LC_HEADING) || - (this == *"<" && next == *"/" && after != *"") || - (this == *"<" && next != *"!" && !(ctx & LC_TAG_CLOSE)) || - (this == *">" && ctx & LC_TAG_CLOSE) || - (this == *"'" && next == *"'")); + return (this == *"" || this == *"\n" || this == *"[" || this == *"]" || + this == *"<" || this == *">" || (this == *"'" && next == *"'") || + (this == *"|" && ctx & LC_TEMPLATE) || + (this == *"=" && ctx & (LC_TEMPLATE_PARAM_KEY | LC_HEADING)) || + (this == *"}" && next == *"}" && + (ctx & LC_TEMPLATE || (after == *"}" && ctx & LC_ARGUMENT)))); } /* @@ -1073,7 +1068,19 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, while (1) { this = Tokenizer_READ(self, 0); next = Tokenizer_READ(self, 1); - if (!brackets && Tokenizer_is_free_link(self, this, next)) { + if (this == *"&") { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_entity(self)) + return NULL; + } + else if (this == *"<" && next == *"!" + && Tokenizer_READ(self, 2) == *"-" + && Tokenizer_READ(self, 3) == *"-") { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_comment(self)) + return NULL; + } + else if (!brackets && Tokenizer_is_free_link(self, this, next)) { self->head--; return Tokenizer_pop(self); } @@ -1086,18 +1093,6 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, } else if (this == *"]") return Tokenizer_pop(self); - else if (this == *"&") { - PUSH_TAIL_BUFFER(*extra, NULL) - if (Tokenizer_parse_entity(self)) - return NULL; - } - else if (this == *"<" && next == *"!" - && Tokenizer_READ(self, 2) == *"-" - && Tokenizer_READ(self, 3) == *"-") { - PUSH_TAIL_BUFFER(*extra, NULL) - if (Tokenizer_parse_comment(self)) - return NULL; - } else if (this == *" ") { if (brackets) { if (Tokenizer_emit(self, ExternalLinkSeparator)) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index bcc00ef..2c28718 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -389,16 +389,13 @@ class Tokenizer(object): """Return whether the current head is the end of a free link.""" # Built from _parse()'s end sentinels: after, ctx = self._read(2), self._context - return (this is self.END or this in ("\n", "[", "]") or - this == "|" and ctx & contexts.TEMPLATE or - this == "=" and ctx & contexts.TEMPLATE_PARAM_KEY or - this == next == "}" and ctx & contexts.TEMPLATE or - this == next == after == "}" and ctx & contexts.ARGUMENT or - this == "=" and ctx & contexts.HEADING or - this == "<" and next == "/" and after is not self.END or - this == "<" and next != "!" and not ctx & contexts.TAG_CLOSE or - this == ">" and ctx & contexts.TAG_CLOSE or - this == next == "'") + equal_sign_contexts = contexts.TEMPLATE_PARAM_KEY | contexts.HEADING + return (this in (self.END, "\n", "[", "]", "<", ">") or + this == next == "'" or + (this == "|" and ctx & contexts.TEMPLATE) or + (this == "=" and ctx & equal_sign_contexts) or + (this == next == "}" and ctx & contexts.TEMPLATE) or + (this == next == after == "}" and ctx & contexts.ARGUMENT)) def _really_parse_external_link(self, brackets): """Really parse an external link.""" @@ -414,18 +411,7 @@ class Tokenizer(object): tail = "" while True: this, next = self._read(), self._read(1) - if not brackets and self._is_free_link_end(this, next): - return self._pop(), tail, -1 - elif this is self.END or this == "\n": - self._fail_route() - elif this == next == "{" and self._can_recurse(): - if tail: - self._emit_text(tail) - tail = "" - self._parse_template_or_argument() - elif this == "]": - return self._pop(), tail, 0 - elif this == "&": + if this == "&": if tail: self._emit_text(tail) tail = "" @@ -436,6 +422,17 @@ class Tokenizer(object): self._emit_text(tail) tail = "" self._parse_comment() + elif not brackets and self._is_free_link_end(this, next): + return self._pop(), tail, -1 + elif this is self.END or this == "\n": + self._fail_route() + elif this == next == "{" and self._can_recurse(): + if tail: + self._emit_text(tail) + tail = "" + self._parse_template_or_argument() + elif this == "]": + return self._pop(), tail, 0 elif " " in this: before, after = this.split(" ", 1) if brackets: