diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 84de78e..9acf32d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -26,7 +26,8 @@ import re from . import contexts, tokens from ..compat import htmlentities -from ..definitions import get_html_tag, is_parsable, is_single, is_single_only +from ..definitions import (get_html_tag, is_parsable, is_single, + is_single_only, is_scheme) __all__ = ["Tokenizer"] @@ -313,8 +314,95 @@ class Tokenizer(object): def _really_parse_external_link(self, brackets): """Really parse an external link.""" - # link = self._parse(contexts.EXT_LINK_URL) - raise BadRoute() + scheme_valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" + if brackets: + self._push(contexts.EXT_LINK_URI) + if self._read() == self._read(1) == "/": + self._emit_text("//") + self._head += 2 + else: + scheme = "" + while all(char in scheme_valid for char in self._read()): + scheme += self._read() + self._emit_text(self._read()) + self._head += 1 + if self._read() != ":": + self._fail_route() + self._emit_text(":") + self._head += 1 + slashes = self._read() == self._read(1) == "/" + if slashes: + self._emit_text("//") + self._head += 2 + if not is_scheme(scheme, slashes): + self._fail_route() + else: + scheme = [] + try: + # Ugly, but we have to backtrack through the textbuffer looking + # for our scheme since it was just parsed as text: + for i in range(-1, -len(self._textbuffer) - 1, -1): + for char in reversed(self._textbuffer[i]): + if char.isspace() or char in self.MARKERS: + raise StopIteration() + if char not in scheme_valid: + raise BadRoute() + scheme.append(char) + except StopIteration: + pass + scheme = "".join(reversed(scheme)) + slashes = self._read() == self._read(1) == "/" + if not is_scheme(scheme, slashes): + raise BadRoute() + # Remove the scheme from the textbuffer, now that it's part of the + # external link: + length = len(scheme) + while length: + if length < len(self._textbuffer[-1]): + self._textbuffer[-1] = self._textbuffer[-1][:-length] + break + length -= len(self._textbuffer[-1]) + self._textbuffer.pop() + self._push(contexts.EXT_LINK_URI) + self._emit_text(scheme) + self._emit_text(":") + if slashes: + self._emit_text("//") + self._head += 2 + parentheses = False + + while True: + this, next = self._read(), self._read(1) + if this is self.END or this == "\n": + if brackets: + self._fail_route() + self._head -= 1 + return self._pop(), None + elif this == next == "{" and self._can_recurse(): + self._parse_template_or_argument() + elif this == "&": + self._parse_entity() + elif this == "]": + if not brackets: + self._head -= 1 + return self._pop(), None + elif this == "(" and not brackets and not parentheses: + parentheses = True + self._emit_text(this) + elif " " in this: ## Should be a more general whitespace check + before, after = this.split(" ", 1) + self._emit_text(before) + if brackets: + self._emit(tokens.ExternalLinkSeparator()) + self._emit_text(after) + self._context ^= contexts.EXT_LINK_URI + self._context |= contexts.EXT_LINK_TITLE + self._head += 1 + return self._parse(push=False), None + return self._pop(), " " + after + else: + self._emit_text(this) + self._head += 1 def _parse_external_link(self, brackets): """Parse an external link at the head of the wikicode string.""" @@ -324,7 +412,7 @@ class Tokenizer(object): bad_context = self._context & contexts.INVALID_LINK if bad_context or not self._can_recurse(): raise BadRoute() - link = self._really_parse_external_link(brackets) + link, extra = self._really_parse_external_link(brackets) except BadRoute: self._head = reset if not brackets and self._context & contexts.DL_TERM: @@ -332,9 +420,11 @@ class Tokenizer(object): else: self._emit_text(self._read()) else: - self._emit(tokens.ExternalLinkOpen(brackets)) + self._emit(tokens.ExternalLinkOpen(brackets=brackets)) self._emit_all(link) self._emit(tokens.ExternalLinkClose()) + if extra: + self._emit_text(extra) def _parse_heading(self): """Parse a section heading at the head of the wikicode string."""