|
|
@@ -26,7 +26,8 @@ import re |
|
|
|
|
|
|
|
from . import contexts, tokens |
|
|
|
from ..compat import htmlentities |
|
|
|
from ..definitions import get_html_tag, is_parsable, is_single, is_single_only |
|
|
|
from ..definitions import (get_html_tag, is_parsable, is_single, |
|
|
|
is_single_only, is_scheme) |
|
|
|
|
|
|
|
__all__ = ["Tokenizer"] |
|
|
|
|
|
|
@@ -313,8 +314,95 @@ class Tokenizer(object): |
|
|
|
|
|
|
|
def _really_parse_external_link(self, brackets): |
|
|
|
"""Really parse an external link.""" |
|
|
|
# link = self._parse(contexts.EXT_LINK_URL) |
|
|
|
raise BadRoute() |
|
|
|
scheme_valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" |
|
|
|
if brackets: |
|
|
|
self._push(contexts.EXT_LINK_URI) |
|
|
|
if self._read() == self._read(1) == "/": |
|
|
|
self._emit_text("//") |
|
|
|
self._head += 2 |
|
|
|
else: |
|
|
|
scheme = "" |
|
|
|
while all(char in scheme_valid for char in self._read()): |
|
|
|
scheme += self._read() |
|
|
|
self._emit_text(self._read()) |
|
|
|
self._head += 1 |
|
|
|
if self._read() != ":": |
|
|
|
self._fail_route() |
|
|
|
self._emit_text(":") |
|
|
|
self._head += 1 |
|
|
|
slashes = self._read() == self._read(1) == "/" |
|
|
|
if slashes: |
|
|
|
self._emit_text("//") |
|
|
|
self._head += 2 |
|
|
|
if not is_scheme(scheme, slashes): |
|
|
|
self._fail_route() |
|
|
|
else: |
|
|
|
scheme = [] |
|
|
|
try: |
|
|
|
# Ugly, but we have to backtrack through the textbuffer looking |
|
|
|
# for our scheme since it was just parsed as text: |
|
|
|
for i in range(-1, -len(self._textbuffer) - 1, -1): |
|
|
|
for char in reversed(self._textbuffer[i]): |
|
|
|
if char.isspace() or char in self.MARKERS: |
|
|
|
raise StopIteration() |
|
|
|
if char not in scheme_valid: |
|
|
|
raise BadRoute() |
|
|
|
scheme.append(char) |
|
|
|
except StopIteration: |
|
|
|
pass |
|
|
|
scheme = "".join(reversed(scheme)) |
|
|
|
slashes = self._read() == self._read(1) == "/" |
|
|
|
if not is_scheme(scheme, slashes): |
|
|
|
raise BadRoute() |
|
|
|
# Remove the scheme from the textbuffer, now that it's part of the |
|
|
|
# external link: |
|
|
|
length = len(scheme) |
|
|
|
while length: |
|
|
|
if length < len(self._textbuffer[-1]): |
|
|
|
self._textbuffer[-1] = self._textbuffer[-1][:-length] |
|
|
|
break |
|
|
|
length -= len(self._textbuffer[-1]) |
|
|
|
self._textbuffer.pop() |
|
|
|
self._push(contexts.EXT_LINK_URI) |
|
|
|
self._emit_text(scheme) |
|
|
|
self._emit_text(":") |
|
|
|
if slashes: |
|
|
|
self._emit_text("//") |
|
|
|
self._head += 2 |
|
|
|
parentheses = False |
|
|
|
|
|
|
|
while True: |
|
|
|
this, next = self._read(), self._read(1) |
|
|
|
if this is self.END or this == "\n": |
|
|
|
if brackets: |
|
|
|
self._fail_route() |
|
|
|
self._head -= 1 |
|
|
|
return self._pop(), None |
|
|
|
elif this == next == "{" and self._can_recurse(): |
|
|
|
self._parse_template_or_argument() |
|
|
|
elif this == "&": |
|
|
|
self._parse_entity() |
|
|
|
elif this == "]": |
|
|
|
if not brackets: |
|
|
|
self._head -= 1 |
|
|
|
return self._pop(), None |
|
|
|
elif this == "(" and not brackets and not parentheses: |
|
|
|
parentheses = True |
|
|
|
self._emit_text(this) |
|
|
|
elif " " in this: ## Should be a more general whitespace check |
|
|
|
before, after = this.split(" ", 1) |
|
|
|
self._emit_text(before) |
|
|
|
if brackets: |
|
|
|
self._emit(tokens.ExternalLinkSeparator()) |
|
|
|
self._emit_text(after) |
|
|
|
self._context ^= contexts.EXT_LINK_URI |
|
|
|
self._context |= contexts.EXT_LINK_TITLE |
|
|
|
self._head += 1 |
|
|
|
return self._parse(push=False), None |
|
|
|
return self._pop(), " " + after |
|
|
|
else: |
|
|
|
self._emit_text(this) |
|
|
|
self._head += 1 |
|
|
|
|
|
|
|
def _parse_external_link(self, brackets): |
|
|
|
"""Parse an external link at the head of the wikicode string.""" |
|
|
@@ -324,7 +412,7 @@ class Tokenizer(object): |
|
|
|
bad_context = self._context & contexts.INVALID_LINK |
|
|
|
if bad_context or not self._can_recurse(): |
|
|
|
raise BadRoute() |
|
|
|
link = self._really_parse_external_link(brackets) |
|
|
|
link, extra = self._really_parse_external_link(brackets) |
|
|
|
except BadRoute: |
|
|
|
self._head = reset |
|
|
|
if not brackets and self._context & contexts.DL_TERM: |
|
|
@@ -332,9 +420,11 @@ class Tokenizer(object): |
|
|
|
else: |
|
|
|
self._emit_text(self._read()) |
|
|
|
else: |
|
|
|
self._emit(tokens.ExternalLinkOpen(brackets)) |
|
|
|
self._emit(tokens.ExternalLinkOpen(brackets=brackets)) |
|
|
|
self._emit_all(link) |
|
|
|
self._emit(tokens.ExternalLinkClose()) |
|
|
|
if extra: |
|
|
|
self._emit_text(extra) |
|
|
|
|
|
|
|
def _parse_heading(self): |
|
|
|
"""Parse a section heading at the head of the wikicode string.""" |
|
|
|