Browse Source

Start implementation of external links in Python.

tags/v0.3
Ben Kurtovic 10 years ago
parent
commit
da272ae10a
1 changed files with 95 additions and 5 deletions
  1. +95
    -5
      mwparserfromhell/parser/tokenizer.py

+ 95
- 5
mwparserfromhell/parser/tokenizer.py View File

@@ -26,7 +26,8 @@ import re

from . import contexts, tokens
from ..compat import htmlentities
from ..definitions import get_html_tag, is_parsable, is_single, is_single_only
from ..definitions import (get_html_tag, is_parsable, is_single,
is_single_only, is_scheme)

__all__ = ["Tokenizer"]

@@ -313,8 +314,95 @@ class Tokenizer(object):

def _really_parse_external_link(self, brackets):
"""Really parse an external link."""
# link = self._parse(contexts.EXT_LINK_URL)
raise BadRoute()
scheme_valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"
if brackets:
self._push(contexts.EXT_LINK_URI)
if self._read() == self._read(1) == "/":
self._emit_text("//")
self._head += 2
else:
scheme = ""
while all(char in scheme_valid for char in self._read()):
scheme += self._read()
self._emit_text(self._read())
self._head += 1
if self._read() != ":":
self._fail_route()
self._emit_text(":")
self._head += 1
slashes = self._read() == self._read(1) == "/"
if slashes:
self._emit_text("//")
self._head += 2
if not is_scheme(scheme, slashes):
self._fail_route()
else:
scheme = []
try:
# Ugly, but we have to backtrack through the textbuffer looking
# for our scheme since it was just parsed as text:
for i in range(-1, -len(self._textbuffer) - 1, -1):
for char in reversed(self._textbuffer[i]):
if char.isspace() or char in self.MARKERS:
raise StopIteration()
if char not in scheme_valid:
raise BadRoute()
scheme.append(char)
except StopIteration:
pass
scheme = "".join(reversed(scheme))
slashes = self._read() == self._read(1) == "/"
if not is_scheme(scheme, slashes):
raise BadRoute()
# Remove the scheme from the textbuffer, now that it's part of the
# external link:
length = len(scheme)
while length:
if length < len(self._textbuffer[-1]):
self._textbuffer[-1] = self._textbuffer[-1][:-length]
break
length -= len(self._textbuffer[-1])
self._textbuffer.pop()
self._push(contexts.EXT_LINK_URI)
self._emit_text(scheme)
self._emit_text(":")
if slashes:
self._emit_text("//")
self._head += 2
parentheses = False

while True:
this, next = self._read(), self._read(1)
if this is self.END or this == "\n":
if brackets:
self._fail_route()
self._head -= 1
return self._pop(), None
elif this == next == "{" and self._can_recurse():
self._parse_template_or_argument()
elif this == "&":
self._parse_entity()
elif this == "]":
if not brackets:
self._head -= 1
return self._pop(), None
elif this == "(" and not brackets and not parentheses:
parentheses = True
self._emit_text(this)
elif " " in this: ## Should be a more general whitespace check
before, after = this.split(" ", 1)
self._emit_text(before)
if brackets:
self._emit(tokens.ExternalLinkSeparator())
self._emit_text(after)
self._context ^= contexts.EXT_LINK_URI
self._context |= contexts.EXT_LINK_TITLE
self._head += 1
return self._parse(push=False), None
return self._pop(), " " + after
else:
self._emit_text(this)
self._head += 1

def _parse_external_link(self, brackets):
"""Parse an external link at the head of the wikicode string."""
@@ -324,7 +412,7 @@ class Tokenizer(object):
bad_context = self._context & contexts.INVALID_LINK
if bad_context or not self._can_recurse():
raise BadRoute()
link = self._really_parse_external_link(brackets)
link, extra = self._really_parse_external_link(brackets)
except BadRoute:
self._head = reset
if not brackets and self._context & contexts.DL_TERM:
@@ -332,9 +420,11 @@ class Tokenizer(object):
else:
self._emit_text(self._read())
else:
self._emit(tokens.ExternalLinkOpen(brackets))
self._emit(tokens.ExternalLinkOpen(brackets=brackets))
self._emit_all(link)
self._emit(tokens.ExternalLinkClose())
if extra:
self._emit_text(extra)

def _parse_heading(self):
"""Parse a section heading at the head of the wikicode string."""


Loading…
Cancel
Save