Browse Source

Fix some external links; refactor into different methods.

tags/v0.3
Ben Kurtovic 10 years ago
parent
commit
432198547a
1 changed files with 68 additions and 53 deletions
  1. +68
    -53
      mwparserfromhell/parser/tokenizer.py

+ 68
- 53
mwparserfromhell/parser/tokenizer.py View File

@@ -312,65 +312,67 @@ class Tokenizer(object):
self._head += 1
return self._pop()

def _really_parse_external_link(self, brackets):
"""Really parse an external link."""
scheme_valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"
if brackets:
self._push(contexts.EXT_LINK_URI)
if self._read() == self._read(1) == "/":
self._emit_text("//")
self._head += 2
else:
scheme = ""
while all(char in scheme_valid for char in self._read()):
scheme += self._read()
self._emit_text(self._read())
self._head += 1
if self._read() != ":":
self._fail_route()
self._emit_text(":")
self._head += 1
slashes = self._read() == self._read(1) == "/"
if slashes:
self._emit_text("//")
self._head += 2
if not is_scheme(scheme, slashes):
self._fail_route()
def _parse_bracketed_uri_scheme(self):
"""Parse the URI scheme of a bracket-enclosed external link."""
self._push(contexts.EXT_LINK_URI)
if self._read() == self._read(1) == "/":
self._emit_text("//")
self._head += 2
else:
scheme = []
try:
# Ugly, but we have to backtrack through the textbuffer looking
# for our scheme since it was just parsed as text:
for i in range(-1, -len(self._textbuffer) - 1, -1):
for char in reversed(self._textbuffer[i]):
if char.isspace() or char in self.MARKERS:
raise StopIteration()
if char not in scheme_valid:
raise BadRoute()
scheme.append(char)
except StopIteration:
pass
scheme = "".join(reversed(scheme))
slashes = self._read() == self._read(1) == "/"
if not is_scheme(scheme, slashes):
raise BadRoute()
# Remove the scheme from the textbuffer, now that it's part of the
# external link:
length = len(scheme)
while length:
if length < len(self._textbuffer[-1]):
self._textbuffer[-1] = self._textbuffer[-1][:-length]
break
length -= len(self._textbuffer[-1])
self._textbuffer.pop()
self._push(contexts.EXT_LINK_URI)
self._emit_text(scheme)
valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"
all_valid = lambda: all(char in valid for char in self._read())
scheme = ""
while self._read() is not self.END and all_valid():
scheme += self._read()
self._emit_text(self._read())
self._head += 1
if self._read() != ":":
self._fail_route()
self._emit_text(":")
self._head += 1
slashes = self._read() == self._read(1) == "/"
if slashes:
self._emit_text("//")
self._head += 2
parentheses = False
if not is_scheme(scheme, slashes):
self._fail_route()

def _parse_free_uri_scheme(self):
"""Parse the URI scheme of a free (no brackets) external link."""
valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"
scheme = []
try:
# Ugly, but we have to backtrack through the textbuffer looking for
# our scheme since it was just parsed as text:
for i in range(-1, -len(self._textbuffer) - 1, -1):
for char in reversed(self._textbuffer[i]):
if char.isspace() or char in self.MARKERS:
raise StopIteration()
if char not in valid:
raise BadRoute()
scheme.append(char)
except StopIteration:
pass
scheme = "".join(reversed(scheme))
slashes = self._read() == self._read(1) == "/"
if not is_scheme(scheme, slashes):
raise BadRoute()
parentheses = False
self._push(contexts.EXT_LINK_URI)
self._emit_text(scheme)
self._emit_text(":")
if slashes:
self._emit_text("//")
self._head += 2

def _really_parse_external_link(self, brackets):
"""Really parse an external link."""
if brackets:
self._parse_bracketed_uri_scheme()
else:
self._parse_free_uri_scheme()
if self._read() in self.MARKERS or self._read()[0].isspace(): ## Should actually check for valid chars
self._fail_route()
while True:
this, next = self._read(), self._read(1)
if this is self.END or this == "\n":
@@ -404,6 +406,16 @@ class Tokenizer(object):
self._emit_text(this)
self._head += 1

def _remove_uri_scheme_from_textbuffer(self, scheme):
"""Remove the URI scheme of a new external link from the textbuffer."""
length = len(scheme)
while length:
if length < len(self._textbuffer[-1]):
self._textbuffer[-1] = self._textbuffer[-1][:-length]
break
length -= len(self._textbuffer[-1])
self._textbuffer.pop()

def _parse_external_link(self, brackets):
"""Parse an external link at the head of the wikicode string."""
reset = self._head
@@ -420,6 +432,9 @@ class Tokenizer(object):
else:
self._emit_text(self._read())
else:
if not brackets:
scheme = link[0].text.split(":", 1)[0]
self._remove_uri_scheme_from_textbuffer(scheme)
self._emit(tokens.ExternalLinkOpen(brackets=brackets))
self._emit_all(link)
self._emit(tokens.ExternalLinkClose())


Loading…
Cancel
Save