diff --git a/CHANGELOG b/CHANGELOG
index 8922738..122247f 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,8 +1,10 @@
v0.3 (unreleased):
-- Added complete support for HTML Tags, along with appropriate unit tests. This
- includes forms like [foo], , and wiki-markup tags
- like bold ('''), italics (''), and lists (*, #, ; and :).
+- Added complete support for HTML Tags, including forms like [foo],
+ , and wiki-markup tags like bold ('''), italics (''), and
+ lists (*, #, ; and :).
+- Added support for ExternalLinks (http://example.com/ and
+ [http://example.com/ Example]).
- Wikicode's filter methods are now passed 'recursive=True' by default instead
of False. This is a breaking change if you rely on any filter() methods being
non-recursive by default.
@@ -14,7 +16,7 @@ v0.3 (unreleased):
- Renamed Template.has_param() to has() for consistency with Template's other
methods; has_param() is now an alias.
- The C tokenizer extension now works on Python 3 in addition to Python 2.7.
-- Various fixes and cleanup.
+- Various bugfixes, internal changes, and cleanup.
v0.2 (released June 20, 2013):
diff --git a/docs/api/mwparserfromhell.nodes.rst b/docs/api/mwparserfromhell.nodes.rst
index a093c17..7043070 100644
--- a/docs/api/mwparserfromhell.nodes.rst
+++ b/docs/api/mwparserfromhell.nodes.rst
@@ -25,6 +25,14 @@ nodes Package
:undoc-members:
:show-inheritance:
+:mod:`external_link` Module
+---------------------------
+
+.. automodule:: mwparserfromhell.nodes.external_link
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
:mod:`heading` Module
---------------------
diff --git a/docs/api/mwparserfromhell.rst b/docs/api/mwparserfromhell.rst
index b682139..0da522e 100644
--- a/docs/api/mwparserfromhell.rst
+++ b/docs/api/mwparserfromhell.rst
@@ -30,10 +30,10 @@ mwparserfromhell Package
:members:
:undoc-members:
-:mod:`tag_defs` Module
-----------------------
+:mod:`definitions` Module
+-------------------------
-.. automodule:: mwparserfromhell.tag_defs
+.. automodule:: mwparserfromhell.definitions
:members:
:mod:`utils` Module
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 86dfd78..f43a3c9 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -7,10 +7,11 @@ v0.3
Unreleased
(`changes `__):
-- Added complete support for HTML :py:class:`Tags <.Tag>`, along with
- appropriate unit tests. This includes forms like ``[foo]``,
- ````, and wiki-markup tags like bold (``'''``), italics
- (``''``), and lists (``*``, ``#``, ``;`` and ``:``).
+- Added complete support for HTML :py:class:`Tags <.Tag>`, including forms like
+ ``[foo]``, ````, and wiki-markup tags like bold
+ (``'''``), italics (``''``), and lists (``*``, ``#``, ``;`` and ``:``).
+- Added support for :py:class:`.ExternalLink`\ s (``http://example.com/`` and
+ ``[http://example.com/ Example]``).
- :py:class:`Wikicode's <.Wikicode>` :py:meth:`.filter` methods are now passed
*recursive=True* by default instead of *False*. **This is a breaking change
if you rely on any filter() methods being non-recursive by default.**
@@ -25,7 +26,7 @@ Unreleased
:py:meth:`~.Template.has` for consistency with :py:class:`~.Template`\ 's
other methods; :py:meth:`~.has_param` is now an alias.
- The C tokenizer extension now works on Python 3 in addition to Python 2.7.
-- Various fixes and cleanup.
+- Various bugfixes, internal changes, and cleanup.
v0.2
----
diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py
index 738d4c2..74e1616 100644
--- a/mwparserfromhell/__init__.py
+++ b/mwparserfromhell/__init__.py
@@ -34,6 +34,7 @@ __license__ = "MIT License"
__version__ = "0.3.dev"
__email__ = "ben.kurtovic@verizon.net"
-from . import compat, nodes, parser, smart_list, string_mixin, utils, wikicode
+from . import (compat, definitions, nodes, parser, smart_list, string_mixin,
+ utils, wikicode)
parse = utils.parse_anything
diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/definitions.py
similarity index 73%
rename from mwparserfromhell/tag_defs.py
rename to mwparserfromhell/definitions.py
index 2395fc6..9449bcb 100644
--- a/mwparserfromhell/tag_defs.py
+++ b/mwparserfromhell/definitions.py
@@ -20,12 +20,22 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
-"""Contains data regarding certain HTML tags."""
+"""Contains data about certain markup, like HTML tags and external links."""
from __future__ import unicode_literals
__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single",
- "is_single_only"]
+ "is_single_only", "is_scheme"]
+
+URI_SCHEMES = {
+ # [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943
+ "http": True, "https": True, "ftp": True, "ftps": True, "ssh": True,
+ "sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False,
+ "sips": False, "gopher": True, "telnet": True, "nntp": True,
+ "worldwind": True, "mailto": False, "tel": False, "sms": False,
+ "news": False, "svn": True, "git": True, "mms": True, "bitcoin": False,
+ "magnet": False, "urn": False, "geo": False
+}
PARSER_BLACKLIST = [
# enwiki extensions @ 2013-06-28
@@ -70,3 +80,12 @@ def is_single(tag):
def is_single_only(tag):
"""Return whether or not the given *tag* must exist without a close tag."""
return tag.lower() in SINGLE_ONLY
+
+def is_scheme(scheme, slashes=True, reverse=False):
+ """Return whether *scheme* is valid for external links."""
+ if reverse: # Convenience for C
+ scheme = scheme[::-1]
+ scheme = scheme.lower()
+ if slashes:
+ return scheme in URI_SCHEMES
+ return scheme in URI_SCHEMES and not URI_SCHEMES[scheme]
diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py
index faaa0b2..ba97b3f 100644
--- a/mwparserfromhell/nodes/__init__.py
+++ b/mwparserfromhell/nodes/__init__.py
@@ -69,6 +69,7 @@ from . import extras
from .text import Text
from .argument import Argument
from .comment import Comment
+from .external_link import ExternalLink
from .heading import Heading
from .html_entity import HTMLEntity
from .tag import Tag
diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py
new file mode 100644
index 0000000..d74f6b3
--- /dev/null
+++ b/mwparserfromhell/nodes/external_link.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2012-2013 Ben Kurtovic
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import unicode_literals
+
+from . import Node
+from ..compat import str
+from ..utils import parse_anything
+
+__all__ = ["ExternalLink"]
+
+class ExternalLink(Node):
+ """Represents an external link, like ``[http://example.com/ Example]``."""
+
+ def __init__(self, url, title=None, brackets=True):
+ super(ExternalLink, self).__init__()
+ self._url = url
+ self._title = title
+ self._brackets = brackets
+
+ def __unicode__(self):
+ if self.brackets:
+ if self.title is not None:
+ return "[" + str(self.url) + " " + str(self.title) + "]"
+ return "[" + str(self.url) + "]"
+ return str(self.url)
+
+ def __iternodes__(self, getter):
+ yield None, self
+ for child in getter(self.url):
+ yield self.url, child
+ if self.title is not None:
+ for child in getter(self.title):
+ yield self.title, child
+
+ def __strip__(self, normalize, collapse):
+ if self.brackets:
+ if self.title:
+ return self.title.strip_code(normalize, collapse)
+ return None
+ return self.url.strip_code(normalize, collapse)
+
+ def __showtree__(self, write, get, mark):
+ if self.brackets:
+ write("[")
+ get(self.url)
+ if self.title is not None:
+ get(self.title)
+ if self.brackets:
+ write("]")
+
+ @property
+ def url(self):
+ """The URL of the link target, as a :py:class:`~.Wikicode` object."""
+ return self._url
+
+ @property
+ def title(self):
+ """The link title (if given), as a :py:class:`~.Wikicode` object."""
+ return self._title
+
+ @property
+ def brackets(self):
+ """Whether to enclose the URL in brackets or display it straight."""
+ return self._brackets
+
+ @url.setter
+ def url(self, value):
+ from ..parser import contexts
+ self._url = parse_anything(value, contexts.EXT_LINK_URI)
+
+ @title.setter
+ def title(self, value):
+ self._title = None if value is None else parse_anything(value)
+
+ @brackets.setter
+ def brackets(self, value):
+ self._brackets = bool(value)
diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index b4aec3e..80b8a88 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -25,7 +25,7 @@ from __future__ import unicode_literals
from . import Node, Text
from .extras import Attribute
from ..compat import str
-from ..tag_defs import is_visible
+from ..definitions import is_visible
from ..utils import parse_anything
__all__ = ["Tag"]
@@ -152,7 +152,7 @@ class Tag(Node):
This makes the tag look like a lone close tag. It is technically
invalid and is only parsable Wikicode when the tag itself is
single-only, like ``
`` and ````. See
- :py:func:`.tag_defs.is_single_only`.
+ :py:func:`.definitions.is_single_only`.
"""
return self._invalid
@@ -161,7 +161,7 @@ class Tag(Node):
"""Whether the tag is implicitly self-closing, with no ending slash.
This is only possible for specific "single" tags like ``
`` and
- ````. See :py:func:`.tag_defs.is_single`. This field only has an
+ ````. See :py:func:`.definitions.is_single`. This field only has an
effect if :py:attr:`self_closing` is also ``True``.
"""
return self._implicit
diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py
index 1fb95b5..22c3dc2 100644
--- a/mwparserfromhell/parser/__init__.py
+++ b/mwparserfromhell/parser/__init__.py
@@ -46,16 +46,15 @@ class Parser(object):
:py:class:`~.Node`\ s by the :py:class:`~.Builder`.
"""
- def __init__(self, text):
- self.text = text
+ def __init__(self):
if use_c and CTokenizer:
self._tokenizer = CTokenizer()
else:
self._tokenizer = Tokenizer()
self._builder = Builder()
- def parse(self):
- """Return a string as a parsed :py:class:`~.Wikicode` object tree."""
- tokens = self._tokenizer.tokenize(self.text)
+ def parse(self, text, context=0):
+ """Parse *text*, returning a :py:class:`~.Wikicode` object tree."""
+ tokens = self._tokenizer.tokenize(text, context)
code = self._builder.build(tokens)
return code
diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py
index 196ef14..d31f450 100644
--- a/mwparserfromhell/parser/builder.py
+++ b/mwparserfromhell/parser/builder.py
@@ -24,8 +24,8 @@ from __future__ import unicode_literals
from . import tokens
from ..compat import str
-from ..nodes import (Argument, Comment, Heading, HTMLEntity, Tag, Template,
- Text, Wikilink)
+from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag,
+ Template, Text, Wikilink)
from ..nodes.extras import Attribute, Parameter
from ..smart_list import SmartList
from ..wikicode import Wikicode
@@ -142,6 +142,22 @@ class Builder(object):
else:
self._write(self._handle_token(token))
+ def _handle_external_link(self, token):
+ """Handle when an external link is at the head of the tokens."""
+ brackets, url = token.brackets, None
+ self._push()
+ while self._tokens:
+ token = self._tokens.pop()
+ if isinstance(token, tokens.ExternalLinkSeparator):
+ url = self._pop()
+ self._push()
+ elif isinstance(token, tokens.ExternalLinkClose):
+ if url is not None:
+ return ExternalLink(url, self._pop(), brackets)
+ return ExternalLink(self._pop(), brackets=brackets)
+ else:
+ self._write(self._handle_token(token))
+
def _handle_entity(self):
"""Handle a case where an HTML entity is at the head of the tokens."""
token = self._tokens.pop()
@@ -244,6 +260,8 @@ class Builder(object):
return self._handle_argument()
elif isinstance(token, tokens.WikilinkOpen):
return self._handle_wikilink()
+ elif isinstance(token, tokens.ExternalLinkOpen):
+ return self._handle_external_link(token)
elif isinstance(token, tokens.HTMLEntityStart):
return self._handle_entity()
elif isinstance(token, tokens.HeadingStart):
diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py
index a1b67be..33da8f7 100644
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -51,6 +51,12 @@ Local (stack-specific) contexts:
* :py:const:`WIKILINK_TITLE`
* :py:const:`WIKILINK_TEXT`
+* :py:const:`EXT_LINK`
+
+ * :py:const:`EXT_LINK_URI`
+ * :py:const:`EXT_LINK_TITLE`
+ * :py:const:`EXT_LINK_BRACKETS`
+
* :py:const:`HEADING`
* :py:const:`HEADING_LEVEL_1`
@@ -94,6 +100,7 @@ Aggregate contexts:
* :py:const:`FAIL`
* :py:const:`UNSAFE`
* :py:const:`DOUBLE`
+* :py:const:`INVALID_LINK`
"""
@@ -112,35 +119,40 @@ WIKILINK_TITLE = 1 << 5
WIKILINK_TEXT = 1 << 6
WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT
-HEADING_LEVEL_1 = 1 << 7
-HEADING_LEVEL_2 = 1 << 8
-HEADING_LEVEL_3 = 1 << 9
-HEADING_LEVEL_4 = 1 << 10
-HEADING_LEVEL_5 = 1 << 11
-HEADING_LEVEL_6 = 1 << 12
+EXT_LINK_URI = 1 << 7
+EXT_LINK_TITLE = 1 << 8
+EXT_LINK_BRACKETS = 1 << 9
+EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE + EXT_LINK_BRACKETS
+
+HEADING_LEVEL_1 = 1 << 10
+HEADING_LEVEL_2 = 1 << 11
+HEADING_LEVEL_3 = 1 << 12
+HEADING_LEVEL_4 = 1 << 13
+HEADING_LEVEL_5 = 1 << 14
+HEADING_LEVEL_6 = 1 << 15
HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 +
HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6)
-TAG_OPEN = 1 << 13
-TAG_ATTR = 1 << 14
-TAG_BODY = 1 << 15
-TAG_CLOSE = 1 << 16
+TAG_OPEN = 1 << 16
+TAG_ATTR = 1 << 17
+TAG_BODY = 1 << 18
+TAG_CLOSE = 1 << 19
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE
-STYLE_ITALICS = 1 << 17
-STYLE_BOLD = 1 << 18
-STYLE_PASS_AGAIN = 1 << 19
-STYLE_SECOND_PASS = 1 << 20
+STYLE_ITALICS = 1 << 20
+STYLE_BOLD = 1 << 21
+STYLE_PASS_AGAIN = 1 << 22
+STYLE_SECOND_PASS = 1 << 23
STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS
-DL_TERM = 1 << 21
+DL_TERM = 1 << 24
-HAS_TEXT = 1 << 22
-FAIL_ON_TEXT = 1 << 23
-FAIL_NEXT = 1 << 24
-FAIL_ON_LBRACE = 1 << 25
-FAIL_ON_RBRACE = 1 << 26
-FAIL_ON_EQUALS = 1 << 27
+HAS_TEXT = 1 << 25
+FAIL_ON_TEXT = 1 << 26
+FAIL_NEXT = 1 << 27
+FAIL_ON_LBRACE = 1 << 28
+FAIL_ON_RBRACE = 1 << 29
+FAIL_ON_EQUALS = 1 << 30
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS)
@@ -150,7 +162,8 @@ GL_HEADING = 1 << 0
# Aggregate contexts:
-FAIL = TEMPLATE + ARGUMENT + WIKILINK + HEADING + TAG + STYLE
-UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME +
- TAG_CLOSE)
+FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE
+UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY +
+ ARGUMENT_NAME + TAG_CLOSE)
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE
+INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK + EXT_LINK
diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index 67a4ae6..07d3988 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -24,6 +24,20 @@ SOFTWARE.
#include "tokenizer.h"
/*
+ Determine whether the given Py_UNICODE is a marker.
+*/
+static int is_marker(Py_UNICODE this)
+{
+ int i;
+
+ for (i = 0; i < NUM_MARKERS; i++) {
+ if (*MARKERS[i] == this)
+ return 1;
+ }
+ return 0;
+}
+
+/*
Given a context, return the heading level encoded within it.
*/
static int heading_level_from_context(int n)
@@ -37,13 +51,14 @@ static int heading_level_from_context(int n)
}
/*
- Call the given function in tag_defs, using 'tag' as a parameter, and return
- its output as a bool.
+ Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as
+ parameters, and return its output as a bool.
*/
-static int call_tag_def_func(const char* funcname, PyObject* tag)
+static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2,
+ PyObject* in3)
{
- PyObject* func = PyObject_GetAttrString(tag_defs, funcname);
- PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL);
+ PyObject* func = PyObject_GetAttrString(definitions, funcname);
+ PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL);
int ans = (result == Py_True) ? 1 : 0;
Py_DECREF(func);
@@ -65,7 +80,7 @@ static PyObject* strip_tag_name(PyObject* token)
Py_DECREF(text);
if (!rstripped)
return NULL;
- lowered = PyObject_CallMethod(rstripped, "rstrip", NULL);
+ lowered = PyObject_CallMethod(rstripped, "lower", NULL);
Py_DECREF(rstripped);
return lowered;
}
@@ -85,7 +100,7 @@ static Textbuffer* Textbuffer_new(void)
PyErr_NoMemory();
return NULL;
}
- buffer->next = NULL;
+ buffer->prev = buffer->next = NULL;
return buffer;
}
@@ -113,10 +128,10 @@ static int Textbuffer_write(Textbuffer** this, Py_UNICODE code)
if (!new)
return -1;
new->next = self;
+ self->prev = new;
*this = self = new;
}
- self->data[self->size] = code;
- self->size++;
+ self->data[self->size++] = code;
return 0;
}
@@ -345,7 +360,7 @@ static void* Tokenizer_fail_route(Tokenizer* self)
}
/*
- Write a token to the end of the current token stack.
+ Write a token to the current token stack.
*/
static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first)
{
@@ -366,7 +381,8 @@ static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first)
}
/*
- Write a token to the end of the current token stack.
+ Write a token to the current token stack, with kwargs. Steals a reference
+ to kwargs.
*/
static int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token,
PyObject* kwargs, int first)
@@ -417,6 +433,42 @@ static int Tokenizer_emit_text(Tokenizer* self, const char* text)
}
/*
+ Write the contents of another textbuffer to the current textbuffer,
+ deallocating it in the process.
+*/
+static int
+Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse)
+{
+ Textbuffer *original = buffer;
+ int i;
+
+ if (reverse) {
+ do {
+ for (i = buffer->size - 1; i >= 0; i--) {
+ if (Tokenizer_emit_char(self, buffer->data[i])) {
+ Textbuffer_dealloc(original);
+ return -1;
+ }
+ }
+ } while ((buffer = buffer->next));
+ }
+ else {
+ while (buffer->next)
+ buffer = buffer->next;
+ do {
+ for (i = 0; i < buffer->size; i++) {
+ if (Tokenizer_emit_char(self, buffer->data[i])) {
+ Textbuffer_dealloc(original);
+ return -1;
+ }
+ }
+ } while ((buffer = buffer->prev));
+ }
+ Textbuffer_dealloc(original);
+ return 0;
+}
+
+/*
Write a series of tokens to the current stack at once.
*/
static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist)
@@ -808,6 +860,353 @@ static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self)
}
/*
+ Parse the URI scheme of a bracket-enclosed external link.
+*/
+static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self)
+{
+ static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
+ Textbuffer* buffer;
+ PyObject* scheme;
+ Py_UNICODE this;
+ int slashes, i;
+
+ if (Tokenizer_push(self, LC_EXT_LINK_URI))
+ return -1;
+ if (Tokenizer_READ(self, 0) == *"/" && Tokenizer_READ(self, 1) == *"/") {
+ if (Tokenizer_emit_text(self, "//"))
+ return -1;
+ self->head += 2;
+ }
+ else {
+ buffer = Textbuffer_new();
+ if (!buffer)
+ return -1;
+ while ((this = Tokenizer_READ(self, 0)) != *"") {
+ i = 0;
+ while (1) {
+ if (!valid[i])
+ goto end_of_loop;
+ if (this == valid[i])
+ break;
+ i++;
+ }
+ Textbuffer_write(&buffer, this);
+ if (Tokenizer_emit_char(self, this)) {
+ Textbuffer_dealloc(buffer);
+ return -1;
+ }
+ self->head++;
+ }
+ end_of_loop:
+ if (this != *":") {
+ Textbuffer_dealloc(buffer);
+ Tokenizer_fail_route(self);
+ return 0;
+ }
+ if (Tokenizer_emit_char(self, *":")) {
+ Textbuffer_dealloc(buffer);
+ return -1;
+ }
+ self->head++;
+ slashes = (Tokenizer_READ(self, 0) == *"/" &&
+ Tokenizer_READ(self, 1) == *"/");
+ if (slashes) {
+ if (Tokenizer_emit_text(self, "//")) {
+ Textbuffer_dealloc(buffer);
+ return -1;
+ }
+ self->head += 2;
+ }
+ scheme = Textbuffer_render(buffer);
+ Textbuffer_dealloc(buffer);
+ if (!scheme)
+ return -1;
+ if (!IS_SCHEME(scheme, slashes, 0)) {
+ Py_DECREF(scheme);
+ Tokenizer_fail_route(self);
+ return 0;
+ }
+ Py_DECREF(scheme);
+ }
+ return 0;
+}
+
+/*
+ Parse the URI scheme of a free (no brackets) external link.
+*/
+static int Tokenizer_parse_free_uri_scheme(Tokenizer* self)
+{
+ static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-";
+ Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer;
+ PyObject *scheme;
+ Py_UNICODE chunk;
+ int slashes, i, j;
+
+ if (!scheme_buffer)
+ return -1;
+ // We have to backtrack through the textbuffer looking for our scheme since
+ // it was just parsed as text:
+ temp_buffer = self->topstack->textbuffer;
+ while (temp_buffer) {
+ for (i = temp_buffer->size - 1; i >= 0; i--) {
+ chunk = temp_buffer->data[i];
+ if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk))
+ goto end_of_loop;
+ j = 0;
+ while (1) {
+ if (!valid[j]) {
+ Textbuffer_dealloc(scheme_buffer);
+ FAIL_ROUTE(0);
+ return 0;
+ }
+ if (chunk == valid[j])
+ break;
+ j++;
+ }
+ Textbuffer_write(&scheme_buffer, chunk);
+ }
+ temp_buffer = temp_buffer->next;
+ }
+ end_of_loop:
+ scheme = Textbuffer_render(scheme_buffer);
+ if (!scheme) {
+ Textbuffer_dealloc(scheme_buffer);
+ return -1;
+ }
+ slashes = (Tokenizer_READ(self, 0) == *"/" &&
+ Tokenizer_READ(self, 1) == *"/");
+ if (!IS_SCHEME(scheme, slashes, 1)) {
+ Py_DECREF(scheme);
+ Textbuffer_dealloc(scheme_buffer);
+ FAIL_ROUTE(0);
+ return 0;
+ }
+ Py_DECREF(scheme);
+ if (Tokenizer_push(self, LC_EXT_LINK_URI)) {
+ Textbuffer_dealloc(scheme_buffer);
+ return -1;
+ }
+ if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1))
+ return -1;
+ if (Tokenizer_emit_char(self, *":"))
+ return -1;
+ if (slashes) {
+ if (Tokenizer_emit_text(self, "//"))
+ return -1;
+ self->head += 2;
+ }
+ return 0;
+}
+
+/*
+ Handle text in a free external link, including trailing punctuation.
+*/
+static int
+Tokenizer_handle_free_link_text(Tokenizer* self, int* parens,
+ Textbuffer** tail, Py_UNICODE this)
+{
+ #define PUSH_TAIL_BUFFER(tail, error) \
+ if ((tail)->size || (tail)->next) { \
+ if (Tokenizer_emit_textbuffer(self, tail, 0)) \
+ return error; \
+ tail = Textbuffer_new(); \
+ if (!(tail)) \
+ return error; \
+ }
+
+ if (this == *"(" && !(*parens)) {
+ *parens = 1;
+ PUSH_TAIL_BUFFER(*tail, -1)
+ }
+ else if (this == *"," || this == *";" || this == *"\\" || this == *"." ||
+ this == *":" || this == *"!" || this == *"?" ||
+ (!(*parens) && this == *")"))
+ return Textbuffer_write(tail, this);
+ else
+ PUSH_TAIL_BUFFER(*tail, -1)
+ return Tokenizer_emit_char(self, this);
+}
+
+/*
+ Really parse an external link.
+*/
+static PyObject*
+Tokenizer_really_parse_external_link(Tokenizer* self, int brackets,
+ Textbuffer** extra)
+{
+ Py_UNICODE this, next;
+ int parens = 0;
+
+ if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) :
+ Tokenizer_parse_free_uri_scheme(self))
+ return NULL;
+ if (BAD_ROUTE)
+ return NULL;
+ this = Tokenizer_READ(self, 0);
+ if (this == *"" || this == *"\n" || this == *" " || this == *"]")
+ return Tokenizer_fail_route(self);
+ if (!brackets && this == *"[")
+ return Tokenizer_fail_route(self);
+ while (1) {
+ this = Tokenizer_READ(self, 0);
+ next = Tokenizer_READ(self, 1);
+ if (this == *"" || this == *"\n") {
+ if (brackets)
+ return Tokenizer_fail_route(self);
+ self->head--;
+ return Tokenizer_pop(self);
+ }
+ if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) {
+ PUSH_TAIL_BUFFER(*extra, NULL)
+ if (Tokenizer_parse_template_or_argument(self))
+ return NULL;
+ }
+ else if (this == *"[") {
+ if (!brackets) {
+ self->head--;
+ return Tokenizer_pop(self);
+ }
+ if (Tokenizer_emit_char(self, *"["))
+ return NULL;
+ }
+ else if (this == *"]") {
+ if (!brackets)
+ self->head--;
+ return Tokenizer_pop(self);
+ }
+ else if (this == *"&") {
+ PUSH_TAIL_BUFFER(*extra, NULL)
+ if (Tokenizer_parse_entity(self))
+ return NULL;
+ }
+ else if (this == *" ") {
+ if (brackets) {
+ if (Tokenizer_emit(self, ExternalLinkSeparator))
+ return NULL;
+ self->topstack->context ^= LC_EXT_LINK_URI;
+ self->topstack->context |= LC_EXT_LINK_TITLE;
+ self->head++;
+ return Tokenizer_parse(self, 0, 0);
+ }
+ if (Textbuffer_write(extra, *" "))
+ return NULL;
+ return Tokenizer_pop(self);
+ }
+ else if (!brackets) {
+ if (Tokenizer_handle_free_link_text(self, &parens, extra, this))
+ return NULL;
+ }
+ else {
+ if (Tokenizer_emit_char(self, this))
+ return NULL;
+ }
+ self->head++;
+ }
+}
+
+/*
+ Remove the URI scheme of a new external link from the textbuffer.
+*/
+static int
+Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link)
+{
+ PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"),
+ *split, *scheme;
+ Py_ssize_t length;
+ Textbuffer* temp;
+
+ if (!text)
+ return -1;
+ split = PyObject_CallMethod(text, "split", "si", ":", 1);
+ Py_DECREF(text);
+ if (!split)
+ return -1;
+ scheme = PyList_GET_ITEM(split, 0);
+ length = PyUnicode_GET_SIZE(scheme);
+ while (length) {
+ temp = self->topstack->textbuffer;
+ if (length <= temp->size) {
+ temp->size -= length;
+ break;
+ }
+ length -= temp->size;
+ self->topstack->textbuffer = temp->next;
+ free(temp->data);
+ free(temp);
+ }
+ Py_DECREF(split);
+ return 0;
+}
+
+/*
+ Parse an external link at the head of the wikicode string.
+*/
+static int Tokenizer_parse_external_link(Tokenizer* self, int brackets)
+{
+ #define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK
+ #define NOT_A_LINK \
+ if (!brackets && self->topstack->context & LC_DLTERM) \
+ return Tokenizer_handle_dl_term(self); \
+ return Tokenizer_emit_char(self, Tokenizer_READ(self, 0))
+
+ Py_ssize_t reset = self->head;
+ PyObject *link, *kwargs;
+ Textbuffer *extra = 0;
+
+ if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) {
+ NOT_A_LINK;
+ }
+ extra = Textbuffer_new();
+ if (!extra)
+ return -1;
+ self->head++;
+ link = Tokenizer_really_parse_external_link(self, brackets, &extra);
+ if (BAD_ROUTE) {
+ RESET_ROUTE();
+ self->head = reset;
+ Textbuffer_dealloc(extra);
+ NOT_A_LINK;
+ }
+ if (!link) {
+ Textbuffer_dealloc(extra);
+ return -1;
+ }
+ if (!brackets) {
+ if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) {
+ Textbuffer_dealloc(extra);
+ Py_DECREF(link);
+ return -1;
+ }
+ }
+ kwargs = PyDict_New();
+ if (!kwargs) {
+ Textbuffer_dealloc(extra);
+ Py_DECREF(link);
+ return -1;
+ }
+ PyDict_SetItemString(kwargs, "brackets", brackets ? Py_True : Py_False);
+ if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) {
+ Textbuffer_dealloc(extra);
+ Py_DECREF(link);
+ return -1;
+ }
+ if (Tokenizer_emit_all(self, link)) {
+ Textbuffer_dealloc(extra);
+ Py_DECREF(link);
+ return -1;
+ }
+ Py_DECREF(link);
+ if (Tokenizer_emit(self, ExternalLinkClose)) {
+ Textbuffer_dealloc(extra);
+ return -1;
+ }
+ if (extra->size || extra->next)
+ return Tokenizer_emit_textbuffer(self, extra, 0);
+ Textbuffer_dealloc(extra);
+ return 0;
+}
+
+/*
Parse a section heading at the head of the wikicode string.
*/
static int Tokenizer_parse_heading(Tokenizer* self)
@@ -1238,15 +1637,8 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text)
static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text)
{
Py_UNICODE next = Tokenizer_READ(self, 1);
- int i, is_marker = 0;
- for (i = 0; i < NUM_MARKERS; i++) {
- if (*MARKERS[i] == text) {
- is_marker = 1;
- break;
- }
- }
- if (!is_marker || !Tokenizer_CAN_RECURSE(self))
+ if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self))
return Tokenizer_emit_char(self, text);
else if (text == next && next == *"{")
return Tokenizer_parse_template_or_argument(self);
@@ -1264,17 +1656,11 @@ static int
Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk)
{
PyObject *trash;
- int first_time, i, is_marker = 0, escaped;
+ int first_time, escaped;
if (data->context & TAG_NAME) {
first_time = !(data->context & TAG_NOTE_SPACE);
- for (i = 0; i < NUM_MARKERS; i++) {
- if (*MARKERS[i] == chunk) {
- is_marker = 1;
- break;
- }
- }
- if (is_marker || (Py_UNICODE_ISSPACE(chunk) && first_time)) {
+ if (is_marker(chunk) || (Py_UNICODE_ISSPACE(chunk) && first_time)) {
// Tags must start with text, not spaces
Tokenizer_fail_route(self);
return 0;
@@ -1623,7 +2009,6 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
Textbuffer* buf;
PyObject *name, *tag;
Py_UNICODE this;
- int is_marker, i;
self->head += 2;
buf = Textbuffer_new();
@@ -1631,14 +2016,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self)
return -1;
while (1) {
this = Tokenizer_READ(self, pos);
- is_marker = 0;
- for (i = 0; i < NUM_MARKERS; i++) {
- if (*MARKERS[i] == this) {
- is_marker = 1;
- break;
- }
- }
- if (is_marker) {
+ if (is_marker(this)) {
name = Textbuffer_render(buf);
if (!name) {
Textbuffer_dealloc(buf);
@@ -1985,9 +2363,9 @@ static int Tokenizer_handle_hr(Tokenizer* self)
self->head++;
}
markup = Textbuffer_render(buffer);
+ Textbuffer_dealloc(buffer);
if (!markup)
return -1;
- Textbuffer_dealloc(buffer);
kwargs = PyDict_New();
if (!kwargs)
return -1;
@@ -2047,21 +2425,21 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context)
*/
static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
{
- if (context & LC_FAIL_NEXT) {
+ if (context & LC_FAIL_NEXT)
return -1;
- }
- if (context & LC_WIKILINK_TITLE) {
- if (data == *"]" || data == *"{")
+ if (context & LC_WIKILINK) {
+ if (context & LC_WIKILINK_TEXT)
+ return (data == *"[" && Tokenizer_READ(self, 1) == *"[") ? -1 : 0;
+ else if (data == *"]" || data == *"{")
self->topstack->context |= LC_FAIL_NEXT;
else if (data == *"\n" || data == *"[" || data == *"}")
return -1;
return 0;
}
- if (context & LC_TAG_CLOSE) {
- if (data == *"<")
- return -1;
- return 0;
- }
+ if (context & LC_EXT_LINK_TITLE)
+ return (data == *"\n") ? -1 : 0;
+ if (context & LC_TAG_CLOSE)
+ return (data == *"<") ? -1 : 0;
if (context & LC_TEMPLATE_NAME) {
if (data == *"{" || data == *"}" || data == *"[") {
self->topstack->context |= LC_FAIL_NEXT;
@@ -2126,7 +2504,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data)
*/
static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
{
- int this_context, is_marker, i;
+ int this_context;
Py_UNICODE this, next, next_next, last;
PyObject* temp;
@@ -2146,14 +2524,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
return Tokenizer_fail_route(self);
}
}
- is_marker = 0;
- for (i = 0; i < NUM_MARKERS; i++) {
- if (*MARKERS[i] == this) {
- is_marker = 1;
- break;
- }
- }
- if (!is_marker) {
+ if (!is_marker(this)) {
if (Tokenizer_emit_char(self, this))
return NULL;
self->head++;
@@ -2192,9 +2563,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
if (Tokenizer_emit_char(self, this))
return NULL;
}
- else if (this == next && next == *"[") {
- if (!(this_context & LC_WIKILINK_TITLE) &&
- Tokenizer_CAN_RECURSE(self)) {
+ else if (this == next && next == *"[" && Tokenizer_CAN_RECURSE(self)) {
+ if (!(this_context & AGG_INVALID_LINK)) {
if (Tokenizer_parse_wikilink(self))
return NULL;
}
@@ -2207,6 +2577,16 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
}
else if (this == next && next == *"]" && this_context & LC_WIKILINK)
return Tokenizer_handle_wikilink_end(self);
+ else if (this == *"[") {
+ if (Tokenizer_parse_external_link(self, 1))
+ return NULL;
+ }
+ else if (this == *":" && !is_marker(last)) {
+ if (Tokenizer_parse_external_link(self, 0))
+ return NULL;
+ }
+ else if (this == *"]" && this_context & LC_EXT_LINK_TITLE)
+ return Tokenizer_pop(self);
else if (this == *"=" && !(self->global & GL_HEADING)) {
if (last == *"\n" || last == *"") {
if (Tokenizer_parse_heading(self))
@@ -2243,9 +2623,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
return NULL;
}
}
- else if (this == *"<") {
- if (!(this_context & LC_TAG_CLOSE) &&
- Tokenizer_CAN_RECURSE(self)) {
+ else if (this == *"<" && !(this_context & LC_TAG_CLOSE)) {
+ if (Tokenizer_CAN_RECURSE(self)) {
if (Tokenizer_parse_tag(self))
return NULL;
}
@@ -2289,8 +2668,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push)
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
{
PyObject *text, *temp;
+ int context = 0;
- if (PyArg_ParseTuple(args, "U", &text)) {
+ if (PyArg_ParseTuple(args, "U|i", &text, &context)) {
Py_XDECREF(self->text);
self->text = PySequence_Fast(text, "expected a sequence");
}
@@ -2299,7 +2679,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
Py_ssize_t size;
/* Failed to parse a Unicode object; try a string instead. */
PyErr_Clear();
- if (!PyArg_ParseTuple(args, "s#", &encoded, &size))
+ if (!PyArg_ParseTuple(args, "s#|i", &encoded, &size, &context))
return NULL;
temp = PyUnicode_FromStringAndSize(encoded, size);
if (!text)
@@ -2311,7 +2691,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args)
}
self->head = self->global = self->depth = self->cycles = 0;
self->length = PyList_GET_SIZE(self->text);
- return Tokenizer_parse(self, 0, 1);
+ return Tokenizer_parse(self, context, 1);
}
static int load_entitydefs(void)
@@ -2389,6 +2769,11 @@ static int load_tokens(void)
WikilinkSeparator = PyObject_GetAttrString(tokens, "WikilinkSeparator");
WikilinkClose = PyObject_GetAttrString(tokens, "WikilinkClose");
+ ExternalLinkOpen = PyObject_GetAttrString(tokens, "ExternalLinkOpen");
+ ExternalLinkSeparator = PyObject_GetAttrString(tokens,
+ "ExternalLinkSeparator");
+ ExternalLinkClose = PyObject_GetAttrString(tokens, "ExternalLinkClose");
+
HTMLEntityStart = PyObject_GetAttrString(tokens, "HTMLEntityStart");
HTMLEntityNumeric = PyObject_GetAttrString(tokens, "HTMLEntityNumeric");
HTMLEntityHex = PyObject_GetAttrString(tokens, "HTMLEntityHex");
@@ -2413,13 +2798,13 @@ static int load_tokens(void)
return 0;
}
-static int load_tag_defs(void)
+static int load_definitions(void)
{
PyObject *tempmod,
*globals = PyEval_GetGlobals(),
*locals = PyEval_GetLocals(),
*fromlist = PyList_New(1),
- *modname = IMPORT_NAME_FUNC("tag_defs");
+ *modname = IMPORT_NAME_FUNC("definitions");
char *name = "mwparserfromhell";
if (!fromlist || !modname)
@@ -2429,7 +2814,7 @@ static int load_tag_defs(void)
Py_DECREF(fromlist);
if (!tempmod)
return -1;
- tag_defs = PyObject_GetAttrString(tempmod, "tag_defs");
+ definitions = PyObject_GetAttrString(tempmod, "definitions");
Py_DECREF(tempmod);
return 0;
}
@@ -2452,7 +2837,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void)
NOARGS = PyTuple_New(0);
if (!EMPTY || !NOARGS)
INIT_ERROR;
- if (load_entitydefs() || load_tokens() || load_tag_defs())
+ if (load_entitydefs() || load_tokens() || load_definitions())
INIT_ERROR;
#ifdef IS_PY3K
return module;
diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h
index 264360e..da3c57a 100644
--- a/mwparserfromhell/parser/tokenizer.h
+++ b/mwparserfromhell/parser/tokenizer.h
@@ -62,7 +62,7 @@ static char** entitydefs;
static PyObject* EMPTY;
static PyObject* NOARGS;
-static PyObject* tag_defs;
+static PyObject* definitions;
/* Tokens: */
@@ -82,6 +82,10 @@ static PyObject* WikilinkOpen;
static PyObject* WikilinkSeparator;
static PyObject* WikilinkClose;
+static PyObject* ExternalLinkOpen;
+static PyObject* ExternalLinkSeparator;
+static PyObject* ExternalLinkClose;
+
static PyObject* HTMLEntityStart;
static PyObject* HTMLEntityNumeric;
static PyObject* HTMLEntityHex;
@@ -104,48 +108,53 @@ static PyObject* TagCloseClose;
/* Local contexts: */
-#define LC_TEMPLATE 0x0000007
-#define LC_TEMPLATE_NAME 0x0000001
-#define LC_TEMPLATE_PARAM_KEY 0x0000002
-#define LC_TEMPLATE_PARAM_VALUE 0x0000004
-
-#define LC_ARGUMENT 0x0000018
-#define LC_ARGUMENT_NAME 0x0000008
-#define LC_ARGUMENT_DEFAULT 0x0000010
-
-#define LC_WIKILINK 0x0000060
-#define LC_WIKILINK_TITLE 0x0000020
-#define LC_WIKILINK_TEXT 0x0000040
-
-#define LC_HEADING 0x0001F80
-#define LC_HEADING_LEVEL_1 0x0000080
-#define LC_HEADING_LEVEL_2 0x0000100
-#define LC_HEADING_LEVEL_3 0x0000200
-#define LC_HEADING_LEVEL_4 0x0000400
-#define LC_HEADING_LEVEL_5 0x0000800
-#define LC_HEADING_LEVEL_6 0x0001000
-
-#define LC_TAG 0x001E000
-#define LC_TAG_OPEN 0x0002000
-#define LC_TAG_ATTR 0x0004000
-#define LC_TAG_BODY 0x0008000
-#define LC_TAG_CLOSE 0x0010000
-
-#define LC_STYLE 0x01E0000
-#define LC_STYLE_ITALICS 0x0020000
-#define LC_STYLE_BOLD 0x0040000
-#define LC_STYLE_PASS_AGAIN 0x0080000
-#define LC_STYLE_SECOND_PASS 0x0100000
-
-#define LC_DLTERM 0x0200000
-
-#define LC_SAFETY_CHECK 0xFC00000
-#define LC_HAS_TEXT 0x0400000
-#define LC_FAIL_ON_TEXT 0x0800000
-#define LC_FAIL_NEXT 0x1000000
-#define LC_FAIL_ON_LBRACE 0x2000000
-#define LC_FAIL_ON_RBRACE 0x4000000
-#define LC_FAIL_ON_EQUALS 0x8000000
+#define LC_TEMPLATE 0x00000007
+#define LC_TEMPLATE_NAME 0x00000001
+#define LC_TEMPLATE_PARAM_KEY 0x00000002
+#define LC_TEMPLATE_PARAM_VALUE 0x00000004
+
+#define LC_ARGUMENT 0x00000018
+#define LC_ARGUMENT_NAME 0x00000008
+#define LC_ARGUMENT_DEFAULT 0x00000010
+
+#define LC_WIKILINK 0x00000060
+#define LC_WIKILINK_TITLE 0x00000020
+#define LC_WIKILINK_TEXT 0x00000040
+
+#define LC_EXT_LINK 0x00000380
+#define LC_EXT_LINK_URI 0x00000080
+#define LC_EXT_LINK_TITLE 0x00000100
+#define LC_EXT_LINK_BRACKETS 0x00000200
+
+#define LC_HEADING 0x0000FC00
+#define LC_HEADING_LEVEL_1 0x00000400
+#define LC_HEADING_LEVEL_2 0x00000800
+#define LC_HEADING_LEVEL_3 0x00001000
+#define LC_HEADING_LEVEL_4 0x00002000
+#define LC_HEADING_LEVEL_5 0x00004000
+#define LC_HEADING_LEVEL_6 0x00008000
+
+#define LC_TAG 0x000F0000
+#define LC_TAG_OPEN 0x00010000
+#define LC_TAG_ATTR 0x00020000
+#define LC_TAG_BODY 0x00040000
+#define LC_TAG_CLOSE 0x00080000
+
+#define LC_STYLE 0x00F00000
+#define LC_STYLE_ITALICS 0x00100000
+#define LC_STYLE_BOLD 0x00200000
+#define LC_STYLE_PASS_AGAIN 0x00400000
+#define LC_STYLE_SECOND_PASS 0x00800000
+
+#define LC_DLTERM 0x01000000
+
+#define LC_SAFETY_CHECK 0x7E000000
+#define LC_HAS_TEXT 0x02000000
+#define LC_FAIL_ON_TEXT 0x04000000
+#define LC_FAIL_NEXT 0x08000000
+#define LC_FAIL_ON_LBRACE 0x10000000
+#define LC_FAIL_ON_RBRACE 0x20000000
+#define LC_FAIL_ON_EQUALS 0x40000000
/* Global contexts: */
@@ -153,9 +162,10 @@ static PyObject* TagCloseClose;
/* Aggregate contexts: */
-#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE)
-#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
-#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)
+#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE)
+#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME)
+#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE)
+#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK)
/* Tag contexts: */
@@ -174,6 +184,7 @@ static PyObject* TagCloseClose;
struct Textbuffer {
Py_ssize_t size;
Py_UNICODE* data;
+ struct Textbuffer* prev;
struct Textbuffer* next;
};
@@ -228,12 +239,14 @@ typedef struct {
#define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1)
-/* Macros for accessing HTML tag definitions: */
+/* Macros for accessing definitions: */
#define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li")
-#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag))
-#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag))
-#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag))
+#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL))
+#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL))
+#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL))
+#define IS_SCHEME(scheme, slashes, reverse) \
+ (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False))
/* Function prototypes: */
@@ -247,6 +260,8 @@ static void TagData_dealloc(TagData*);
static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
static void Tokenizer_dealloc(Tokenizer*);
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
+static int Tokenizer_parse_entity(Tokenizer*);
+static int Tokenizer_handle_dl_term(Tokenizer*);
static int Tokenizer_parse_tag(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, int, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 583d2f8..6ab549a 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -26,7 +26,8 @@ import re
from . import contexts, tokens
from ..compat import htmlentities
-from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only
+from ..definitions import (get_html_tag, is_parsable, is_single,
+ is_single_only, is_scheme)
__all__ = ["Tokenizer"]
@@ -60,7 +61,7 @@ class Tokenizer(object):
START = object()
END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";",
- ":", "/", "-", "\n", END]
+ ":", "/", "-", "\n", START, END]
MAX_DEPTH = 40
MAX_CYCLES = 100000
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
@@ -311,6 +312,168 @@ class Tokenizer(object):
self._head += 1
return self._pop()
+ def _parse_bracketed_uri_scheme(self):
+ """Parse the URI scheme of a bracket-enclosed external link."""
+ self._push(contexts.EXT_LINK_URI)
+ if self._read() == self._read(1) == "/":
+ self._emit_text("//")
+ self._head += 2
+ else:
+ valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"
+ all_valid = lambda: all(char in valid for char in self._read())
+ scheme = ""
+ while self._read() is not self.END and all_valid():
+ scheme += self._read()
+ self._emit_text(self._read())
+ self._head += 1
+ if self._read() != ":":
+ self._fail_route()
+ self._emit_text(":")
+ self._head += 1
+ slashes = self._read() == self._read(1) == "/"
+ if slashes:
+ self._emit_text("//")
+ self._head += 2
+ if not is_scheme(scheme, slashes):
+ self._fail_route()
+
+ def _parse_free_uri_scheme(self):
+ """Parse the URI scheme of a free (no brackets) external link."""
+ valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"
+ scheme = []
+ try:
+ # We have to backtrack through the textbuffer looking for our
+ # scheme since it was just parsed as text:
+ for chunk in reversed(self._textbuffer):
+ for char in reversed(chunk):
+ if char.isspace() or char in self.MARKERS:
+ raise StopIteration()
+ if char not in valid:
+ raise BadRoute()
+ scheme.append(char)
+ except StopIteration:
+ pass
+ scheme = "".join(reversed(scheme))
+ slashes = self._read() == self._read(1) == "/"
+ if not is_scheme(scheme, slashes):
+ raise BadRoute()
+ self._push(contexts.EXT_LINK_URI)
+ self._emit_text(scheme)
+ self._emit_text(":")
+ if slashes:
+ self._emit_text("//")
+ self._head += 2
+
+ def _handle_free_link_text(self, punct, tail, this):
+ """Handle text in a free ext link, including trailing punctuation."""
+ if "(" in this and ")" in punct:
+ punct = punct[:-1] # ')' is not longer valid punctuation
+ if this.endswith(punct):
+ for i in reversed(range(-len(this), 0)):
+ if i == -len(this) or this[i - 1] not in punct:
+ break
+ stripped = this[:i]
+ if stripped and tail:
+ self._emit_text(tail)
+ tail = ""
+ tail += this[i:]
+ this = stripped
+ elif tail:
+ self._emit_text(tail)
+ tail = ""
+ self._emit_text(this)
+ return punct, tail
+
+ def _really_parse_external_link(self, brackets):
+ """Really parse an external link."""
+ if brackets:
+ self._parse_bracketed_uri_scheme()
+ invalid = ("\n", " ", "]")
+ else:
+ self._parse_free_uri_scheme()
+ invalid = ("\n", " ", "[", "]")
+ punct = tuple(",;\.:!?)")
+ if self._read() is self.END or self._read()[0] in invalid:
+ self._fail_route()
+ tail = ""
+ while True:
+ this, next = self._read(), self._read(1)
+ if this is self.END or this == "\n":
+ if brackets:
+ self._fail_route()
+ return self._pop(), tail, -1
+ elif this == next == "{" and self._can_recurse():
+ if tail:
+ self._emit_text(tail)
+ tail = ""
+ self._parse_template_or_argument()
+ elif this == "[":
+ if brackets:
+ self._emit_text("[")
+ else:
+ return self._pop(), tail, -1
+ elif this == "]":
+ return self._pop(), tail, 0 if brackets else -1
+ elif this == "&":
+ if tail:
+ self._emit_text(tail)
+ tail = ""
+ self._parse_entity()
+ elif " " in this:
+ before, after = this.split(" ", 1)
+ if brackets:
+ self._emit_text(before)
+ self._emit(tokens.ExternalLinkSeparator())
+ if after:
+ self._emit_text(after)
+ self._context ^= contexts.EXT_LINK_URI
+ self._context |= contexts.EXT_LINK_TITLE
+ self._head += 1
+ return self._parse(push=False), None, 0
+ punct, tail = self._handle_free_link_text(punct, tail, before)
+ return self._pop(), tail + " " + after, 0
+ elif not brackets:
+ punct, tail = self._handle_free_link_text(punct, tail, this)
+ else:
+ self._emit_text(this)
+ self._head += 1
+
+ def _remove_uri_scheme_from_textbuffer(self, scheme):
+ """Remove the URI scheme of a new external link from the textbuffer."""
+ length = len(scheme)
+ while length:
+ if length < len(self._textbuffer[-1]):
+ self._textbuffer[-1] = self._textbuffer[-1][:-length]
+ break
+ length -= len(self._textbuffer[-1])
+ self._textbuffer.pop()
+
+ def _parse_external_link(self, brackets):
+ """Parse an external link at the head of the wikicode string."""
+ reset = self._head
+ self._head += 1
+ try:
+ bad_context = self._context & contexts.INVALID_LINK
+ if bad_context or not self._can_recurse():
+ raise BadRoute()
+ link, extra, delta = self._really_parse_external_link(brackets)
+ except BadRoute:
+ self._head = reset
+ if not brackets and self._context & contexts.DL_TERM:
+ self._handle_dl_term()
+ else:
+ self._emit_text(self._read())
+ else:
+ if not brackets:
+ scheme = link[0].text.split(":", 1)[0]
+ self._remove_uri_scheme_from_textbuffer(scheme)
+ self._emit(tokens.ExternalLinkOpen(brackets=brackets))
+ self._emit_all(link)
+ self._emit(tokens.ExternalLinkClose())
+ self._head += delta
+ if extra:
+ self._emit_text(extra)
+
def _parse_heading(self):
"""Parse a section heading at the head of the wikicode string."""
self._global |= contexts.GL_HEADING
@@ -810,12 +973,16 @@ class Tokenizer(object):
context = self._context
if context & contexts.FAIL_NEXT:
return False
- if context & contexts.WIKILINK_TITLE:
- if this == "]" or this == "{":
+ if context & contexts.WIKILINK:
+ if context & contexts.WIKILINK_TEXT:
+ return not (this == self._read(1) == "[")
+ elif this == "]" or this == "{":
self._context |= contexts.FAIL_NEXT
elif this == "\n" or this == "[" or this == "}":
return False
return True
+ elif context & contexts.EXT_LINK_TITLE:
+ return this != "\n"
elif context & contexts.TEMPLATE_NAME:
if this == "{" or this == "}" or this == "[":
self._context |= contexts.FAIL_NEXT
@@ -898,8 +1065,8 @@ class Tokenizer(object):
return self._handle_argument_end()
else:
self._emit_text("}")
- elif this == next == "[":
- if not self._context & contexts.WIKILINK_TITLE and self._can_recurse():
+ elif this == next == "[" and self._can_recurse():
+ if not self._context & contexts.INVALID_LINK:
self._parse_wikilink()
else:
self._emit_text("[")
@@ -907,6 +1074,12 @@ class Tokenizer(object):
self._handle_wikilink_separator()
elif this == next == "]" and self._context & contexts.WIKILINK:
return self._handle_wikilink_end()
+ elif this == "[":
+ self._parse_external_link(True)
+ elif this == ":" and self._read(-1) not in self.MARKERS:
+ self._parse_external_link(False)
+ elif this == "]" and self._context & contexts.EXT_LINK_TITLE:
+ return self._pop()
elif this == "=" and not self._global & contexts.GL_HEADING:
if self._read(-1) in ("\n", self.START):
self._parse_heading()
@@ -928,8 +1101,8 @@ class Tokenizer(object):
self._handle_tag_open_close()
else:
self._handle_invalid_tag_start()
- elif this == "<":
- if not self._context & contexts.TAG_CLOSE and self._can_recurse():
+ elif this == "<" and not self._context & contexts.TAG_CLOSE:
+ if self._can_recurse():
self._parse_tag()
else:
self._emit_text("<")
@@ -952,8 +1125,9 @@ class Tokenizer(object):
self._emit_text(this)
self._head += 1
- def tokenize(self, text):
+ def tokenize(self, text, context=0):
"""Build a list of tokens from a string of wikicode and return it."""
split = self.regex.split(text)
self._text = [segment for segment in split if segment]
- return self._parse()
+ self._head = self._global = self._depth = self._cycles = 0
+ return self._parse(context)
diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py
index 0ffac86..57308ea 100644
--- a/mwparserfromhell/parser/tokens.py
+++ b/mwparserfromhell/parser/tokens.py
@@ -84,6 +84,10 @@ WikilinkOpen = make("WikilinkOpen") # [[
WikilinkSeparator = make("WikilinkSeparator") # |
WikilinkClose = make("WikilinkClose") # ]]
+ExternalLinkOpen = make("ExternalLinkOpen") # [
+ExternalLinkSeparator = make("ExternalLinkSeparator") #
+ExternalLinkClose = make("ExternalLinkClose") # ]
+
HTMLEntityStart = make("HTMLEntityStart") # &
HTMLEntityNumeric = make("HTMLEntityNumeric") # #
HTMLEntityHex = make("HTMLEntityHex") # x
diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py
index 31e5ba0..758e751 100644
--- a/mwparserfromhell/utils.py
+++ b/mwparserfromhell/utils.py
@@ -33,7 +33,7 @@ from .smart_list import SmartList
__all__ = ["parse_anything"]
-def parse_anything(value):
+def parse_anything(value, context=0):
"""Return a :py:class:`~.Wikicode` for *value*, allowing multiple types.
This differs from :py:meth:`.Parser.parse` in that we accept more than just
@@ -44,6 +44,12 @@ def parse_anything(value):
on-the-fly by various methods of :py:class:`~.Wikicode` and others like
:py:class:`~.Template`, such as :py:meth:`wikicode.insert()
<.Wikicode.insert>` or setting :py:meth:`template.name <.Template.name>`.
+
+ If given, *context* will be passed as a starting context to the parser.
+ This is helpful when this function is used inside node attribute setters.
+ For example, :py:class:`~.ExternalLink`\ 's :py:attr:`~.ExternalLink.url`
+ setter sets *context* to :py:mod:`contexts.EXT_LINK_URI <.contexts>` to
+ prevent the URL itself from becoming an :py:class:`~.ExternalLink`.
"""
from .parser import Parser
from .wikicode import Wikicode
@@ -53,17 +59,17 @@ def parse_anything(value):
elif isinstance(value, Node):
return Wikicode(SmartList([value]))
elif isinstance(value, str):
- return Parser(value).parse()
+ return Parser().parse(value, context)
elif isinstance(value, bytes):
- return Parser(value.decode("utf8")).parse()
+ return Parser().parse(value.decode("utf8"), context)
elif isinstance(value, int):
- return Parser(str(value)).parse()
+ return Parser().parse(str(value), context)
elif value is None:
return Wikicode(SmartList())
try:
nodelist = SmartList()
for item in value:
- nodelist += parse_anything(item).nodes
+ nodelist += parse_anything(item, context).nodes
except TypeError:
error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}"
raise ValueError(error.format(type(value).__name__, value))
diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py
index b5e854d..c3249d9 100644
--- a/mwparserfromhell/wikicode.py
+++ b/mwparserfromhell/wikicode.py
@@ -24,8 +24,8 @@ from __future__ import unicode_literals
import re
from .compat import maxsize, py3k, str
-from .nodes import (Argument, Comment, Heading, HTMLEntity, Node, Tag,
- Template, Text, Wikilink)
+from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity,
+ Node, Tag, Template, Text, Wikilink)
from .string_mixin import StringMixIn
from .utils import parse_anything
@@ -509,6 +509,6 @@ class Wikicode(StringMixIn):
return "\n".join(self._get_tree(self, [], marker, 0))
Wikicode._build_filter_methods(
- arguments=Argument, comments=Comment, headings=Heading,
- html_entities=HTMLEntity, tags=Tag, templates=Template, text=Text,
- wikilinks=Wikilink)
+ arguments=Argument, comments=Comment, external_links=ExternalLink,
+ headings=Heading, html_entities=HTMLEntity, tags=Tag, templates=Template,
+ text=Text, wikilinks=Wikilink)
diff --git a/tests/test_builder.py b/tests/test_builder.py
index 29ae65a..152ab53 100644
--- a/tests/test_builder.py
+++ b/tests/test_builder.py
@@ -23,8 +23,8 @@
from __future__ import unicode_literals
import unittest
-from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity,
- Tag, Template, Text, Wikilink)
+from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading,
+ HTMLEntity, Tag, Template, Text, Wikilink)
from mwparserfromhell.nodes.extras import Attribute, Parameter
from mwparserfromhell.parser import tokens
from mwparserfromhell.parser.builder import Builder
@@ -150,6 +150,48 @@ class TestBuilder(TreeEqualityTestCase):
for test, valid in tests:
self.assertWikicodeEqual(valid, self.builder.build(test))
+ def test_external_link(self):
+ """tests for building ExternalLink nodes"""
+ tests = [
+ ([tokens.ExternalLinkOpen(brackets=False),
+ tokens.Text(text="http://example.com/"),
+ tokens.ExternalLinkClose()],
+ wrap([ExternalLink(wraptext("http://example.com/"),
+ brackets=False)])),
+
+ ([tokens.ExternalLinkOpen(brackets=True),
+ tokens.Text(text="http://example.com/"),
+ tokens.ExternalLinkClose()],
+ wrap([ExternalLink(wraptext("http://example.com/"))])),
+
+ ([tokens.ExternalLinkOpen(brackets=True),
+ tokens.Text(text="http://example.com/"),
+ tokens.ExternalLinkSeparator(), tokens.ExternalLinkClose()],
+ wrap([ExternalLink(wraptext("http://example.com/"), wrap([]))])),
+
+ ([tokens.ExternalLinkOpen(brackets=True),
+ tokens.Text(text="http://example.com/"),
+ tokens.ExternalLinkSeparator(), tokens.Text(text="Example"),
+ tokens.ExternalLinkClose()],
+ wrap([ExternalLink(wraptext("http://example.com/"),
+ wraptext("Example"))])),
+
+ ([tokens.ExternalLinkOpen(brackets=False),
+ tokens.Text(text="http://example"), tokens.Text(text=".com/foo"),
+ tokens.ExternalLinkClose()],
+ wrap([ExternalLink(wraptext("http://example", ".com/foo"),
+ brackets=False)])),
+
+ ([tokens.ExternalLinkOpen(brackets=True),
+ tokens.Text(text="http://example"), tokens.Text(text=".com/foo"),
+ tokens.ExternalLinkSeparator(), tokens.Text(text="Example"),
+ tokens.Text(text=" Web Page"), tokens.ExternalLinkClose()],
+ wrap([ExternalLink(wraptext("http://example", ".com/foo"),
+ wraptext("Example", " Web Page"))])),
+ ]
+ for test, valid in tests:
+ self.assertWikicodeEqual(valid, self.builder.build(test))
+
def test_html_entity(self):
"""tests for building HTMLEntity nodes"""
tests = [
diff --git a/tests/test_external_link.py b/tests/test_external_link.py
new file mode 100644
index 0000000..13a82bf
--- /dev/null
+++ b/tests/test_external_link.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2012-2013 Ben Kurtovic
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from __future__ import unicode_literals
+import unittest
+
+from mwparserfromhell.compat import str
+from mwparserfromhell.nodes import ExternalLink, Text
+
+from ._test_tree_equality import TreeEqualityTestCase, getnodes, wrap, wraptext
+
+class TestExternalLink(TreeEqualityTestCase):
+ """Test cases for the ExternalLink node."""
+
+ def test_unicode(self):
+ """test ExternalLink.__unicode__()"""
+ node = ExternalLink(wraptext("http://example.com/"), brackets=False)
+ self.assertEqual("http://example.com/", str(node))
+ node2 = ExternalLink(wraptext("http://example.com/"))
+ self.assertEqual("[http://example.com/]", str(node2))
+ node3 = ExternalLink(wraptext("http://example.com/"), wrap([]))
+ self.assertEqual("[http://example.com/ ]", str(node3))
+ node4 = ExternalLink(wraptext("http://example.com/"),
+ wraptext("Example Web Page"))
+ self.assertEqual("[http://example.com/ Example Web Page]", str(node4))
+
+ def test_iternodes(self):
+ """test ExternalLink.__iternodes__()"""
+ node1n1 = Text("http://example.com/")
+ node2n1 = Text("http://example.com/")
+ node2n2, node2n3 = Text("Example"), Text("Page")
+ node1 = ExternalLink(wrap([node1n1]), brackets=False)
+ node2 = ExternalLink(wrap([node2n1]), wrap([node2n2, node2n3]))
+ gen1 = node1.__iternodes__(getnodes)
+ gen2 = node2.__iternodes__(getnodes)
+ self.assertEqual((None, node1), next(gen1))
+ self.assertEqual((None, node2), next(gen2))
+ self.assertEqual((node1.url, node1n1), next(gen1))
+ self.assertEqual((node2.url, node2n1), next(gen2))
+ self.assertEqual((node2.title, node2n2), next(gen2))
+ self.assertEqual((node2.title, node2n3), next(gen2))
+ self.assertRaises(StopIteration, next, gen1)
+ self.assertRaises(StopIteration, next, gen2)
+
+ def test_strip(self):
+ """test ExternalLink.__strip__()"""
+ node1 = ExternalLink(wraptext("http://example.com"), brackets=False)
+ node2 = ExternalLink(wraptext("http://example.com"))
+ node3 = ExternalLink(wraptext("http://example.com"), wrap([]))
+ node4 = ExternalLink(wraptext("http://example.com"), wraptext("Link"))
+ for a in (True, False):
+ for b in (True, False):
+ self.assertEqual("http://example.com", node1.__strip__(a, b))
+ self.assertEqual(None, node2.__strip__(a, b))
+ self.assertEqual(None, node3.__strip__(a, b))
+ self.assertEqual("Link", node4.__strip__(a, b))
+
+ def test_showtree(self):
+ """test ExternalLink.__showtree__()"""
+ output = []
+ getter, marker = object(), object()
+ get = lambda code: output.append((getter, code))
+ mark = lambda: output.append(marker)
+ node1 = ExternalLink(wraptext("http://example.com"), brackets=False)
+ node2 = ExternalLink(wraptext("http://example.com"), wraptext("Link"))
+ node1.__showtree__(output.append, get, mark)
+ node2.__showtree__(output.append, get, mark)
+ valid = [
+ (getter, node1.url), "[", (getter, node2.url),
+ (getter, node2.title), "]"]
+ self.assertEqual(valid, output)
+
+ def test_url(self):
+ """test getter/setter for the url attribute"""
+ url = wraptext("http://example.com/")
+ node1 = ExternalLink(url, brackets=False)
+ node2 = ExternalLink(url, wraptext("Example"))
+ self.assertIs(url, node1.url)
+ self.assertIs(url, node2.url)
+ node1.url = "mailto:héhehé@spam.com"
+ node2.url = "mailto:héhehé@spam.com"
+ self.assertWikicodeEqual(wraptext("mailto:héhehé@spam.com"), node1.url)
+ self.assertWikicodeEqual(wraptext("mailto:héhehé@spam.com"), node2.url)
+
+ def test_title(self):
+ """test getter/setter for the title attribute"""
+ title = wraptext("Example!")
+ node1 = ExternalLink(wraptext("http://example.com/"), brackets=False)
+ node2 = ExternalLink(wraptext("http://example.com/"), title)
+ self.assertIs(None, node1.title)
+ self.assertIs(title, node2.title)
+ node2.title = None
+ self.assertIs(None, node2.title)
+ node2.title = "My Website"
+ self.assertWikicodeEqual(wraptext("My Website"), node2.title)
+
+ def test_brackets(self):
+ """test getter/setter for the brackets attribute"""
+ node1 = ExternalLink(wraptext("http://example.com/"), brackets=False)
+ node2 = ExternalLink(wraptext("http://example.com/"), wraptext("Link"))
+ self.assertFalse(node1.brackets)
+ self.assertTrue(node2.brackets)
+ node1.brackets = True
+ node2.brackets = False
+ self.assertTrue(node1.brackets)
+ self.assertFalse(node2.brackets)
+ self.assertEqual("[http://example.com/]", str(node1))
+ self.assertEqual("http://example.com/", str(node2))
+
+if __name__ == "__main__":
+ unittest.main(verbosity=2)
diff --git a/tests/test_parser.py b/tests/test_parser.py
index ec5f065..8760c0e 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -36,9 +36,9 @@ class TestParser(TreeEqualityTestCase):
def test_use_c(self):
"""make sure the correct tokenizer is used"""
if parser.use_c:
- self.assertTrue(parser.Parser(None)._tokenizer.USES_C)
+ self.assertTrue(parser.Parser()._tokenizer.USES_C)
parser.use_c = False
- self.assertFalse(parser.Parser(None)._tokenizer.USES_C)
+ self.assertFalse(parser.Parser()._tokenizer.USES_C)
def test_parsing(self):
"""integration test for parsing overall"""
@@ -59,7 +59,7 @@ class TestParser(TreeEqualityTestCase):
]))
])
])
- actual = parser.Parser(text).parse()
+ actual = parser.Parser().parse(text)
self.assertWikicodeEqual(expected, actual)
if __name__ == "__main__":
diff --git a/tests/test_wikicode.py b/tests/test_wikicode.py
index 08cf93c..14d801c 100644
--- a/tests/test_wikicode.py
+++ b/tests/test_wikicode.py
@@ -276,6 +276,7 @@ class TestWikicode(TreeEqualityTestCase):
self.assertEqual(["{{{e}}}"], get_filter("arguments"))
self.assertIs(code.get(4), get_filter("arguments")[0])
self.assertEqual([], get_filter("comments"))
+ self.assertEqual([], get_filter("external_links"))
self.assertEqual([], get_filter("headings"))
self.assertEqual([], get_filter("html_entities"))
self.assertEqual([], get_filter("tags"))
diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest
new file mode 100644
index 0000000..af7a570
--- /dev/null
+++ b/tests/tokenizer/external_links.mwtest
@@ -0,0 +1,473 @@
+name: basic
+label: basic external link
+input: "http://example.com/"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose()]
+
+---
+
+name: basic_brackets
+label: basic external link in brackets
+input: "[http://example.com/]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkClose()]
+
+---
+
+name: brackets_space
+label: basic external link in brackets, with a space after
+input: "[http://example.com/ ]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), ExternalLinkClose()]
+
+---
+
+name: brackets_title
+label: basic external link in brackets, with a title
+input: "[http://example.com/ Example]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()]
+
+---
+
+name: brackets_multiword_title
+label: basic external link in brackets, with a multi-word title
+input: "[http://example.com/ Example Web Page]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text="Example Web Page"), ExternalLinkClose()]
+
+---
+
+name: brackets_adjacent
+label: three adjacent bracket-enclosed external links
+input: "[http://foo.com/ Foo][http://bar.com/ Bar]\n[http://baz.com/ Baz]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://foo.com/"), ExternalLinkSeparator(), Text(text="Foo"), ExternalLinkClose(), ExternalLinkOpen(brackets=True), Text(text="http://bar.com/"), ExternalLinkSeparator(), Text(text="Bar"), ExternalLinkClose(), Text(text="\n"), ExternalLinkOpen(brackets=True), Text(text="http://baz.com/"), ExternalLinkSeparator(), Text(text="Baz"), ExternalLinkClose()]
+
+---
+
+name: brackets_newline_before
+label: bracket-enclosed link with a newline before the title
+input: "[http://example.com/ \nExample]"
+output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" \nExample]")]
+
+---
+
+name: brackets_newline_inside
+label: bracket-enclosed link with a newline in the title
+input: "[http://example.com/ Example \nWeb Page]"
+output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" Example \nWeb Page]")]
+
+---
+
+name: brackets_newline_after
+label: bracket-enclosed link with a newline after the title
+input: "[http://example.com/ Example\n]"
+output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" Example\n]")]
+
+---
+
+name: brackets_space_before
+label: bracket-enclosed link with a space before the URL
+input: "[ http://example.com Example]"
+output: [Text(text="[ "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" Example]")]
+
+---
+
+name: brackets_title_like_url
+label: bracket-enclosed link with a title that looks like a URL
+input: "[http://example.com http://example.com]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="http://example.com"), ExternalLinkClose()]
+
+---
+
+name: brackets_recursive
+label: bracket-enclosed link with a bracket-enclosed link as the title
+input: "[http://example.com [http://example.com]]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="[http://example.com"), ExternalLinkClose(), Text(text="]")]
+
+---
+
+name: period_after
+label: a period after a free link that is excluded
+input: "http://example.com."
+output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=".")]
+
+---
+
+name: colons_after
+label: colons after a free link that are excluded
+input: "http://example.com/foo:bar.:;baz!?,"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo:bar.:;baz"), ExternalLinkClose(), Text(text="!?,")]
+
+---
+
+name: close_paren_after_excluded
+label: a closing parenthesis after a free link that is excluded
+input: "http://example.)com)"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://example.)com"), ExternalLinkClose(), Text(text=")")]
+
+---
+
+name: close_paren_after_included
+label: a closing parenthesis after a free link that is included because of an opening parenthesis in the URL
+input: "http://example.(com)"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://example.(com)"), ExternalLinkClose()]
+
+---
+
+name: open_bracket_inside
+label: an open bracket inside a free link that causes it to be ended abruptly
+input: "http://foobar[baz.com"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://foobar"), ExternalLinkClose(), Text(text="[baz.com")]
+
+---
+
+name: brackets_period_after
+label: a period after a bracket-enclosed link that is included
+input: "[http://example.com. Example]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com."), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()]
+
+---
+
+name: brackets_colons_after
+label: colons after a bracket-enclosed link that are included
+input: "[http://example.com/foo:bar.:;baz!?, Example]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo:bar.:;baz!?,"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()]
+
+---
+
+name: brackets_close_paren_after_included
+label: a closing parenthesis after a bracket-enclosed link that is included
+input: "[http://example.)com) Example]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://example.)com)"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()]
+
+---
+
+name: brackets_close_paren_after_included_2
+label: a closing parenthesis after a bracket-enclosed link that is also included
+input: "[http://example.(com) Example]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()]
+
+---
+
+name: brackets_open_bracket_inside
+label: an open bracket inside a bracket-enclosed link that is also included
+input: "[http://foobar[baz.com Example]"
+output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()]
+
+---
+
+name: adjacent_space
+label: two free links separated by a space
+input: "http://example.com http://example.com"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()]
+
+---
+
+name: adjacent_newline
+label: two free links separated by a newline
+input: "http://example.com\nhttp://example.com"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text="\n"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()]
+
+---
+
+name: adjacent_close_bracket
+label: two free links separated by a close bracket
+input: "http://example.com]http://example.com"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text="]"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()]
+
+---
+
+name: html_entity_in_url
+label: a HTML entity parsed correctly inside a free link
+input: "http://exa mple.com/"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="mple.com/"), ExternalLinkClose()]
+
+---
+
+name: template_in_url
+label: a template parsed correctly inside a free link
+input: "http://exa{{template}}mple.com/"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), TemplateOpen(), Text(text="template"), TemplateClose(), Text(text="mple.com/"), ExternalLinkClose()]
+
+---
+
+name: argument_in_url
+label: an argument parsed correctly inside a free link
+input: "http://exa{{{argument}}}mple.com/"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ArgumentOpen(), Text(text="argument"), ArgumentClose(), Text(text="mple.com/"), ExternalLinkClose()]
+
+---
+
+name: wikilink_in_url
+label: a wikilink that destroys a free link
+input: "http://exa[[wikilink]]mple.com/"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ExternalLinkClose(), WikilinkOpen(), Text(text="wikilink"), WikilinkClose(), Text(text="mple.com/")]
+
+---
+
+name: external_link_in_url
+label: a bracketed link that destroys a free link
+input: "http://exa[http://example.com/]mple.com/"
+output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ExternalLinkClose(), ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkClose(), Text(text="mple.com/")]
+
+---
+
+name: spaces_padding
+label: spaces padding a free link
+input: " http://example.com "
+output: [Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" ")]
+
+---
+
+name: text_and_spaces_padding
+label: text and spaces padding a free link
+input: "x http://example.com x"
+output: [Text(text="x "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" x")]
+
+---
+
+name: template_before
+label: a template before a free link
+input: "{{foo}}http://example.com"
+output: [TemplateOpen(), Text(text="foo"), TemplateClose(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()]
+
+---
+
+name: spaces_padding_no_slashes
+label: spaces padding a free link with no slashes after the colon
+input: " mailto:example@example.com "
+output: [Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" ")]
+
+---
+
+name: text_and_spaces_padding_no_slashes
+label: text and spaces padding a free link with no slashes after the colon
+input: "x mailto:example@example.com x"
+output: [Text(text="x "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" x")]
+
+---
+
+name: template_before_no_slashes
+label: a template before a free link with no slashes after the colon
+input: "{{foo}}mailto:example@example.com"
+output: [TemplateOpen(), Text(text="foo"), TemplateClose(), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose()]
+
+---
+
+name: no_slashes
+label: a free link with no slashes after the colon
+input: "mailto:example@example.com"
+output: [ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose()]
+
+---
+
+name: slashes_optional
+label: a free link using a scheme that doesn't need slashes, but has them anyway
+input: "mailto://example@example.com"
+output: [ExternalLinkOpen(brackets=False), Text(text="mailto://example@example.com"), ExternalLinkClose()]
+
+---
+
+name: short
+label: a very short free link
+input: "mailto://abc"
+output: [ExternalLinkOpen(brackets=False), Text(text="mailto://abc"), ExternalLinkClose()]
+
+---
+
+name: slashes_missing
+label: slashes missing from a free link with a scheme that requires them
+input: "http:example@example.com"
+output: [Text(text="http:example@example.com")]
+
+---
+
+name: no_scheme_but_slashes
+label: no scheme in a free link, but slashes (protocol-relative free links are not supported)
+input: "//example.com"
+output: [Text(text="//example.com")]
+
+---
+
+name: no_scheme_but_colon
+label: no scheme in a free link, but a colon
+input: " :example.com"
+output: [Text(text=" :example.com")]
+
+---
+
+name: no_scheme_but_colon_and_slashes
+label: no scheme in a free link, but a colon and slashes
+input: " ://example.com"
+output: [Text(text=" ://example.com")]
+
+---
+
+name: fake_scheme_no_slashes
+label: a nonexistent scheme in a free link, without slashes
+input: "fake:example.com"
+output: [Text(text="fake:example.com")]
+
+---
+
+name: fake_scheme_slashes
+label: a nonexistent scheme in a free link, with slashes
+input: "fake://example.com"
+output: [Text(text="fake://example.com")]
+
+---
+
+name: fake_scheme_brackets_no_slashes
+label: a nonexistent scheme in a bracketed link, without slashes
+input: "[fake:example.com]"
+output: [Text(text="[fake:example.com]")]
+
+---
+
+name: fake_scheme_brackets_slashes
+label: #=a nonexistent scheme in a bracketed link, with slashes
+input: "[fake://example.com]"
+output: [Text(text="[fake://example.com]")]
+
+---
+
+name: interrupted_scheme
+label: an otherwise valid scheme with something in the middle of it, in a free link
+input: "ht?tp://example.com"
+output: [Text(text="ht?tp://example.com")]
+
+---
+
+name: interrupted_scheme_brackets
+label: an otherwise valid scheme with something in the middle of it, in a bracketed link
+input: "[ht?tp://example.com]"
+output: [Text(text="[ht?tp://example.com]")]
+
+---
+
+name: no_slashes_brackets
+label: no slashes after the colon in a bracketed link
+input: "[mailto:example@example.com Example]"
+output: [ExternalLinkOpen(brackets=True), Text(text="mailto:example@example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()]
+
+---
+
+name: space_before_no_slashes_brackets
+label: a space before a bracketed link with no slashes after the colon
+input: "[ mailto:example@example.com Example]"
+output: [Text(text="[ "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" Example]")]
+
+---
+
+name: slashes_optional_brackets
+label: a bracketed link using a scheme that doesn't need slashes, but has them anyway
+input: "[mailto://example@example.com Example]"
+output: [ExternalLinkOpen(brackets=True), Text(text="mailto://example@example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()]
+
+---
+
+name: short_brackets
+label: a very short link in brackets
+input: "[mailto://abc Example]"
+output: [ExternalLinkOpen(brackets=True), Text(text="mailto://abc"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()]
+
+---
+
+name: slashes_missing_brackets
+label: slashes missing from a scheme that requires them in a bracketed link
+input: "[http:example@example.com Example]"
+output: [Text(text="[http:example@example.com Example]")]
+
+---
+
+name: protcol_relative
+label: a protocol-relative link (in brackets)
+input: "[//example.com Example]"
+output: [ExternalLinkOpen(brackets=True), Text(text="//example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()]
+
+---
+
+name: scheme_missing_but_colon_brackets
+label: scheme missing from a bracketed link, but with a colon
+input: "[:example.com Example]"
+output: [Text(text="[:example.com Example]")]
+
+---
+
+name: scheme_missing_but_colon_slashes_brackets
+label: scheme missing from a bracketed link, but with a colon and slashes
+input: "[://example.com Example]"
+output: [Text(text="[://example.com Example]")]
+
+---
+
+name: unclosed_protocol_relative
+label: an unclosed protocol-relative bracketed link
+input: "[//example.com"
+output: [Text(text="[//example.com")]
+
+---
+
+name: space_before_protcol_relative
+label: a space before a protocol-relative bracketed link
+input: "[ //example.com]"
+output: [Text(text="[ //example.com]")]
+
+---
+
+name: unclosed_just_scheme
+label: an unclosed bracketed link, ending after the scheme
+input: "[http"
+output: [Text(text="[http")]
+
+---
+
+name: unclosed_scheme_colon
+label: an unclosed bracketed link, ending after the colon
+input: "[http:"
+output: [Text(text="[http:")]
+
+---
+
+name: unclosed_scheme_colon_slashes
+label: an unclosed bracketed link, ending after the slashes
+input: "[http://"
+output: [Text(text="[http://")]
+
+---
+
+name: incomplete_bracket
+label: just an open bracket
+input: "["
+output: [Text(text="[")]
+
+---
+
+name: incomplete_scheme_colon
+label: a free link with just a scheme and a colon
+input: "http:"
+output: [Text(text="http:")]
+
+---
+
+name: incomplete_scheme_colon_slashes
+label: a free link with just a scheme, colon, and slashes
+input: "http://"
+output: [Text(text="http://")]
+
+---
+
+name: brackets_scheme_but_no_url
+label: brackets around a scheme and a colon
+input: "[mailto:]"
+output: [Text(text="[mailto:]")]
+
+---
+
+name: brackets_scheme_slashes_but_no_url
+label: brackets around a scheme, colon, and slashes
+input: "[http://]"
+output: [Text(text="[http://]")]
+
+---
+
+name: brackets_scheme_title_but_no_url
+label: brackets around a scheme, colon, and slashes, with a title
+input: "[http:// Example]"
+output: [Text(text="[http:// Example]")]
diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest
index 0277a51..083b12c 100644
--- a/tests/tokenizer/integration.mwtest
+++ b/tests/tokenizer/integration.mwtest
@@ -12,6 +12,13 @@ output: [TemplateOpen(), ArgumentOpen(), ArgumentOpen(), Text(text="foo"), Argum
---
+name: link_in_template_name
+label: a wikilink inside a template name, which breaks the template
+input: "{{foo[[bar]]}}"
+output: [Text(text="{{foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="}}")]
+
+---
+
name: rich_heading
label: a heading with templates/wikilinks in it
input: "== Head{{ing}} [[with]] {{{funky|{{stuf}}}}} =="
@@ -51,3 +58,17 @@ name: wildcard_redux
label: an even wilder assortment of various things
input: "{{a|b|{{c|[[d]]{{{e}}}}}}}[[f|{{{g}}}]]{{i|j= }}"
output: [TemplateOpen(), Text(text="a"), TemplateParamSeparator(), Text(text="b"), TemplateParamSeparator(), TemplateOpen(), Text(text="c"), TemplateParamSeparator(), WikilinkOpen(), Text(text="d"), WikilinkClose(), ArgumentOpen(), Text(text="e"), ArgumentClose(), TemplateClose(), TemplateClose(), WikilinkOpen(), Text(text="f"), WikilinkSeparator(), ArgumentOpen(), Text(text="g"), ArgumentClose(), CommentStart(), Text(text="h"), CommentEnd(), WikilinkClose(), TemplateOpen(), Text(text="i"), TemplateParamSeparator(), Text(text="j"), TemplateParamEquals(), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), TemplateClose()]
+
+---
+
+name: link_inside_dl
+label: an external link inside a def list, such that the external link is parsed
+input: ";;;mailto:example"
+output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), ExternalLinkOpen(brackets=False), Text(text="mailto:example"), ExternalLinkClose()]
+
+---
+
+name: link_inside_dl_2
+label: an external link inside a def list, such that the external link is not parsed
+input: ";;;malito:example"
+output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="malito"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="example")]
diff --git a/tests/tokenizer/wikilinks.mwtest b/tests/tokenizer/wikilinks.mwtest
index 0682ef1..8eb381a 100644
--- a/tests/tokenizer/wikilinks.mwtest
+++ b/tests/tokenizer/wikilinks.mwtest
@@ -40,17 +40,17 @@ output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar|b
---
-name: nested
-label: a wikilink nested within the value of another
-input: "[[foo|[[bar]]]]"
-output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), WikilinkOpen(), Text(text="bar"), WikilinkClose(), WikilinkClose()]
+name: newline_text
+label: a newline in the middle of the text
+input: "[[foo|foo\nbar]]"
+output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="foo\nbar"), WikilinkClose()]
---
-name: nested_with_text
-label: a wikilink nested within the value of another, separated by other data
-input: "[[foo|a[[b]]c]]"
-output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c"), WikilinkClose()]
+name: bracket_text
+label: a left bracket in the middle of the text
+input: "[[foo|bar[baz]]"
+output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar[baz"), WikilinkClose()]
---
@@ -96,13 +96,34 @@ output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(),
---
-name: invalid_nested_text
+name: invalid_nested_padding
label: invalid wikilink: trying to nest in the wrong context, with a text param
input: "[[foo[[bar]]|baz]]"
output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="|baz]]")]
---
+name: invalid_nested_text
+label: invalid wikilink: a wikilink nested within the value of another
+input: "[[foo|[[bar]]"
+output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose()]
+
+---
+
+name: invalid_nested_text_2
+label: invalid wikilink: a wikilink nested within the value of another, two pairs of closing brackets
+input: "[[foo|[[bar]]]]"
+output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="]]")]
+
+---
+
+name: invalid_nested_text_padding
+label: invalid wikilink: a wikilink nested within the value of another, separated by other data
+input: "[[foo|a[[b]]c]]"
+output: [Text(text="[[foo|a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c]]")]
+
+---
+
name: incomplete_open_only
label: incomplete wikilinks: just an open
input: "[["