@@ -1,8 +1,10 @@ | |||
v0.3 (unreleased): | |||
- Added complete support for HTML Tags, along with appropriate unit tests. This | |||
includes forms like <ref>foo</ref>, <ref name="bar"/>, and wiki-markup tags | |||
like bold ('''), italics (''), and lists (*, #, ; and :). | |||
- Added complete support for HTML Tags, including forms like <ref>foo</ref>, | |||
<ref name="bar"/>, and wiki-markup tags like bold ('''), italics (''), and | |||
lists (*, #, ; and :). | |||
- Added support for ExternalLinks (http://example.com/ and | |||
[http://example.com/ Example]). | |||
- Wikicode's filter methods are now passed 'recursive=True' by default instead | |||
of False. This is a breaking change if you rely on any filter() methods being | |||
non-recursive by default. | |||
@@ -14,7 +16,7 @@ v0.3 (unreleased): | |||
- Renamed Template.has_param() to has() for consistency with Template's other | |||
methods; has_param() is now an alias. | |||
- The C tokenizer extension now works on Python 3 in addition to Python 2.7. | |||
- Various fixes and cleanup. | |||
- Various bugfixes, internal changes, and cleanup. | |||
v0.2 (released June 20, 2013): | |||
@@ -25,6 +25,14 @@ nodes Package | |||
:undoc-members: | |||
:show-inheritance: | |||
:mod:`external_link` Module | |||
--------------------------- | |||
.. automodule:: mwparserfromhell.nodes.external_link | |||
:members: | |||
:undoc-members: | |||
:show-inheritance: | |||
:mod:`heading` Module | |||
--------------------- | |||
@@ -30,10 +30,10 @@ mwparserfromhell Package | |||
:members: | |||
:undoc-members: | |||
:mod:`tag_defs` Module | |||
:mod:`definitions` Module | |||
------------------------- | |||
.. automodule:: mwparserfromhell.tag_defs | |||
.. automodule:: mwparserfromhell.definitions | |||
:members: | |||
:mod:`utils` Module | |||
@@ -7,10 +7,11 @@ v0.3 | |||
Unreleased | |||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.2...develop>`__): | |||
- Added complete support for HTML :py:class:`Tags <.Tag>`, along with | |||
appropriate unit tests. This includes forms like ``<ref>foo</ref>``, | |||
``<ref name="bar"/>``, and wiki-markup tags like bold (``'''``), italics | |||
(``''``), and lists (``*``, ``#``, ``;`` and ``:``). | |||
- Added complete support for HTML :py:class:`Tags <.Tag>`, including forms like | |||
``<ref>foo</ref>``, ``<ref name="bar"/>``, and wiki-markup tags like bold | |||
(``'''``), italics (``''``), and lists (``*``, ``#``, ``;`` and ``:``). | |||
- Added support for :py:class:`.ExternalLink`\ s (``http://example.com/`` and | |||
``[http://example.com/ Example]``). | |||
- :py:class:`Wikicode's <.Wikicode>` :py:meth:`.filter` methods are now passed | |||
*recursive=True* by default instead of *False*. **This is a breaking change | |||
if you rely on any filter() methods being non-recursive by default.** | |||
@@ -25,7 +26,7 @@ Unreleased | |||
:py:meth:`~.Template.has` for consistency with :py:class:`~.Template`\ 's | |||
other methods; :py:meth:`~.has_param` is now an alias. | |||
- The C tokenizer extension now works on Python 3 in addition to Python 2.7. | |||
- Various fixes and cleanup. | |||
- Various bugfixes, internal changes, and cleanup. | |||
v0.2 | |||
---- | |||
@@ -34,6 +34,7 @@ __license__ = "MIT License" | |||
__version__ = "0.3.dev" | |||
__email__ = "ben.kurtovic@verizon.net" | |||
from . import compat, nodes, parser, smart_list, string_mixin, utils, wikicode | |||
from . import (compat, definitions, nodes, parser, smart_list, string_mixin, | |||
utils, wikicode) | |||
parse = utils.parse_anything |
@@ -20,12 +20,22 @@ | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
"""Contains data regarding certain HTML tags.""" | |||
"""Contains data about certain markup, like HTML tags and external links.""" | |||
from __future__ import unicode_literals | |||
__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", | |||
"is_single_only"] | |||
"is_single_only", "is_scheme"] | |||
URI_SCHEMES = { | |||
# [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943 | |||
"http": True, "https": True, "ftp": True, "ftps": True, "ssh": True, | |||
"sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False, | |||
"sips": False, "gopher": True, "telnet": True, "nntp": True, | |||
"worldwind": True, "mailto": False, "tel": False, "sms": False, | |||
"news": False, "svn": True, "git": True, "mms": True, "bitcoin": False, | |||
"magnet": False, "urn": False, "geo": False | |||
} | |||
PARSER_BLACKLIST = [ | |||
# enwiki extensions @ 2013-06-28 | |||
@@ -70,3 +80,12 @@ def is_single(tag): | |||
def is_single_only(tag): | |||
"""Return whether or not the given *tag* must exist without a close tag.""" | |||
return tag.lower() in SINGLE_ONLY | |||
def is_scheme(scheme, slashes=True, reverse=False): | |||
"""Return whether *scheme* is valid for external links.""" | |||
if reverse: # Convenience for C | |||
scheme = scheme[::-1] | |||
scheme = scheme.lower() | |||
if slashes: | |||
return scheme in URI_SCHEMES | |||
return scheme in URI_SCHEMES and not URI_SCHEMES[scheme] |
@@ -69,6 +69,7 @@ from . import extras | |||
from .text import Text | |||
from .argument import Argument | |||
from .comment import Comment | |||
from .external_link import ExternalLink | |||
from .heading import Heading | |||
from .html_entity import HTMLEntity | |||
from .tag import Tag | |||
@@ -0,0 +1,97 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
from . import Node | |||
from ..compat import str | |||
from ..utils import parse_anything | |||
__all__ = ["ExternalLink"] | |||
class ExternalLink(Node): | |||
"""Represents an external link, like ``[http://example.com/ Example]``.""" | |||
def __init__(self, url, title=None, brackets=True): | |||
super(ExternalLink, self).__init__() | |||
self._url = url | |||
self._title = title | |||
self._brackets = brackets | |||
def __unicode__(self): | |||
if self.brackets: | |||
if self.title is not None: | |||
return "[" + str(self.url) + " " + str(self.title) + "]" | |||
return "[" + str(self.url) + "]" | |||
return str(self.url) | |||
def __iternodes__(self, getter): | |||
yield None, self | |||
for child in getter(self.url): | |||
yield self.url, child | |||
if self.title is not None: | |||
for child in getter(self.title): | |||
yield self.title, child | |||
def __strip__(self, normalize, collapse): | |||
if self.brackets: | |||
if self.title: | |||
return self.title.strip_code(normalize, collapse) | |||
return None | |||
return self.url.strip_code(normalize, collapse) | |||
def __showtree__(self, write, get, mark): | |||
if self.brackets: | |||
write("[") | |||
get(self.url) | |||
if self.title is not None: | |||
get(self.title) | |||
if self.brackets: | |||
write("]") | |||
@property | |||
def url(self): | |||
"""The URL of the link target, as a :py:class:`~.Wikicode` object.""" | |||
return self._url | |||
@property | |||
def title(self): | |||
"""The link title (if given), as a :py:class:`~.Wikicode` object.""" | |||
return self._title | |||
@property | |||
def brackets(self): | |||
"""Whether to enclose the URL in brackets or display it straight.""" | |||
return self._brackets | |||
@url.setter | |||
def url(self, value): | |||
from ..parser import contexts | |||
self._url = parse_anything(value, contexts.EXT_LINK_URI) | |||
@title.setter | |||
def title(self, value): | |||
self._title = None if value is None else parse_anything(value) | |||
@brackets.setter | |||
def brackets(self, value): | |||
self._brackets = bool(value) |
@@ -25,7 +25,7 @@ from __future__ import unicode_literals | |||
from . import Node, Text | |||
from .extras import Attribute | |||
from ..compat import str | |||
from ..tag_defs import is_visible | |||
from ..definitions import is_visible | |||
from ..utils import parse_anything | |||
__all__ = ["Tag"] | |||
@@ -152,7 +152,7 @@ class Tag(Node): | |||
This makes the tag look like a lone close tag. It is technically | |||
invalid and is only parsable Wikicode when the tag itself is | |||
single-only, like ``<br>`` and ``<img>``. See | |||
:py:func:`.tag_defs.is_single_only`. | |||
:py:func:`.definitions.is_single_only`. | |||
""" | |||
return self._invalid | |||
@@ -161,7 +161,7 @@ class Tag(Node): | |||
"""Whether the tag is implicitly self-closing, with no ending slash. | |||
This is only possible for specific "single" tags like ``<br>`` and | |||
``<li>``. See :py:func:`.tag_defs.is_single`. This field only has an | |||
``<li>``. See :py:func:`.definitions.is_single`. This field only has an | |||
effect if :py:attr:`self_closing` is also ``True``. | |||
""" | |||
return self._implicit | |||
@@ -46,16 +46,15 @@ class Parser(object): | |||
:py:class:`~.Node`\ s by the :py:class:`~.Builder`. | |||
""" | |||
def __init__(self, text): | |||
self.text = text | |||
def __init__(self): | |||
if use_c and CTokenizer: | |||
self._tokenizer = CTokenizer() | |||
else: | |||
self._tokenizer = Tokenizer() | |||
self._builder = Builder() | |||
def parse(self): | |||
"""Return a string as a parsed :py:class:`~.Wikicode` object tree.""" | |||
tokens = self._tokenizer.tokenize(self.text) | |||
def parse(self, text, context=0): | |||
"""Parse *text*, returning a :py:class:`~.Wikicode` object tree.""" | |||
tokens = self._tokenizer.tokenize(text, context) | |||
code = self._builder.build(tokens) | |||
return code |
@@ -24,8 +24,8 @@ from __future__ import unicode_literals | |||
from . import tokens | |||
from ..compat import str | |||
from ..nodes import (Argument, Comment, Heading, HTMLEntity, Tag, Template, | |||
Text, Wikilink) | |||
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, | |||
Template, Text, Wikilink) | |||
from ..nodes.extras import Attribute, Parameter | |||
from ..smart_list import SmartList | |||
from ..wikicode import Wikicode | |||
@@ -142,6 +142,22 @@ class Builder(object): | |||
else: | |||
self._write(self._handle_token(token)) | |||
def _handle_external_link(self, token): | |||
"""Handle when an external link is at the head of the tokens.""" | |||
brackets, url = token.brackets, None | |||
self._push() | |||
while self._tokens: | |||
token = self._tokens.pop() | |||
if isinstance(token, tokens.ExternalLinkSeparator): | |||
url = self._pop() | |||
self._push() | |||
elif isinstance(token, tokens.ExternalLinkClose): | |||
if url is not None: | |||
return ExternalLink(url, self._pop(), brackets) | |||
return ExternalLink(self._pop(), brackets=brackets) | |||
else: | |||
self._write(self._handle_token(token)) | |||
def _handle_entity(self): | |||
"""Handle a case where an HTML entity is at the head of the tokens.""" | |||
token = self._tokens.pop() | |||
@@ -244,6 +260,8 @@ class Builder(object): | |||
return self._handle_argument() | |||
elif isinstance(token, tokens.WikilinkOpen): | |||
return self._handle_wikilink() | |||
elif isinstance(token, tokens.ExternalLinkOpen): | |||
return self._handle_external_link(token) | |||
elif isinstance(token, tokens.HTMLEntityStart): | |||
return self._handle_entity() | |||
elif isinstance(token, tokens.HeadingStart): | |||
@@ -51,6 +51,12 @@ Local (stack-specific) contexts: | |||
* :py:const:`WIKILINK_TITLE` | |||
* :py:const:`WIKILINK_TEXT` | |||
* :py:const:`EXT_LINK` | |||
* :py:const:`EXT_LINK_URI` | |||
* :py:const:`EXT_LINK_TITLE` | |||
* :py:const:`EXT_LINK_BRACKETS` | |||
* :py:const:`HEADING` | |||
* :py:const:`HEADING_LEVEL_1` | |||
@@ -94,6 +100,7 @@ Aggregate contexts: | |||
* :py:const:`FAIL` | |||
* :py:const:`UNSAFE` | |||
* :py:const:`DOUBLE` | |||
* :py:const:`INVALID_LINK` | |||
""" | |||
@@ -112,35 +119,40 @@ WIKILINK_TITLE = 1 << 5 | |||
WIKILINK_TEXT = 1 << 6 | |||
WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT | |||
HEADING_LEVEL_1 = 1 << 7 | |||
HEADING_LEVEL_2 = 1 << 8 | |||
HEADING_LEVEL_3 = 1 << 9 | |||
HEADING_LEVEL_4 = 1 << 10 | |||
HEADING_LEVEL_5 = 1 << 11 | |||
HEADING_LEVEL_6 = 1 << 12 | |||
EXT_LINK_URI = 1 << 7 | |||
EXT_LINK_TITLE = 1 << 8 | |||
EXT_LINK_BRACKETS = 1 << 9 | |||
EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE + EXT_LINK_BRACKETS | |||
HEADING_LEVEL_1 = 1 << 10 | |||
HEADING_LEVEL_2 = 1 << 11 | |||
HEADING_LEVEL_3 = 1 << 12 | |||
HEADING_LEVEL_4 = 1 << 13 | |||
HEADING_LEVEL_5 = 1 << 14 | |||
HEADING_LEVEL_6 = 1 << 15 | |||
HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 + | |||
HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6) | |||
TAG_OPEN = 1 << 13 | |||
TAG_ATTR = 1 << 14 | |||
TAG_BODY = 1 << 15 | |||
TAG_CLOSE = 1 << 16 | |||
TAG_OPEN = 1 << 16 | |||
TAG_ATTR = 1 << 17 | |||
TAG_BODY = 1 << 18 | |||
TAG_CLOSE = 1 << 19 | |||
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE | |||
STYLE_ITALICS = 1 << 17 | |||
STYLE_BOLD = 1 << 18 | |||
STYLE_PASS_AGAIN = 1 << 19 | |||
STYLE_SECOND_PASS = 1 << 20 | |||
STYLE_ITALICS = 1 << 20 | |||
STYLE_BOLD = 1 << 21 | |||
STYLE_PASS_AGAIN = 1 << 22 | |||
STYLE_SECOND_PASS = 1 << 23 | |||
STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS | |||
DL_TERM = 1 << 21 | |||
DL_TERM = 1 << 24 | |||
HAS_TEXT = 1 << 22 | |||
FAIL_ON_TEXT = 1 << 23 | |||
FAIL_NEXT = 1 << 24 | |||
FAIL_ON_LBRACE = 1 << 25 | |||
FAIL_ON_RBRACE = 1 << 26 | |||
FAIL_ON_EQUALS = 1 << 27 | |||
HAS_TEXT = 1 << 25 | |||
FAIL_ON_TEXT = 1 << 26 | |||
FAIL_NEXT = 1 << 27 | |||
FAIL_ON_LBRACE = 1 << 28 | |||
FAIL_ON_RBRACE = 1 << 29 | |||
FAIL_ON_EQUALS = 1 << 30 | |||
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | |||
FAIL_ON_RBRACE + FAIL_ON_EQUALS) | |||
@@ -150,7 +162,8 @@ GL_HEADING = 1 << 0 | |||
# Aggregate contexts: | |||
FAIL = TEMPLATE + ARGUMENT + WIKILINK + HEADING + TAG + STYLE | |||
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + | |||
TAG_CLOSE) | |||
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE | |||
UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + | |||
ARGUMENT_NAME + TAG_CLOSE) | |||
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE | |||
INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK + EXT_LINK |
@@ -24,6 +24,20 @@ SOFTWARE. | |||
#include "tokenizer.h" | |||
/* | |||
Determine whether the given Py_UNICODE is a marker. | |||
*/ | |||
static int is_marker(Py_UNICODE this) | |||
{ | |||
int i; | |||
for (i = 0; i < NUM_MARKERS; i++) { | |||
if (*MARKERS[i] == this) | |||
return 1; | |||
} | |||
return 0; | |||
} | |||
/* | |||
Given a context, return the heading level encoded within it. | |||
*/ | |||
static int heading_level_from_context(int n) | |||
@@ -37,13 +51,14 @@ static int heading_level_from_context(int n) | |||
} | |||
/* | |||
Call the given function in tag_defs, using 'tag' as a parameter, and return | |||
its output as a bool. | |||
Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as | |||
parameters, and return its output as a bool. | |||
*/ | |||
static int call_tag_def_func(const char* funcname, PyObject* tag) | |||
static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2, | |||
PyObject* in3) | |||
{ | |||
PyObject* func = PyObject_GetAttrString(tag_defs, funcname); | |||
PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL); | |||
PyObject* func = PyObject_GetAttrString(definitions, funcname); | |||
PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL); | |||
int ans = (result == Py_True) ? 1 : 0; | |||
Py_DECREF(func); | |||
@@ -65,7 +80,7 @@ static PyObject* strip_tag_name(PyObject* token) | |||
Py_DECREF(text); | |||
if (!rstripped) | |||
return NULL; | |||
lowered = PyObject_CallMethod(rstripped, "rstrip", NULL); | |||
lowered = PyObject_CallMethod(rstripped, "lower", NULL); | |||
Py_DECREF(rstripped); | |||
return lowered; | |||
} | |||
@@ -85,7 +100,7 @@ static Textbuffer* Textbuffer_new(void) | |||
PyErr_NoMemory(); | |||
return NULL; | |||
} | |||
buffer->next = NULL; | |||
buffer->prev = buffer->next = NULL; | |||
return buffer; | |||
} | |||
@@ -113,10 +128,10 @@ static int Textbuffer_write(Textbuffer** this, Py_UNICODE code) | |||
if (!new) | |||
return -1; | |||
new->next = self; | |||
self->prev = new; | |||
*this = self = new; | |||
} | |||
self->data[self->size] = code; | |||
self->size++; | |||
self->data[self->size++] = code; | |||
return 0; | |||
} | |||
@@ -345,7 +360,7 @@ static void* Tokenizer_fail_route(Tokenizer* self) | |||
} | |||
/* | |||
Write a token to the end of the current token stack. | |||
Write a token to the current token stack. | |||
*/ | |||
static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) | |||
{ | |||
@@ -366,7 +381,8 @@ static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) | |||
} | |||
/* | |||
Write a token to the end of the current token stack. | |||
Write a token to the current token stack, with kwargs. Steals a reference | |||
to kwargs. | |||
*/ | |||
static int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token, | |||
PyObject* kwargs, int first) | |||
@@ -417,6 +433,42 @@ static int Tokenizer_emit_text(Tokenizer* self, const char* text) | |||
} | |||
/* | |||
Write the contents of another textbuffer to the current textbuffer, | |||
deallocating it in the process. | |||
*/ | |||
static int | |||
Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse) | |||
{ | |||
Textbuffer *original = buffer; | |||
int i; | |||
if (reverse) { | |||
do { | |||
for (i = buffer->size - 1; i >= 0; i--) { | |||
if (Tokenizer_emit_char(self, buffer->data[i])) { | |||
Textbuffer_dealloc(original); | |||
return -1; | |||
} | |||
} | |||
} while ((buffer = buffer->next)); | |||
} | |||
else { | |||
while (buffer->next) | |||
buffer = buffer->next; | |||
do { | |||
for (i = 0; i < buffer->size; i++) { | |||
if (Tokenizer_emit_char(self, buffer->data[i])) { | |||
Textbuffer_dealloc(original); | |||
return -1; | |||
} | |||
} | |||
} while ((buffer = buffer->prev)); | |||
} | |||
Textbuffer_dealloc(original); | |||
return 0; | |||
} | |||
/* | |||
Write a series of tokens to the current stack at once. | |||
*/ | |||
static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist) | |||
@@ -808,6 +860,353 @@ static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self) | |||
} | |||
/* | |||
Parse the URI scheme of a bracket-enclosed external link. | |||
*/ | |||
static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) | |||
{ | |||
static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; | |||
Textbuffer* buffer; | |||
PyObject* scheme; | |||
Py_UNICODE this; | |||
int slashes, i; | |||
if (Tokenizer_push(self, LC_EXT_LINK_URI)) | |||
return -1; | |||
if (Tokenizer_READ(self, 0) == *"/" && Tokenizer_READ(self, 1) == *"/") { | |||
if (Tokenizer_emit_text(self, "//")) | |||
return -1; | |||
self->head += 2; | |||
} | |||
else { | |||
buffer = Textbuffer_new(); | |||
if (!buffer) | |||
return -1; | |||
while ((this = Tokenizer_READ(self, 0)) != *"") { | |||
i = 0; | |||
while (1) { | |||
if (!valid[i]) | |||
goto end_of_loop; | |||
if (this == valid[i]) | |||
break; | |||
i++; | |||
} | |||
Textbuffer_write(&buffer, this); | |||
if (Tokenizer_emit_char(self, this)) { | |||
Textbuffer_dealloc(buffer); | |||
return -1; | |||
} | |||
self->head++; | |||
} | |||
end_of_loop: | |||
if (this != *":") { | |||
Textbuffer_dealloc(buffer); | |||
Tokenizer_fail_route(self); | |||
return 0; | |||
} | |||
if (Tokenizer_emit_char(self, *":")) { | |||
Textbuffer_dealloc(buffer); | |||
return -1; | |||
} | |||
self->head++; | |||
slashes = (Tokenizer_READ(self, 0) == *"/" && | |||
Tokenizer_READ(self, 1) == *"/"); | |||
if (slashes) { | |||
if (Tokenizer_emit_text(self, "//")) { | |||
Textbuffer_dealloc(buffer); | |||
return -1; | |||
} | |||
self->head += 2; | |||
} | |||
scheme = Textbuffer_render(buffer); | |||
Textbuffer_dealloc(buffer); | |||
if (!scheme) | |||
return -1; | |||
if (!IS_SCHEME(scheme, slashes, 0)) { | |||
Py_DECREF(scheme); | |||
Tokenizer_fail_route(self); | |||
return 0; | |||
} | |||
Py_DECREF(scheme); | |||
} | |||
return 0; | |||
} | |||
/* | |||
Parse the URI scheme of a free (no brackets) external link. | |||
*/ | |||
static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||
{ | |||
static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; | |||
Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer; | |||
PyObject *scheme; | |||
Py_UNICODE chunk; | |||
int slashes, i, j; | |||
if (!scheme_buffer) | |||
return -1; | |||
// We have to backtrack through the textbuffer looking for our scheme since | |||
// it was just parsed as text: | |||
temp_buffer = self->topstack->textbuffer; | |||
while (temp_buffer) { | |||
for (i = temp_buffer->size - 1; i >= 0; i--) { | |||
chunk = temp_buffer->data[i]; | |||
if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) | |||
goto end_of_loop; | |||
j = 0; | |||
while (1) { | |||
if (!valid[j]) { | |||
Textbuffer_dealloc(scheme_buffer); | |||
FAIL_ROUTE(0); | |||
return 0; | |||
} | |||
if (chunk == valid[j]) | |||
break; | |||
j++; | |||
} | |||
Textbuffer_write(&scheme_buffer, chunk); | |||
} | |||
temp_buffer = temp_buffer->next; | |||
} | |||
end_of_loop: | |||
scheme = Textbuffer_render(scheme_buffer); | |||
if (!scheme) { | |||
Textbuffer_dealloc(scheme_buffer); | |||
return -1; | |||
} | |||
slashes = (Tokenizer_READ(self, 0) == *"/" && | |||
Tokenizer_READ(self, 1) == *"/"); | |||
if (!IS_SCHEME(scheme, slashes, 1)) { | |||
Py_DECREF(scheme); | |||
Textbuffer_dealloc(scheme_buffer); | |||
FAIL_ROUTE(0); | |||
return 0; | |||
} | |||
Py_DECREF(scheme); | |||
if (Tokenizer_push(self, LC_EXT_LINK_URI)) { | |||
Textbuffer_dealloc(scheme_buffer); | |||
return -1; | |||
} | |||
if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1)) | |||
return -1; | |||
if (Tokenizer_emit_char(self, *":")) | |||
return -1; | |||
if (slashes) { | |||
if (Tokenizer_emit_text(self, "//")) | |||
return -1; | |||
self->head += 2; | |||
} | |||
return 0; | |||
} | |||
/* | |||
Handle text in a free external link, including trailing punctuation. | |||
*/ | |||
static int | |||
Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, | |||
Textbuffer** tail, Py_UNICODE this) | |||
{ | |||
#define PUSH_TAIL_BUFFER(tail, error) \ | |||
if ((tail)->size || (tail)->next) { \ | |||
if (Tokenizer_emit_textbuffer(self, tail, 0)) \ | |||
return error; \ | |||
tail = Textbuffer_new(); \ | |||
if (!(tail)) \ | |||
return error; \ | |||
} | |||
if (this == *"(" && !(*parens)) { | |||
*parens = 1; | |||
PUSH_TAIL_BUFFER(*tail, -1) | |||
} | |||
else if (this == *"," || this == *";" || this == *"\\" || this == *"." || | |||
this == *":" || this == *"!" || this == *"?" || | |||
(!(*parens) && this == *")")) | |||
return Textbuffer_write(tail, this); | |||
else | |||
PUSH_TAIL_BUFFER(*tail, -1) | |||
return Tokenizer_emit_char(self, this); | |||
} | |||
/* | |||
Really parse an external link. | |||
*/ | |||
static PyObject* | |||
Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||
Textbuffer** extra) | |||
{ | |||
Py_UNICODE this, next; | |||
int parens = 0; | |||
if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) : | |||
Tokenizer_parse_free_uri_scheme(self)) | |||
return NULL; | |||
if (BAD_ROUTE) | |||
return NULL; | |||
this = Tokenizer_READ(self, 0); | |||
if (this == *"" || this == *"\n" || this == *" " || this == *"]") | |||
return Tokenizer_fail_route(self); | |||
if (!brackets && this == *"[") | |||
return Tokenizer_fail_route(self); | |||
while (1) { | |||
this = Tokenizer_READ(self, 0); | |||
next = Tokenizer_READ(self, 1); | |||
if (this == *"" || this == *"\n") { | |||
if (brackets) | |||
return Tokenizer_fail_route(self); | |||
self->head--; | |||
return Tokenizer_pop(self); | |||
} | |||
if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) { | |||
PUSH_TAIL_BUFFER(*extra, NULL) | |||
if (Tokenizer_parse_template_or_argument(self)) | |||
return NULL; | |||
} | |||
else if (this == *"[") { | |||
if (!brackets) { | |||
self->head--; | |||
return Tokenizer_pop(self); | |||
} | |||
if (Tokenizer_emit_char(self, *"[")) | |||
return NULL; | |||
} | |||
else if (this == *"]") { | |||
if (!brackets) | |||
self->head--; | |||
return Tokenizer_pop(self); | |||
} | |||
else if (this == *"&") { | |||
PUSH_TAIL_BUFFER(*extra, NULL) | |||
if (Tokenizer_parse_entity(self)) | |||
return NULL; | |||
} | |||
else if (this == *" ") { | |||
if (brackets) { | |||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | |||
return NULL; | |||
self->topstack->context ^= LC_EXT_LINK_URI; | |||
self->topstack->context |= LC_EXT_LINK_TITLE; | |||
self->head++; | |||
return Tokenizer_parse(self, 0, 0); | |||
} | |||
if (Textbuffer_write(extra, *" ")) | |||
return NULL; | |||
return Tokenizer_pop(self); | |||
} | |||
else if (!brackets) { | |||
if (Tokenizer_handle_free_link_text(self, &parens, extra, this)) | |||
return NULL; | |||
} | |||
else { | |||
if (Tokenizer_emit_char(self, this)) | |||
return NULL; | |||
} | |||
self->head++; | |||
} | |||
} | |||
/* | |||
Remove the URI scheme of a new external link from the textbuffer. | |||
*/ | |||
static int | |||
Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) | |||
{ | |||
PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"), | |||
*split, *scheme; | |||
Py_ssize_t length; | |||
Textbuffer* temp; | |||
if (!text) | |||
return -1; | |||
split = PyObject_CallMethod(text, "split", "si", ":", 1); | |||
Py_DECREF(text); | |||
if (!split) | |||
return -1; | |||
scheme = PyList_GET_ITEM(split, 0); | |||
length = PyUnicode_GET_SIZE(scheme); | |||
while (length) { | |||
temp = self->topstack->textbuffer; | |||
if (length <= temp->size) { | |||
temp->size -= length; | |||
break; | |||
} | |||
length -= temp->size; | |||
self->topstack->textbuffer = temp->next; | |||
free(temp->data); | |||
free(temp); | |||
} | |||
Py_DECREF(split); | |||
return 0; | |||
} | |||
/* | |||
Parse an external link at the head of the wikicode string. | |||
*/ | |||
static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) | |||
{ | |||
#define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK | |||
#define NOT_A_LINK \ | |||
if (!brackets && self->topstack->context & LC_DLTERM) \ | |||
return Tokenizer_handle_dl_term(self); \ | |||
return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)) | |||
Py_ssize_t reset = self->head; | |||
PyObject *link, *kwargs; | |||
Textbuffer *extra = 0; | |||
if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { | |||
NOT_A_LINK; | |||
} | |||
extra = Textbuffer_new(); | |||
if (!extra) | |||
return -1; | |||
self->head++; | |||
link = Tokenizer_really_parse_external_link(self, brackets, &extra); | |||
if (BAD_ROUTE) { | |||
RESET_ROUTE(); | |||
self->head = reset; | |||
Textbuffer_dealloc(extra); | |||
NOT_A_LINK; | |||
} | |||
if (!link) { | |||
Textbuffer_dealloc(extra); | |||
return -1; | |||
} | |||
if (!brackets) { | |||
if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) { | |||
Textbuffer_dealloc(extra); | |||
Py_DECREF(link); | |||
return -1; | |||
} | |||
} | |||
kwargs = PyDict_New(); | |||
if (!kwargs) { | |||
Textbuffer_dealloc(extra); | |||
Py_DECREF(link); | |||
return -1; | |||
} | |||
PyDict_SetItemString(kwargs, "brackets", brackets ? Py_True : Py_False); | |||
if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) { | |||
Textbuffer_dealloc(extra); | |||
Py_DECREF(link); | |||
return -1; | |||
} | |||
if (Tokenizer_emit_all(self, link)) { | |||
Textbuffer_dealloc(extra); | |||
Py_DECREF(link); | |||
return -1; | |||
} | |||
Py_DECREF(link); | |||
if (Tokenizer_emit(self, ExternalLinkClose)) { | |||
Textbuffer_dealloc(extra); | |||
return -1; | |||
} | |||
if (extra->size || extra->next) | |||
return Tokenizer_emit_textbuffer(self, extra, 0); | |||
Textbuffer_dealloc(extra); | |||
return 0; | |||
} | |||
/* | |||
Parse a section heading at the head of the wikicode string. | |||
*/ | |||
static int Tokenizer_parse_heading(Tokenizer* self) | |||
@@ -1238,15 +1637,8 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) | |||
static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text) | |||
{ | |||
Py_UNICODE next = Tokenizer_READ(self, 1); | |||
int i, is_marker = 0; | |||
for (i = 0; i < NUM_MARKERS; i++) { | |||
if (*MARKERS[i] == text) { | |||
is_marker = 1; | |||
break; | |||
} | |||
} | |||
if (!is_marker || !Tokenizer_CAN_RECURSE(self)) | |||
if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self)) | |||
return Tokenizer_emit_char(self, text); | |||
else if (text == next && next == *"{") | |||
return Tokenizer_parse_template_or_argument(self); | |||
@@ -1264,17 +1656,11 @@ static int | |||
Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk) | |||
{ | |||
PyObject *trash; | |||
int first_time, i, is_marker = 0, escaped; | |||
int first_time, escaped; | |||
if (data->context & TAG_NAME) { | |||
first_time = !(data->context & TAG_NOTE_SPACE); | |||
for (i = 0; i < NUM_MARKERS; i++) { | |||
if (*MARKERS[i] == chunk) { | |||
is_marker = 1; | |||
break; | |||
} | |||
} | |||
if (is_marker || (Py_UNICODE_ISSPACE(chunk) && first_time)) { | |||
if (is_marker(chunk) || (Py_UNICODE_ISSPACE(chunk) && first_time)) { | |||
// Tags must start with text, not spaces | |||
Tokenizer_fail_route(self); | |||
return 0; | |||
@@ -1623,7 +2009,6 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) | |||
Textbuffer* buf; | |||
PyObject *name, *tag; | |||
Py_UNICODE this; | |||
int is_marker, i; | |||
self->head += 2; | |||
buf = Textbuffer_new(); | |||
@@ -1631,14 +2016,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) | |||
return -1; | |||
while (1) { | |||
this = Tokenizer_READ(self, pos); | |||
is_marker = 0; | |||
for (i = 0; i < NUM_MARKERS; i++) { | |||
if (*MARKERS[i] == this) { | |||
is_marker = 1; | |||
break; | |||
} | |||
} | |||
if (is_marker) { | |||
if (is_marker(this)) { | |||
name = Textbuffer_render(buf); | |||
if (!name) { | |||
Textbuffer_dealloc(buf); | |||
@@ -1985,9 +2363,9 @@ static int Tokenizer_handle_hr(Tokenizer* self) | |||
self->head++; | |||
} | |||
markup = Textbuffer_render(buffer); | |||
Textbuffer_dealloc(buffer); | |||
if (!markup) | |||
return -1; | |||
Textbuffer_dealloc(buffer); | |||
kwargs = PyDict_New(); | |||
if (!kwargs) | |||
return -1; | |||
@@ -2047,21 +2425,21 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) | |||
*/ | |||
static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||
{ | |||
if (context & LC_FAIL_NEXT) { | |||
if (context & LC_FAIL_NEXT) | |||
return -1; | |||
} | |||
if (context & LC_WIKILINK_TITLE) { | |||
if (data == *"]" || data == *"{") | |||
if (context & LC_WIKILINK) { | |||
if (context & LC_WIKILINK_TEXT) | |||
return (data == *"[" && Tokenizer_READ(self, 1) == *"[") ? -1 : 0; | |||
else if (data == *"]" || data == *"{") | |||
self->topstack->context |= LC_FAIL_NEXT; | |||
else if (data == *"\n" || data == *"[" || data == *"}") | |||
return -1; | |||
return 0; | |||
} | |||
if (context & LC_TAG_CLOSE) { | |||
if (data == *"<") | |||
return -1; | |||
return 0; | |||
} | |||
if (context & LC_EXT_LINK_TITLE) | |||
return (data == *"\n") ? -1 : 0; | |||
if (context & LC_TAG_CLOSE) | |||
return (data == *"<") ? -1 : 0; | |||
if (context & LC_TEMPLATE_NAME) { | |||
if (data == *"{" || data == *"}" || data == *"[") { | |||
self->topstack->context |= LC_FAIL_NEXT; | |||
@@ -2126,7 +2504,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||
*/ | |||
static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||
{ | |||
int this_context, is_marker, i; | |||
int this_context; | |||
Py_UNICODE this, next, next_next, last; | |||
PyObject* temp; | |||
@@ -2146,14 +2524,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||
return Tokenizer_fail_route(self); | |||
} | |||
} | |||
is_marker = 0; | |||
for (i = 0; i < NUM_MARKERS; i++) { | |||
if (*MARKERS[i] == this) { | |||
is_marker = 1; | |||
break; | |||
} | |||
} | |||
if (!is_marker) { | |||
if (!is_marker(this)) { | |||
if (Tokenizer_emit_char(self, this)) | |||
return NULL; | |||
self->head++; | |||
@@ -2192,9 +2563,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||
if (Tokenizer_emit_char(self, this)) | |||
return NULL; | |||
} | |||
else if (this == next && next == *"[") { | |||
if (!(this_context & LC_WIKILINK_TITLE) && | |||
Tokenizer_CAN_RECURSE(self)) { | |||
else if (this == next && next == *"[" && Tokenizer_CAN_RECURSE(self)) { | |||
if (!(this_context & AGG_INVALID_LINK)) { | |||
if (Tokenizer_parse_wikilink(self)) | |||
return NULL; | |||
} | |||
@@ -2207,6 +2577,16 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||
} | |||
else if (this == next && next == *"]" && this_context & LC_WIKILINK) | |||
return Tokenizer_handle_wikilink_end(self); | |||
else if (this == *"[") { | |||
if (Tokenizer_parse_external_link(self, 1)) | |||
return NULL; | |||
} | |||
else if (this == *":" && !is_marker(last)) { | |||
if (Tokenizer_parse_external_link(self, 0)) | |||
return NULL; | |||
} | |||
else if (this == *"]" && this_context & LC_EXT_LINK_TITLE) | |||
return Tokenizer_pop(self); | |||
else if (this == *"=" && !(self->global & GL_HEADING)) { | |||
if (last == *"\n" || last == *"") { | |||
if (Tokenizer_parse_heading(self)) | |||
@@ -2243,9 +2623,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||
return NULL; | |||
} | |||
} | |||
else if (this == *"<") { | |||
if (!(this_context & LC_TAG_CLOSE) && | |||
Tokenizer_CAN_RECURSE(self)) { | |||
else if (this == *"<" && !(this_context & LC_TAG_CLOSE)) { | |||
if (Tokenizer_CAN_RECURSE(self)) { | |||
if (Tokenizer_parse_tag(self)) | |||
return NULL; | |||
} | |||
@@ -2289,8 +2668,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
{ | |||
PyObject *text, *temp; | |||
int context = 0; | |||
if (PyArg_ParseTuple(args, "U", &text)) { | |||
if (PyArg_ParseTuple(args, "U|i", &text, &context)) { | |||
Py_XDECREF(self->text); | |||
self->text = PySequence_Fast(text, "expected a sequence"); | |||
} | |||
@@ -2299,7 +2679,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
Py_ssize_t size; | |||
/* Failed to parse a Unicode object; try a string instead. */ | |||
PyErr_Clear(); | |||
if (!PyArg_ParseTuple(args, "s#", &encoded, &size)) | |||
if (!PyArg_ParseTuple(args, "s#|i", &encoded, &size, &context)) | |||
return NULL; | |||
temp = PyUnicode_FromStringAndSize(encoded, size); | |||
if (!text) | |||
@@ -2311,7 +2691,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||
} | |||
self->head = self->global = self->depth = self->cycles = 0; | |||
self->length = PyList_GET_SIZE(self->text); | |||
return Tokenizer_parse(self, 0, 1); | |||
return Tokenizer_parse(self, context, 1); | |||
} | |||
static int load_entitydefs(void) | |||
@@ -2389,6 +2769,11 @@ static int load_tokens(void) | |||
WikilinkSeparator = PyObject_GetAttrString(tokens, "WikilinkSeparator"); | |||
WikilinkClose = PyObject_GetAttrString(tokens, "WikilinkClose"); | |||
ExternalLinkOpen = PyObject_GetAttrString(tokens, "ExternalLinkOpen"); | |||
ExternalLinkSeparator = PyObject_GetAttrString(tokens, | |||
"ExternalLinkSeparator"); | |||
ExternalLinkClose = PyObject_GetAttrString(tokens, "ExternalLinkClose"); | |||
HTMLEntityStart = PyObject_GetAttrString(tokens, "HTMLEntityStart"); | |||
HTMLEntityNumeric = PyObject_GetAttrString(tokens, "HTMLEntityNumeric"); | |||
HTMLEntityHex = PyObject_GetAttrString(tokens, "HTMLEntityHex"); | |||
@@ -2413,13 +2798,13 @@ static int load_tokens(void) | |||
return 0; | |||
} | |||
static int load_tag_defs(void) | |||
static int load_definitions(void) | |||
{ | |||
PyObject *tempmod, | |||
*globals = PyEval_GetGlobals(), | |||
*locals = PyEval_GetLocals(), | |||
*fromlist = PyList_New(1), | |||
*modname = IMPORT_NAME_FUNC("tag_defs"); | |||
*modname = IMPORT_NAME_FUNC("definitions"); | |||
char *name = "mwparserfromhell"; | |||
if (!fromlist || !modname) | |||
@@ -2429,7 +2814,7 @@ static int load_tag_defs(void) | |||
Py_DECREF(fromlist); | |||
if (!tempmod) | |||
return -1; | |||
tag_defs = PyObject_GetAttrString(tempmod, "tag_defs"); | |||
definitions = PyObject_GetAttrString(tempmod, "definitions"); | |||
Py_DECREF(tempmod); | |||
return 0; | |||
} | |||
@@ -2452,7 +2837,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void) | |||
NOARGS = PyTuple_New(0); | |||
if (!EMPTY || !NOARGS) | |||
INIT_ERROR; | |||
if (load_entitydefs() || load_tokens() || load_tag_defs()) | |||
if (load_entitydefs() || load_tokens() || load_definitions()) | |||
INIT_ERROR; | |||
#ifdef IS_PY3K | |||
return module; | |||
@@ -62,7 +62,7 @@ static char** entitydefs; | |||
static PyObject* EMPTY; | |||
static PyObject* NOARGS; | |||
static PyObject* tag_defs; | |||
static PyObject* definitions; | |||
/* Tokens: */ | |||
@@ -82,6 +82,10 @@ static PyObject* WikilinkOpen; | |||
static PyObject* WikilinkSeparator; | |||
static PyObject* WikilinkClose; | |||
static PyObject* ExternalLinkOpen; | |||
static PyObject* ExternalLinkSeparator; | |||
static PyObject* ExternalLinkClose; | |||
static PyObject* HTMLEntityStart; | |||
static PyObject* HTMLEntityNumeric; | |||
static PyObject* HTMLEntityHex; | |||
@@ -104,48 +108,53 @@ static PyObject* TagCloseClose; | |||
/* Local contexts: */ | |||
#define LC_TEMPLATE 0x0000007 | |||
#define LC_TEMPLATE_NAME 0x0000001 | |||
#define LC_TEMPLATE_PARAM_KEY 0x0000002 | |||
#define LC_TEMPLATE_PARAM_VALUE 0x0000004 | |||
#define LC_ARGUMENT 0x0000018 | |||
#define LC_ARGUMENT_NAME 0x0000008 | |||
#define LC_ARGUMENT_DEFAULT 0x0000010 | |||
#define LC_WIKILINK 0x0000060 | |||
#define LC_WIKILINK_TITLE 0x0000020 | |||
#define LC_WIKILINK_TEXT 0x0000040 | |||
#define LC_HEADING 0x0001F80 | |||
#define LC_HEADING_LEVEL_1 0x0000080 | |||
#define LC_HEADING_LEVEL_2 0x0000100 | |||
#define LC_HEADING_LEVEL_3 0x0000200 | |||
#define LC_HEADING_LEVEL_4 0x0000400 | |||
#define LC_HEADING_LEVEL_5 0x0000800 | |||
#define LC_HEADING_LEVEL_6 0x0001000 | |||
#define LC_TAG 0x001E000 | |||
#define LC_TAG_OPEN 0x0002000 | |||
#define LC_TAG_ATTR 0x0004000 | |||
#define LC_TAG_BODY 0x0008000 | |||
#define LC_TAG_CLOSE 0x0010000 | |||
#define LC_STYLE 0x01E0000 | |||
#define LC_STYLE_ITALICS 0x0020000 | |||
#define LC_STYLE_BOLD 0x0040000 | |||
#define LC_STYLE_PASS_AGAIN 0x0080000 | |||
#define LC_STYLE_SECOND_PASS 0x0100000 | |||
#define LC_DLTERM 0x0200000 | |||
#define LC_SAFETY_CHECK 0xFC00000 | |||
#define LC_HAS_TEXT 0x0400000 | |||
#define LC_FAIL_ON_TEXT 0x0800000 | |||
#define LC_FAIL_NEXT 0x1000000 | |||
#define LC_FAIL_ON_LBRACE 0x2000000 | |||
#define LC_FAIL_ON_RBRACE 0x4000000 | |||
#define LC_FAIL_ON_EQUALS 0x8000000 | |||
#define LC_TEMPLATE 0x00000007 | |||
#define LC_TEMPLATE_NAME 0x00000001 | |||
#define LC_TEMPLATE_PARAM_KEY 0x00000002 | |||
#define LC_TEMPLATE_PARAM_VALUE 0x00000004 | |||
#define LC_ARGUMENT 0x00000018 | |||
#define LC_ARGUMENT_NAME 0x00000008 | |||
#define LC_ARGUMENT_DEFAULT 0x00000010 | |||
#define LC_WIKILINK 0x00000060 | |||
#define LC_WIKILINK_TITLE 0x00000020 | |||
#define LC_WIKILINK_TEXT 0x00000040 | |||
#define LC_EXT_LINK 0x00000380 | |||
#define LC_EXT_LINK_URI 0x00000080 | |||
#define LC_EXT_LINK_TITLE 0x00000100 | |||
#define LC_EXT_LINK_BRACKETS 0x00000200 | |||
#define LC_HEADING 0x0000FC00 | |||
#define LC_HEADING_LEVEL_1 0x00000400 | |||
#define LC_HEADING_LEVEL_2 0x00000800 | |||
#define LC_HEADING_LEVEL_3 0x00001000 | |||
#define LC_HEADING_LEVEL_4 0x00002000 | |||
#define LC_HEADING_LEVEL_5 0x00004000 | |||
#define LC_HEADING_LEVEL_6 0x00008000 | |||
#define LC_TAG 0x000F0000 | |||
#define LC_TAG_OPEN 0x00010000 | |||
#define LC_TAG_ATTR 0x00020000 | |||
#define LC_TAG_BODY 0x00040000 | |||
#define LC_TAG_CLOSE 0x00080000 | |||
#define LC_STYLE 0x00F00000 | |||
#define LC_STYLE_ITALICS 0x00100000 | |||
#define LC_STYLE_BOLD 0x00200000 | |||
#define LC_STYLE_PASS_AGAIN 0x00400000 | |||
#define LC_STYLE_SECOND_PASS 0x00800000 | |||
#define LC_DLTERM 0x01000000 | |||
#define LC_SAFETY_CHECK 0x7E000000 | |||
#define LC_HAS_TEXT 0x02000000 | |||
#define LC_FAIL_ON_TEXT 0x04000000 | |||
#define LC_FAIL_NEXT 0x08000000 | |||
#define LC_FAIL_ON_LBRACE 0x10000000 | |||
#define LC_FAIL_ON_RBRACE 0x20000000 | |||
#define LC_FAIL_ON_EQUALS 0x40000000 | |||
/* Global contexts: */ | |||
@@ -153,9 +162,10 @@ static PyObject* TagCloseClose; | |||
/* Aggregate contexts: */ | |||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) | |||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) | |||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) | |||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) | |||
#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK) | |||
/* Tag contexts: */ | |||
@@ -174,6 +184,7 @@ static PyObject* TagCloseClose; | |||
struct Textbuffer { | |||
Py_ssize_t size; | |||
Py_UNICODE* data; | |||
struct Textbuffer* prev; | |||
struct Textbuffer* next; | |||
}; | |||
@@ -228,12 +239,14 @@ typedef struct { | |||
#define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1) | |||
/* Macros for accessing HTML tag definitions: */ | |||
/* Macros for accessing definitions: */ | |||
#define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") | |||
#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag)) | |||
#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag)) | |||
#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag)) | |||
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL)) | |||
#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL)) | |||
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL)) | |||
#define IS_SCHEME(scheme, slashes, reverse) \ | |||
(call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False)) | |||
/* Function prototypes: */ | |||
@@ -247,6 +260,8 @@ static void TagData_dealloc(TagData*); | |||
static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); | |||
static void Tokenizer_dealloc(Tokenizer*); | |||
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); | |||
static int Tokenizer_parse_entity(Tokenizer*); | |||
static int Tokenizer_handle_dl_term(Tokenizer*); | |||
static int Tokenizer_parse_tag(Tokenizer*); | |||
static PyObject* Tokenizer_parse(Tokenizer*, int, int); | |||
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | |||
@@ -26,7 +26,8 @@ import re | |||
from . import contexts, tokens | |||
from ..compat import htmlentities | |||
from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only | |||
from ..definitions import (get_html_tag, is_parsable, is_single, | |||
is_single_only, is_scheme) | |||
__all__ = ["Tokenizer"] | |||
@@ -60,7 +61,7 @@ class Tokenizer(object): | |||
START = object() | |||
END = object() | |||
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", | |||
":", "/", "-", "\n", END] | |||
":", "/", "-", "\n", START, END] | |||
MAX_DEPTH = 40 | |||
MAX_CYCLES = 100000 | |||
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) | |||
@@ -311,6 +312,168 @@ class Tokenizer(object): | |||
self._head += 1 | |||
return self._pop() | |||
def _parse_bracketed_uri_scheme(self): | |||
"""Parse the URI scheme of a bracket-enclosed external link.""" | |||
self._push(contexts.EXT_LINK_URI) | |||
if self._read() == self._read(1) == "/": | |||
self._emit_text("//") | |||
self._head += 2 | |||
else: | |||
valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" | |||
all_valid = lambda: all(char in valid for char in self._read()) | |||
scheme = "" | |||
while self._read() is not self.END and all_valid(): | |||
scheme += self._read() | |||
self._emit_text(self._read()) | |||
self._head += 1 | |||
if self._read() != ":": | |||
self._fail_route() | |||
self._emit_text(":") | |||
self._head += 1 | |||
slashes = self._read() == self._read(1) == "/" | |||
if slashes: | |||
self._emit_text("//") | |||
self._head += 2 | |||
if not is_scheme(scheme, slashes): | |||
self._fail_route() | |||
def _parse_free_uri_scheme(self): | |||
"""Parse the URI scheme of a free (no brackets) external link.""" | |||
valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" | |||
scheme = [] | |||
try: | |||
# We have to backtrack through the textbuffer looking for our | |||
# scheme since it was just parsed as text: | |||
for chunk in reversed(self._textbuffer): | |||
for char in reversed(chunk): | |||
if char.isspace() or char in self.MARKERS: | |||
raise StopIteration() | |||
if char not in valid: | |||
raise BadRoute() | |||
scheme.append(char) | |||
except StopIteration: | |||
pass | |||
scheme = "".join(reversed(scheme)) | |||
slashes = self._read() == self._read(1) == "/" | |||
if not is_scheme(scheme, slashes): | |||
raise BadRoute() | |||
self._push(contexts.EXT_LINK_URI) | |||
self._emit_text(scheme) | |||
self._emit_text(":") | |||
if slashes: | |||
self._emit_text("//") | |||
self._head += 2 | |||
def _handle_free_link_text(self, punct, tail, this): | |||
"""Handle text in a free ext link, including trailing punctuation.""" | |||
if "(" in this and ")" in punct: | |||
punct = punct[:-1] # ')' is not longer valid punctuation | |||
if this.endswith(punct): | |||
for i in reversed(range(-len(this), 0)): | |||
if i == -len(this) or this[i - 1] not in punct: | |||
break | |||
stripped = this[:i] | |||
if stripped and tail: | |||
self._emit_text(tail) | |||
tail = "" | |||
tail += this[i:] | |||
this = stripped | |||
elif tail: | |||
self._emit_text(tail) | |||
tail = "" | |||
self._emit_text(this) | |||
return punct, tail | |||
def _really_parse_external_link(self, brackets): | |||
"""Really parse an external link.""" | |||
if brackets: | |||
self._parse_bracketed_uri_scheme() | |||
invalid = ("\n", " ", "]") | |||
else: | |||
self._parse_free_uri_scheme() | |||
invalid = ("\n", " ", "[", "]") | |||
punct = tuple(",;\.:!?)") | |||
if self._read() is self.END or self._read()[0] in invalid: | |||
self._fail_route() | |||
tail = "" | |||
while True: | |||
this, next = self._read(), self._read(1) | |||
if this is self.END or this == "\n": | |||
if brackets: | |||
self._fail_route() | |||
return self._pop(), tail, -1 | |||
elif this == next == "{" and self._can_recurse(): | |||
if tail: | |||
self._emit_text(tail) | |||
tail = "" | |||
self._parse_template_or_argument() | |||
elif this == "[": | |||
if brackets: | |||
self._emit_text("[") | |||
else: | |||
return self._pop(), tail, -1 | |||
elif this == "]": | |||
return self._pop(), tail, 0 if brackets else -1 | |||
elif this == "&": | |||
if tail: | |||
self._emit_text(tail) | |||
tail = "" | |||
self._parse_entity() | |||
elif " " in this: | |||
before, after = this.split(" ", 1) | |||
if brackets: | |||
self._emit_text(before) | |||
self._emit(tokens.ExternalLinkSeparator()) | |||
if after: | |||
self._emit_text(after) | |||
self._context ^= contexts.EXT_LINK_URI | |||
self._context |= contexts.EXT_LINK_TITLE | |||
self._head += 1 | |||
return self._parse(push=False), None, 0 | |||
punct, tail = self._handle_free_link_text(punct, tail, before) | |||
return self._pop(), tail + " " + after, 0 | |||
elif not brackets: | |||
punct, tail = self._handle_free_link_text(punct, tail, this) | |||
else: | |||
self._emit_text(this) | |||
self._head += 1 | |||
def _remove_uri_scheme_from_textbuffer(self, scheme): | |||
"""Remove the URI scheme of a new external link from the textbuffer.""" | |||
length = len(scheme) | |||
while length: | |||
if length < len(self._textbuffer[-1]): | |||
self._textbuffer[-1] = self._textbuffer[-1][:-length] | |||
break | |||
length -= len(self._textbuffer[-1]) | |||
self._textbuffer.pop() | |||
def _parse_external_link(self, brackets): | |||
"""Parse an external link at the head of the wikicode string.""" | |||
reset = self._head | |||
self._head += 1 | |||
try: | |||
bad_context = self._context & contexts.INVALID_LINK | |||
if bad_context or not self._can_recurse(): | |||
raise BadRoute() | |||
link, extra, delta = self._really_parse_external_link(brackets) | |||
except BadRoute: | |||
self._head = reset | |||
if not brackets and self._context & contexts.DL_TERM: | |||
self._handle_dl_term() | |||
else: | |||
self._emit_text(self._read()) | |||
else: | |||
if not brackets: | |||
scheme = link[0].text.split(":", 1)[0] | |||
self._remove_uri_scheme_from_textbuffer(scheme) | |||
self._emit(tokens.ExternalLinkOpen(brackets=brackets)) | |||
self._emit_all(link) | |||
self._emit(tokens.ExternalLinkClose()) | |||
self._head += delta | |||
if extra: | |||
self._emit_text(extra) | |||
def _parse_heading(self): | |||
"""Parse a section heading at the head of the wikicode string.""" | |||
self._global |= contexts.GL_HEADING | |||
@@ -810,12 +973,16 @@ class Tokenizer(object): | |||
context = self._context | |||
if context & contexts.FAIL_NEXT: | |||
return False | |||
if context & contexts.WIKILINK_TITLE: | |||
if this == "]" or this == "{": | |||
if context & contexts.WIKILINK: | |||
if context & contexts.WIKILINK_TEXT: | |||
return not (this == self._read(1) == "[") | |||
elif this == "]" or this == "{": | |||
self._context |= contexts.FAIL_NEXT | |||
elif this == "\n" or this == "[" or this == "}": | |||
return False | |||
return True | |||
elif context & contexts.EXT_LINK_TITLE: | |||
return this != "\n" | |||
elif context & contexts.TEMPLATE_NAME: | |||
if this == "{" or this == "}" or this == "[": | |||
self._context |= contexts.FAIL_NEXT | |||
@@ -898,8 +1065,8 @@ class Tokenizer(object): | |||
return self._handle_argument_end() | |||
else: | |||
self._emit_text("}") | |||
elif this == next == "[": | |||
if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): | |||
elif this == next == "[" and self._can_recurse(): | |||
if not self._context & contexts.INVALID_LINK: | |||
self._parse_wikilink() | |||
else: | |||
self._emit_text("[") | |||
@@ -907,6 +1074,12 @@ class Tokenizer(object): | |||
self._handle_wikilink_separator() | |||
elif this == next == "]" and self._context & contexts.WIKILINK: | |||
return self._handle_wikilink_end() | |||
elif this == "[": | |||
self._parse_external_link(True) | |||
elif this == ":" and self._read(-1) not in self.MARKERS: | |||
self._parse_external_link(False) | |||
elif this == "]" and self._context & contexts.EXT_LINK_TITLE: | |||
return self._pop() | |||
elif this == "=" and not self._global & contexts.GL_HEADING: | |||
if self._read(-1) in ("\n", self.START): | |||
self._parse_heading() | |||
@@ -928,8 +1101,8 @@ class Tokenizer(object): | |||
self._handle_tag_open_close() | |||
else: | |||
self._handle_invalid_tag_start() | |||
elif this == "<": | |||
if not self._context & contexts.TAG_CLOSE and self._can_recurse(): | |||
elif this == "<" and not self._context & contexts.TAG_CLOSE: | |||
if self._can_recurse(): | |||
self._parse_tag() | |||
else: | |||
self._emit_text("<") | |||
@@ -952,8 +1125,9 @@ class Tokenizer(object): | |||
self._emit_text(this) | |||
self._head += 1 | |||
def tokenize(self, text): | |||
def tokenize(self, text, context=0): | |||
"""Build a list of tokens from a string of wikicode and return it.""" | |||
split = self.regex.split(text) | |||
self._text = [segment for segment in split if segment] | |||
return self._parse() | |||
self._head = self._global = self._depth = self._cycles = 0 | |||
return self._parse(context) |
@@ -84,6 +84,10 @@ WikilinkOpen = make("WikilinkOpen") # [[ | |||
WikilinkSeparator = make("WikilinkSeparator") # | | |||
WikilinkClose = make("WikilinkClose") # ]] | |||
ExternalLinkOpen = make("ExternalLinkOpen") # [ | |||
ExternalLinkSeparator = make("ExternalLinkSeparator") # | |||
ExternalLinkClose = make("ExternalLinkClose") # ] | |||
HTMLEntityStart = make("HTMLEntityStart") # & | |||
HTMLEntityNumeric = make("HTMLEntityNumeric") # # | |||
HTMLEntityHex = make("HTMLEntityHex") # x | |||
@@ -33,7 +33,7 @@ from .smart_list import SmartList | |||
__all__ = ["parse_anything"] | |||
def parse_anything(value): | |||
def parse_anything(value, context=0): | |||
"""Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. | |||
This differs from :py:meth:`.Parser.parse` in that we accept more than just | |||
@@ -44,6 +44,12 @@ def parse_anything(value): | |||
on-the-fly by various methods of :py:class:`~.Wikicode` and others like | |||
:py:class:`~.Template`, such as :py:meth:`wikicode.insert() | |||
<.Wikicode.insert>` or setting :py:meth:`template.name <.Template.name>`. | |||
If given, *context* will be passed as a starting context to the parser. | |||
This is helpful when this function is used inside node attribute setters. | |||
For example, :py:class:`~.ExternalLink`\ 's :py:attr:`~.ExternalLink.url` | |||
setter sets *context* to :py:mod:`contexts.EXT_LINK_URI <.contexts>` to | |||
prevent the URL itself from becoming an :py:class:`~.ExternalLink`. | |||
""" | |||
from .parser import Parser | |||
from .wikicode import Wikicode | |||
@@ -53,17 +59,17 @@ def parse_anything(value): | |||
elif isinstance(value, Node): | |||
return Wikicode(SmartList([value])) | |||
elif isinstance(value, str): | |||
return Parser(value).parse() | |||
return Parser().parse(value, context) | |||
elif isinstance(value, bytes): | |||
return Parser(value.decode("utf8")).parse() | |||
return Parser().parse(value.decode("utf8"), context) | |||
elif isinstance(value, int): | |||
return Parser(str(value)).parse() | |||
return Parser().parse(str(value), context) | |||
elif value is None: | |||
return Wikicode(SmartList()) | |||
try: | |||
nodelist = SmartList() | |||
for item in value: | |||
nodelist += parse_anything(item).nodes | |||
nodelist += parse_anything(item, context).nodes | |||
except TypeError: | |||
error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" | |||
raise ValueError(error.format(type(value).__name__, value)) | |||
@@ -24,8 +24,8 @@ from __future__ import unicode_literals | |||
import re | |||
from .compat import maxsize, py3k, str | |||
from .nodes import (Argument, Comment, Heading, HTMLEntity, Node, Tag, | |||
Template, Text, Wikilink) | |||
from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, | |||
Node, Tag, Template, Text, Wikilink) | |||
from .string_mixin import StringMixIn | |||
from .utils import parse_anything | |||
@@ -509,6 +509,6 @@ class Wikicode(StringMixIn): | |||
return "\n".join(self._get_tree(self, [], marker, 0)) | |||
Wikicode._build_filter_methods( | |||
arguments=Argument, comments=Comment, headings=Heading, | |||
html_entities=HTMLEntity, tags=Tag, templates=Template, text=Text, | |||
wikilinks=Wikilink) | |||
arguments=Argument, comments=Comment, external_links=ExternalLink, | |||
headings=Heading, html_entities=HTMLEntity, tags=Tag, templates=Template, | |||
text=Text, wikilinks=Wikilink) |
@@ -23,8 +23,8 @@ | |||
from __future__ import unicode_literals | |||
import unittest | |||
from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity, | |||
Tag, Template, Text, Wikilink) | |||
from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, | |||
HTMLEntity, Tag, Template, Text, Wikilink) | |||
from mwparserfromhell.nodes.extras import Attribute, Parameter | |||
from mwparserfromhell.parser import tokens | |||
from mwparserfromhell.parser.builder import Builder | |||
@@ -150,6 +150,48 @@ class TestBuilder(TreeEqualityTestCase): | |||
for test, valid in tests: | |||
self.assertWikicodeEqual(valid, self.builder.build(test)) | |||
def test_external_link(self): | |||
"""tests for building ExternalLink nodes""" | |||
tests = [ | |||
([tokens.ExternalLinkOpen(brackets=False), | |||
tokens.Text(text="http://example.com/"), | |||
tokens.ExternalLinkClose()], | |||
wrap([ExternalLink(wraptext("http://example.com/"), | |||
brackets=False)])), | |||
([tokens.ExternalLinkOpen(brackets=True), | |||
tokens.Text(text="http://example.com/"), | |||
tokens.ExternalLinkClose()], | |||
wrap([ExternalLink(wraptext("http://example.com/"))])), | |||
([tokens.ExternalLinkOpen(brackets=True), | |||
tokens.Text(text="http://example.com/"), | |||
tokens.ExternalLinkSeparator(), tokens.ExternalLinkClose()], | |||
wrap([ExternalLink(wraptext("http://example.com/"), wrap([]))])), | |||
([tokens.ExternalLinkOpen(brackets=True), | |||
tokens.Text(text="http://example.com/"), | |||
tokens.ExternalLinkSeparator(), tokens.Text(text="Example"), | |||
tokens.ExternalLinkClose()], | |||
wrap([ExternalLink(wraptext("http://example.com/"), | |||
wraptext("Example"))])), | |||
([tokens.ExternalLinkOpen(brackets=False), | |||
tokens.Text(text="http://example"), tokens.Text(text=".com/foo"), | |||
tokens.ExternalLinkClose()], | |||
wrap([ExternalLink(wraptext("http://example", ".com/foo"), | |||
brackets=False)])), | |||
([tokens.ExternalLinkOpen(brackets=True), | |||
tokens.Text(text="http://example"), tokens.Text(text=".com/foo"), | |||
tokens.ExternalLinkSeparator(), tokens.Text(text="Example"), | |||
tokens.Text(text=" Web Page"), tokens.ExternalLinkClose()], | |||
wrap([ExternalLink(wraptext("http://example", ".com/foo"), | |||
wraptext("Example", " Web Page"))])), | |||
] | |||
for test, valid in tests: | |||
self.assertWikicodeEqual(valid, self.builder.build(test)) | |||
def test_html_entity(self): | |||
"""tests for building HTMLEntity nodes""" | |||
tests = [ | |||
@@ -0,0 +1,130 @@ | |||
# -*- coding: utf-8 -*- | |||
# | |||
# Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net> | |||
# | |||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||
# of this software and associated documentation files (the "Software"), to deal | |||
# in the Software without restriction, including without limitation the rights | |||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
# copies of the Software, and to permit persons to whom the Software is | |||
# furnished to do so, subject to the following conditions: | |||
# | |||
# The above copyright notice and this permission notice shall be included in | |||
# all copies or substantial portions of the Software. | |||
# | |||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
# SOFTWARE. | |||
from __future__ import unicode_literals | |||
import unittest | |||
from mwparserfromhell.compat import str | |||
from mwparserfromhell.nodes import ExternalLink, Text | |||
from ._test_tree_equality import TreeEqualityTestCase, getnodes, wrap, wraptext | |||
class TestExternalLink(TreeEqualityTestCase): | |||
"""Test cases for the ExternalLink node.""" | |||
def test_unicode(self): | |||
"""test ExternalLink.__unicode__()""" | |||
node = ExternalLink(wraptext("http://example.com/"), brackets=False) | |||
self.assertEqual("http://example.com/", str(node)) | |||
node2 = ExternalLink(wraptext("http://example.com/")) | |||
self.assertEqual("[http://example.com/]", str(node2)) | |||
node3 = ExternalLink(wraptext("http://example.com/"), wrap([])) | |||
self.assertEqual("[http://example.com/ ]", str(node3)) | |||
node4 = ExternalLink(wraptext("http://example.com/"), | |||
wraptext("Example Web Page")) | |||
self.assertEqual("[http://example.com/ Example Web Page]", str(node4)) | |||
def test_iternodes(self): | |||
"""test ExternalLink.__iternodes__()""" | |||
node1n1 = Text("http://example.com/") | |||
node2n1 = Text("http://example.com/") | |||
node2n2, node2n3 = Text("Example"), Text("Page") | |||
node1 = ExternalLink(wrap([node1n1]), brackets=False) | |||
node2 = ExternalLink(wrap([node2n1]), wrap([node2n2, node2n3])) | |||
gen1 = node1.__iternodes__(getnodes) | |||
gen2 = node2.__iternodes__(getnodes) | |||
self.assertEqual((None, node1), next(gen1)) | |||
self.assertEqual((None, node2), next(gen2)) | |||
self.assertEqual((node1.url, node1n1), next(gen1)) | |||
self.assertEqual((node2.url, node2n1), next(gen2)) | |||
self.assertEqual((node2.title, node2n2), next(gen2)) | |||
self.assertEqual((node2.title, node2n3), next(gen2)) | |||
self.assertRaises(StopIteration, next, gen1) | |||
self.assertRaises(StopIteration, next, gen2) | |||
def test_strip(self): | |||
"""test ExternalLink.__strip__()""" | |||
node1 = ExternalLink(wraptext("http://example.com"), brackets=False) | |||
node2 = ExternalLink(wraptext("http://example.com")) | |||
node3 = ExternalLink(wraptext("http://example.com"), wrap([])) | |||
node4 = ExternalLink(wraptext("http://example.com"), wraptext("Link")) | |||
for a in (True, False): | |||
for b in (True, False): | |||
self.assertEqual("http://example.com", node1.__strip__(a, b)) | |||
self.assertEqual(None, node2.__strip__(a, b)) | |||
self.assertEqual(None, node3.__strip__(a, b)) | |||
self.assertEqual("Link", node4.__strip__(a, b)) | |||
def test_showtree(self): | |||
"""test ExternalLink.__showtree__()""" | |||
output = [] | |||
getter, marker = object(), object() | |||
get = lambda code: output.append((getter, code)) | |||
mark = lambda: output.append(marker) | |||
node1 = ExternalLink(wraptext("http://example.com"), brackets=False) | |||
node2 = ExternalLink(wraptext("http://example.com"), wraptext("Link")) | |||
node1.__showtree__(output.append, get, mark) | |||
node2.__showtree__(output.append, get, mark) | |||
valid = [ | |||
(getter, node1.url), "[", (getter, node2.url), | |||
(getter, node2.title), "]"] | |||
self.assertEqual(valid, output) | |||
def test_url(self): | |||
"""test getter/setter for the url attribute""" | |||
url = wraptext("http://example.com/") | |||
node1 = ExternalLink(url, brackets=False) | |||
node2 = ExternalLink(url, wraptext("Example")) | |||
self.assertIs(url, node1.url) | |||
self.assertIs(url, node2.url) | |||
node1.url = "mailto:héhehé@spam.com" | |||
node2.url = "mailto:héhehé@spam.com" | |||
self.assertWikicodeEqual(wraptext("mailto:héhehé@spam.com"), node1.url) | |||
self.assertWikicodeEqual(wraptext("mailto:héhehé@spam.com"), node2.url) | |||
def test_title(self): | |||
"""test getter/setter for the title attribute""" | |||
title = wraptext("Example!") | |||
node1 = ExternalLink(wraptext("http://example.com/"), brackets=False) | |||
node2 = ExternalLink(wraptext("http://example.com/"), title) | |||
self.assertIs(None, node1.title) | |||
self.assertIs(title, node2.title) | |||
node2.title = None | |||
self.assertIs(None, node2.title) | |||
node2.title = "My Website" | |||
self.assertWikicodeEqual(wraptext("My Website"), node2.title) | |||
def test_brackets(self): | |||
"""test getter/setter for the brackets attribute""" | |||
node1 = ExternalLink(wraptext("http://example.com/"), brackets=False) | |||
node2 = ExternalLink(wraptext("http://example.com/"), wraptext("Link")) | |||
self.assertFalse(node1.brackets) | |||
self.assertTrue(node2.brackets) | |||
node1.brackets = True | |||
node2.brackets = False | |||
self.assertTrue(node1.brackets) | |||
self.assertFalse(node2.brackets) | |||
self.assertEqual("[http://example.com/]", str(node1)) | |||
self.assertEqual("http://example.com/", str(node2)) | |||
if __name__ == "__main__": | |||
unittest.main(verbosity=2) |
@@ -36,9 +36,9 @@ class TestParser(TreeEqualityTestCase): | |||
def test_use_c(self): | |||
"""make sure the correct tokenizer is used""" | |||
if parser.use_c: | |||
self.assertTrue(parser.Parser(None)._tokenizer.USES_C) | |||
self.assertTrue(parser.Parser()._tokenizer.USES_C) | |||
parser.use_c = False | |||
self.assertFalse(parser.Parser(None)._tokenizer.USES_C) | |||
self.assertFalse(parser.Parser()._tokenizer.USES_C) | |||
def test_parsing(self): | |||
"""integration test for parsing overall""" | |||
@@ -59,7 +59,7 @@ class TestParser(TreeEqualityTestCase): | |||
])) | |||
]) | |||
]) | |||
actual = parser.Parser(text).parse() | |||
actual = parser.Parser().parse(text) | |||
self.assertWikicodeEqual(expected, actual) | |||
if __name__ == "__main__": | |||
@@ -276,6 +276,7 @@ class TestWikicode(TreeEqualityTestCase): | |||
self.assertEqual(["{{{e}}}"], get_filter("arguments")) | |||
self.assertIs(code.get(4), get_filter("arguments")[0]) | |||
self.assertEqual([], get_filter("comments")) | |||
self.assertEqual([], get_filter("external_links")) | |||
self.assertEqual([], get_filter("headings")) | |||
self.assertEqual([], get_filter("html_entities")) | |||
self.assertEqual([], get_filter("tags")) | |||
@@ -0,0 +1,473 @@ | |||
name: basic | |||
label: basic external link | |||
input: "http://example.com/" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose()] | |||
--- | |||
name: basic_brackets | |||
label: basic external link in brackets | |||
input: "[http://example.com/]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkClose()] | |||
--- | |||
name: brackets_space | |||
label: basic external link in brackets, with a space after | |||
input: "[http://example.com/ ]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), ExternalLinkClose()] | |||
--- | |||
name: brackets_title | |||
label: basic external link in brackets, with a title | |||
input: "[http://example.com/ Example]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||
--- | |||
name: brackets_multiword_title | |||
label: basic external link in brackets, with a multi-word title | |||
input: "[http://example.com/ Example Web Page]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text="Example Web Page"), ExternalLinkClose()] | |||
--- | |||
name: brackets_adjacent | |||
label: three adjacent bracket-enclosed external links | |||
input: "[http://foo.com/ Foo][http://bar.com/ Bar]\n[http://baz.com/ Baz]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo.com/"), ExternalLinkSeparator(), Text(text="Foo"), ExternalLinkClose(), ExternalLinkOpen(brackets=True), Text(text="http://bar.com/"), ExternalLinkSeparator(), Text(text="Bar"), ExternalLinkClose(), Text(text="\n"), ExternalLinkOpen(brackets=True), Text(text="http://baz.com/"), ExternalLinkSeparator(), Text(text="Baz"), ExternalLinkClose()] | |||
--- | |||
name: brackets_newline_before | |||
label: bracket-enclosed link with a newline before the title | |||
input: "[http://example.com/ \nExample]" | |||
output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" \nExample]")] | |||
--- | |||
name: brackets_newline_inside | |||
label: bracket-enclosed link with a newline in the title | |||
input: "[http://example.com/ Example \nWeb Page]" | |||
output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" Example \nWeb Page]")] | |||
--- | |||
name: brackets_newline_after | |||
label: bracket-enclosed link with a newline after the title | |||
input: "[http://example.com/ Example\n]" | |||
output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" Example\n]")] | |||
--- | |||
name: brackets_space_before | |||
label: bracket-enclosed link with a space before the URL | |||
input: "[ http://example.com Example]" | |||
output: [Text(text="[ "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" Example]")] | |||
--- | |||
name: brackets_title_like_url | |||
label: bracket-enclosed link with a title that looks like a URL | |||
input: "[http://example.com http://example.com]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="http://example.com"), ExternalLinkClose()] | |||
--- | |||
name: brackets_recursive | |||
label: bracket-enclosed link with a bracket-enclosed link as the title | |||
input: "[http://example.com [http://example.com]]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="[http://example.com"), ExternalLinkClose(), Text(text="]")] | |||
--- | |||
name: period_after | |||
label: a period after a free link that is excluded | |||
input: "http://example.com." | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=".")] | |||
--- | |||
name: colons_after | |||
label: colons after a free link that are excluded | |||
input: "http://example.com/foo:bar.:;baz!?," | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo:bar.:;baz"), ExternalLinkClose(), Text(text="!?,")] | |||
--- | |||
name: close_paren_after_excluded | |||
label: a closing parenthesis after a free link that is excluded | |||
input: "http://example.)com)" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.)com"), ExternalLinkClose(), Text(text=")")] | |||
--- | |||
name: close_paren_after_included | |||
label: a closing parenthesis after a free link that is included because of an opening parenthesis in the URL | |||
input: "http://example.(com)" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.(com)"), ExternalLinkClose()] | |||
--- | |||
name: open_bracket_inside | |||
label: an open bracket inside a free link that causes it to be ended abruptly | |||
input: "http://foobar[baz.com" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foobar"), ExternalLinkClose(), Text(text="[baz.com")] | |||
--- | |||
name: brackets_period_after | |||
label: a period after a bracket-enclosed link that is included | |||
input: "[http://example.com. Example]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com."), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||
--- | |||
name: brackets_colons_after | |||
label: colons after a bracket-enclosed link that are included | |||
input: "[http://example.com/foo:bar.:;baz!?, Example]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo:bar.:;baz!?,"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||
--- | |||
name: brackets_close_paren_after_included | |||
label: a closing parenthesis after a bracket-enclosed link that is included | |||
input: "[http://example.)com) Example]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.)com)"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||
--- | |||
name: brackets_close_paren_after_included_2 | |||
label: a closing parenthesis after a bracket-enclosed link that is also included | |||
input: "[http://example.(com) Example]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||
--- | |||
name: brackets_open_bracket_inside | |||
label: an open bracket inside a bracket-enclosed link that is also included | |||
input: "[http://foobar[baz.com Example]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||
--- | |||
name: adjacent_space | |||
label: two free links separated by a space | |||
input: "http://example.com http://example.com" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] | |||
--- | |||
name: adjacent_newline | |||
label: two free links separated by a newline | |||
input: "http://example.com\nhttp://example.com" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text="\n"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] | |||
--- | |||
name: adjacent_close_bracket | |||
label: two free links separated by a close bracket | |||
input: "http://example.com]http://example.com" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text="]"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] | |||
--- | |||
name: html_entity_in_url | |||
label: a HTML entity parsed correctly inside a free link | |||
input: "http://exa mple.com/" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="mple.com/"), ExternalLinkClose()] | |||
--- | |||
name: template_in_url | |||
label: a template parsed correctly inside a free link | |||
input: "http://exa{{template}}mple.com/" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), TemplateOpen(), Text(text="template"), TemplateClose(), Text(text="mple.com/"), ExternalLinkClose()] | |||
--- | |||
name: argument_in_url | |||
label: an argument parsed correctly inside a free link | |||
input: "http://exa{{{argument}}}mple.com/" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ArgumentOpen(), Text(text="argument"), ArgumentClose(), Text(text="mple.com/"), ExternalLinkClose()] | |||
--- | |||
name: wikilink_in_url | |||
label: a wikilink that destroys a free link | |||
input: "http://exa[[wikilink]]mple.com/" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ExternalLinkClose(), WikilinkOpen(), Text(text="wikilink"), WikilinkClose(), Text(text="mple.com/")] | |||
--- | |||
name: external_link_in_url | |||
label: a bracketed link that destroys a free link | |||
input: "http://exa[http://example.com/]mple.com/" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ExternalLinkClose(), ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkClose(), Text(text="mple.com/")] | |||
--- | |||
name: spaces_padding | |||
label: spaces padding a free link | |||
input: " http://example.com " | |||
output: [Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" ")] | |||
--- | |||
name: text_and_spaces_padding | |||
label: text and spaces padding a free link | |||
input: "x http://example.com x" | |||
output: [Text(text="x "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" x")] | |||
--- | |||
name: template_before | |||
label: a template before a free link | |||
input: "{{foo}}http://example.com" | |||
output: [TemplateOpen(), Text(text="foo"), TemplateClose(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] | |||
--- | |||
name: spaces_padding_no_slashes | |||
label: spaces padding a free link with no slashes after the colon | |||
input: " mailto:example@example.com " | |||
output: [Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" ")] | |||
--- | |||
name: text_and_spaces_padding_no_slashes | |||
label: text and spaces padding a free link with no slashes after the colon | |||
input: "x mailto:example@example.com x" | |||
output: [Text(text="x "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" x")] | |||
--- | |||
name: template_before_no_slashes | |||
label: a template before a free link with no slashes after the colon | |||
input: "{{foo}}mailto:example@example.com" | |||
output: [TemplateOpen(), Text(text="foo"), TemplateClose(), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose()] | |||
--- | |||
name: no_slashes | |||
label: a free link with no slashes after the colon | |||
input: "mailto:example@example.com" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose()] | |||
--- | |||
name: slashes_optional | |||
label: a free link using a scheme that doesn't need slashes, but has them anyway | |||
input: "mailto://example@example.com" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="mailto://example@example.com"), ExternalLinkClose()] | |||
--- | |||
name: short | |||
label: a very short free link | |||
input: "mailto://abc" | |||
output: [ExternalLinkOpen(brackets=False), Text(text="mailto://abc"), ExternalLinkClose()] | |||
--- | |||
name: slashes_missing | |||
label: slashes missing from a free link with a scheme that requires them | |||
input: "http:example@example.com" | |||
output: [Text(text="http:example@example.com")] | |||
--- | |||
name: no_scheme_but_slashes | |||
label: no scheme in a free link, but slashes (protocol-relative free links are not supported) | |||
input: "//example.com" | |||
output: [Text(text="//example.com")] | |||
--- | |||
name: no_scheme_but_colon | |||
label: no scheme in a free link, but a colon | |||
input: " :example.com" | |||
output: [Text(text=" :example.com")] | |||
--- | |||
name: no_scheme_but_colon_and_slashes | |||
label: no scheme in a free link, but a colon and slashes | |||
input: " ://example.com" | |||
output: [Text(text=" ://example.com")] | |||
--- | |||
name: fake_scheme_no_slashes | |||
label: a nonexistent scheme in a free link, without slashes | |||
input: "fake:example.com" | |||
output: [Text(text="fake:example.com")] | |||
--- | |||
name: fake_scheme_slashes | |||
label: a nonexistent scheme in a free link, with slashes | |||
input: "fake://example.com" | |||
output: [Text(text="fake://example.com")] | |||
--- | |||
name: fake_scheme_brackets_no_slashes | |||
label: a nonexistent scheme in a bracketed link, without slashes | |||
input: "[fake:example.com]" | |||
output: [Text(text="[fake:example.com]")] | |||
--- | |||
name: fake_scheme_brackets_slashes | |||
label: #=a nonexistent scheme in a bracketed link, with slashes | |||
input: "[fake://example.com]" | |||
output: [Text(text="[fake://example.com]")] | |||
--- | |||
name: interrupted_scheme | |||
label: an otherwise valid scheme with something in the middle of it, in a free link | |||
input: "ht?tp://example.com" | |||
output: [Text(text="ht?tp://example.com")] | |||
--- | |||
name: interrupted_scheme_brackets | |||
label: an otherwise valid scheme with something in the middle of it, in a bracketed link | |||
input: "[ht?tp://example.com]" | |||
output: [Text(text="[ht?tp://example.com]")] | |||
--- | |||
name: no_slashes_brackets | |||
label: no slashes after the colon in a bracketed link | |||
input: "[mailto:example@example.com Example]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="mailto:example@example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||
--- | |||
name: space_before_no_slashes_brackets | |||
label: a space before a bracketed link with no slashes after the colon | |||
input: "[ mailto:example@example.com Example]" | |||
output: [Text(text="[ "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" Example]")] | |||
--- | |||
name: slashes_optional_brackets | |||
label: a bracketed link using a scheme that doesn't need slashes, but has them anyway | |||
input: "[mailto://example@example.com Example]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="mailto://example@example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||
--- | |||
name: short_brackets | |||
label: a very short link in brackets | |||
input: "[mailto://abc Example]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="mailto://abc"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||
--- | |||
name: slashes_missing_brackets | |||
label: slashes missing from a scheme that requires them in a bracketed link | |||
input: "[http:example@example.com Example]" | |||
output: [Text(text="[http:example@example.com Example]")] | |||
--- | |||
name: protcol_relative | |||
label: a protocol-relative link (in brackets) | |||
input: "[//example.com Example]" | |||
output: [ExternalLinkOpen(brackets=True), Text(text="//example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||
--- | |||
name: scheme_missing_but_colon_brackets | |||
label: scheme missing from a bracketed link, but with a colon | |||
input: "[:example.com Example]" | |||
output: [Text(text="[:example.com Example]")] | |||
--- | |||
name: scheme_missing_but_colon_slashes_brackets | |||
label: scheme missing from a bracketed link, but with a colon and slashes | |||
input: "[://example.com Example]" | |||
output: [Text(text="[://example.com Example]")] | |||
--- | |||
name: unclosed_protocol_relative | |||
label: an unclosed protocol-relative bracketed link | |||
input: "[//example.com" | |||
output: [Text(text="[//example.com")] | |||
--- | |||
name: space_before_protcol_relative | |||
label: a space before a protocol-relative bracketed link | |||
input: "[ //example.com]" | |||
output: [Text(text="[ //example.com]")] | |||
--- | |||
name: unclosed_just_scheme | |||
label: an unclosed bracketed link, ending after the scheme | |||
input: "[http" | |||
output: [Text(text="[http")] | |||
--- | |||
name: unclosed_scheme_colon | |||
label: an unclosed bracketed link, ending after the colon | |||
input: "[http:" | |||
output: [Text(text="[http:")] | |||
--- | |||
name: unclosed_scheme_colon_slashes | |||
label: an unclosed bracketed link, ending after the slashes | |||
input: "[http://" | |||
output: [Text(text="[http://")] | |||
--- | |||
name: incomplete_bracket | |||
label: just an open bracket | |||
input: "[" | |||
output: [Text(text="[")] | |||
--- | |||
name: incomplete_scheme_colon | |||
label: a free link with just a scheme and a colon | |||
input: "http:" | |||
output: [Text(text="http:")] | |||
--- | |||
name: incomplete_scheme_colon_slashes | |||
label: a free link with just a scheme, colon, and slashes | |||
input: "http://" | |||
output: [Text(text="http://")] | |||
--- | |||
name: brackets_scheme_but_no_url | |||
label: brackets around a scheme and a colon | |||
input: "[mailto:]" | |||
output: [Text(text="[mailto:]")] | |||
--- | |||
name: brackets_scheme_slashes_but_no_url | |||
label: brackets around a scheme, colon, and slashes | |||
input: "[http://]" | |||
output: [Text(text="[http://]")] | |||
--- | |||
name: brackets_scheme_title_but_no_url | |||
label: brackets around a scheme, colon, and slashes, with a title | |||
input: "[http:// Example]" | |||
output: [Text(text="[http:// Example]")] |
@@ -12,6 +12,13 @@ output: [TemplateOpen(), ArgumentOpen(), ArgumentOpen(), Text(text="foo"), Argum | |||
--- | |||
name: link_in_template_name | |||
label: a wikilink inside a template name, which breaks the template | |||
input: "{{foo[[bar]]}}" | |||
output: [Text(text="{{foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="}}")] | |||
--- | |||
name: rich_heading | |||
label: a heading with templates/wikilinks in it | |||
input: "== Head{{ing}} [[with]] {{{funky|{{stuf}}}}} ==" | |||
@@ -51,3 +58,17 @@ name: wildcard_redux | |||
label: an even wilder assortment of various things | |||
input: "{{a|b|{{c|[[d]]{{{e}}}}}}}[[f|{{{g}}}<!--h-->]]{{i|j= }}" | |||
output: [TemplateOpen(), Text(text="a"), TemplateParamSeparator(), Text(text="b"), TemplateParamSeparator(), TemplateOpen(), Text(text="c"), TemplateParamSeparator(), WikilinkOpen(), Text(text="d"), WikilinkClose(), ArgumentOpen(), Text(text="e"), ArgumentClose(), TemplateClose(), TemplateClose(), WikilinkOpen(), Text(text="f"), WikilinkSeparator(), ArgumentOpen(), Text(text="g"), ArgumentClose(), CommentStart(), Text(text="h"), CommentEnd(), WikilinkClose(), TemplateOpen(), Text(text="i"), TemplateParamSeparator(), Text(text="j"), TemplateParamEquals(), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), TemplateClose()] | |||
--- | |||
name: link_inside_dl | |||
label: an external link inside a def list, such that the external link is parsed | |||
input: ";;;mailto:example" | |||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), ExternalLinkOpen(brackets=False), Text(text="mailto:example"), ExternalLinkClose()] | |||
--- | |||
name: link_inside_dl_2 | |||
label: an external link inside a def list, such that the external link is not parsed | |||
input: ";;;malito:example" | |||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="malito"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="example")] |
@@ -40,17 +40,17 @@ output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar|b | |||
--- | |||
name: nested | |||
label: a wikilink nested within the value of another | |||
input: "[[foo|[[bar]]]]" | |||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), WikilinkOpen(), Text(text="bar"), WikilinkClose(), WikilinkClose()] | |||
name: newline_text | |||
label: a newline in the middle of the text | |||
input: "[[foo|foo\nbar]]" | |||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="foo\nbar"), WikilinkClose()] | |||
--- | |||
name: nested_with_text | |||
label: a wikilink nested within the value of another, separated by other data | |||
input: "[[foo|a[[b]]c]]" | |||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c"), WikilinkClose()] | |||
name: bracket_text | |||
label: a left bracket in the middle of the text | |||
input: "[[foo|bar[baz]]" | |||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar[baz"), WikilinkClose()] | |||
--- | |||
@@ -96,13 +96,34 @@ output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), | |||
--- | |||
name: invalid_nested_text | |||
name: invalid_nested_padding | |||
label: invalid wikilink: trying to nest in the wrong context, with a text param | |||
input: "[[foo[[bar]]|baz]]" | |||
output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="|baz]]")] | |||
--- | |||
name: invalid_nested_text | |||
label: invalid wikilink: a wikilink nested within the value of another | |||
input: "[[foo|[[bar]]" | |||
output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose()] | |||
--- | |||
name: invalid_nested_text_2 | |||
label: invalid wikilink: a wikilink nested within the value of another, two pairs of closing brackets | |||
input: "[[foo|[[bar]]]]" | |||
output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="]]")] | |||
--- | |||
name: invalid_nested_text_padding | |||
label: invalid wikilink: a wikilink nested within the value of another, separated by other data | |||
input: "[[foo|a[[b]]c]]" | |||
output: [Text(text="[[foo|a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c]]")] | |||
--- | |||
name: incomplete_open_only | |||
label: incomplete wikilinks: just an open | |||
input: "[[" | |||