diff --git a/CHANGELOG b/CHANGELOG index 8922738..122247f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,8 +1,10 @@ v0.3 (unreleased): -- Added complete support for HTML Tags, along with appropriate unit tests. This - includes forms like foo, , and wiki-markup tags - like bold ('''), italics (''), and lists (*, #, ; and :). +- Added complete support for HTML Tags, including forms like foo, + , and wiki-markup tags like bold ('''), italics (''), and + lists (*, #, ; and :). +- Added support for ExternalLinks (http://example.com/ and + [http://example.com/ Example]). - Wikicode's filter methods are now passed 'recursive=True' by default instead of False. This is a breaking change if you rely on any filter() methods being non-recursive by default. @@ -14,7 +16,7 @@ v0.3 (unreleased): - Renamed Template.has_param() to has() for consistency with Template's other methods; has_param() is now an alias. - The C tokenizer extension now works on Python 3 in addition to Python 2.7. -- Various fixes and cleanup. +- Various bugfixes, internal changes, and cleanup. v0.2 (released June 20, 2013): diff --git a/docs/api/mwparserfromhell.nodes.rst b/docs/api/mwparserfromhell.nodes.rst index a093c17..7043070 100644 --- a/docs/api/mwparserfromhell.nodes.rst +++ b/docs/api/mwparserfromhell.nodes.rst @@ -25,6 +25,14 @@ nodes Package :undoc-members: :show-inheritance: +:mod:`external_link` Module +--------------------------- + +.. automodule:: mwparserfromhell.nodes.external_link + :members: + :undoc-members: + :show-inheritance: + :mod:`heading` Module --------------------- diff --git a/docs/api/mwparserfromhell.rst b/docs/api/mwparserfromhell.rst index b682139..0da522e 100644 --- a/docs/api/mwparserfromhell.rst +++ b/docs/api/mwparserfromhell.rst @@ -30,10 +30,10 @@ mwparserfromhell Package :members: :undoc-members: -:mod:`tag_defs` Module ----------------------- +:mod:`definitions` Module +------------------------- -.. automodule:: mwparserfromhell.tag_defs +.. automodule:: mwparserfromhell.definitions :members: :mod:`utils` Module diff --git a/docs/changelog.rst b/docs/changelog.rst index 86dfd78..f43a3c9 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,10 +7,11 @@ v0.3 Unreleased (`changes `__): -- Added complete support for HTML :py:class:`Tags <.Tag>`, along with - appropriate unit tests. This includes forms like ``foo``, - ````, and wiki-markup tags like bold (``'''``), italics - (``''``), and lists (``*``, ``#``, ``;`` and ``:``). +- Added complete support for HTML :py:class:`Tags <.Tag>`, including forms like + ``foo``, ````, and wiki-markup tags like bold + (``'''``), italics (``''``), and lists (``*``, ``#``, ``;`` and ``:``). +- Added support for :py:class:`.ExternalLink`\ s (``http://example.com/`` and + ``[http://example.com/ Example]``). - :py:class:`Wikicode's <.Wikicode>` :py:meth:`.filter` methods are now passed *recursive=True* by default instead of *False*. **This is a breaking change if you rely on any filter() methods being non-recursive by default.** @@ -25,7 +26,7 @@ Unreleased :py:meth:`~.Template.has` for consistency with :py:class:`~.Template`\ 's other methods; :py:meth:`~.has_param` is now an alias. - The C tokenizer extension now works on Python 3 in addition to Python 2.7. -- Various fixes and cleanup. +- Various bugfixes, internal changes, and cleanup. v0.2 ---- diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 738d4c2..74e1616 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -34,6 +34,7 @@ __license__ = "MIT License" __version__ = "0.3.dev" __email__ = "ben.kurtovic@verizon.net" -from . import compat, nodes, parser, smart_list, string_mixin, utils, wikicode +from . import (compat, definitions, nodes, parser, smart_list, string_mixin, + utils, wikicode) parse = utils.parse_anything diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/definitions.py similarity index 73% rename from mwparserfromhell/tag_defs.py rename to mwparserfromhell/definitions.py index 2395fc6..9449bcb 100644 --- a/mwparserfromhell/tag_defs.py +++ b/mwparserfromhell/definitions.py @@ -20,12 +20,22 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -"""Contains data regarding certain HTML tags.""" +"""Contains data about certain markup, like HTML tags and external links.""" from __future__ import unicode_literals __all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", - "is_single_only"] + "is_single_only", "is_scheme"] + +URI_SCHEMES = { + # [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943 + "http": True, "https": True, "ftp": True, "ftps": True, "ssh": True, + "sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False, + "sips": False, "gopher": True, "telnet": True, "nntp": True, + "worldwind": True, "mailto": False, "tel": False, "sms": False, + "news": False, "svn": True, "git": True, "mms": True, "bitcoin": False, + "magnet": False, "urn": False, "geo": False +} PARSER_BLACKLIST = [ # enwiki extensions @ 2013-06-28 @@ -70,3 +80,12 @@ def is_single(tag): def is_single_only(tag): """Return whether or not the given *tag* must exist without a close tag.""" return tag.lower() in SINGLE_ONLY + +def is_scheme(scheme, slashes=True, reverse=False): + """Return whether *scheme* is valid for external links.""" + if reverse: # Convenience for C + scheme = scheme[::-1] + scheme = scheme.lower() + if slashes: + return scheme in URI_SCHEMES + return scheme in URI_SCHEMES and not URI_SCHEMES[scheme] diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index faaa0b2..ba97b3f 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -69,6 +69,7 @@ from . import extras from .text import Text from .argument import Argument from .comment import Comment +from .external_link import ExternalLink from .heading import Heading from .html_entity import HTMLEntity from .tag import Tag diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py new file mode 100644 index 0000000..d74f6b3 --- /dev/null +++ b/mwparserfromhell/nodes/external_link.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012-2013 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals + +from . import Node +from ..compat import str +from ..utils import parse_anything + +__all__ = ["ExternalLink"] + +class ExternalLink(Node): + """Represents an external link, like ``[http://example.com/ Example]``.""" + + def __init__(self, url, title=None, brackets=True): + super(ExternalLink, self).__init__() + self._url = url + self._title = title + self._brackets = brackets + + def __unicode__(self): + if self.brackets: + if self.title is not None: + return "[" + str(self.url) + " " + str(self.title) + "]" + return "[" + str(self.url) + "]" + return str(self.url) + + def __iternodes__(self, getter): + yield None, self + for child in getter(self.url): + yield self.url, child + if self.title is not None: + for child in getter(self.title): + yield self.title, child + + def __strip__(self, normalize, collapse): + if self.brackets: + if self.title: + return self.title.strip_code(normalize, collapse) + return None + return self.url.strip_code(normalize, collapse) + + def __showtree__(self, write, get, mark): + if self.brackets: + write("[") + get(self.url) + if self.title is not None: + get(self.title) + if self.brackets: + write("]") + + @property + def url(self): + """The URL of the link target, as a :py:class:`~.Wikicode` object.""" + return self._url + + @property + def title(self): + """The link title (if given), as a :py:class:`~.Wikicode` object.""" + return self._title + + @property + def brackets(self): + """Whether to enclose the URL in brackets or display it straight.""" + return self._brackets + + @url.setter + def url(self, value): + from ..parser import contexts + self._url = parse_anything(value, contexts.EXT_LINK_URI) + + @title.setter + def title(self, value): + self._title = None if value is None else parse_anything(value) + + @brackets.setter + def brackets(self, value): + self._brackets = bool(value) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index b4aec3e..80b8a88 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -25,7 +25,7 @@ from __future__ import unicode_literals from . import Node, Text from .extras import Attribute from ..compat import str -from ..tag_defs import is_visible +from ..definitions import is_visible from ..utils import parse_anything __all__ = ["Tag"] @@ -152,7 +152,7 @@ class Tag(Node): This makes the tag look like a lone close tag. It is technically invalid and is only parsable Wikicode when the tag itself is single-only, like ``
`` and ````. See - :py:func:`.tag_defs.is_single_only`. + :py:func:`.definitions.is_single_only`. """ return self._invalid @@ -161,7 +161,7 @@ class Tag(Node): """Whether the tag is implicitly self-closing, with no ending slash. This is only possible for specific "single" tags like ``
`` and - ``
  • ``. See :py:func:`.tag_defs.is_single`. This field only has an + ``
  • ``. See :py:func:`.definitions.is_single`. This field only has an effect if :py:attr:`self_closing` is also ``True``. """ return self._implicit diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index 1fb95b5..22c3dc2 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -46,16 +46,15 @@ class Parser(object): :py:class:`~.Node`\ s by the :py:class:`~.Builder`. """ - def __init__(self, text): - self.text = text + def __init__(self): if use_c and CTokenizer: self._tokenizer = CTokenizer() else: self._tokenizer = Tokenizer() self._builder = Builder() - def parse(self): - """Return a string as a parsed :py:class:`~.Wikicode` object tree.""" - tokens = self._tokenizer.tokenize(self.text) + def parse(self, text, context=0): + """Parse *text*, returning a :py:class:`~.Wikicode` object tree.""" + tokens = self._tokenizer.tokenize(text, context) code = self._builder.build(tokens) return code diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 196ef14..d31f450 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -24,8 +24,8 @@ from __future__ import unicode_literals from . import tokens from ..compat import str -from ..nodes import (Argument, Comment, Heading, HTMLEntity, Tag, Template, - Text, Wikilink) +from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, + Template, Text, Wikilink) from ..nodes.extras import Attribute, Parameter from ..smart_list import SmartList from ..wikicode import Wikicode @@ -142,6 +142,22 @@ class Builder(object): else: self._write(self._handle_token(token)) + def _handle_external_link(self, token): + """Handle when an external link is at the head of the tokens.""" + brackets, url = token.brackets, None + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.ExternalLinkSeparator): + url = self._pop() + self._push() + elif isinstance(token, tokens.ExternalLinkClose): + if url is not None: + return ExternalLink(url, self._pop(), brackets) + return ExternalLink(self._pop(), brackets=brackets) + else: + self._write(self._handle_token(token)) + def _handle_entity(self): """Handle a case where an HTML entity is at the head of the tokens.""" token = self._tokens.pop() @@ -244,6 +260,8 @@ class Builder(object): return self._handle_argument() elif isinstance(token, tokens.WikilinkOpen): return self._handle_wikilink() + elif isinstance(token, tokens.ExternalLinkOpen): + return self._handle_external_link(token) elif isinstance(token, tokens.HTMLEntityStart): return self._handle_entity() elif isinstance(token, tokens.HeadingStart): diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index a1b67be..33da8f7 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -51,6 +51,12 @@ Local (stack-specific) contexts: * :py:const:`WIKILINK_TITLE` * :py:const:`WIKILINK_TEXT` +* :py:const:`EXT_LINK` + + * :py:const:`EXT_LINK_URI` + * :py:const:`EXT_LINK_TITLE` + * :py:const:`EXT_LINK_BRACKETS` + * :py:const:`HEADING` * :py:const:`HEADING_LEVEL_1` @@ -94,6 +100,7 @@ Aggregate contexts: * :py:const:`FAIL` * :py:const:`UNSAFE` * :py:const:`DOUBLE` +* :py:const:`INVALID_LINK` """ @@ -112,35 +119,40 @@ WIKILINK_TITLE = 1 << 5 WIKILINK_TEXT = 1 << 6 WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT -HEADING_LEVEL_1 = 1 << 7 -HEADING_LEVEL_2 = 1 << 8 -HEADING_LEVEL_3 = 1 << 9 -HEADING_LEVEL_4 = 1 << 10 -HEADING_LEVEL_5 = 1 << 11 -HEADING_LEVEL_6 = 1 << 12 +EXT_LINK_URI = 1 << 7 +EXT_LINK_TITLE = 1 << 8 +EXT_LINK_BRACKETS = 1 << 9 +EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE + EXT_LINK_BRACKETS + +HEADING_LEVEL_1 = 1 << 10 +HEADING_LEVEL_2 = 1 << 11 +HEADING_LEVEL_3 = 1 << 12 +HEADING_LEVEL_4 = 1 << 13 +HEADING_LEVEL_5 = 1 << 14 +HEADING_LEVEL_6 = 1 << 15 HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 + HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6) -TAG_OPEN = 1 << 13 -TAG_ATTR = 1 << 14 -TAG_BODY = 1 << 15 -TAG_CLOSE = 1 << 16 +TAG_OPEN = 1 << 16 +TAG_ATTR = 1 << 17 +TAG_BODY = 1 << 18 +TAG_CLOSE = 1 << 19 TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE -STYLE_ITALICS = 1 << 17 -STYLE_BOLD = 1 << 18 -STYLE_PASS_AGAIN = 1 << 19 -STYLE_SECOND_PASS = 1 << 20 +STYLE_ITALICS = 1 << 20 +STYLE_BOLD = 1 << 21 +STYLE_PASS_AGAIN = 1 << 22 +STYLE_SECOND_PASS = 1 << 23 STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS -DL_TERM = 1 << 21 +DL_TERM = 1 << 24 -HAS_TEXT = 1 << 22 -FAIL_ON_TEXT = 1 << 23 -FAIL_NEXT = 1 << 24 -FAIL_ON_LBRACE = 1 << 25 -FAIL_ON_RBRACE = 1 << 26 -FAIL_ON_EQUALS = 1 << 27 +HAS_TEXT = 1 << 25 +FAIL_ON_TEXT = 1 << 26 +FAIL_NEXT = 1 << 27 +FAIL_ON_LBRACE = 1 << 28 +FAIL_ON_RBRACE = 1 << 29 +FAIL_ON_EQUALS = 1 << 30 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) @@ -150,7 +162,8 @@ GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + HEADING + TAG + STYLE -UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + - TAG_CLOSE) +FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE +UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE +INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK + EXT_LINK diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 67a4ae6..07d3988 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -24,6 +24,20 @@ SOFTWARE. #include "tokenizer.h" /* + Determine whether the given Py_UNICODE is a marker. +*/ +static int is_marker(Py_UNICODE this) +{ + int i; + + for (i = 0; i < NUM_MARKERS; i++) { + if (*MARKERS[i] == this) + return 1; + } + return 0; +} + +/* Given a context, return the heading level encoded within it. */ static int heading_level_from_context(int n) @@ -37,13 +51,14 @@ static int heading_level_from_context(int n) } /* - Call the given function in tag_defs, using 'tag' as a parameter, and return - its output as a bool. + Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as + parameters, and return its output as a bool. */ -static int call_tag_def_func(const char* funcname, PyObject* tag) +static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2, + PyObject* in3) { - PyObject* func = PyObject_GetAttrString(tag_defs, funcname); - PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL); + PyObject* func = PyObject_GetAttrString(definitions, funcname); + PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL); int ans = (result == Py_True) ? 1 : 0; Py_DECREF(func); @@ -65,7 +80,7 @@ static PyObject* strip_tag_name(PyObject* token) Py_DECREF(text); if (!rstripped) return NULL; - lowered = PyObject_CallMethod(rstripped, "rstrip", NULL); + lowered = PyObject_CallMethod(rstripped, "lower", NULL); Py_DECREF(rstripped); return lowered; } @@ -85,7 +100,7 @@ static Textbuffer* Textbuffer_new(void) PyErr_NoMemory(); return NULL; } - buffer->next = NULL; + buffer->prev = buffer->next = NULL; return buffer; } @@ -113,10 +128,10 @@ static int Textbuffer_write(Textbuffer** this, Py_UNICODE code) if (!new) return -1; new->next = self; + self->prev = new; *this = self = new; } - self->data[self->size] = code; - self->size++; + self->data[self->size++] = code; return 0; } @@ -345,7 +360,7 @@ static void* Tokenizer_fail_route(Tokenizer* self) } /* - Write a token to the end of the current token stack. + Write a token to the current token stack. */ static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) { @@ -366,7 +381,8 @@ static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) } /* - Write a token to the end of the current token stack. + Write a token to the current token stack, with kwargs. Steals a reference + to kwargs. */ static int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token, PyObject* kwargs, int first) @@ -417,6 +433,42 @@ static int Tokenizer_emit_text(Tokenizer* self, const char* text) } /* + Write the contents of another textbuffer to the current textbuffer, + deallocating it in the process. +*/ +static int +Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse) +{ + Textbuffer *original = buffer; + int i; + + if (reverse) { + do { + for (i = buffer->size - 1; i >= 0; i--) { + if (Tokenizer_emit_char(self, buffer->data[i])) { + Textbuffer_dealloc(original); + return -1; + } + } + } while ((buffer = buffer->next)); + } + else { + while (buffer->next) + buffer = buffer->next; + do { + for (i = 0; i < buffer->size; i++) { + if (Tokenizer_emit_char(self, buffer->data[i])) { + Textbuffer_dealloc(original); + return -1; + } + } + } while ((buffer = buffer->prev)); + } + Textbuffer_dealloc(original); + return 0; +} + +/* Write a series of tokens to the current stack at once. */ static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist) @@ -808,6 +860,353 @@ static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self) } /* + Parse the URI scheme of a bracket-enclosed external link. +*/ +static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) +{ + static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; + Textbuffer* buffer; + PyObject* scheme; + Py_UNICODE this; + int slashes, i; + + if (Tokenizer_push(self, LC_EXT_LINK_URI)) + return -1; + if (Tokenizer_READ(self, 0) == *"/" && Tokenizer_READ(self, 1) == *"/") { + if (Tokenizer_emit_text(self, "//")) + return -1; + self->head += 2; + } + else { + buffer = Textbuffer_new(); + if (!buffer) + return -1; + while ((this = Tokenizer_READ(self, 0)) != *"") { + i = 0; + while (1) { + if (!valid[i]) + goto end_of_loop; + if (this == valid[i]) + break; + i++; + } + Textbuffer_write(&buffer, this); + if (Tokenizer_emit_char(self, this)) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head++; + } + end_of_loop: + if (this != *":") { + Textbuffer_dealloc(buffer); + Tokenizer_fail_route(self); + return 0; + } + if (Tokenizer_emit_char(self, *":")) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head++; + slashes = (Tokenizer_READ(self, 0) == *"/" && + Tokenizer_READ(self, 1) == *"/"); + if (slashes) { + if (Tokenizer_emit_text(self, "//")) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head += 2; + } + scheme = Textbuffer_render(buffer); + Textbuffer_dealloc(buffer); + if (!scheme) + return -1; + if (!IS_SCHEME(scheme, slashes, 0)) { + Py_DECREF(scheme); + Tokenizer_fail_route(self); + return 0; + } + Py_DECREF(scheme); + } + return 0; +} + +/* + Parse the URI scheme of a free (no brackets) external link. +*/ +static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) +{ + static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; + Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer; + PyObject *scheme; + Py_UNICODE chunk; + int slashes, i, j; + + if (!scheme_buffer) + return -1; + // We have to backtrack through the textbuffer looking for our scheme since + // it was just parsed as text: + temp_buffer = self->topstack->textbuffer; + while (temp_buffer) { + for (i = temp_buffer->size - 1; i >= 0; i--) { + chunk = temp_buffer->data[i]; + if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) + goto end_of_loop; + j = 0; + while (1) { + if (!valid[j]) { + Textbuffer_dealloc(scheme_buffer); + FAIL_ROUTE(0); + return 0; + } + if (chunk == valid[j]) + break; + j++; + } + Textbuffer_write(&scheme_buffer, chunk); + } + temp_buffer = temp_buffer->next; + } + end_of_loop: + scheme = Textbuffer_render(scheme_buffer); + if (!scheme) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + slashes = (Tokenizer_READ(self, 0) == *"/" && + Tokenizer_READ(self, 1) == *"/"); + if (!IS_SCHEME(scheme, slashes, 1)) { + Py_DECREF(scheme); + Textbuffer_dealloc(scheme_buffer); + FAIL_ROUTE(0); + return 0; + } + Py_DECREF(scheme); + if (Tokenizer_push(self, LC_EXT_LINK_URI)) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1)) + return -1; + if (Tokenizer_emit_char(self, *":")) + return -1; + if (slashes) { + if (Tokenizer_emit_text(self, "//")) + return -1; + self->head += 2; + } + return 0; +} + +/* + Handle text in a free external link, including trailing punctuation. +*/ +static int +Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, + Textbuffer** tail, Py_UNICODE this) +{ + #define PUSH_TAIL_BUFFER(tail, error) \ + if ((tail)->size || (tail)->next) { \ + if (Tokenizer_emit_textbuffer(self, tail, 0)) \ + return error; \ + tail = Textbuffer_new(); \ + if (!(tail)) \ + return error; \ + } + + if (this == *"(" && !(*parens)) { + *parens = 1; + PUSH_TAIL_BUFFER(*tail, -1) + } + else if (this == *"," || this == *";" || this == *"\\" || this == *"." || + this == *":" || this == *"!" || this == *"?" || + (!(*parens) && this == *")")) + return Textbuffer_write(tail, this); + else + PUSH_TAIL_BUFFER(*tail, -1) + return Tokenizer_emit_char(self, this); +} + +/* + Really parse an external link. +*/ +static PyObject* +Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, + Textbuffer** extra) +{ + Py_UNICODE this, next; + int parens = 0; + + if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) : + Tokenizer_parse_free_uri_scheme(self)) + return NULL; + if (BAD_ROUTE) + return NULL; + this = Tokenizer_READ(self, 0); + if (this == *"" || this == *"\n" || this == *" " || this == *"]") + return Tokenizer_fail_route(self); + if (!brackets && this == *"[") + return Tokenizer_fail_route(self); + while (1) { + this = Tokenizer_READ(self, 0); + next = Tokenizer_READ(self, 1); + if (this == *"" || this == *"\n") { + if (brackets) + return Tokenizer_fail_route(self); + self->head--; + return Tokenizer_pop(self); + } + if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_template_or_argument(self)) + return NULL; + } + else if (this == *"[") { + if (!brackets) { + self->head--; + return Tokenizer_pop(self); + } + if (Tokenizer_emit_char(self, *"[")) + return NULL; + } + else if (this == *"]") { + if (!brackets) + self->head--; + return Tokenizer_pop(self); + } + else if (this == *"&") { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_entity(self)) + return NULL; + } + else if (this == *" ") { + if (brackets) { + if (Tokenizer_emit(self, ExternalLinkSeparator)) + return NULL; + self->topstack->context ^= LC_EXT_LINK_URI; + self->topstack->context |= LC_EXT_LINK_TITLE; + self->head++; + return Tokenizer_parse(self, 0, 0); + } + if (Textbuffer_write(extra, *" ")) + return NULL; + return Tokenizer_pop(self); + } + else if (!brackets) { + if (Tokenizer_handle_free_link_text(self, &parens, extra, this)) + return NULL; + } + else { + if (Tokenizer_emit_char(self, this)) + return NULL; + } + self->head++; + } +} + +/* + Remove the URI scheme of a new external link from the textbuffer. +*/ +static int +Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) +{ + PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"), + *split, *scheme; + Py_ssize_t length; + Textbuffer* temp; + + if (!text) + return -1; + split = PyObject_CallMethod(text, "split", "si", ":", 1); + Py_DECREF(text); + if (!split) + return -1; + scheme = PyList_GET_ITEM(split, 0); + length = PyUnicode_GET_SIZE(scheme); + while (length) { + temp = self->topstack->textbuffer; + if (length <= temp->size) { + temp->size -= length; + break; + } + length -= temp->size; + self->topstack->textbuffer = temp->next; + free(temp->data); + free(temp); + } + Py_DECREF(split); + return 0; +} + +/* + Parse an external link at the head of the wikicode string. +*/ +static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) +{ + #define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK + #define NOT_A_LINK \ + if (!brackets && self->topstack->context & LC_DLTERM) \ + return Tokenizer_handle_dl_term(self); \ + return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)) + + Py_ssize_t reset = self->head; + PyObject *link, *kwargs; + Textbuffer *extra = 0; + + if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { + NOT_A_LINK; + } + extra = Textbuffer_new(); + if (!extra) + return -1; + self->head++; + link = Tokenizer_really_parse_external_link(self, brackets, &extra); + if (BAD_ROUTE) { + RESET_ROUTE(); + self->head = reset; + Textbuffer_dealloc(extra); + NOT_A_LINK; + } + if (!link) { + Textbuffer_dealloc(extra); + return -1; + } + if (!brackets) { + if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) { + Textbuffer_dealloc(extra); + Py_DECREF(link); + return -1; + } + } + kwargs = PyDict_New(); + if (!kwargs) { + Textbuffer_dealloc(extra); + Py_DECREF(link); + return -1; + } + PyDict_SetItemString(kwargs, "brackets", brackets ? Py_True : Py_False); + if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) { + Textbuffer_dealloc(extra); + Py_DECREF(link); + return -1; + } + if (Tokenizer_emit_all(self, link)) { + Textbuffer_dealloc(extra); + Py_DECREF(link); + return -1; + } + Py_DECREF(link); + if (Tokenizer_emit(self, ExternalLinkClose)) { + Textbuffer_dealloc(extra); + return -1; + } + if (extra->size || extra->next) + return Tokenizer_emit_textbuffer(self, extra, 0); + Textbuffer_dealloc(extra); + return 0; +} + +/* Parse a section heading at the head of the wikicode string. */ static int Tokenizer_parse_heading(Tokenizer* self) @@ -1238,15 +1637,8 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text) { Py_UNICODE next = Tokenizer_READ(self, 1); - int i, is_marker = 0; - for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == text) { - is_marker = 1; - break; - } - } - if (!is_marker || !Tokenizer_CAN_RECURSE(self)) + if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self)) return Tokenizer_emit_char(self, text); else if (text == next && next == *"{") return Tokenizer_parse_template_or_argument(self); @@ -1264,17 +1656,11 @@ static int Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk) { PyObject *trash; - int first_time, i, is_marker = 0, escaped; + int first_time, escaped; if (data->context & TAG_NAME) { first_time = !(data->context & TAG_NOTE_SPACE); - for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == chunk) { - is_marker = 1; - break; - } - } - if (is_marker || (Py_UNICODE_ISSPACE(chunk) && first_time)) { + if (is_marker(chunk) || (Py_UNICODE_ISSPACE(chunk) && first_time)) { // Tags must start with text, not spaces Tokenizer_fail_route(self); return 0; @@ -1623,7 +2009,6 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) Textbuffer* buf; PyObject *name, *tag; Py_UNICODE this; - int is_marker, i; self->head += 2; buf = Textbuffer_new(); @@ -1631,14 +2016,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) return -1; while (1) { this = Tokenizer_READ(self, pos); - is_marker = 0; - for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == this) { - is_marker = 1; - break; - } - } - if (is_marker) { + if (is_marker(this)) { name = Textbuffer_render(buf); if (!name) { Textbuffer_dealloc(buf); @@ -1985,9 +2363,9 @@ static int Tokenizer_handle_hr(Tokenizer* self) self->head++; } markup = Textbuffer_render(buffer); + Textbuffer_dealloc(buffer); if (!markup) return -1; - Textbuffer_dealloc(buffer); kwargs = PyDict_New(); if (!kwargs) return -1; @@ -2047,21 +2425,21 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) */ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) { - if (context & LC_FAIL_NEXT) { + if (context & LC_FAIL_NEXT) return -1; - } - if (context & LC_WIKILINK_TITLE) { - if (data == *"]" || data == *"{") + if (context & LC_WIKILINK) { + if (context & LC_WIKILINK_TEXT) + return (data == *"[" && Tokenizer_READ(self, 1) == *"[") ? -1 : 0; + else if (data == *"]" || data == *"{") self->topstack->context |= LC_FAIL_NEXT; else if (data == *"\n" || data == *"[" || data == *"}") return -1; return 0; } - if (context & LC_TAG_CLOSE) { - if (data == *"<") - return -1; - return 0; - } + if (context & LC_EXT_LINK_TITLE) + return (data == *"\n") ? -1 : 0; + if (context & LC_TAG_CLOSE) + return (data == *"<") ? -1 : 0; if (context & LC_TEMPLATE_NAME) { if (data == *"{" || data == *"}" || data == *"[") { self->topstack->context |= LC_FAIL_NEXT; @@ -2126,7 +2504,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) */ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) { - int this_context, is_marker, i; + int this_context; Py_UNICODE this, next, next_next, last; PyObject* temp; @@ -2146,14 +2524,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) return Tokenizer_fail_route(self); } } - is_marker = 0; - for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == this) { - is_marker = 1; - break; - } - } - if (!is_marker) { + if (!is_marker(this)) { if (Tokenizer_emit_char(self, this)) return NULL; self->head++; @@ -2192,9 +2563,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) if (Tokenizer_emit_char(self, this)) return NULL; } - else if (this == next && next == *"[") { - if (!(this_context & LC_WIKILINK_TITLE) && - Tokenizer_CAN_RECURSE(self)) { + else if (this == next && next == *"[" && Tokenizer_CAN_RECURSE(self)) { + if (!(this_context & AGG_INVALID_LINK)) { if (Tokenizer_parse_wikilink(self)) return NULL; } @@ -2207,6 +2577,16 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) } else if (this == next && next == *"]" && this_context & LC_WIKILINK) return Tokenizer_handle_wikilink_end(self); + else if (this == *"[") { + if (Tokenizer_parse_external_link(self, 1)) + return NULL; + } + else if (this == *":" && !is_marker(last)) { + if (Tokenizer_parse_external_link(self, 0)) + return NULL; + } + else if (this == *"]" && this_context & LC_EXT_LINK_TITLE) + return Tokenizer_pop(self); else if (this == *"=" && !(self->global & GL_HEADING)) { if (last == *"\n" || last == *"") { if (Tokenizer_parse_heading(self)) @@ -2243,9 +2623,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) return NULL; } } - else if (this == *"<") { - if (!(this_context & LC_TAG_CLOSE) && - Tokenizer_CAN_RECURSE(self)) { + else if (this == *"<" && !(this_context & LC_TAG_CLOSE)) { + if (Tokenizer_CAN_RECURSE(self)) { if (Tokenizer_parse_tag(self)) return NULL; } @@ -2289,8 +2668,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) { PyObject *text, *temp; + int context = 0; - if (PyArg_ParseTuple(args, "U", &text)) { + if (PyArg_ParseTuple(args, "U|i", &text, &context)) { Py_XDECREF(self->text); self->text = PySequence_Fast(text, "expected a sequence"); } @@ -2299,7 +2679,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) Py_ssize_t size; /* Failed to parse a Unicode object; try a string instead. */ PyErr_Clear(); - if (!PyArg_ParseTuple(args, "s#", &encoded, &size)) + if (!PyArg_ParseTuple(args, "s#|i", &encoded, &size, &context)) return NULL; temp = PyUnicode_FromStringAndSize(encoded, size); if (!text) @@ -2311,7 +2691,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) } self->head = self->global = self->depth = self->cycles = 0; self->length = PyList_GET_SIZE(self->text); - return Tokenizer_parse(self, 0, 1); + return Tokenizer_parse(self, context, 1); } static int load_entitydefs(void) @@ -2389,6 +2769,11 @@ static int load_tokens(void) WikilinkSeparator = PyObject_GetAttrString(tokens, "WikilinkSeparator"); WikilinkClose = PyObject_GetAttrString(tokens, "WikilinkClose"); + ExternalLinkOpen = PyObject_GetAttrString(tokens, "ExternalLinkOpen"); + ExternalLinkSeparator = PyObject_GetAttrString(tokens, + "ExternalLinkSeparator"); + ExternalLinkClose = PyObject_GetAttrString(tokens, "ExternalLinkClose"); + HTMLEntityStart = PyObject_GetAttrString(tokens, "HTMLEntityStart"); HTMLEntityNumeric = PyObject_GetAttrString(tokens, "HTMLEntityNumeric"); HTMLEntityHex = PyObject_GetAttrString(tokens, "HTMLEntityHex"); @@ -2413,13 +2798,13 @@ static int load_tokens(void) return 0; } -static int load_tag_defs(void) +static int load_definitions(void) { PyObject *tempmod, *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(), *fromlist = PyList_New(1), - *modname = IMPORT_NAME_FUNC("tag_defs"); + *modname = IMPORT_NAME_FUNC("definitions"); char *name = "mwparserfromhell"; if (!fromlist || !modname) @@ -2429,7 +2814,7 @@ static int load_tag_defs(void) Py_DECREF(fromlist); if (!tempmod) return -1; - tag_defs = PyObject_GetAttrString(tempmod, "tag_defs"); + definitions = PyObject_GetAttrString(tempmod, "definitions"); Py_DECREF(tempmod); return 0; } @@ -2452,7 +2837,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void) NOARGS = PyTuple_New(0); if (!EMPTY || !NOARGS) INIT_ERROR; - if (load_entitydefs() || load_tokens() || load_tag_defs()) + if (load_entitydefs() || load_tokens() || load_definitions()) INIT_ERROR; #ifdef IS_PY3K return module; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 264360e..da3c57a 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -62,7 +62,7 @@ static char** entitydefs; static PyObject* EMPTY; static PyObject* NOARGS; -static PyObject* tag_defs; +static PyObject* definitions; /* Tokens: */ @@ -82,6 +82,10 @@ static PyObject* WikilinkOpen; static PyObject* WikilinkSeparator; static PyObject* WikilinkClose; +static PyObject* ExternalLinkOpen; +static PyObject* ExternalLinkSeparator; +static PyObject* ExternalLinkClose; + static PyObject* HTMLEntityStart; static PyObject* HTMLEntityNumeric; static PyObject* HTMLEntityHex; @@ -104,48 +108,53 @@ static PyObject* TagCloseClose; /* Local contexts: */ -#define LC_TEMPLATE 0x0000007 -#define LC_TEMPLATE_NAME 0x0000001 -#define LC_TEMPLATE_PARAM_KEY 0x0000002 -#define LC_TEMPLATE_PARAM_VALUE 0x0000004 - -#define LC_ARGUMENT 0x0000018 -#define LC_ARGUMENT_NAME 0x0000008 -#define LC_ARGUMENT_DEFAULT 0x0000010 - -#define LC_WIKILINK 0x0000060 -#define LC_WIKILINK_TITLE 0x0000020 -#define LC_WIKILINK_TEXT 0x0000040 - -#define LC_HEADING 0x0001F80 -#define LC_HEADING_LEVEL_1 0x0000080 -#define LC_HEADING_LEVEL_2 0x0000100 -#define LC_HEADING_LEVEL_3 0x0000200 -#define LC_HEADING_LEVEL_4 0x0000400 -#define LC_HEADING_LEVEL_5 0x0000800 -#define LC_HEADING_LEVEL_6 0x0001000 - -#define LC_TAG 0x001E000 -#define LC_TAG_OPEN 0x0002000 -#define LC_TAG_ATTR 0x0004000 -#define LC_TAG_BODY 0x0008000 -#define LC_TAG_CLOSE 0x0010000 - -#define LC_STYLE 0x01E0000 -#define LC_STYLE_ITALICS 0x0020000 -#define LC_STYLE_BOLD 0x0040000 -#define LC_STYLE_PASS_AGAIN 0x0080000 -#define LC_STYLE_SECOND_PASS 0x0100000 - -#define LC_DLTERM 0x0200000 - -#define LC_SAFETY_CHECK 0xFC00000 -#define LC_HAS_TEXT 0x0400000 -#define LC_FAIL_ON_TEXT 0x0800000 -#define LC_FAIL_NEXT 0x1000000 -#define LC_FAIL_ON_LBRACE 0x2000000 -#define LC_FAIL_ON_RBRACE 0x4000000 -#define LC_FAIL_ON_EQUALS 0x8000000 +#define LC_TEMPLATE 0x00000007 +#define LC_TEMPLATE_NAME 0x00000001 +#define LC_TEMPLATE_PARAM_KEY 0x00000002 +#define LC_TEMPLATE_PARAM_VALUE 0x00000004 + +#define LC_ARGUMENT 0x00000018 +#define LC_ARGUMENT_NAME 0x00000008 +#define LC_ARGUMENT_DEFAULT 0x00000010 + +#define LC_WIKILINK 0x00000060 +#define LC_WIKILINK_TITLE 0x00000020 +#define LC_WIKILINK_TEXT 0x00000040 + +#define LC_EXT_LINK 0x00000380 +#define LC_EXT_LINK_URI 0x00000080 +#define LC_EXT_LINK_TITLE 0x00000100 +#define LC_EXT_LINK_BRACKETS 0x00000200 + +#define LC_HEADING 0x0000FC00 +#define LC_HEADING_LEVEL_1 0x00000400 +#define LC_HEADING_LEVEL_2 0x00000800 +#define LC_HEADING_LEVEL_3 0x00001000 +#define LC_HEADING_LEVEL_4 0x00002000 +#define LC_HEADING_LEVEL_5 0x00004000 +#define LC_HEADING_LEVEL_6 0x00008000 + +#define LC_TAG 0x000F0000 +#define LC_TAG_OPEN 0x00010000 +#define LC_TAG_ATTR 0x00020000 +#define LC_TAG_BODY 0x00040000 +#define LC_TAG_CLOSE 0x00080000 + +#define LC_STYLE 0x00F00000 +#define LC_STYLE_ITALICS 0x00100000 +#define LC_STYLE_BOLD 0x00200000 +#define LC_STYLE_PASS_AGAIN 0x00400000 +#define LC_STYLE_SECOND_PASS 0x00800000 + +#define LC_DLTERM 0x01000000 + +#define LC_SAFETY_CHECK 0x7E000000 +#define LC_HAS_TEXT 0x02000000 +#define LC_FAIL_ON_TEXT 0x04000000 +#define LC_FAIL_NEXT 0x08000000 +#define LC_FAIL_ON_LBRACE 0x10000000 +#define LC_FAIL_ON_RBRACE 0x20000000 +#define LC_FAIL_ON_EQUALS 0x40000000 /* Global contexts: */ @@ -153,9 +162,10 @@ static PyObject* TagCloseClose; /* Aggregate contexts: */ -#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) -#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) -#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) +#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) +#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) +#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK) /* Tag contexts: */ @@ -174,6 +184,7 @@ static PyObject* TagCloseClose; struct Textbuffer { Py_ssize_t size; Py_UNICODE* data; + struct Textbuffer* prev; struct Textbuffer* next; }; @@ -228,12 +239,14 @@ typedef struct { #define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1) -/* Macros for accessing HTML tag definitions: */ +/* Macros for accessing definitions: */ #define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") -#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag)) -#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag)) -#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag)) +#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL)) +#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL)) +#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL)) +#define IS_SCHEME(scheme, slashes, reverse) \ + (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False)) /* Function prototypes: */ @@ -247,6 +260,8 @@ static void TagData_dealloc(TagData*); static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); static void Tokenizer_dealloc(Tokenizer*); static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); +static int Tokenizer_parse_entity(Tokenizer*); +static int Tokenizer_handle_dl_term(Tokenizer*); static int Tokenizer_parse_tag(Tokenizer*); static PyObject* Tokenizer_parse(Tokenizer*, int, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 583d2f8..6ab549a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -26,7 +26,8 @@ import re from . import contexts, tokens from ..compat import htmlentities -from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only +from ..definitions import (get_html_tag, is_parsable, is_single, + is_single_only, is_scheme) __all__ = ["Tokenizer"] @@ -60,7 +61,7 @@ class Tokenizer(object): START = object() END = object() MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", - ":", "/", "-", "\n", END] + ":", "/", "-", "\n", START, END] MAX_DEPTH = 40 MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) @@ -311,6 +312,168 @@ class Tokenizer(object): self._head += 1 return self._pop() + def _parse_bracketed_uri_scheme(self): + """Parse the URI scheme of a bracket-enclosed external link.""" + self._push(contexts.EXT_LINK_URI) + if self._read() == self._read(1) == "/": + self._emit_text("//") + self._head += 2 + else: + valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" + all_valid = lambda: all(char in valid for char in self._read()) + scheme = "" + while self._read() is not self.END and all_valid(): + scheme += self._read() + self._emit_text(self._read()) + self._head += 1 + if self._read() != ":": + self._fail_route() + self._emit_text(":") + self._head += 1 + slashes = self._read() == self._read(1) == "/" + if slashes: + self._emit_text("//") + self._head += 2 + if not is_scheme(scheme, slashes): + self._fail_route() + + def _parse_free_uri_scheme(self): + """Parse the URI scheme of a free (no brackets) external link.""" + valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" + scheme = [] + try: + # We have to backtrack through the textbuffer looking for our + # scheme since it was just parsed as text: + for chunk in reversed(self._textbuffer): + for char in reversed(chunk): + if char.isspace() or char in self.MARKERS: + raise StopIteration() + if char not in valid: + raise BadRoute() + scheme.append(char) + except StopIteration: + pass + scheme = "".join(reversed(scheme)) + slashes = self._read() == self._read(1) == "/" + if not is_scheme(scheme, slashes): + raise BadRoute() + self._push(contexts.EXT_LINK_URI) + self._emit_text(scheme) + self._emit_text(":") + if slashes: + self._emit_text("//") + self._head += 2 + + def _handle_free_link_text(self, punct, tail, this): + """Handle text in a free ext link, including trailing punctuation.""" + if "(" in this and ")" in punct: + punct = punct[:-1] # ')' is not longer valid punctuation + if this.endswith(punct): + for i in reversed(range(-len(this), 0)): + if i == -len(this) or this[i - 1] not in punct: + break + stripped = this[:i] + if stripped and tail: + self._emit_text(tail) + tail = "" + tail += this[i:] + this = stripped + elif tail: + self._emit_text(tail) + tail = "" + self._emit_text(this) + return punct, tail + + def _really_parse_external_link(self, brackets): + """Really parse an external link.""" + if brackets: + self._parse_bracketed_uri_scheme() + invalid = ("\n", " ", "]") + else: + self._parse_free_uri_scheme() + invalid = ("\n", " ", "[", "]") + punct = tuple(",;\.:!?)") + if self._read() is self.END or self._read()[0] in invalid: + self._fail_route() + tail = "" + while True: + this, next = self._read(), self._read(1) + if this is self.END or this == "\n": + if brackets: + self._fail_route() + return self._pop(), tail, -1 + elif this == next == "{" and self._can_recurse(): + if tail: + self._emit_text(tail) + tail = "" + self._parse_template_or_argument() + elif this == "[": + if brackets: + self._emit_text("[") + else: + return self._pop(), tail, -1 + elif this == "]": + return self._pop(), tail, 0 if brackets else -1 + elif this == "&": + if tail: + self._emit_text(tail) + tail = "" + self._parse_entity() + elif " " in this: + before, after = this.split(" ", 1) + if brackets: + self._emit_text(before) + self._emit(tokens.ExternalLinkSeparator()) + if after: + self._emit_text(after) + self._context ^= contexts.EXT_LINK_URI + self._context |= contexts.EXT_LINK_TITLE + self._head += 1 + return self._parse(push=False), None, 0 + punct, tail = self._handle_free_link_text(punct, tail, before) + return self._pop(), tail + " " + after, 0 + elif not brackets: + punct, tail = self._handle_free_link_text(punct, tail, this) + else: + self._emit_text(this) + self._head += 1 + + def _remove_uri_scheme_from_textbuffer(self, scheme): + """Remove the URI scheme of a new external link from the textbuffer.""" + length = len(scheme) + while length: + if length < len(self._textbuffer[-1]): + self._textbuffer[-1] = self._textbuffer[-1][:-length] + break + length -= len(self._textbuffer[-1]) + self._textbuffer.pop() + + def _parse_external_link(self, brackets): + """Parse an external link at the head of the wikicode string.""" + reset = self._head + self._head += 1 + try: + bad_context = self._context & contexts.INVALID_LINK + if bad_context or not self._can_recurse(): + raise BadRoute() + link, extra, delta = self._really_parse_external_link(brackets) + except BadRoute: + self._head = reset + if not brackets and self._context & contexts.DL_TERM: + self._handle_dl_term() + else: + self._emit_text(self._read()) + else: + if not brackets: + scheme = link[0].text.split(":", 1)[0] + self._remove_uri_scheme_from_textbuffer(scheme) + self._emit(tokens.ExternalLinkOpen(brackets=brackets)) + self._emit_all(link) + self._emit(tokens.ExternalLinkClose()) + self._head += delta + if extra: + self._emit_text(extra) + def _parse_heading(self): """Parse a section heading at the head of the wikicode string.""" self._global |= contexts.GL_HEADING @@ -810,12 +973,16 @@ class Tokenizer(object): context = self._context if context & contexts.FAIL_NEXT: return False - if context & contexts.WIKILINK_TITLE: - if this == "]" or this == "{": + if context & contexts.WIKILINK: + if context & contexts.WIKILINK_TEXT: + return not (this == self._read(1) == "[") + elif this == "]" or this == "{": self._context |= contexts.FAIL_NEXT elif this == "\n" or this == "[" or this == "}": return False return True + elif context & contexts.EXT_LINK_TITLE: + return this != "\n" elif context & contexts.TEMPLATE_NAME: if this == "{" or this == "}" or this == "[": self._context |= contexts.FAIL_NEXT @@ -898,8 +1065,8 @@ class Tokenizer(object): return self._handle_argument_end() else: self._emit_text("}") - elif this == next == "[": - if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): + elif this == next == "[" and self._can_recurse(): + if not self._context & contexts.INVALID_LINK: self._parse_wikilink() else: self._emit_text("[") @@ -907,6 +1074,12 @@ class Tokenizer(object): self._handle_wikilink_separator() elif this == next == "]" and self._context & contexts.WIKILINK: return self._handle_wikilink_end() + elif this == "[": + self._parse_external_link(True) + elif this == ":" and self._read(-1) not in self.MARKERS: + self._parse_external_link(False) + elif this == "]" and self._context & contexts.EXT_LINK_TITLE: + return self._pop() elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() @@ -928,8 +1101,8 @@ class Tokenizer(object): self._handle_tag_open_close() else: self._handle_invalid_tag_start() - elif this == "<": - if not self._context & contexts.TAG_CLOSE and self._can_recurse(): + elif this == "<" and not self._context & contexts.TAG_CLOSE: + if self._can_recurse(): self._parse_tag() else: self._emit_text("<") @@ -952,8 +1125,9 @@ class Tokenizer(object): self._emit_text(this) self._head += 1 - def tokenize(self, text): + def tokenize(self, text, context=0): """Build a list of tokens from a string of wikicode and return it.""" split = self.regex.split(text) self._text = [segment for segment in split if segment] - return self._parse() + self._head = self._global = self._depth = self._cycles = 0 + return self._parse(context) diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 0ffac86..57308ea 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -84,6 +84,10 @@ WikilinkOpen = make("WikilinkOpen") # [[ WikilinkSeparator = make("WikilinkSeparator") # | WikilinkClose = make("WikilinkClose") # ]] +ExternalLinkOpen = make("ExternalLinkOpen") # [ +ExternalLinkSeparator = make("ExternalLinkSeparator") # +ExternalLinkClose = make("ExternalLinkClose") # ] + HTMLEntityStart = make("HTMLEntityStart") # & HTMLEntityNumeric = make("HTMLEntityNumeric") # # HTMLEntityHex = make("HTMLEntityHex") # x diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index 31e5ba0..758e751 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -33,7 +33,7 @@ from .smart_list import SmartList __all__ = ["parse_anything"] -def parse_anything(value): +def parse_anything(value, context=0): """Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. This differs from :py:meth:`.Parser.parse` in that we accept more than just @@ -44,6 +44,12 @@ def parse_anything(value): on-the-fly by various methods of :py:class:`~.Wikicode` and others like :py:class:`~.Template`, such as :py:meth:`wikicode.insert() <.Wikicode.insert>` or setting :py:meth:`template.name <.Template.name>`. + + If given, *context* will be passed as a starting context to the parser. + This is helpful when this function is used inside node attribute setters. + For example, :py:class:`~.ExternalLink`\ 's :py:attr:`~.ExternalLink.url` + setter sets *context* to :py:mod:`contexts.EXT_LINK_URI <.contexts>` to + prevent the URL itself from becoming an :py:class:`~.ExternalLink`. """ from .parser import Parser from .wikicode import Wikicode @@ -53,17 +59,17 @@ def parse_anything(value): elif isinstance(value, Node): return Wikicode(SmartList([value])) elif isinstance(value, str): - return Parser(value).parse() + return Parser().parse(value, context) elif isinstance(value, bytes): - return Parser(value.decode("utf8")).parse() + return Parser().parse(value.decode("utf8"), context) elif isinstance(value, int): - return Parser(str(value)).parse() + return Parser().parse(str(value), context) elif value is None: return Wikicode(SmartList()) try: nodelist = SmartList() for item in value: - nodelist += parse_anything(item).nodes + nodelist += parse_anything(item, context).nodes except TypeError: error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" raise ValueError(error.format(type(value).__name__, value)) diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index b5e854d..c3249d9 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -24,8 +24,8 @@ from __future__ import unicode_literals import re from .compat import maxsize, py3k, str -from .nodes import (Argument, Comment, Heading, HTMLEntity, Node, Tag, - Template, Text, Wikilink) +from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, + Node, Tag, Template, Text, Wikilink) from .string_mixin import StringMixIn from .utils import parse_anything @@ -509,6 +509,6 @@ class Wikicode(StringMixIn): return "\n".join(self._get_tree(self, [], marker, 0)) Wikicode._build_filter_methods( - arguments=Argument, comments=Comment, headings=Heading, - html_entities=HTMLEntity, tags=Tag, templates=Template, text=Text, - wikilinks=Wikilink) + arguments=Argument, comments=Comment, external_links=ExternalLink, + headings=Heading, html_entities=HTMLEntity, tags=Tag, templates=Template, + text=Text, wikilinks=Wikilink) diff --git a/tests/test_builder.py b/tests/test_builder.py index 29ae65a..152ab53 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -23,8 +23,8 @@ from __future__ import unicode_literals import unittest -from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity, - Tag, Template, Text, Wikilink) +from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, + HTMLEntity, Tag, Template, Text, Wikilink) from mwparserfromhell.nodes.extras import Attribute, Parameter from mwparserfromhell.parser import tokens from mwparserfromhell.parser.builder import Builder @@ -150,6 +150,48 @@ class TestBuilder(TreeEqualityTestCase): for test, valid in tests: self.assertWikicodeEqual(valid, self.builder.build(test)) + def test_external_link(self): + """tests for building ExternalLink nodes""" + tests = [ + ([tokens.ExternalLinkOpen(brackets=False), + tokens.Text(text="http://example.com/"), + tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example.com/"), + brackets=False)])), + + ([tokens.ExternalLinkOpen(brackets=True), + tokens.Text(text="http://example.com/"), + tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example.com/"))])), + + ([tokens.ExternalLinkOpen(brackets=True), + tokens.Text(text="http://example.com/"), + tokens.ExternalLinkSeparator(), tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example.com/"), wrap([]))])), + + ([tokens.ExternalLinkOpen(brackets=True), + tokens.Text(text="http://example.com/"), + tokens.ExternalLinkSeparator(), tokens.Text(text="Example"), + tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example.com/"), + wraptext("Example"))])), + + ([tokens.ExternalLinkOpen(brackets=False), + tokens.Text(text="http://example"), tokens.Text(text=".com/foo"), + tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example", ".com/foo"), + brackets=False)])), + + ([tokens.ExternalLinkOpen(brackets=True), + tokens.Text(text="http://example"), tokens.Text(text=".com/foo"), + tokens.ExternalLinkSeparator(), tokens.Text(text="Example"), + tokens.Text(text=" Web Page"), tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example", ".com/foo"), + wraptext("Example", " Web Page"))])), + ] + for test, valid in tests: + self.assertWikicodeEqual(valid, self.builder.build(test)) + def test_html_entity(self): """tests for building HTMLEntity nodes""" tests = [ diff --git a/tests/test_external_link.py b/tests/test_external_link.py new file mode 100644 index 0000000..13a82bf --- /dev/null +++ b/tests/test_external_link.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012-2013 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals +import unittest + +from mwparserfromhell.compat import str +from mwparserfromhell.nodes import ExternalLink, Text + +from ._test_tree_equality import TreeEqualityTestCase, getnodes, wrap, wraptext + +class TestExternalLink(TreeEqualityTestCase): + """Test cases for the ExternalLink node.""" + + def test_unicode(self): + """test ExternalLink.__unicode__()""" + node = ExternalLink(wraptext("http://example.com/"), brackets=False) + self.assertEqual("http://example.com/", str(node)) + node2 = ExternalLink(wraptext("http://example.com/")) + self.assertEqual("[http://example.com/]", str(node2)) + node3 = ExternalLink(wraptext("http://example.com/"), wrap([])) + self.assertEqual("[http://example.com/ ]", str(node3)) + node4 = ExternalLink(wraptext("http://example.com/"), + wraptext("Example Web Page")) + self.assertEqual("[http://example.com/ Example Web Page]", str(node4)) + + def test_iternodes(self): + """test ExternalLink.__iternodes__()""" + node1n1 = Text("http://example.com/") + node2n1 = Text("http://example.com/") + node2n2, node2n3 = Text("Example"), Text("Page") + node1 = ExternalLink(wrap([node1n1]), brackets=False) + node2 = ExternalLink(wrap([node2n1]), wrap([node2n2, node2n3])) + gen1 = node1.__iternodes__(getnodes) + gen2 = node2.__iternodes__(getnodes) + self.assertEqual((None, node1), next(gen1)) + self.assertEqual((None, node2), next(gen2)) + self.assertEqual((node1.url, node1n1), next(gen1)) + self.assertEqual((node2.url, node2n1), next(gen2)) + self.assertEqual((node2.title, node2n2), next(gen2)) + self.assertEqual((node2.title, node2n3), next(gen2)) + self.assertRaises(StopIteration, next, gen1) + self.assertRaises(StopIteration, next, gen2) + + def test_strip(self): + """test ExternalLink.__strip__()""" + node1 = ExternalLink(wraptext("http://example.com"), brackets=False) + node2 = ExternalLink(wraptext("http://example.com")) + node3 = ExternalLink(wraptext("http://example.com"), wrap([])) + node4 = ExternalLink(wraptext("http://example.com"), wraptext("Link")) + for a in (True, False): + for b in (True, False): + self.assertEqual("http://example.com", node1.__strip__(a, b)) + self.assertEqual(None, node2.__strip__(a, b)) + self.assertEqual(None, node3.__strip__(a, b)) + self.assertEqual("Link", node4.__strip__(a, b)) + + def test_showtree(self): + """test ExternalLink.__showtree__()""" + output = [] + getter, marker = object(), object() + get = lambda code: output.append((getter, code)) + mark = lambda: output.append(marker) + node1 = ExternalLink(wraptext("http://example.com"), brackets=False) + node2 = ExternalLink(wraptext("http://example.com"), wraptext("Link")) + node1.__showtree__(output.append, get, mark) + node2.__showtree__(output.append, get, mark) + valid = [ + (getter, node1.url), "[", (getter, node2.url), + (getter, node2.title), "]"] + self.assertEqual(valid, output) + + def test_url(self): + """test getter/setter for the url attribute""" + url = wraptext("http://example.com/") + node1 = ExternalLink(url, brackets=False) + node2 = ExternalLink(url, wraptext("Example")) + self.assertIs(url, node1.url) + self.assertIs(url, node2.url) + node1.url = "mailto:héhehé@spam.com" + node2.url = "mailto:héhehé@spam.com" + self.assertWikicodeEqual(wraptext("mailto:héhehé@spam.com"), node1.url) + self.assertWikicodeEqual(wraptext("mailto:héhehé@spam.com"), node2.url) + + def test_title(self): + """test getter/setter for the title attribute""" + title = wraptext("Example!") + node1 = ExternalLink(wraptext("http://example.com/"), brackets=False) + node2 = ExternalLink(wraptext("http://example.com/"), title) + self.assertIs(None, node1.title) + self.assertIs(title, node2.title) + node2.title = None + self.assertIs(None, node2.title) + node2.title = "My Website" + self.assertWikicodeEqual(wraptext("My Website"), node2.title) + + def test_brackets(self): + """test getter/setter for the brackets attribute""" + node1 = ExternalLink(wraptext("http://example.com/"), brackets=False) + node2 = ExternalLink(wraptext("http://example.com/"), wraptext("Link")) + self.assertFalse(node1.brackets) + self.assertTrue(node2.brackets) + node1.brackets = True + node2.brackets = False + self.assertTrue(node1.brackets) + self.assertFalse(node2.brackets) + self.assertEqual("[http://example.com/]", str(node1)) + self.assertEqual("http://example.com/", str(node2)) + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_parser.py b/tests/test_parser.py index ec5f065..8760c0e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -36,9 +36,9 @@ class TestParser(TreeEqualityTestCase): def test_use_c(self): """make sure the correct tokenizer is used""" if parser.use_c: - self.assertTrue(parser.Parser(None)._tokenizer.USES_C) + self.assertTrue(parser.Parser()._tokenizer.USES_C) parser.use_c = False - self.assertFalse(parser.Parser(None)._tokenizer.USES_C) + self.assertFalse(parser.Parser()._tokenizer.USES_C) def test_parsing(self): """integration test for parsing overall""" @@ -59,7 +59,7 @@ class TestParser(TreeEqualityTestCase): ])) ]) ]) - actual = parser.Parser(text).parse() + actual = parser.Parser().parse(text) self.assertWikicodeEqual(expected, actual) if __name__ == "__main__": diff --git a/tests/test_wikicode.py b/tests/test_wikicode.py index 08cf93c..14d801c 100644 --- a/tests/test_wikicode.py +++ b/tests/test_wikicode.py @@ -276,6 +276,7 @@ class TestWikicode(TreeEqualityTestCase): self.assertEqual(["{{{e}}}"], get_filter("arguments")) self.assertIs(code.get(4), get_filter("arguments")[0]) self.assertEqual([], get_filter("comments")) + self.assertEqual([], get_filter("external_links")) self.assertEqual([], get_filter("headings")) self.assertEqual([], get_filter("html_entities")) self.assertEqual([], get_filter("tags")) diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest new file mode 100644 index 0000000..af7a570 --- /dev/null +++ b/tests/tokenizer/external_links.mwtest @@ -0,0 +1,473 @@ +name: basic +label: basic external link +input: "http://example.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose()] + +--- + +name: basic_brackets +label: basic external link in brackets +input: "[http://example.com/]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkClose()] + +--- + +name: brackets_space +label: basic external link in brackets, with a space after +input: "[http://example.com/ ]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), ExternalLinkClose()] + +--- + +name: brackets_title +label: basic external link in brackets, with a title +input: "[http://example.com/ Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_multiword_title +label: basic external link in brackets, with a multi-word title +input: "[http://example.com/ Example Web Page]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text="Example Web Page"), ExternalLinkClose()] + +--- + +name: brackets_adjacent +label: three adjacent bracket-enclosed external links +input: "[http://foo.com/ Foo][http://bar.com/ Bar]\n[http://baz.com/ Baz]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foo.com/"), ExternalLinkSeparator(), Text(text="Foo"), ExternalLinkClose(), ExternalLinkOpen(brackets=True), Text(text="http://bar.com/"), ExternalLinkSeparator(), Text(text="Bar"), ExternalLinkClose(), Text(text="\n"), ExternalLinkOpen(brackets=True), Text(text="http://baz.com/"), ExternalLinkSeparator(), Text(text="Baz"), ExternalLinkClose()] + +--- + +name: brackets_newline_before +label: bracket-enclosed link with a newline before the title +input: "[http://example.com/ \nExample]" +output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" \nExample]")] + +--- + +name: brackets_newline_inside +label: bracket-enclosed link with a newline in the title +input: "[http://example.com/ Example \nWeb Page]" +output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" Example \nWeb Page]")] + +--- + +name: brackets_newline_after +label: bracket-enclosed link with a newline after the title +input: "[http://example.com/ Example\n]" +output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" Example\n]")] + +--- + +name: brackets_space_before +label: bracket-enclosed link with a space before the URL +input: "[ http://example.com Example]" +output: [Text(text="[ "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" Example]")] + +--- + +name: brackets_title_like_url +label: bracket-enclosed link with a title that looks like a URL +input: "[http://example.com http://example.com]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: brackets_recursive +label: bracket-enclosed link with a bracket-enclosed link as the title +input: "[http://example.com [http://example.com]]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="[http://example.com"), ExternalLinkClose(), Text(text="]")] + +--- + +name: period_after +label: a period after a free link that is excluded +input: "http://example.com." +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=".")] + +--- + +name: colons_after +label: colons after a free link that are excluded +input: "http://example.com/foo:bar.:;baz!?," +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo:bar.:;baz"), ExternalLinkClose(), Text(text="!?,")] + +--- + +name: close_paren_after_excluded +label: a closing parenthesis after a free link that is excluded +input: "http://example.)com)" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.)com"), ExternalLinkClose(), Text(text=")")] + +--- + +name: close_paren_after_included +label: a closing parenthesis after a free link that is included because of an opening parenthesis in the URL +input: "http://example.(com)" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.(com)"), ExternalLinkClose()] + +--- + +name: open_bracket_inside +label: an open bracket inside a free link that causes it to be ended abruptly +input: "http://foobar[baz.com" +output: [ExternalLinkOpen(brackets=False), Text(text="http://foobar"), ExternalLinkClose(), Text(text="[baz.com")] + +--- + +name: brackets_period_after +label: a period after a bracket-enclosed link that is included +input: "[http://example.com. Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com."), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_colons_after +label: colons after a bracket-enclosed link that are included +input: "[http://example.com/foo:bar.:;baz!?, Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo:bar.:;baz!?,"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_close_paren_after_included +label: a closing parenthesis after a bracket-enclosed link that is included +input: "[http://example.)com) Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.)com)"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_close_paren_after_included_2 +label: a closing parenthesis after a bracket-enclosed link that is also included +input: "[http://example.(com) Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_open_bracket_inside +label: an open bracket inside a bracket-enclosed link that is also included +input: "[http://foobar[baz.com Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: adjacent_space +label: two free links separated by a space +input: "http://example.com http://example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: adjacent_newline +label: two free links separated by a newline +input: "http://example.com\nhttp://example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text="\n"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: adjacent_close_bracket +label: two free links separated by a close bracket +input: "http://example.com]http://example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text="]"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: html_entity_in_url +label: a HTML entity parsed correctly inside a free link +input: "http://exa mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="mple.com/"), ExternalLinkClose()] + +--- + +name: template_in_url +label: a template parsed correctly inside a free link +input: "http://exa{{template}}mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), TemplateOpen(), Text(text="template"), TemplateClose(), Text(text="mple.com/"), ExternalLinkClose()] + +--- + +name: argument_in_url +label: an argument parsed correctly inside a free link +input: "http://exa{{{argument}}}mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ArgumentOpen(), Text(text="argument"), ArgumentClose(), Text(text="mple.com/"), ExternalLinkClose()] + +--- + +name: wikilink_in_url +label: a wikilink that destroys a free link +input: "http://exa[[wikilink]]mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ExternalLinkClose(), WikilinkOpen(), Text(text="wikilink"), WikilinkClose(), Text(text="mple.com/")] + +--- + +name: external_link_in_url +label: a bracketed link that destroys a free link +input: "http://exa[http://example.com/]mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ExternalLinkClose(), ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkClose(), Text(text="mple.com/")] + +--- + +name: spaces_padding +label: spaces padding a free link +input: " http://example.com " +output: [Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" ")] + +--- + +name: text_and_spaces_padding +label: text and spaces padding a free link +input: "x http://example.com x" +output: [Text(text="x "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" x")] + +--- + +name: template_before +label: a template before a free link +input: "{{foo}}http://example.com" +output: [TemplateOpen(), Text(text="foo"), TemplateClose(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: spaces_padding_no_slashes +label: spaces padding a free link with no slashes after the colon +input: " mailto:example@example.com " +output: [Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" ")] + +--- + +name: text_and_spaces_padding_no_slashes +label: text and spaces padding a free link with no slashes after the colon +input: "x mailto:example@example.com x" +output: [Text(text="x "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" x")] + +--- + +name: template_before_no_slashes +label: a template before a free link with no slashes after the colon +input: "{{foo}}mailto:example@example.com" +output: [TemplateOpen(), Text(text="foo"), TemplateClose(), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose()] + +--- + +name: no_slashes +label: a free link with no slashes after the colon +input: "mailto:example@example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose()] + +--- + +name: slashes_optional +label: a free link using a scheme that doesn't need slashes, but has them anyway +input: "mailto://example@example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="mailto://example@example.com"), ExternalLinkClose()] + +--- + +name: short +label: a very short free link +input: "mailto://abc" +output: [ExternalLinkOpen(brackets=False), Text(text="mailto://abc"), ExternalLinkClose()] + +--- + +name: slashes_missing +label: slashes missing from a free link with a scheme that requires them +input: "http:example@example.com" +output: [Text(text="http:example@example.com")] + +--- + +name: no_scheme_but_slashes +label: no scheme in a free link, but slashes (protocol-relative free links are not supported) +input: "//example.com" +output: [Text(text="//example.com")] + +--- + +name: no_scheme_but_colon +label: no scheme in a free link, but a colon +input: " :example.com" +output: [Text(text=" :example.com")] + +--- + +name: no_scheme_but_colon_and_slashes +label: no scheme in a free link, but a colon and slashes +input: " ://example.com" +output: [Text(text=" ://example.com")] + +--- + +name: fake_scheme_no_slashes +label: a nonexistent scheme in a free link, without slashes +input: "fake:example.com" +output: [Text(text="fake:example.com")] + +--- + +name: fake_scheme_slashes +label: a nonexistent scheme in a free link, with slashes +input: "fake://example.com" +output: [Text(text="fake://example.com")] + +--- + +name: fake_scheme_brackets_no_slashes +label: a nonexistent scheme in a bracketed link, without slashes +input: "[fake:example.com]" +output: [Text(text="[fake:example.com]")] + +--- + +name: fake_scheme_brackets_slashes +label: #=a nonexistent scheme in a bracketed link, with slashes +input: "[fake://example.com]" +output: [Text(text="[fake://example.com]")] + +--- + +name: interrupted_scheme +label: an otherwise valid scheme with something in the middle of it, in a free link +input: "ht?tp://example.com" +output: [Text(text="ht?tp://example.com")] + +--- + +name: interrupted_scheme_brackets +label: an otherwise valid scheme with something in the middle of it, in a bracketed link +input: "[ht?tp://example.com]" +output: [Text(text="[ht?tp://example.com]")] + +--- + +name: no_slashes_brackets +label: no slashes after the colon in a bracketed link +input: "[mailto:example@example.com Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="mailto:example@example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: space_before_no_slashes_brackets +label: a space before a bracketed link with no slashes after the colon +input: "[ mailto:example@example.com Example]" +output: [Text(text="[ "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" Example]")] + +--- + +name: slashes_optional_brackets +label: a bracketed link using a scheme that doesn't need slashes, but has them anyway +input: "[mailto://example@example.com Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="mailto://example@example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: short_brackets +label: a very short link in brackets +input: "[mailto://abc Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="mailto://abc"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: slashes_missing_brackets +label: slashes missing from a scheme that requires them in a bracketed link +input: "[http:example@example.com Example]" +output: [Text(text="[http:example@example.com Example]")] + +--- + +name: protcol_relative +label: a protocol-relative link (in brackets) +input: "[//example.com Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="//example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: scheme_missing_but_colon_brackets +label: scheme missing from a bracketed link, but with a colon +input: "[:example.com Example]" +output: [Text(text="[:example.com Example]")] + +--- + +name: scheme_missing_but_colon_slashes_brackets +label: scheme missing from a bracketed link, but with a colon and slashes +input: "[://example.com Example]" +output: [Text(text="[://example.com Example]")] + +--- + +name: unclosed_protocol_relative +label: an unclosed protocol-relative bracketed link +input: "[//example.com" +output: [Text(text="[//example.com")] + +--- + +name: space_before_protcol_relative +label: a space before a protocol-relative bracketed link +input: "[ //example.com]" +output: [Text(text="[ //example.com]")] + +--- + +name: unclosed_just_scheme +label: an unclosed bracketed link, ending after the scheme +input: "[http" +output: [Text(text="[http")] + +--- + +name: unclosed_scheme_colon +label: an unclosed bracketed link, ending after the colon +input: "[http:" +output: [Text(text="[http:")] + +--- + +name: unclosed_scheme_colon_slashes +label: an unclosed bracketed link, ending after the slashes +input: "[http://" +output: [Text(text="[http://")] + +--- + +name: incomplete_bracket +label: just an open bracket +input: "[" +output: [Text(text="[")] + +--- + +name: incomplete_scheme_colon +label: a free link with just a scheme and a colon +input: "http:" +output: [Text(text="http:")] + +--- + +name: incomplete_scheme_colon_slashes +label: a free link with just a scheme, colon, and slashes +input: "http://" +output: [Text(text="http://")] + +--- + +name: brackets_scheme_but_no_url +label: brackets around a scheme and a colon +input: "[mailto:]" +output: [Text(text="[mailto:]")] + +--- + +name: brackets_scheme_slashes_but_no_url +label: brackets around a scheme, colon, and slashes +input: "[http://]" +output: [Text(text="[http://]")] + +--- + +name: brackets_scheme_title_but_no_url +label: brackets around a scheme, colon, and slashes, with a title +input: "[http:// Example]" +output: [Text(text="[http:// Example]")] diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index 0277a51..083b12c 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -12,6 +12,13 @@ output: [TemplateOpen(), ArgumentOpen(), ArgumentOpen(), Text(text="foo"), Argum --- +name: link_in_template_name +label: a wikilink inside a template name, which breaks the template +input: "{{foo[[bar]]}}" +output: [Text(text="{{foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="}}")] + +--- + name: rich_heading label: a heading with templates/wikilinks in it input: "== Head{{ing}} [[with]] {{{funky|{{stuf}}}}} ==" @@ -51,3 +58,17 @@ name: wildcard_redux label: an even wilder assortment of various things input: "{{a|b|{{c|[[d]]{{{e}}}}}}}[[f|{{{g}}}]]{{i|j= }}" output: [TemplateOpen(), Text(text="a"), TemplateParamSeparator(), Text(text="b"), TemplateParamSeparator(), TemplateOpen(), Text(text="c"), TemplateParamSeparator(), WikilinkOpen(), Text(text="d"), WikilinkClose(), ArgumentOpen(), Text(text="e"), ArgumentClose(), TemplateClose(), TemplateClose(), WikilinkOpen(), Text(text="f"), WikilinkSeparator(), ArgumentOpen(), Text(text="g"), ArgumentClose(), CommentStart(), Text(text="h"), CommentEnd(), WikilinkClose(), TemplateOpen(), Text(text="i"), TemplateParamSeparator(), Text(text="j"), TemplateParamEquals(), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), TemplateClose()] + +--- + +name: link_inside_dl +label: an external link inside a def list, such that the external link is parsed +input: ";;;mailto:example" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), ExternalLinkOpen(brackets=False), Text(text="mailto:example"), ExternalLinkClose()] + +--- + +name: link_inside_dl_2 +label: an external link inside a def list, such that the external link is not parsed +input: ";;;malito:example" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="malito"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="example")] diff --git a/tests/tokenizer/wikilinks.mwtest b/tests/tokenizer/wikilinks.mwtest index 0682ef1..8eb381a 100644 --- a/tests/tokenizer/wikilinks.mwtest +++ b/tests/tokenizer/wikilinks.mwtest @@ -40,17 +40,17 @@ output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar|b --- -name: nested -label: a wikilink nested within the value of another -input: "[[foo|[[bar]]]]" -output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), WikilinkOpen(), Text(text="bar"), WikilinkClose(), WikilinkClose()] +name: newline_text +label: a newline in the middle of the text +input: "[[foo|foo\nbar]]" +output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="foo\nbar"), WikilinkClose()] --- -name: nested_with_text -label: a wikilink nested within the value of another, separated by other data -input: "[[foo|a[[b]]c]]" -output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c"), WikilinkClose()] +name: bracket_text +label: a left bracket in the middle of the text +input: "[[foo|bar[baz]]" +output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar[baz"), WikilinkClose()] --- @@ -96,13 +96,34 @@ output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), --- -name: invalid_nested_text +name: invalid_nested_padding label: invalid wikilink: trying to nest in the wrong context, with a text param input: "[[foo[[bar]]|baz]]" output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="|baz]]")] --- +name: invalid_nested_text +label: invalid wikilink: a wikilink nested within the value of another +input: "[[foo|[[bar]]" +output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose()] + +--- + +name: invalid_nested_text_2 +label: invalid wikilink: a wikilink nested within the value of another, two pairs of closing brackets +input: "[[foo|[[bar]]]]" +output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="]]")] + +--- + +name: invalid_nested_text_padding +label: invalid wikilink: a wikilink nested within the value of another, separated by other data +input: "[[foo|a[[b]]c]]" +output: [Text(text="[[foo|a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c]]")] + +--- + name: incomplete_open_only label: incomplete wikilinks: just an open input: "[["