@@ -1,8 +1,10 @@ | |||||
v0.3 (unreleased): | v0.3 (unreleased): | ||||
- Added complete support for HTML Tags, along with appropriate unit tests. This | |||||
includes forms like <ref>foo</ref>, <ref name="bar"/>, and wiki-markup tags | |||||
like bold ('''), italics (''), and lists (*, #, ; and :). | |||||
- Added complete support for HTML Tags, including forms like <ref>foo</ref>, | |||||
<ref name="bar"/>, and wiki-markup tags like bold ('''), italics (''), and | |||||
lists (*, #, ; and :). | |||||
- Added support for ExternalLinks (http://example.com/ and | |||||
[http://example.com/ Example]). | |||||
- Wikicode's filter methods are now passed 'recursive=True' by default instead | - Wikicode's filter methods are now passed 'recursive=True' by default instead | ||||
of False. This is a breaking change if you rely on any filter() methods being | of False. This is a breaking change if you rely on any filter() methods being | ||||
non-recursive by default. | non-recursive by default. | ||||
@@ -14,7 +16,7 @@ v0.3 (unreleased): | |||||
- Renamed Template.has_param() to has() for consistency with Template's other | - Renamed Template.has_param() to has() for consistency with Template's other | ||||
methods; has_param() is now an alias. | methods; has_param() is now an alias. | ||||
- The C tokenizer extension now works on Python 3 in addition to Python 2.7. | - The C tokenizer extension now works on Python 3 in addition to Python 2.7. | ||||
- Various fixes and cleanup. | |||||
- Various bugfixes, internal changes, and cleanup. | |||||
v0.2 (released June 20, 2013): | v0.2 (released June 20, 2013): | ||||
@@ -25,6 +25,14 @@ nodes Package | |||||
:undoc-members: | :undoc-members: | ||||
:show-inheritance: | :show-inheritance: | ||||
:mod:`external_link` Module | |||||
--------------------------- | |||||
.. automodule:: mwparserfromhell.nodes.external_link | |||||
:members: | |||||
:undoc-members: | |||||
:show-inheritance: | |||||
:mod:`heading` Module | :mod:`heading` Module | ||||
--------------------- | --------------------- | ||||
@@ -30,10 +30,10 @@ mwparserfromhell Package | |||||
:members: | :members: | ||||
:undoc-members: | :undoc-members: | ||||
:mod:`tag_defs` Module | |||||
:mod:`definitions` Module | |||||
------------------------- | |||||
.. automodule:: mwparserfromhell.tag_defs | |||||
.. automodule:: mwparserfromhell.definitions | |||||
:members: | :members: | ||||
:mod:`utils` Module | :mod:`utils` Module | ||||
@@ -7,10 +7,11 @@ v0.3 | |||||
Unreleased | Unreleased | ||||
(`changes <https://github.com/earwig/mwparserfromhell/compare/v0.2...develop>`__): | (`changes <https://github.com/earwig/mwparserfromhell/compare/v0.2...develop>`__): | ||||
- Added complete support for HTML :py:class:`Tags <.Tag>`, along with | |||||
appropriate unit tests. This includes forms like ``<ref>foo</ref>``, | |||||
``<ref name="bar"/>``, and wiki-markup tags like bold (``'''``), italics | |||||
(``''``), and lists (``*``, ``#``, ``;`` and ``:``). | |||||
- Added complete support for HTML :py:class:`Tags <.Tag>`, including forms like | |||||
``<ref>foo</ref>``, ``<ref name="bar"/>``, and wiki-markup tags like bold | |||||
(``'''``), italics (``''``), and lists (``*``, ``#``, ``;`` and ``:``). | |||||
- Added support for :py:class:`.ExternalLink`\ s (``http://example.com/`` and | |||||
``[http://example.com/ Example]``). | |||||
- :py:class:`Wikicode's <.Wikicode>` :py:meth:`.filter` methods are now passed | - :py:class:`Wikicode's <.Wikicode>` :py:meth:`.filter` methods are now passed | ||||
*recursive=True* by default instead of *False*. **This is a breaking change | *recursive=True* by default instead of *False*. **This is a breaking change | ||||
if you rely on any filter() methods being non-recursive by default.** | if you rely on any filter() methods being non-recursive by default.** | ||||
@@ -25,7 +26,7 @@ Unreleased | |||||
:py:meth:`~.Template.has` for consistency with :py:class:`~.Template`\ 's | :py:meth:`~.Template.has` for consistency with :py:class:`~.Template`\ 's | ||||
other methods; :py:meth:`~.has_param` is now an alias. | other methods; :py:meth:`~.has_param` is now an alias. | ||||
- The C tokenizer extension now works on Python 3 in addition to Python 2.7. | - The C tokenizer extension now works on Python 3 in addition to Python 2.7. | ||||
- Various fixes and cleanup. | |||||
- Various bugfixes, internal changes, and cleanup. | |||||
v0.2 | v0.2 | ||||
---- | ---- | ||||
@@ -34,6 +34,7 @@ __license__ = "MIT License" | |||||
__version__ = "0.3.dev" | __version__ = "0.3.dev" | ||||
__email__ = "ben.kurtovic@verizon.net" | __email__ = "ben.kurtovic@verizon.net" | ||||
from . import compat, nodes, parser, smart_list, string_mixin, utils, wikicode | |||||
from . import (compat, definitions, nodes, parser, smart_list, string_mixin, | |||||
utils, wikicode) | |||||
parse = utils.parse_anything | parse = utils.parse_anything |
@@ -20,12 +20,22 @@ | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
# SOFTWARE. | # SOFTWARE. | ||||
"""Contains data regarding certain HTML tags.""" | |||||
"""Contains data about certain markup, like HTML tags and external links.""" | |||||
from __future__ import unicode_literals | from __future__ import unicode_literals | ||||
__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", | __all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", | ||||
"is_single_only"] | |||||
"is_single_only", "is_scheme"] | |||||
URI_SCHEMES = { | |||||
# [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943 | |||||
"http": True, "https": True, "ftp": True, "ftps": True, "ssh": True, | |||||
"sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False, | |||||
"sips": False, "gopher": True, "telnet": True, "nntp": True, | |||||
"worldwind": True, "mailto": False, "tel": False, "sms": False, | |||||
"news": False, "svn": True, "git": True, "mms": True, "bitcoin": False, | |||||
"magnet": False, "urn": False, "geo": False | |||||
} | |||||
PARSER_BLACKLIST = [ | PARSER_BLACKLIST = [ | ||||
# enwiki extensions @ 2013-06-28 | # enwiki extensions @ 2013-06-28 | ||||
@@ -70,3 +80,12 @@ def is_single(tag): | |||||
def is_single_only(tag): | def is_single_only(tag): | ||||
"""Return whether or not the given *tag* must exist without a close tag.""" | """Return whether or not the given *tag* must exist without a close tag.""" | ||||
return tag.lower() in SINGLE_ONLY | return tag.lower() in SINGLE_ONLY | ||||
def is_scheme(scheme, slashes=True, reverse=False): | |||||
"""Return whether *scheme* is valid for external links.""" | |||||
if reverse: # Convenience for C | |||||
scheme = scheme[::-1] | |||||
scheme = scheme.lower() | |||||
if slashes: | |||||
return scheme in URI_SCHEMES | |||||
return scheme in URI_SCHEMES and not URI_SCHEMES[scheme] |
@@ -69,6 +69,7 @@ from . import extras | |||||
from .text import Text | from .text import Text | ||||
from .argument import Argument | from .argument import Argument | ||||
from .comment import Comment | from .comment import Comment | ||||
from .external_link import ExternalLink | |||||
from .heading import Heading | from .heading import Heading | ||||
from .html_entity import HTMLEntity | from .html_entity import HTMLEntity | ||||
from .tag import Tag | from .tag import Tag | ||||
@@ -0,0 +1,97 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
from __future__ import unicode_literals | |||||
from . import Node | |||||
from ..compat import str | |||||
from ..utils import parse_anything | |||||
__all__ = ["ExternalLink"] | |||||
class ExternalLink(Node): | |||||
"""Represents an external link, like ``[http://example.com/ Example]``.""" | |||||
def __init__(self, url, title=None, brackets=True): | |||||
super(ExternalLink, self).__init__() | |||||
self._url = url | |||||
self._title = title | |||||
self._brackets = brackets | |||||
def __unicode__(self): | |||||
if self.brackets: | |||||
if self.title is not None: | |||||
return "[" + str(self.url) + " " + str(self.title) + "]" | |||||
return "[" + str(self.url) + "]" | |||||
return str(self.url) | |||||
def __iternodes__(self, getter): | |||||
yield None, self | |||||
for child in getter(self.url): | |||||
yield self.url, child | |||||
if self.title is not None: | |||||
for child in getter(self.title): | |||||
yield self.title, child | |||||
def __strip__(self, normalize, collapse): | |||||
if self.brackets: | |||||
if self.title: | |||||
return self.title.strip_code(normalize, collapse) | |||||
return None | |||||
return self.url.strip_code(normalize, collapse) | |||||
def __showtree__(self, write, get, mark): | |||||
if self.brackets: | |||||
write("[") | |||||
get(self.url) | |||||
if self.title is not None: | |||||
get(self.title) | |||||
if self.brackets: | |||||
write("]") | |||||
@property | |||||
def url(self): | |||||
"""The URL of the link target, as a :py:class:`~.Wikicode` object.""" | |||||
return self._url | |||||
@property | |||||
def title(self): | |||||
"""The link title (if given), as a :py:class:`~.Wikicode` object.""" | |||||
return self._title | |||||
@property | |||||
def brackets(self): | |||||
"""Whether to enclose the URL in brackets or display it straight.""" | |||||
return self._brackets | |||||
@url.setter | |||||
def url(self, value): | |||||
from ..parser import contexts | |||||
self._url = parse_anything(value, contexts.EXT_LINK_URI) | |||||
@title.setter | |||||
def title(self, value): | |||||
self._title = None if value is None else parse_anything(value) | |||||
@brackets.setter | |||||
def brackets(self, value): | |||||
self._brackets = bool(value) |
@@ -25,7 +25,7 @@ from __future__ import unicode_literals | |||||
from . import Node, Text | from . import Node, Text | ||||
from .extras import Attribute | from .extras import Attribute | ||||
from ..compat import str | from ..compat import str | ||||
from ..tag_defs import is_visible | |||||
from ..definitions import is_visible | |||||
from ..utils import parse_anything | from ..utils import parse_anything | ||||
__all__ = ["Tag"] | __all__ = ["Tag"] | ||||
@@ -152,7 +152,7 @@ class Tag(Node): | |||||
This makes the tag look like a lone close tag. It is technically | This makes the tag look like a lone close tag. It is technically | ||||
invalid and is only parsable Wikicode when the tag itself is | invalid and is only parsable Wikicode when the tag itself is | ||||
single-only, like ``<br>`` and ``<img>``. See | single-only, like ``<br>`` and ``<img>``. See | ||||
:py:func:`.tag_defs.is_single_only`. | |||||
:py:func:`.definitions.is_single_only`. | |||||
""" | """ | ||||
return self._invalid | return self._invalid | ||||
@@ -161,7 +161,7 @@ class Tag(Node): | |||||
"""Whether the tag is implicitly self-closing, with no ending slash. | """Whether the tag is implicitly self-closing, with no ending slash. | ||||
This is only possible for specific "single" tags like ``<br>`` and | This is only possible for specific "single" tags like ``<br>`` and | ||||
``<li>``. See :py:func:`.tag_defs.is_single`. This field only has an | |||||
``<li>``. See :py:func:`.definitions.is_single`. This field only has an | |||||
effect if :py:attr:`self_closing` is also ``True``. | effect if :py:attr:`self_closing` is also ``True``. | ||||
""" | """ | ||||
return self._implicit | return self._implicit | ||||
@@ -46,16 +46,15 @@ class Parser(object): | |||||
:py:class:`~.Node`\ s by the :py:class:`~.Builder`. | :py:class:`~.Node`\ s by the :py:class:`~.Builder`. | ||||
""" | """ | ||||
def __init__(self, text): | |||||
self.text = text | |||||
def __init__(self): | |||||
if use_c and CTokenizer: | if use_c and CTokenizer: | ||||
self._tokenizer = CTokenizer() | self._tokenizer = CTokenizer() | ||||
else: | else: | ||||
self._tokenizer = Tokenizer() | self._tokenizer = Tokenizer() | ||||
self._builder = Builder() | self._builder = Builder() | ||||
def parse(self): | |||||
"""Return a string as a parsed :py:class:`~.Wikicode` object tree.""" | |||||
tokens = self._tokenizer.tokenize(self.text) | |||||
def parse(self, text, context=0): | |||||
"""Parse *text*, returning a :py:class:`~.Wikicode` object tree.""" | |||||
tokens = self._tokenizer.tokenize(text, context) | |||||
code = self._builder.build(tokens) | code = self._builder.build(tokens) | ||||
return code | return code |
@@ -24,8 +24,8 @@ from __future__ import unicode_literals | |||||
from . import tokens | from . import tokens | ||||
from ..compat import str | from ..compat import str | ||||
from ..nodes import (Argument, Comment, Heading, HTMLEntity, Tag, Template, | |||||
Text, Wikilink) | |||||
from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, | |||||
Template, Text, Wikilink) | |||||
from ..nodes.extras import Attribute, Parameter | from ..nodes.extras import Attribute, Parameter | ||||
from ..smart_list import SmartList | from ..smart_list import SmartList | ||||
from ..wikicode import Wikicode | from ..wikicode import Wikicode | ||||
@@ -142,6 +142,22 @@ class Builder(object): | |||||
else: | else: | ||||
self._write(self._handle_token(token)) | self._write(self._handle_token(token)) | ||||
def _handle_external_link(self, token): | |||||
"""Handle when an external link is at the head of the tokens.""" | |||||
brackets, url = token.brackets, None | |||||
self._push() | |||||
while self._tokens: | |||||
token = self._tokens.pop() | |||||
if isinstance(token, tokens.ExternalLinkSeparator): | |||||
url = self._pop() | |||||
self._push() | |||||
elif isinstance(token, tokens.ExternalLinkClose): | |||||
if url is not None: | |||||
return ExternalLink(url, self._pop(), brackets) | |||||
return ExternalLink(self._pop(), brackets=brackets) | |||||
else: | |||||
self._write(self._handle_token(token)) | |||||
def _handle_entity(self): | def _handle_entity(self): | ||||
"""Handle a case where an HTML entity is at the head of the tokens.""" | """Handle a case where an HTML entity is at the head of the tokens.""" | ||||
token = self._tokens.pop() | token = self._tokens.pop() | ||||
@@ -244,6 +260,8 @@ class Builder(object): | |||||
return self._handle_argument() | return self._handle_argument() | ||||
elif isinstance(token, tokens.WikilinkOpen): | elif isinstance(token, tokens.WikilinkOpen): | ||||
return self._handle_wikilink() | return self._handle_wikilink() | ||||
elif isinstance(token, tokens.ExternalLinkOpen): | |||||
return self._handle_external_link(token) | |||||
elif isinstance(token, tokens.HTMLEntityStart): | elif isinstance(token, tokens.HTMLEntityStart): | ||||
return self._handle_entity() | return self._handle_entity() | ||||
elif isinstance(token, tokens.HeadingStart): | elif isinstance(token, tokens.HeadingStart): | ||||
@@ -51,6 +51,12 @@ Local (stack-specific) contexts: | |||||
* :py:const:`WIKILINK_TITLE` | * :py:const:`WIKILINK_TITLE` | ||||
* :py:const:`WIKILINK_TEXT` | * :py:const:`WIKILINK_TEXT` | ||||
* :py:const:`EXT_LINK` | |||||
* :py:const:`EXT_LINK_URI` | |||||
* :py:const:`EXT_LINK_TITLE` | |||||
* :py:const:`EXT_LINK_BRACKETS` | |||||
* :py:const:`HEADING` | * :py:const:`HEADING` | ||||
* :py:const:`HEADING_LEVEL_1` | * :py:const:`HEADING_LEVEL_1` | ||||
@@ -94,6 +100,7 @@ Aggregate contexts: | |||||
* :py:const:`FAIL` | * :py:const:`FAIL` | ||||
* :py:const:`UNSAFE` | * :py:const:`UNSAFE` | ||||
* :py:const:`DOUBLE` | * :py:const:`DOUBLE` | ||||
* :py:const:`INVALID_LINK` | |||||
""" | """ | ||||
@@ -112,35 +119,40 @@ WIKILINK_TITLE = 1 << 5 | |||||
WIKILINK_TEXT = 1 << 6 | WIKILINK_TEXT = 1 << 6 | ||||
WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT | WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT | ||||
HEADING_LEVEL_1 = 1 << 7 | |||||
HEADING_LEVEL_2 = 1 << 8 | |||||
HEADING_LEVEL_3 = 1 << 9 | |||||
HEADING_LEVEL_4 = 1 << 10 | |||||
HEADING_LEVEL_5 = 1 << 11 | |||||
HEADING_LEVEL_6 = 1 << 12 | |||||
EXT_LINK_URI = 1 << 7 | |||||
EXT_LINK_TITLE = 1 << 8 | |||||
EXT_LINK_BRACKETS = 1 << 9 | |||||
EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE + EXT_LINK_BRACKETS | |||||
HEADING_LEVEL_1 = 1 << 10 | |||||
HEADING_LEVEL_2 = 1 << 11 | |||||
HEADING_LEVEL_3 = 1 << 12 | |||||
HEADING_LEVEL_4 = 1 << 13 | |||||
HEADING_LEVEL_5 = 1 << 14 | |||||
HEADING_LEVEL_6 = 1 << 15 | |||||
HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 + | HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 + | ||||
HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6) | HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6) | ||||
TAG_OPEN = 1 << 13 | |||||
TAG_ATTR = 1 << 14 | |||||
TAG_BODY = 1 << 15 | |||||
TAG_CLOSE = 1 << 16 | |||||
TAG_OPEN = 1 << 16 | |||||
TAG_ATTR = 1 << 17 | |||||
TAG_BODY = 1 << 18 | |||||
TAG_CLOSE = 1 << 19 | |||||
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE | TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE | ||||
STYLE_ITALICS = 1 << 17 | |||||
STYLE_BOLD = 1 << 18 | |||||
STYLE_PASS_AGAIN = 1 << 19 | |||||
STYLE_SECOND_PASS = 1 << 20 | |||||
STYLE_ITALICS = 1 << 20 | |||||
STYLE_BOLD = 1 << 21 | |||||
STYLE_PASS_AGAIN = 1 << 22 | |||||
STYLE_SECOND_PASS = 1 << 23 | |||||
STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS | STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS | ||||
DL_TERM = 1 << 21 | |||||
DL_TERM = 1 << 24 | |||||
HAS_TEXT = 1 << 22 | |||||
FAIL_ON_TEXT = 1 << 23 | |||||
FAIL_NEXT = 1 << 24 | |||||
FAIL_ON_LBRACE = 1 << 25 | |||||
FAIL_ON_RBRACE = 1 << 26 | |||||
FAIL_ON_EQUALS = 1 << 27 | |||||
HAS_TEXT = 1 << 25 | |||||
FAIL_ON_TEXT = 1 << 26 | |||||
FAIL_NEXT = 1 << 27 | |||||
FAIL_ON_LBRACE = 1 << 28 | |||||
FAIL_ON_RBRACE = 1 << 29 | |||||
FAIL_ON_EQUALS = 1 << 30 | |||||
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | ||||
FAIL_ON_RBRACE + FAIL_ON_EQUALS) | FAIL_ON_RBRACE + FAIL_ON_EQUALS) | ||||
@@ -150,7 +162,8 @@ GL_HEADING = 1 << 0 | |||||
# Aggregate contexts: | # Aggregate contexts: | ||||
FAIL = TEMPLATE + ARGUMENT + WIKILINK + HEADING + TAG + STYLE | |||||
UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + | |||||
TAG_CLOSE) | |||||
FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE | |||||
UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + | |||||
ARGUMENT_NAME + TAG_CLOSE) | |||||
DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE | DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE | ||||
INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK + EXT_LINK |
@@ -24,6 +24,20 @@ SOFTWARE. | |||||
#include "tokenizer.h" | #include "tokenizer.h" | ||||
/* | /* | ||||
Determine whether the given Py_UNICODE is a marker. | |||||
*/ | |||||
static int is_marker(Py_UNICODE this) | |||||
{ | |||||
int i; | |||||
for (i = 0; i < NUM_MARKERS; i++) { | |||||
if (*MARKERS[i] == this) | |||||
return 1; | |||||
} | |||||
return 0; | |||||
} | |||||
/* | |||||
Given a context, return the heading level encoded within it. | Given a context, return the heading level encoded within it. | ||||
*/ | */ | ||||
static int heading_level_from_context(int n) | static int heading_level_from_context(int n) | ||||
@@ -37,13 +51,14 @@ static int heading_level_from_context(int n) | |||||
} | } | ||||
/* | /* | ||||
Call the given function in tag_defs, using 'tag' as a parameter, and return | |||||
its output as a bool. | |||||
Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as | |||||
parameters, and return its output as a bool. | |||||
*/ | */ | ||||
static int call_tag_def_func(const char* funcname, PyObject* tag) | |||||
static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2, | |||||
PyObject* in3) | |||||
{ | { | ||||
PyObject* func = PyObject_GetAttrString(tag_defs, funcname); | |||||
PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL); | |||||
PyObject* func = PyObject_GetAttrString(definitions, funcname); | |||||
PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL); | |||||
int ans = (result == Py_True) ? 1 : 0; | int ans = (result == Py_True) ? 1 : 0; | ||||
Py_DECREF(func); | Py_DECREF(func); | ||||
@@ -65,7 +80,7 @@ static PyObject* strip_tag_name(PyObject* token) | |||||
Py_DECREF(text); | Py_DECREF(text); | ||||
if (!rstripped) | if (!rstripped) | ||||
return NULL; | return NULL; | ||||
lowered = PyObject_CallMethod(rstripped, "rstrip", NULL); | |||||
lowered = PyObject_CallMethod(rstripped, "lower", NULL); | |||||
Py_DECREF(rstripped); | Py_DECREF(rstripped); | ||||
return lowered; | return lowered; | ||||
} | } | ||||
@@ -85,7 +100,7 @@ static Textbuffer* Textbuffer_new(void) | |||||
PyErr_NoMemory(); | PyErr_NoMemory(); | ||||
return NULL; | return NULL; | ||||
} | } | ||||
buffer->next = NULL; | |||||
buffer->prev = buffer->next = NULL; | |||||
return buffer; | return buffer; | ||||
} | } | ||||
@@ -113,10 +128,10 @@ static int Textbuffer_write(Textbuffer** this, Py_UNICODE code) | |||||
if (!new) | if (!new) | ||||
return -1; | return -1; | ||||
new->next = self; | new->next = self; | ||||
self->prev = new; | |||||
*this = self = new; | *this = self = new; | ||||
} | } | ||||
self->data[self->size] = code; | |||||
self->size++; | |||||
self->data[self->size++] = code; | |||||
return 0; | return 0; | ||||
} | } | ||||
@@ -345,7 +360,7 @@ static void* Tokenizer_fail_route(Tokenizer* self) | |||||
} | } | ||||
/* | /* | ||||
Write a token to the end of the current token stack. | |||||
Write a token to the current token stack. | |||||
*/ | */ | ||||
static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) | static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) | ||||
{ | { | ||||
@@ -366,7 +381,8 @@ static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) | |||||
} | } | ||||
/* | /* | ||||
Write a token to the end of the current token stack. | |||||
Write a token to the current token stack, with kwargs. Steals a reference | |||||
to kwargs. | |||||
*/ | */ | ||||
static int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token, | static int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token, | ||||
PyObject* kwargs, int first) | PyObject* kwargs, int first) | ||||
@@ -417,6 +433,42 @@ static int Tokenizer_emit_text(Tokenizer* self, const char* text) | |||||
} | } | ||||
/* | /* | ||||
Write the contents of another textbuffer to the current textbuffer, | |||||
deallocating it in the process. | |||||
*/ | |||||
static int | |||||
Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse) | |||||
{ | |||||
Textbuffer *original = buffer; | |||||
int i; | |||||
if (reverse) { | |||||
do { | |||||
for (i = buffer->size - 1; i >= 0; i--) { | |||||
if (Tokenizer_emit_char(self, buffer->data[i])) { | |||||
Textbuffer_dealloc(original); | |||||
return -1; | |||||
} | |||||
} | |||||
} while ((buffer = buffer->next)); | |||||
} | |||||
else { | |||||
while (buffer->next) | |||||
buffer = buffer->next; | |||||
do { | |||||
for (i = 0; i < buffer->size; i++) { | |||||
if (Tokenizer_emit_char(self, buffer->data[i])) { | |||||
Textbuffer_dealloc(original); | |||||
return -1; | |||||
} | |||||
} | |||||
} while ((buffer = buffer->prev)); | |||||
} | |||||
Textbuffer_dealloc(original); | |||||
return 0; | |||||
} | |||||
/* | |||||
Write a series of tokens to the current stack at once. | Write a series of tokens to the current stack at once. | ||||
*/ | */ | ||||
static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist) | static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist) | ||||
@@ -808,6 +860,353 @@ static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self) | |||||
} | } | ||||
/* | /* | ||||
Parse the URI scheme of a bracket-enclosed external link. | |||||
*/ | |||||
static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) | |||||
{ | |||||
static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; | |||||
Textbuffer* buffer; | |||||
PyObject* scheme; | |||||
Py_UNICODE this; | |||||
int slashes, i; | |||||
if (Tokenizer_push(self, LC_EXT_LINK_URI)) | |||||
return -1; | |||||
if (Tokenizer_READ(self, 0) == *"/" && Tokenizer_READ(self, 1) == *"/") { | |||||
if (Tokenizer_emit_text(self, "//")) | |||||
return -1; | |||||
self->head += 2; | |||||
} | |||||
else { | |||||
buffer = Textbuffer_new(); | |||||
if (!buffer) | |||||
return -1; | |||||
while ((this = Tokenizer_READ(self, 0)) != *"") { | |||||
i = 0; | |||||
while (1) { | |||||
if (!valid[i]) | |||||
goto end_of_loop; | |||||
if (this == valid[i]) | |||||
break; | |||||
i++; | |||||
} | |||||
Textbuffer_write(&buffer, this); | |||||
if (Tokenizer_emit_char(self, this)) { | |||||
Textbuffer_dealloc(buffer); | |||||
return -1; | |||||
} | |||||
self->head++; | |||||
} | |||||
end_of_loop: | |||||
if (this != *":") { | |||||
Textbuffer_dealloc(buffer); | |||||
Tokenizer_fail_route(self); | |||||
return 0; | |||||
} | |||||
if (Tokenizer_emit_char(self, *":")) { | |||||
Textbuffer_dealloc(buffer); | |||||
return -1; | |||||
} | |||||
self->head++; | |||||
slashes = (Tokenizer_READ(self, 0) == *"/" && | |||||
Tokenizer_READ(self, 1) == *"/"); | |||||
if (slashes) { | |||||
if (Tokenizer_emit_text(self, "//")) { | |||||
Textbuffer_dealloc(buffer); | |||||
return -1; | |||||
} | |||||
self->head += 2; | |||||
} | |||||
scheme = Textbuffer_render(buffer); | |||||
Textbuffer_dealloc(buffer); | |||||
if (!scheme) | |||||
return -1; | |||||
if (!IS_SCHEME(scheme, slashes, 0)) { | |||||
Py_DECREF(scheme); | |||||
Tokenizer_fail_route(self); | |||||
return 0; | |||||
} | |||||
Py_DECREF(scheme); | |||||
} | |||||
return 0; | |||||
} | |||||
/* | |||||
Parse the URI scheme of a free (no brackets) external link. | |||||
*/ | |||||
static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) | |||||
{ | |||||
static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; | |||||
Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer; | |||||
PyObject *scheme; | |||||
Py_UNICODE chunk; | |||||
int slashes, i, j; | |||||
if (!scheme_buffer) | |||||
return -1; | |||||
// We have to backtrack through the textbuffer looking for our scheme since | |||||
// it was just parsed as text: | |||||
temp_buffer = self->topstack->textbuffer; | |||||
while (temp_buffer) { | |||||
for (i = temp_buffer->size - 1; i >= 0; i--) { | |||||
chunk = temp_buffer->data[i]; | |||||
if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) | |||||
goto end_of_loop; | |||||
j = 0; | |||||
while (1) { | |||||
if (!valid[j]) { | |||||
Textbuffer_dealloc(scheme_buffer); | |||||
FAIL_ROUTE(0); | |||||
return 0; | |||||
} | |||||
if (chunk == valid[j]) | |||||
break; | |||||
j++; | |||||
} | |||||
Textbuffer_write(&scheme_buffer, chunk); | |||||
} | |||||
temp_buffer = temp_buffer->next; | |||||
} | |||||
end_of_loop: | |||||
scheme = Textbuffer_render(scheme_buffer); | |||||
if (!scheme) { | |||||
Textbuffer_dealloc(scheme_buffer); | |||||
return -1; | |||||
} | |||||
slashes = (Tokenizer_READ(self, 0) == *"/" && | |||||
Tokenizer_READ(self, 1) == *"/"); | |||||
if (!IS_SCHEME(scheme, slashes, 1)) { | |||||
Py_DECREF(scheme); | |||||
Textbuffer_dealloc(scheme_buffer); | |||||
FAIL_ROUTE(0); | |||||
return 0; | |||||
} | |||||
Py_DECREF(scheme); | |||||
if (Tokenizer_push(self, LC_EXT_LINK_URI)) { | |||||
Textbuffer_dealloc(scheme_buffer); | |||||
return -1; | |||||
} | |||||
if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1)) | |||||
return -1; | |||||
if (Tokenizer_emit_char(self, *":")) | |||||
return -1; | |||||
if (slashes) { | |||||
if (Tokenizer_emit_text(self, "//")) | |||||
return -1; | |||||
self->head += 2; | |||||
} | |||||
return 0; | |||||
} | |||||
/* | |||||
Handle text in a free external link, including trailing punctuation. | |||||
*/ | |||||
static int | |||||
Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, | |||||
Textbuffer** tail, Py_UNICODE this) | |||||
{ | |||||
#define PUSH_TAIL_BUFFER(tail, error) \ | |||||
if ((tail)->size || (tail)->next) { \ | |||||
if (Tokenizer_emit_textbuffer(self, tail, 0)) \ | |||||
return error; \ | |||||
tail = Textbuffer_new(); \ | |||||
if (!(tail)) \ | |||||
return error; \ | |||||
} | |||||
if (this == *"(" && !(*parens)) { | |||||
*parens = 1; | |||||
PUSH_TAIL_BUFFER(*tail, -1) | |||||
} | |||||
else if (this == *"," || this == *";" || this == *"\\" || this == *"." || | |||||
this == *":" || this == *"!" || this == *"?" || | |||||
(!(*parens) && this == *")")) | |||||
return Textbuffer_write(tail, this); | |||||
else | |||||
PUSH_TAIL_BUFFER(*tail, -1) | |||||
return Tokenizer_emit_char(self, this); | |||||
} | |||||
/* | |||||
Really parse an external link. | |||||
*/ | |||||
static PyObject* | |||||
Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, | |||||
Textbuffer** extra) | |||||
{ | |||||
Py_UNICODE this, next; | |||||
int parens = 0; | |||||
if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) : | |||||
Tokenizer_parse_free_uri_scheme(self)) | |||||
return NULL; | |||||
if (BAD_ROUTE) | |||||
return NULL; | |||||
this = Tokenizer_READ(self, 0); | |||||
if (this == *"" || this == *"\n" || this == *" " || this == *"]") | |||||
return Tokenizer_fail_route(self); | |||||
if (!brackets && this == *"[") | |||||
return Tokenizer_fail_route(self); | |||||
while (1) { | |||||
this = Tokenizer_READ(self, 0); | |||||
next = Tokenizer_READ(self, 1); | |||||
if (this == *"" || this == *"\n") { | |||||
if (brackets) | |||||
return Tokenizer_fail_route(self); | |||||
self->head--; | |||||
return Tokenizer_pop(self); | |||||
} | |||||
if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) { | |||||
PUSH_TAIL_BUFFER(*extra, NULL) | |||||
if (Tokenizer_parse_template_or_argument(self)) | |||||
return NULL; | |||||
} | |||||
else if (this == *"[") { | |||||
if (!brackets) { | |||||
self->head--; | |||||
return Tokenizer_pop(self); | |||||
} | |||||
if (Tokenizer_emit_char(self, *"[")) | |||||
return NULL; | |||||
} | |||||
else if (this == *"]") { | |||||
if (!brackets) | |||||
self->head--; | |||||
return Tokenizer_pop(self); | |||||
} | |||||
else if (this == *"&") { | |||||
PUSH_TAIL_BUFFER(*extra, NULL) | |||||
if (Tokenizer_parse_entity(self)) | |||||
return NULL; | |||||
} | |||||
else if (this == *" ") { | |||||
if (brackets) { | |||||
if (Tokenizer_emit(self, ExternalLinkSeparator)) | |||||
return NULL; | |||||
self->topstack->context ^= LC_EXT_LINK_URI; | |||||
self->topstack->context |= LC_EXT_LINK_TITLE; | |||||
self->head++; | |||||
return Tokenizer_parse(self, 0, 0); | |||||
} | |||||
if (Textbuffer_write(extra, *" ")) | |||||
return NULL; | |||||
return Tokenizer_pop(self); | |||||
} | |||||
else if (!brackets) { | |||||
if (Tokenizer_handle_free_link_text(self, &parens, extra, this)) | |||||
return NULL; | |||||
} | |||||
else { | |||||
if (Tokenizer_emit_char(self, this)) | |||||
return NULL; | |||||
} | |||||
self->head++; | |||||
} | |||||
} | |||||
/* | |||||
Remove the URI scheme of a new external link from the textbuffer. | |||||
*/ | |||||
static int | |||||
Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) | |||||
{ | |||||
PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"), | |||||
*split, *scheme; | |||||
Py_ssize_t length; | |||||
Textbuffer* temp; | |||||
if (!text) | |||||
return -1; | |||||
split = PyObject_CallMethod(text, "split", "si", ":", 1); | |||||
Py_DECREF(text); | |||||
if (!split) | |||||
return -1; | |||||
scheme = PyList_GET_ITEM(split, 0); | |||||
length = PyUnicode_GET_SIZE(scheme); | |||||
while (length) { | |||||
temp = self->topstack->textbuffer; | |||||
if (length <= temp->size) { | |||||
temp->size -= length; | |||||
break; | |||||
} | |||||
length -= temp->size; | |||||
self->topstack->textbuffer = temp->next; | |||||
free(temp->data); | |||||
free(temp); | |||||
} | |||||
Py_DECREF(split); | |||||
return 0; | |||||
} | |||||
/* | |||||
Parse an external link at the head of the wikicode string. | |||||
*/ | |||||
static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) | |||||
{ | |||||
#define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK | |||||
#define NOT_A_LINK \ | |||||
if (!brackets && self->topstack->context & LC_DLTERM) \ | |||||
return Tokenizer_handle_dl_term(self); \ | |||||
return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)) | |||||
Py_ssize_t reset = self->head; | |||||
PyObject *link, *kwargs; | |||||
Textbuffer *extra = 0; | |||||
if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { | |||||
NOT_A_LINK; | |||||
} | |||||
extra = Textbuffer_new(); | |||||
if (!extra) | |||||
return -1; | |||||
self->head++; | |||||
link = Tokenizer_really_parse_external_link(self, brackets, &extra); | |||||
if (BAD_ROUTE) { | |||||
RESET_ROUTE(); | |||||
self->head = reset; | |||||
Textbuffer_dealloc(extra); | |||||
NOT_A_LINK; | |||||
} | |||||
if (!link) { | |||||
Textbuffer_dealloc(extra); | |||||
return -1; | |||||
} | |||||
if (!brackets) { | |||||
if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) { | |||||
Textbuffer_dealloc(extra); | |||||
Py_DECREF(link); | |||||
return -1; | |||||
} | |||||
} | |||||
kwargs = PyDict_New(); | |||||
if (!kwargs) { | |||||
Textbuffer_dealloc(extra); | |||||
Py_DECREF(link); | |||||
return -1; | |||||
} | |||||
PyDict_SetItemString(kwargs, "brackets", brackets ? Py_True : Py_False); | |||||
if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) { | |||||
Textbuffer_dealloc(extra); | |||||
Py_DECREF(link); | |||||
return -1; | |||||
} | |||||
if (Tokenizer_emit_all(self, link)) { | |||||
Textbuffer_dealloc(extra); | |||||
Py_DECREF(link); | |||||
return -1; | |||||
} | |||||
Py_DECREF(link); | |||||
if (Tokenizer_emit(self, ExternalLinkClose)) { | |||||
Textbuffer_dealloc(extra); | |||||
return -1; | |||||
} | |||||
if (extra->size || extra->next) | |||||
return Tokenizer_emit_textbuffer(self, extra, 0); | |||||
Textbuffer_dealloc(extra); | |||||
return 0; | |||||
} | |||||
/* | |||||
Parse a section heading at the head of the wikicode string. | Parse a section heading at the head of the wikicode string. | ||||
*/ | */ | ||||
static int Tokenizer_parse_heading(Tokenizer* self) | static int Tokenizer_parse_heading(Tokenizer* self) | ||||
@@ -1238,15 +1637,8 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) | |||||
static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text) | static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text) | ||||
{ | { | ||||
Py_UNICODE next = Tokenizer_READ(self, 1); | Py_UNICODE next = Tokenizer_READ(self, 1); | ||||
int i, is_marker = 0; | |||||
for (i = 0; i < NUM_MARKERS; i++) { | |||||
if (*MARKERS[i] == text) { | |||||
is_marker = 1; | |||||
break; | |||||
} | |||||
} | |||||
if (!is_marker || !Tokenizer_CAN_RECURSE(self)) | |||||
if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self)) | |||||
return Tokenizer_emit_char(self, text); | return Tokenizer_emit_char(self, text); | ||||
else if (text == next && next == *"{") | else if (text == next && next == *"{") | ||||
return Tokenizer_parse_template_or_argument(self); | return Tokenizer_parse_template_or_argument(self); | ||||
@@ -1264,17 +1656,11 @@ static int | |||||
Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk) | Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk) | ||||
{ | { | ||||
PyObject *trash; | PyObject *trash; | ||||
int first_time, i, is_marker = 0, escaped; | |||||
int first_time, escaped; | |||||
if (data->context & TAG_NAME) { | if (data->context & TAG_NAME) { | ||||
first_time = !(data->context & TAG_NOTE_SPACE); | first_time = !(data->context & TAG_NOTE_SPACE); | ||||
for (i = 0; i < NUM_MARKERS; i++) { | |||||
if (*MARKERS[i] == chunk) { | |||||
is_marker = 1; | |||||
break; | |||||
} | |||||
} | |||||
if (is_marker || (Py_UNICODE_ISSPACE(chunk) && first_time)) { | |||||
if (is_marker(chunk) || (Py_UNICODE_ISSPACE(chunk) && first_time)) { | |||||
// Tags must start with text, not spaces | // Tags must start with text, not spaces | ||||
Tokenizer_fail_route(self); | Tokenizer_fail_route(self); | ||||
return 0; | return 0; | ||||
@@ -1623,7 +2009,6 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) | |||||
Textbuffer* buf; | Textbuffer* buf; | ||||
PyObject *name, *tag; | PyObject *name, *tag; | ||||
Py_UNICODE this; | Py_UNICODE this; | ||||
int is_marker, i; | |||||
self->head += 2; | self->head += 2; | ||||
buf = Textbuffer_new(); | buf = Textbuffer_new(); | ||||
@@ -1631,14 +2016,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) | |||||
return -1; | return -1; | ||||
while (1) { | while (1) { | ||||
this = Tokenizer_READ(self, pos); | this = Tokenizer_READ(self, pos); | ||||
is_marker = 0; | |||||
for (i = 0; i < NUM_MARKERS; i++) { | |||||
if (*MARKERS[i] == this) { | |||||
is_marker = 1; | |||||
break; | |||||
} | |||||
} | |||||
if (is_marker) { | |||||
if (is_marker(this)) { | |||||
name = Textbuffer_render(buf); | name = Textbuffer_render(buf); | ||||
if (!name) { | if (!name) { | ||||
Textbuffer_dealloc(buf); | Textbuffer_dealloc(buf); | ||||
@@ -1985,9 +2363,9 @@ static int Tokenizer_handle_hr(Tokenizer* self) | |||||
self->head++; | self->head++; | ||||
} | } | ||||
markup = Textbuffer_render(buffer); | markup = Textbuffer_render(buffer); | ||||
Textbuffer_dealloc(buffer); | |||||
if (!markup) | if (!markup) | ||||
return -1; | return -1; | ||||
Textbuffer_dealloc(buffer); | |||||
kwargs = PyDict_New(); | kwargs = PyDict_New(); | ||||
if (!kwargs) | if (!kwargs) | ||||
return -1; | return -1; | ||||
@@ -2047,21 +2425,21 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) | |||||
*/ | */ | ||||
static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | ||||
{ | { | ||||
if (context & LC_FAIL_NEXT) { | |||||
if (context & LC_FAIL_NEXT) | |||||
return -1; | return -1; | ||||
} | |||||
if (context & LC_WIKILINK_TITLE) { | |||||
if (data == *"]" || data == *"{") | |||||
if (context & LC_WIKILINK) { | |||||
if (context & LC_WIKILINK_TEXT) | |||||
return (data == *"[" && Tokenizer_READ(self, 1) == *"[") ? -1 : 0; | |||||
else if (data == *"]" || data == *"{") | |||||
self->topstack->context |= LC_FAIL_NEXT; | self->topstack->context |= LC_FAIL_NEXT; | ||||
else if (data == *"\n" || data == *"[" || data == *"}") | else if (data == *"\n" || data == *"[" || data == *"}") | ||||
return -1; | return -1; | ||||
return 0; | return 0; | ||||
} | } | ||||
if (context & LC_TAG_CLOSE) { | |||||
if (data == *"<") | |||||
return -1; | |||||
return 0; | |||||
} | |||||
if (context & LC_EXT_LINK_TITLE) | |||||
return (data == *"\n") ? -1 : 0; | |||||
if (context & LC_TAG_CLOSE) | |||||
return (data == *"<") ? -1 : 0; | |||||
if (context & LC_TEMPLATE_NAME) { | if (context & LC_TEMPLATE_NAME) { | ||||
if (data == *"{" || data == *"}" || data == *"[") { | if (data == *"{" || data == *"}" || data == *"[") { | ||||
self->topstack->context |= LC_FAIL_NEXT; | self->topstack->context |= LC_FAIL_NEXT; | ||||
@@ -2126,7 +2504,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) | |||||
*/ | */ | ||||
static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | ||||
{ | { | ||||
int this_context, is_marker, i; | |||||
int this_context; | |||||
Py_UNICODE this, next, next_next, last; | Py_UNICODE this, next, next_next, last; | ||||
PyObject* temp; | PyObject* temp; | ||||
@@ -2146,14 +2524,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||||
return Tokenizer_fail_route(self); | return Tokenizer_fail_route(self); | ||||
} | } | ||||
} | } | ||||
is_marker = 0; | |||||
for (i = 0; i < NUM_MARKERS; i++) { | |||||
if (*MARKERS[i] == this) { | |||||
is_marker = 1; | |||||
break; | |||||
} | |||||
} | |||||
if (!is_marker) { | |||||
if (!is_marker(this)) { | |||||
if (Tokenizer_emit_char(self, this)) | if (Tokenizer_emit_char(self, this)) | ||||
return NULL; | return NULL; | ||||
self->head++; | self->head++; | ||||
@@ -2192,9 +2563,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||||
if (Tokenizer_emit_char(self, this)) | if (Tokenizer_emit_char(self, this)) | ||||
return NULL; | return NULL; | ||||
} | } | ||||
else if (this == next && next == *"[") { | |||||
if (!(this_context & LC_WIKILINK_TITLE) && | |||||
Tokenizer_CAN_RECURSE(self)) { | |||||
else if (this == next && next == *"[" && Tokenizer_CAN_RECURSE(self)) { | |||||
if (!(this_context & AGG_INVALID_LINK)) { | |||||
if (Tokenizer_parse_wikilink(self)) | if (Tokenizer_parse_wikilink(self)) | ||||
return NULL; | return NULL; | ||||
} | } | ||||
@@ -2207,6 +2577,16 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||||
} | } | ||||
else if (this == next && next == *"]" && this_context & LC_WIKILINK) | else if (this == next && next == *"]" && this_context & LC_WIKILINK) | ||||
return Tokenizer_handle_wikilink_end(self); | return Tokenizer_handle_wikilink_end(self); | ||||
else if (this == *"[") { | |||||
if (Tokenizer_parse_external_link(self, 1)) | |||||
return NULL; | |||||
} | |||||
else if (this == *":" && !is_marker(last)) { | |||||
if (Tokenizer_parse_external_link(self, 0)) | |||||
return NULL; | |||||
} | |||||
else if (this == *"]" && this_context & LC_EXT_LINK_TITLE) | |||||
return Tokenizer_pop(self); | |||||
else if (this == *"=" && !(self->global & GL_HEADING)) { | else if (this == *"=" && !(self->global & GL_HEADING)) { | ||||
if (last == *"\n" || last == *"") { | if (last == *"\n" || last == *"") { | ||||
if (Tokenizer_parse_heading(self)) | if (Tokenizer_parse_heading(self)) | ||||
@@ -2243,9 +2623,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||||
return NULL; | return NULL; | ||||
} | } | ||||
} | } | ||||
else if (this == *"<") { | |||||
if (!(this_context & LC_TAG_CLOSE) && | |||||
Tokenizer_CAN_RECURSE(self)) { | |||||
else if (this == *"<" && !(this_context & LC_TAG_CLOSE)) { | |||||
if (Tokenizer_CAN_RECURSE(self)) { | |||||
if (Tokenizer_parse_tag(self)) | if (Tokenizer_parse_tag(self)) | ||||
return NULL; | return NULL; | ||||
} | } | ||||
@@ -2289,8 +2668,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) | |||||
static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | ||||
{ | { | ||||
PyObject *text, *temp; | PyObject *text, *temp; | ||||
int context = 0; | |||||
if (PyArg_ParseTuple(args, "U", &text)) { | |||||
if (PyArg_ParseTuple(args, "U|i", &text, &context)) { | |||||
Py_XDECREF(self->text); | Py_XDECREF(self->text); | ||||
self->text = PySequence_Fast(text, "expected a sequence"); | self->text = PySequence_Fast(text, "expected a sequence"); | ||||
} | } | ||||
@@ -2299,7 +2679,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||||
Py_ssize_t size; | Py_ssize_t size; | ||||
/* Failed to parse a Unicode object; try a string instead. */ | /* Failed to parse a Unicode object; try a string instead. */ | ||||
PyErr_Clear(); | PyErr_Clear(); | ||||
if (!PyArg_ParseTuple(args, "s#", &encoded, &size)) | |||||
if (!PyArg_ParseTuple(args, "s#|i", &encoded, &size, &context)) | |||||
return NULL; | return NULL; | ||||
temp = PyUnicode_FromStringAndSize(encoded, size); | temp = PyUnicode_FromStringAndSize(encoded, size); | ||||
if (!text) | if (!text) | ||||
@@ -2311,7 +2691,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) | |||||
} | } | ||||
self->head = self->global = self->depth = self->cycles = 0; | self->head = self->global = self->depth = self->cycles = 0; | ||||
self->length = PyList_GET_SIZE(self->text); | self->length = PyList_GET_SIZE(self->text); | ||||
return Tokenizer_parse(self, 0, 1); | |||||
return Tokenizer_parse(self, context, 1); | |||||
} | } | ||||
static int load_entitydefs(void) | static int load_entitydefs(void) | ||||
@@ -2389,6 +2769,11 @@ static int load_tokens(void) | |||||
WikilinkSeparator = PyObject_GetAttrString(tokens, "WikilinkSeparator"); | WikilinkSeparator = PyObject_GetAttrString(tokens, "WikilinkSeparator"); | ||||
WikilinkClose = PyObject_GetAttrString(tokens, "WikilinkClose"); | WikilinkClose = PyObject_GetAttrString(tokens, "WikilinkClose"); | ||||
ExternalLinkOpen = PyObject_GetAttrString(tokens, "ExternalLinkOpen"); | |||||
ExternalLinkSeparator = PyObject_GetAttrString(tokens, | |||||
"ExternalLinkSeparator"); | |||||
ExternalLinkClose = PyObject_GetAttrString(tokens, "ExternalLinkClose"); | |||||
HTMLEntityStart = PyObject_GetAttrString(tokens, "HTMLEntityStart"); | HTMLEntityStart = PyObject_GetAttrString(tokens, "HTMLEntityStart"); | ||||
HTMLEntityNumeric = PyObject_GetAttrString(tokens, "HTMLEntityNumeric"); | HTMLEntityNumeric = PyObject_GetAttrString(tokens, "HTMLEntityNumeric"); | ||||
HTMLEntityHex = PyObject_GetAttrString(tokens, "HTMLEntityHex"); | HTMLEntityHex = PyObject_GetAttrString(tokens, "HTMLEntityHex"); | ||||
@@ -2413,13 +2798,13 @@ static int load_tokens(void) | |||||
return 0; | return 0; | ||||
} | } | ||||
static int load_tag_defs(void) | |||||
static int load_definitions(void) | |||||
{ | { | ||||
PyObject *tempmod, | PyObject *tempmod, | ||||
*globals = PyEval_GetGlobals(), | *globals = PyEval_GetGlobals(), | ||||
*locals = PyEval_GetLocals(), | *locals = PyEval_GetLocals(), | ||||
*fromlist = PyList_New(1), | *fromlist = PyList_New(1), | ||||
*modname = IMPORT_NAME_FUNC("tag_defs"); | |||||
*modname = IMPORT_NAME_FUNC("definitions"); | |||||
char *name = "mwparserfromhell"; | char *name = "mwparserfromhell"; | ||||
if (!fromlist || !modname) | if (!fromlist || !modname) | ||||
@@ -2429,7 +2814,7 @@ static int load_tag_defs(void) | |||||
Py_DECREF(fromlist); | Py_DECREF(fromlist); | ||||
if (!tempmod) | if (!tempmod) | ||||
return -1; | return -1; | ||||
tag_defs = PyObject_GetAttrString(tempmod, "tag_defs"); | |||||
definitions = PyObject_GetAttrString(tempmod, "definitions"); | |||||
Py_DECREF(tempmod); | Py_DECREF(tempmod); | ||||
return 0; | return 0; | ||||
} | } | ||||
@@ -2452,7 +2837,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void) | |||||
NOARGS = PyTuple_New(0); | NOARGS = PyTuple_New(0); | ||||
if (!EMPTY || !NOARGS) | if (!EMPTY || !NOARGS) | ||||
INIT_ERROR; | INIT_ERROR; | ||||
if (load_entitydefs() || load_tokens() || load_tag_defs()) | |||||
if (load_entitydefs() || load_tokens() || load_definitions()) | |||||
INIT_ERROR; | INIT_ERROR; | ||||
#ifdef IS_PY3K | #ifdef IS_PY3K | ||||
return module; | return module; | ||||
@@ -62,7 +62,7 @@ static char** entitydefs; | |||||
static PyObject* EMPTY; | static PyObject* EMPTY; | ||||
static PyObject* NOARGS; | static PyObject* NOARGS; | ||||
static PyObject* tag_defs; | |||||
static PyObject* definitions; | |||||
/* Tokens: */ | /* Tokens: */ | ||||
@@ -82,6 +82,10 @@ static PyObject* WikilinkOpen; | |||||
static PyObject* WikilinkSeparator; | static PyObject* WikilinkSeparator; | ||||
static PyObject* WikilinkClose; | static PyObject* WikilinkClose; | ||||
static PyObject* ExternalLinkOpen; | |||||
static PyObject* ExternalLinkSeparator; | |||||
static PyObject* ExternalLinkClose; | |||||
static PyObject* HTMLEntityStart; | static PyObject* HTMLEntityStart; | ||||
static PyObject* HTMLEntityNumeric; | static PyObject* HTMLEntityNumeric; | ||||
static PyObject* HTMLEntityHex; | static PyObject* HTMLEntityHex; | ||||
@@ -104,48 +108,53 @@ static PyObject* TagCloseClose; | |||||
/* Local contexts: */ | /* Local contexts: */ | ||||
#define LC_TEMPLATE 0x0000007 | |||||
#define LC_TEMPLATE_NAME 0x0000001 | |||||
#define LC_TEMPLATE_PARAM_KEY 0x0000002 | |||||
#define LC_TEMPLATE_PARAM_VALUE 0x0000004 | |||||
#define LC_ARGUMENT 0x0000018 | |||||
#define LC_ARGUMENT_NAME 0x0000008 | |||||
#define LC_ARGUMENT_DEFAULT 0x0000010 | |||||
#define LC_WIKILINK 0x0000060 | |||||
#define LC_WIKILINK_TITLE 0x0000020 | |||||
#define LC_WIKILINK_TEXT 0x0000040 | |||||
#define LC_HEADING 0x0001F80 | |||||
#define LC_HEADING_LEVEL_1 0x0000080 | |||||
#define LC_HEADING_LEVEL_2 0x0000100 | |||||
#define LC_HEADING_LEVEL_3 0x0000200 | |||||
#define LC_HEADING_LEVEL_4 0x0000400 | |||||
#define LC_HEADING_LEVEL_5 0x0000800 | |||||
#define LC_HEADING_LEVEL_6 0x0001000 | |||||
#define LC_TAG 0x001E000 | |||||
#define LC_TAG_OPEN 0x0002000 | |||||
#define LC_TAG_ATTR 0x0004000 | |||||
#define LC_TAG_BODY 0x0008000 | |||||
#define LC_TAG_CLOSE 0x0010000 | |||||
#define LC_STYLE 0x01E0000 | |||||
#define LC_STYLE_ITALICS 0x0020000 | |||||
#define LC_STYLE_BOLD 0x0040000 | |||||
#define LC_STYLE_PASS_AGAIN 0x0080000 | |||||
#define LC_STYLE_SECOND_PASS 0x0100000 | |||||
#define LC_DLTERM 0x0200000 | |||||
#define LC_SAFETY_CHECK 0xFC00000 | |||||
#define LC_HAS_TEXT 0x0400000 | |||||
#define LC_FAIL_ON_TEXT 0x0800000 | |||||
#define LC_FAIL_NEXT 0x1000000 | |||||
#define LC_FAIL_ON_LBRACE 0x2000000 | |||||
#define LC_FAIL_ON_RBRACE 0x4000000 | |||||
#define LC_FAIL_ON_EQUALS 0x8000000 | |||||
#define LC_TEMPLATE 0x00000007 | |||||
#define LC_TEMPLATE_NAME 0x00000001 | |||||
#define LC_TEMPLATE_PARAM_KEY 0x00000002 | |||||
#define LC_TEMPLATE_PARAM_VALUE 0x00000004 | |||||
#define LC_ARGUMENT 0x00000018 | |||||
#define LC_ARGUMENT_NAME 0x00000008 | |||||
#define LC_ARGUMENT_DEFAULT 0x00000010 | |||||
#define LC_WIKILINK 0x00000060 | |||||
#define LC_WIKILINK_TITLE 0x00000020 | |||||
#define LC_WIKILINK_TEXT 0x00000040 | |||||
#define LC_EXT_LINK 0x00000380 | |||||
#define LC_EXT_LINK_URI 0x00000080 | |||||
#define LC_EXT_LINK_TITLE 0x00000100 | |||||
#define LC_EXT_LINK_BRACKETS 0x00000200 | |||||
#define LC_HEADING 0x0000FC00 | |||||
#define LC_HEADING_LEVEL_1 0x00000400 | |||||
#define LC_HEADING_LEVEL_2 0x00000800 | |||||
#define LC_HEADING_LEVEL_3 0x00001000 | |||||
#define LC_HEADING_LEVEL_4 0x00002000 | |||||
#define LC_HEADING_LEVEL_5 0x00004000 | |||||
#define LC_HEADING_LEVEL_6 0x00008000 | |||||
#define LC_TAG 0x000F0000 | |||||
#define LC_TAG_OPEN 0x00010000 | |||||
#define LC_TAG_ATTR 0x00020000 | |||||
#define LC_TAG_BODY 0x00040000 | |||||
#define LC_TAG_CLOSE 0x00080000 | |||||
#define LC_STYLE 0x00F00000 | |||||
#define LC_STYLE_ITALICS 0x00100000 | |||||
#define LC_STYLE_BOLD 0x00200000 | |||||
#define LC_STYLE_PASS_AGAIN 0x00400000 | |||||
#define LC_STYLE_SECOND_PASS 0x00800000 | |||||
#define LC_DLTERM 0x01000000 | |||||
#define LC_SAFETY_CHECK 0x7E000000 | |||||
#define LC_HAS_TEXT 0x02000000 | |||||
#define LC_FAIL_ON_TEXT 0x04000000 | |||||
#define LC_FAIL_NEXT 0x08000000 | |||||
#define LC_FAIL_ON_LBRACE 0x10000000 | |||||
#define LC_FAIL_ON_RBRACE 0x20000000 | |||||
#define LC_FAIL_ON_EQUALS 0x40000000 | |||||
/* Global contexts: */ | /* Global contexts: */ | ||||
@@ -153,9 +162,10 @@ static PyObject* TagCloseClose; | |||||
/* Aggregate contexts: */ | /* Aggregate contexts: */ | ||||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) | |||||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) | |||||
#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) | |||||
#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) | |||||
#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) | |||||
#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK) | |||||
/* Tag contexts: */ | /* Tag contexts: */ | ||||
@@ -174,6 +184,7 @@ static PyObject* TagCloseClose; | |||||
struct Textbuffer { | struct Textbuffer { | ||||
Py_ssize_t size; | Py_ssize_t size; | ||||
Py_UNICODE* data; | Py_UNICODE* data; | ||||
struct Textbuffer* prev; | |||||
struct Textbuffer* next; | struct Textbuffer* next; | ||||
}; | }; | ||||
@@ -228,12 +239,14 @@ typedef struct { | |||||
#define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1) | #define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1) | ||||
/* Macros for accessing HTML tag definitions: */ | |||||
/* Macros for accessing definitions: */ | |||||
#define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") | #define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") | ||||
#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag)) | |||||
#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag)) | |||||
#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag)) | |||||
#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL)) | |||||
#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL)) | |||||
#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL)) | |||||
#define IS_SCHEME(scheme, slashes, reverse) \ | |||||
(call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False)) | |||||
/* Function prototypes: */ | /* Function prototypes: */ | ||||
@@ -247,6 +260,8 @@ static void TagData_dealloc(TagData*); | |||||
static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); | static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); | ||||
static void Tokenizer_dealloc(Tokenizer*); | static void Tokenizer_dealloc(Tokenizer*); | ||||
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); | static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); | ||||
static int Tokenizer_parse_entity(Tokenizer*); | |||||
static int Tokenizer_handle_dl_term(Tokenizer*); | |||||
static int Tokenizer_parse_tag(Tokenizer*); | static int Tokenizer_parse_tag(Tokenizer*); | ||||
static PyObject* Tokenizer_parse(Tokenizer*, int, int); | static PyObject* Tokenizer_parse(Tokenizer*, int, int); | ||||
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | ||||
@@ -26,7 +26,8 @@ import re | |||||
from . import contexts, tokens | from . import contexts, tokens | ||||
from ..compat import htmlentities | from ..compat import htmlentities | ||||
from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only | |||||
from ..definitions import (get_html_tag, is_parsable, is_single, | |||||
is_single_only, is_scheme) | |||||
__all__ = ["Tokenizer"] | __all__ = ["Tokenizer"] | ||||
@@ -60,7 +61,7 @@ class Tokenizer(object): | |||||
START = object() | START = object() | ||||
END = object() | END = object() | ||||
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", | MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", | ||||
":", "/", "-", "\n", END] | |||||
":", "/", "-", "\n", START, END] | |||||
MAX_DEPTH = 40 | MAX_DEPTH = 40 | ||||
MAX_CYCLES = 100000 | MAX_CYCLES = 100000 | ||||
regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) | regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) | ||||
@@ -311,6 +312,168 @@ class Tokenizer(object): | |||||
self._head += 1 | self._head += 1 | ||||
return self._pop() | return self._pop() | ||||
def _parse_bracketed_uri_scheme(self): | |||||
"""Parse the URI scheme of a bracket-enclosed external link.""" | |||||
self._push(contexts.EXT_LINK_URI) | |||||
if self._read() == self._read(1) == "/": | |||||
self._emit_text("//") | |||||
self._head += 2 | |||||
else: | |||||
valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" | |||||
all_valid = lambda: all(char in valid for char in self._read()) | |||||
scheme = "" | |||||
while self._read() is not self.END and all_valid(): | |||||
scheme += self._read() | |||||
self._emit_text(self._read()) | |||||
self._head += 1 | |||||
if self._read() != ":": | |||||
self._fail_route() | |||||
self._emit_text(":") | |||||
self._head += 1 | |||||
slashes = self._read() == self._read(1) == "/" | |||||
if slashes: | |||||
self._emit_text("//") | |||||
self._head += 2 | |||||
if not is_scheme(scheme, slashes): | |||||
self._fail_route() | |||||
def _parse_free_uri_scheme(self): | |||||
"""Parse the URI scheme of a free (no brackets) external link.""" | |||||
valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" | |||||
scheme = [] | |||||
try: | |||||
# We have to backtrack through the textbuffer looking for our | |||||
# scheme since it was just parsed as text: | |||||
for chunk in reversed(self._textbuffer): | |||||
for char in reversed(chunk): | |||||
if char.isspace() or char in self.MARKERS: | |||||
raise StopIteration() | |||||
if char not in valid: | |||||
raise BadRoute() | |||||
scheme.append(char) | |||||
except StopIteration: | |||||
pass | |||||
scheme = "".join(reversed(scheme)) | |||||
slashes = self._read() == self._read(1) == "/" | |||||
if not is_scheme(scheme, slashes): | |||||
raise BadRoute() | |||||
self._push(contexts.EXT_LINK_URI) | |||||
self._emit_text(scheme) | |||||
self._emit_text(":") | |||||
if slashes: | |||||
self._emit_text("//") | |||||
self._head += 2 | |||||
def _handle_free_link_text(self, punct, tail, this): | |||||
"""Handle text in a free ext link, including trailing punctuation.""" | |||||
if "(" in this and ")" in punct: | |||||
punct = punct[:-1] # ')' is not longer valid punctuation | |||||
if this.endswith(punct): | |||||
for i in reversed(range(-len(this), 0)): | |||||
if i == -len(this) or this[i - 1] not in punct: | |||||
break | |||||
stripped = this[:i] | |||||
if stripped and tail: | |||||
self._emit_text(tail) | |||||
tail = "" | |||||
tail += this[i:] | |||||
this = stripped | |||||
elif tail: | |||||
self._emit_text(tail) | |||||
tail = "" | |||||
self._emit_text(this) | |||||
return punct, tail | |||||
def _really_parse_external_link(self, brackets): | |||||
"""Really parse an external link.""" | |||||
if brackets: | |||||
self._parse_bracketed_uri_scheme() | |||||
invalid = ("\n", " ", "]") | |||||
else: | |||||
self._parse_free_uri_scheme() | |||||
invalid = ("\n", " ", "[", "]") | |||||
punct = tuple(",;\.:!?)") | |||||
if self._read() is self.END or self._read()[0] in invalid: | |||||
self._fail_route() | |||||
tail = "" | |||||
while True: | |||||
this, next = self._read(), self._read(1) | |||||
if this is self.END or this == "\n": | |||||
if brackets: | |||||
self._fail_route() | |||||
return self._pop(), tail, -1 | |||||
elif this == next == "{" and self._can_recurse(): | |||||
if tail: | |||||
self._emit_text(tail) | |||||
tail = "" | |||||
self._parse_template_or_argument() | |||||
elif this == "[": | |||||
if brackets: | |||||
self._emit_text("[") | |||||
else: | |||||
return self._pop(), tail, -1 | |||||
elif this == "]": | |||||
return self._pop(), tail, 0 if brackets else -1 | |||||
elif this == "&": | |||||
if tail: | |||||
self._emit_text(tail) | |||||
tail = "" | |||||
self._parse_entity() | |||||
elif " " in this: | |||||
before, after = this.split(" ", 1) | |||||
if brackets: | |||||
self._emit_text(before) | |||||
self._emit(tokens.ExternalLinkSeparator()) | |||||
if after: | |||||
self._emit_text(after) | |||||
self._context ^= contexts.EXT_LINK_URI | |||||
self._context |= contexts.EXT_LINK_TITLE | |||||
self._head += 1 | |||||
return self._parse(push=False), None, 0 | |||||
punct, tail = self._handle_free_link_text(punct, tail, before) | |||||
return self._pop(), tail + " " + after, 0 | |||||
elif not brackets: | |||||
punct, tail = self._handle_free_link_text(punct, tail, this) | |||||
else: | |||||
self._emit_text(this) | |||||
self._head += 1 | |||||
def _remove_uri_scheme_from_textbuffer(self, scheme): | |||||
"""Remove the URI scheme of a new external link from the textbuffer.""" | |||||
length = len(scheme) | |||||
while length: | |||||
if length < len(self._textbuffer[-1]): | |||||
self._textbuffer[-1] = self._textbuffer[-1][:-length] | |||||
break | |||||
length -= len(self._textbuffer[-1]) | |||||
self._textbuffer.pop() | |||||
def _parse_external_link(self, brackets): | |||||
"""Parse an external link at the head of the wikicode string.""" | |||||
reset = self._head | |||||
self._head += 1 | |||||
try: | |||||
bad_context = self._context & contexts.INVALID_LINK | |||||
if bad_context or not self._can_recurse(): | |||||
raise BadRoute() | |||||
link, extra, delta = self._really_parse_external_link(brackets) | |||||
except BadRoute: | |||||
self._head = reset | |||||
if not brackets and self._context & contexts.DL_TERM: | |||||
self._handle_dl_term() | |||||
else: | |||||
self._emit_text(self._read()) | |||||
else: | |||||
if not brackets: | |||||
scheme = link[0].text.split(":", 1)[0] | |||||
self._remove_uri_scheme_from_textbuffer(scheme) | |||||
self._emit(tokens.ExternalLinkOpen(brackets=brackets)) | |||||
self._emit_all(link) | |||||
self._emit(tokens.ExternalLinkClose()) | |||||
self._head += delta | |||||
if extra: | |||||
self._emit_text(extra) | |||||
def _parse_heading(self): | def _parse_heading(self): | ||||
"""Parse a section heading at the head of the wikicode string.""" | """Parse a section heading at the head of the wikicode string.""" | ||||
self._global |= contexts.GL_HEADING | self._global |= contexts.GL_HEADING | ||||
@@ -810,12 +973,16 @@ class Tokenizer(object): | |||||
context = self._context | context = self._context | ||||
if context & contexts.FAIL_NEXT: | if context & contexts.FAIL_NEXT: | ||||
return False | return False | ||||
if context & contexts.WIKILINK_TITLE: | |||||
if this == "]" or this == "{": | |||||
if context & contexts.WIKILINK: | |||||
if context & contexts.WIKILINK_TEXT: | |||||
return not (this == self._read(1) == "[") | |||||
elif this == "]" or this == "{": | |||||
self._context |= contexts.FAIL_NEXT | self._context |= contexts.FAIL_NEXT | ||||
elif this == "\n" or this == "[" or this == "}": | elif this == "\n" or this == "[" or this == "}": | ||||
return False | return False | ||||
return True | return True | ||||
elif context & contexts.EXT_LINK_TITLE: | |||||
return this != "\n" | |||||
elif context & contexts.TEMPLATE_NAME: | elif context & contexts.TEMPLATE_NAME: | ||||
if this == "{" or this == "}" or this == "[": | if this == "{" or this == "}" or this == "[": | ||||
self._context |= contexts.FAIL_NEXT | self._context |= contexts.FAIL_NEXT | ||||
@@ -898,8 +1065,8 @@ class Tokenizer(object): | |||||
return self._handle_argument_end() | return self._handle_argument_end() | ||||
else: | else: | ||||
self._emit_text("}") | self._emit_text("}") | ||||
elif this == next == "[": | |||||
if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): | |||||
elif this == next == "[" and self._can_recurse(): | |||||
if not self._context & contexts.INVALID_LINK: | |||||
self._parse_wikilink() | self._parse_wikilink() | ||||
else: | else: | ||||
self._emit_text("[") | self._emit_text("[") | ||||
@@ -907,6 +1074,12 @@ class Tokenizer(object): | |||||
self._handle_wikilink_separator() | self._handle_wikilink_separator() | ||||
elif this == next == "]" and self._context & contexts.WIKILINK: | elif this == next == "]" and self._context & contexts.WIKILINK: | ||||
return self._handle_wikilink_end() | return self._handle_wikilink_end() | ||||
elif this == "[": | |||||
self._parse_external_link(True) | |||||
elif this == ":" and self._read(-1) not in self.MARKERS: | |||||
self._parse_external_link(False) | |||||
elif this == "]" and self._context & contexts.EXT_LINK_TITLE: | |||||
return self._pop() | |||||
elif this == "=" and not self._global & contexts.GL_HEADING: | elif this == "=" and not self._global & contexts.GL_HEADING: | ||||
if self._read(-1) in ("\n", self.START): | if self._read(-1) in ("\n", self.START): | ||||
self._parse_heading() | self._parse_heading() | ||||
@@ -928,8 +1101,8 @@ class Tokenizer(object): | |||||
self._handle_tag_open_close() | self._handle_tag_open_close() | ||||
else: | else: | ||||
self._handle_invalid_tag_start() | self._handle_invalid_tag_start() | ||||
elif this == "<": | |||||
if not self._context & contexts.TAG_CLOSE and self._can_recurse(): | |||||
elif this == "<" and not self._context & contexts.TAG_CLOSE: | |||||
if self._can_recurse(): | |||||
self._parse_tag() | self._parse_tag() | ||||
else: | else: | ||||
self._emit_text("<") | self._emit_text("<") | ||||
@@ -952,8 +1125,9 @@ class Tokenizer(object): | |||||
self._emit_text(this) | self._emit_text(this) | ||||
self._head += 1 | self._head += 1 | ||||
def tokenize(self, text): | |||||
def tokenize(self, text, context=0): | |||||
"""Build a list of tokens from a string of wikicode and return it.""" | """Build a list of tokens from a string of wikicode and return it.""" | ||||
split = self.regex.split(text) | split = self.regex.split(text) | ||||
self._text = [segment for segment in split if segment] | self._text = [segment for segment in split if segment] | ||||
return self._parse() | |||||
self._head = self._global = self._depth = self._cycles = 0 | |||||
return self._parse(context) |
@@ -84,6 +84,10 @@ WikilinkOpen = make("WikilinkOpen") # [[ | |||||
WikilinkSeparator = make("WikilinkSeparator") # | | WikilinkSeparator = make("WikilinkSeparator") # | | ||||
WikilinkClose = make("WikilinkClose") # ]] | WikilinkClose = make("WikilinkClose") # ]] | ||||
ExternalLinkOpen = make("ExternalLinkOpen") # [ | |||||
ExternalLinkSeparator = make("ExternalLinkSeparator") # | |||||
ExternalLinkClose = make("ExternalLinkClose") # ] | |||||
HTMLEntityStart = make("HTMLEntityStart") # & | HTMLEntityStart = make("HTMLEntityStart") # & | ||||
HTMLEntityNumeric = make("HTMLEntityNumeric") # # | HTMLEntityNumeric = make("HTMLEntityNumeric") # # | ||||
HTMLEntityHex = make("HTMLEntityHex") # x | HTMLEntityHex = make("HTMLEntityHex") # x | ||||
@@ -33,7 +33,7 @@ from .smart_list import SmartList | |||||
__all__ = ["parse_anything"] | __all__ = ["parse_anything"] | ||||
def parse_anything(value): | |||||
def parse_anything(value, context=0): | |||||
"""Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. | """Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. | ||||
This differs from :py:meth:`.Parser.parse` in that we accept more than just | This differs from :py:meth:`.Parser.parse` in that we accept more than just | ||||
@@ -44,6 +44,12 @@ def parse_anything(value): | |||||
on-the-fly by various methods of :py:class:`~.Wikicode` and others like | on-the-fly by various methods of :py:class:`~.Wikicode` and others like | ||||
:py:class:`~.Template`, such as :py:meth:`wikicode.insert() | :py:class:`~.Template`, such as :py:meth:`wikicode.insert() | ||||
<.Wikicode.insert>` or setting :py:meth:`template.name <.Template.name>`. | <.Wikicode.insert>` or setting :py:meth:`template.name <.Template.name>`. | ||||
If given, *context* will be passed as a starting context to the parser. | |||||
This is helpful when this function is used inside node attribute setters. | |||||
For example, :py:class:`~.ExternalLink`\ 's :py:attr:`~.ExternalLink.url` | |||||
setter sets *context* to :py:mod:`contexts.EXT_LINK_URI <.contexts>` to | |||||
prevent the URL itself from becoming an :py:class:`~.ExternalLink`. | |||||
""" | """ | ||||
from .parser import Parser | from .parser import Parser | ||||
from .wikicode import Wikicode | from .wikicode import Wikicode | ||||
@@ -53,17 +59,17 @@ def parse_anything(value): | |||||
elif isinstance(value, Node): | elif isinstance(value, Node): | ||||
return Wikicode(SmartList([value])) | return Wikicode(SmartList([value])) | ||||
elif isinstance(value, str): | elif isinstance(value, str): | ||||
return Parser(value).parse() | |||||
return Parser().parse(value, context) | |||||
elif isinstance(value, bytes): | elif isinstance(value, bytes): | ||||
return Parser(value.decode("utf8")).parse() | |||||
return Parser().parse(value.decode("utf8"), context) | |||||
elif isinstance(value, int): | elif isinstance(value, int): | ||||
return Parser(str(value)).parse() | |||||
return Parser().parse(str(value), context) | |||||
elif value is None: | elif value is None: | ||||
return Wikicode(SmartList()) | return Wikicode(SmartList()) | ||||
try: | try: | ||||
nodelist = SmartList() | nodelist = SmartList() | ||||
for item in value: | for item in value: | ||||
nodelist += parse_anything(item).nodes | |||||
nodelist += parse_anything(item, context).nodes | |||||
except TypeError: | except TypeError: | ||||
error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" | error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" | ||||
raise ValueError(error.format(type(value).__name__, value)) | raise ValueError(error.format(type(value).__name__, value)) | ||||
@@ -24,8 +24,8 @@ from __future__ import unicode_literals | |||||
import re | import re | ||||
from .compat import maxsize, py3k, str | from .compat import maxsize, py3k, str | ||||
from .nodes import (Argument, Comment, Heading, HTMLEntity, Node, Tag, | |||||
Template, Text, Wikilink) | |||||
from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, | |||||
Node, Tag, Template, Text, Wikilink) | |||||
from .string_mixin import StringMixIn | from .string_mixin import StringMixIn | ||||
from .utils import parse_anything | from .utils import parse_anything | ||||
@@ -509,6 +509,6 @@ class Wikicode(StringMixIn): | |||||
return "\n".join(self._get_tree(self, [], marker, 0)) | return "\n".join(self._get_tree(self, [], marker, 0)) | ||||
Wikicode._build_filter_methods( | Wikicode._build_filter_methods( | ||||
arguments=Argument, comments=Comment, headings=Heading, | |||||
html_entities=HTMLEntity, tags=Tag, templates=Template, text=Text, | |||||
wikilinks=Wikilink) | |||||
arguments=Argument, comments=Comment, external_links=ExternalLink, | |||||
headings=Heading, html_entities=HTMLEntity, tags=Tag, templates=Template, | |||||
text=Text, wikilinks=Wikilink) |
@@ -23,8 +23,8 @@ | |||||
from __future__ import unicode_literals | from __future__ import unicode_literals | ||||
import unittest | import unittest | ||||
from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity, | |||||
Tag, Template, Text, Wikilink) | |||||
from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, | |||||
HTMLEntity, Tag, Template, Text, Wikilink) | |||||
from mwparserfromhell.nodes.extras import Attribute, Parameter | from mwparserfromhell.nodes.extras import Attribute, Parameter | ||||
from mwparserfromhell.parser import tokens | from mwparserfromhell.parser import tokens | ||||
from mwparserfromhell.parser.builder import Builder | from mwparserfromhell.parser.builder import Builder | ||||
@@ -150,6 +150,48 @@ class TestBuilder(TreeEqualityTestCase): | |||||
for test, valid in tests: | for test, valid in tests: | ||||
self.assertWikicodeEqual(valid, self.builder.build(test)) | self.assertWikicodeEqual(valid, self.builder.build(test)) | ||||
def test_external_link(self): | |||||
"""tests for building ExternalLink nodes""" | |||||
tests = [ | |||||
([tokens.ExternalLinkOpen(brackets=False), | |||||
tokens.Text(text="http://example.com/"), | |||||
tokens.ExternalLinkClose()], | |||||
wrap([ExternalLink(wraptext("http://example.com/"), | |||||
brackets=False)])), | |||||
([tokens.ExternalLinkOpen(brackets=True), | |||||
tokens.Text(text="http://example.com/"), | |||||
tokens.ExternalLinkClose()], | |||||
wrap([ExternalLink(wraptext("http://example.com/"))])), | |||||
([tokens.ExternalLinkOpen(brackets=True), | |||||
tokens.Text(text="http://example.com/"), | |||||
tokens.ExternalLinkSeparator(), tokens.ExternalLinkClose()], | |||||
wrap([ExternalLink(wraptext("http://example.com/"), wrap([]))])), | |||||
([tokens.ExternalLinkOpen(brackets=True), | |||||
tokens.Text(text="http://example.com/"), | |||||
tokens.ExternalLinkSeparator(), tokens.Text(text="Example"), | |||||
tokens.ExternalLinkClose()], | |||||
wrap([ExternalLink(wraptext("http://example.com/"), | |||||
wraptext("Example"))])), | |||||
([tokens.ExternalLinkOpen(brackets=False), | |||||
tokens.Text(text="http://example"), tokens.Text(text=".com/foo"), | |||||
tokens.ExternalLinkClose()], | |||||
wrap([ExternalLink(wraptext("http://example", ".com/foo"), | |||||
brackets=False)])), | |||||
([tokens.ExternalLinkOpen(brackets=True), | |||||
tokens.Text(text="http://example"), tokens.Text(text=".com/foo"), | |||||
tokens.ExternalLinkSeparator(), tokens.Text(text="Example"), | |||||
tokens.Text(text=" Web Page"), tokens.ExternalLinkClose()], | |||||
wrap([ExternalLink(wraptext("http://example", ".com/foo"), | |||||
wraptext("Example", " Web Page"))])), | |||||
] | |||||
for test, valid in tests: | |||||
self.assertWikicodeEqual(valid, self.builder.build(test)) | |||||
def test_html_entity(self): | def test_html_entity(self): | ||||
"""tests for building HTMLEntity nodes""" | """tests for building HTMLEntity nodes""" | ||||
tests = [ | tests = [ | ||||
@@ -0,0 +1,130 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
from __future__ import unicode_literals | |||||
import unittest | |||||
from mwparserfromhell.compat import str | |||||
from mwparserfromhell.nodes import ExternalLink, Text | |||||
from ._test_tree_equality import TreeEqualityTestCase, getnodes, wrap, wraptext | |||||
class TestExternalLink(TreeEqualityTestCase): | |||||
"""Test cases for the ExternalLink node.""" | |||||
def test_unicode(self): | |||||
"""test ExternalLink.__unicode__()""" | |||||
node = ExternalLink(wraptext("http://example.com/"), brackets=False) | |||||
self.assertEqual("http://example.com/", str(node)) | |||||
node2 = ExternalLink(wraptext("http://example.com/")) | |||||
self.assertEqual("[http://example.com/]", str(node2)) | |||||
node3 = ExternalLink(wraptext("http://example.com/"), wrap([])) | |||||
self.assertEqual("[http://example.com/ ]", str(node3)) | |||||
node4 = ExternalLink(wraptext("http://example.com/"), | |||||
wraptext("Example Web Page")) | |||||
self.assertEqual("[http://example.com/ Example Web Page]", str(node4)) | |||||
def test_iternodes(self): | |||||
"""test ExternalLink.__iternodes__()""" | |||||
node1n1 = Text("http://example.com/") | |||||
node2n1 = Text("http://example.com/") | |||||
node2n2, node2n3 = Text("Example"), Text("Page") | |||||
node1 = ExternalLink(wrap([node1n1]), brackets=False) | |||||
node2 = ExternalLink(wrap([node2n1]), wrap([node2n2, node2n3])) | |||||
gen1 = node1.__iternodes__(getnodes) | |||||
gen2 = node2.__iternodes__(getnodes) | |||||
self.assertEqual((None, node1), next(gen1)) | |||||
self.assertEqual((None, node2), next(gen2)) | |||||
self.assertEqual((node1.url, node1n1), next(gen1)) | |||||
self.assertEqual((node2.url, node2n1), next(gen2)) | |||||
self.assertEqual((node2.title, node2n2), next(gen2)) | |||||
self.assertEqual((node2.title, node2n3), next(gen2)) | |||||
self.assertRaises(StopIteration, next, gen1) | |||||
self.assertRaises(StopIteration, next, gen2) | |||||
def test_strip(self): | |||||
"""test ExternalLink.__strip__()""" | |||||
node1 = ExternalLink(wraptext("http://example.com"), brackets=False) | |||||
node2 = ExternalLink(wraptext("http://example.com")) | |||||
node3 = ExternalLink(wraptext("http://example.com"), wrap([])) | |||||
node4 = ExternalLink(wraptext("http://example.com"), wraptext("Link")) | |||||
for a in (True, False): | |||||
for b in (True, False): | |||||
self.assertEqual("http://example.com", node1.__strip__(a, b)) | |||||
self.assertEqual(None, node2.__strip__(a, b)) | |||||
self.assertEqual(None, node3.__strip__(a, b)) | |||||
self.assertEqual("Link", node4.__strip__(a, b)) | |||||
def test_showtree(self): | |||||
"""test ExternalLink.__showtree__()""" | |||||
output = [] | |||||
getter, marker = object(), object() | |||||
get = lambda code: output.append((getter, code)) | |||||
mark = lambda: output.append(marker) | |||||
node1 = ExternalLink(wraptext("http://example.com"), brackets=False) | |||||
node2 = ExternalLink(wraptext("http://example.com"), wraptext("Link")) | |||||
node1.__showtree__(output.append, get, mark) | |||||
node2.__showtree__(output.append, get, mark) | |||||
valid = [ | |||||
(getter, node1.url), "[", (getter, node2.url), | |||||
(getter, node2.title), "]"] | |||||
self.assertEqual(valid, output) | |||||
def test_url(self): | |||||
"""test getter/setter for the url attribute""" | |||||
url = wraptext("http://example.com/") | |||||
node1 = ExternalLink(url, brackets=False) | |||||
node2 = ExternalLink(url, wraptext("Example")) | |||||
self.assertIs(url, node1.url) | |||||
self.assertIs(url, node2.url) | |||||
node1.url = "mailto:héhehé@spam.com" | |||||
node2.url = "mailto:héhehé@spam.com" | |||||
self.assertWikicodeEqual(wraptext("mailto:héhehé@spam.com"), node1.url) | |||||
self.assertWikicodeEqual(wraptext("mailto:héhehé@spam.com"), node2.url) | |||||
def test_title(self): | |||||
"""test getter/setter for the title attribute""" | |||||
title = wraptext("Example!") | |||||
node1 = ExternalLink(wraptext("http://example.com/"), brackets=False) | |||||
node2 = ExternalLink(wraptext("http://example.com/"), title) | |||||
self.assertIs(None, node1.title) | |||||
self.assertIs(title, node2.title) | |||||
node2.title = None | |||||
self.assertIs(None, node2.title) | |||||
node2.title = "My Website" | |||||
self.assertWikicodeEqual(wraptext("My Website"), node2.title) | |||||
def test_brackets(self): | |||||
"""test getter/setter for the brackets attribute""" | |||||
node1 = ExternalLink(wraptext("http://example.com/"), brackets=False) | |||||
node2 = ExternalLink(wraptext("http://example.com/"), wraptext("Link")) | |||||
self.assertFalse(node1.brackets) | |||||
self.assertTrue(node2.brackets) | |||||
node1.brackets = True | |||||
node2.brackets = False | |||||
self.assertTrue(node1.brackets) | |||||
self.assertFalse(node2.brackets) | |||||
self.assertEqual("[http://example.com/]", str(node1)) | |||||
self.assertEqual("http://example.com/", str(node2)) | |||||
if __name__ == "__main__": | |||||
unittest.main(verbosity=2) |
@@ -36,9 +36,9 @@ class TestParser(TreeEqualityTestCase): | |||||
def test_use_c(self): | def test_use_c(self): | ||||
"""make sure the correct tokenizer is used""" | """make sure the correct tokenizer is used""" | ||||
if parser.use_c: | if parser.use_c: | ||||
self.assertTrue(parser.Parser(None)._tokenizer.USES_C) | |||||
self.assertTrue(parser.Parser()._tokenizer.USES_C) | |||||
parser.use_c = False | parser.use_c = False | ||||
self.assertFalse(parser.Parser(None)._tokenizer.USES_C) | |||||
self.assertFalse(parser.Parser()._tokenizer.USES_C) | |||||
def test_parsing(self): | def test_parsing(self): | ||||
"""integration test for parsing overall""" | """integration test for parsing overall""" | ||||
@@ -59,7 +59,7 @@ class TestParser(TreeEqualityTestCase): | |||||
])) | ])) | ||||
]) | ]) | ||||
]) | ]) | ||||
actual = parser.Parser(text).parse() | |||||
actual = parser.Parser().parse(text) | |||||
self.assertWikicodeEqual(expected, actual) | self.assertWikicodeEqual(expected, actual) | ||||
if __name__ == "__main__": | if __name__ == "__main__": | ||||
@@ -276,6 +276,7 @@ class TestWikicode(TreeEqualityTestCase): | |||||
self.assertEqual(["{{{e}}}"], get_filter("arguments")) | self.assertEqual(["{{{e}}}"], get_filter("arguments")) | ||||
self.assertIs(code.get(4), get_filter("arguments")[0]) | self.assertIs(code.get(4), get_filter("arguments")[0]) | ||||
self.assertEqual([], get_filter("comments")) | self.assertEqual([], get_filter("comments")) | ||||
self.assertEqual([], get_filter("external_links")) | |||||
self.assertEqual([], get_filter("headings")) | self.assertEqual([], get_filter("headings")) | ||||
self.assertEqual([], get_filter("html_entities")) | self.assertEqual([], get_filter("html_entities")) | ||||
self.assertEqual([], get_filter("tags")) | self.assertEqual([], get_filter("tags")) | ||||
@@ -0,0 +1,473 @@ | |||||
name: basic | |||||
label: basic external link | |||||
input: "http://example.com/" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose()] | |||||
--- | |||||
name: basic_brackets | |||||
label: basic external link in brackets | |||||
input: "[http://example.com/]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_space | |||||
label: basic external link in brackets, with a space after | |||||
input: "[http://example.com/ ]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_title | |||||
label: basic external link in brackets, with a title | |||||
input: "[http://example.com/ Example]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_multiword_title | |||||
label: basic external link in brackets, with a multi-word title | |||||
input: "[http://example.com/ Example Web Page]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text="Example Web Page"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_adjacent | |||||
label: three adjacent bracket-enclosed external links | |||||
input: "[http://foo.com/ Foo][http://bar.com/ Bar]\n[http://baz.com/ Baz]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foo.com/"), ExternalLinkSeparator(), Text(text="Foo"), ExternalLinkClose(), ExternalLinkOpen(brackets=True), Text(text="http://bar.com/"), ExternalLinkSeparator(), Text(text="Bar"), ExternalLinkClose(), Text(text="\n"), ExternalLinkOpen(brackets=True), Text(text="http://baz.com/"), ExternalLinkSeparator(), Text(text="Baz"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_newline_before | |||||
label: bracket-enclosed link with a newline before the title | |||||
input: "[http://example.com/ \nExample]" | |||||
output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" \nExample]")] | |||||
--- | |||||
name: brackets_newline_inside | |||||
label: bracket-enclosed link with a newline in the title | |||||
input: "[http://example.com/ Example \nWeb Page]" | |||||
output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" Example \nWeb Page]")] | |||||
--- | |||||
name: brackets_newline_after | |||||
label: bracket-enclosed link with a newline after the title | |||||
input: "[http://example.com/ Example\n]" | |||||
output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" Example\n]")] | |||||
--- | |||||
name: brackets_space_before | |||||
label: bracket-enclosed link with a space before the URL | |||||
input: "[ http://example.com Example]" | |||||
output: [Text(text="[ "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" Example]")] | |||||
--- | |||||
name: brackets_title_like_url | |||||
label: bracket-enclosed link with a title that looks like a URL | |||||
input: "[http://example.com http://example.com]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="http://example.com"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_recursive | |||||
label: bracket-enclosed link with a bracket-enclosed link as the title | |||||
input: "[http://example.com [http://example.com]]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="[http://example.com"), ExternalLinkClose(), Text(text="]")] | |||||
--- | |||||
name: period_after | |||||
label: a period after a free link that is excluded | |||||
input: "http://example.com." | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=".")] | |||||
--- | |||||
name: colons_after | |||||
label: colons after a free link that are excluded | |||||
input: "http://example.com/foo:bar.:;baz!?," | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo:bar.:;baz"), ExternalLinkClose(), Text(text="!?,")] | |||||
--- | |||||
name: close_paren_after_excluded | |||||
label: a closing parenthesis after a free link that is excluded | |||||
input: "http://example.)com)" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.)com"), ExternalLinkClose(), Text(text=")")] | |||||
--- | |||||
name: close_paren_after_included | |||||
label: a closing parenthesis after a free link that is included because of an opening parenthesis in the URL | |||||
input: "http://example.(com)" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.(com)"), ExternalLinkClose()] | |||||
--- | |||||
name: open_bracket_inside | |||||
label: an open bracket inside a free link that causes it to be ended abruptly | |||||
input: "http://foobar[baz.com" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://foobar"), ExternalLinkClose(), Text(text="[baz.com")] | |||||
--- | |||||
name: brackets_period_after | |||||
label: a period after a bracket-enclosed link that is included | |||||
input: "[http://example.com. Example]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com."), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_colons_after | |||||
label: colons after a bracket-enclosed link that are included | |||||
input: "[http://example.com/foo:bar.:;baz!?, Example]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo:bar.:;baz!?,"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_close_paren_after_included | |||||
label: a closing parenthesis after a bracket-enclosed link that is included | |||||
input: "[http://example.)com) Example]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.)com)"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_close_paren_after_included_2 | |||||
label: a closing parenthesis after a bracket-enclosed link that is also included | |||||
input: "[http://example.(com) Example]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||||
--- | |||||
name: brackets_open_bracket_inside | |||||
label: an open bracket inside a bracket-enclosed link that is also included | |||||
input: "[http://foobar[baz.com Example]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||||
--- | |||||
name: adjacent_space | |||||
label: two free links separated by a space | |||||
input: "http://example.com http://example.com" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] | |||||
--- | |||||
name: adjacent_newline | |||||
label: two free links separated by a newline | |||||
input: "http://example.com\nhttp://example.com" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text="\n"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] | |||||
--- | |||||
name: adjacent_close_bracket | |||||
label: two free links separated by a close bracket | |||||
input: "http://example.com]http://example.com" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text="]"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] | |||||
--- | |||||
name: html_entity_in_url | |||||
label: a HTML entity parsed correctly inside a free link | |||||
input: "http://exa mple.com/" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="mple.com/"), ExternalLinkClose()] | |||||
--- | |||||
name: template_in_url | |||||
label: a template parsed correctly inside a free link | |||||
input: "http://exa{{template}}mple.com/" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), TemplateOpen(), Text(text="template"), TemplateClose(), Text(text="mple.com/"), ExternalLinkClose()] | |||||
--- | |||||
name: argument_in_url | |||||
label: an argument parsed correctly inside a free link | |||||
input: "http://exa{{{argument}}}mple.com/" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ArgumentOpen(), Text(text="argument"), ArgumentClose(), Text(text="mple.com/"), ExternalLinkClose()] | |||||
--- | |||||
name: wikilink_in_url | |||||
label: a wikilink that destroys a free link | |||||
input: "http://exa[[wikilink]]mple.com/" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ExternalLinkClose(), WikilinkOpen(), Text(text="wikilink"), WikilinkClose(), Text(text="mple.com/")] | |||||
--- | |||||
name: external_link_in_url | |||||
label: a bracketed link that destroys a free link | |||||
input: "http://exa[http://example.com/]mple.com/" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ExternalLinkClose(), ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkClose(), Text(text="mple.com/")] | |||||
--- | |||||
name: spaces_padding | |||||
label: spaces padding a free link | |||||
input: " http://example.com " | |||||
output: [Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" ")] | |||||
--- | |||||
name: text_and_spaces_padding | |||||
label: text and spaces padding a free link | |||||
input: "x http://example.com x" | |||||
output: [Text(text="x "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" x")] | |||||
--- | |||||
name: template_before | |||||
label: a template before a free link | |||||
input: "{{foo}}http://example.com" | |||||
output: [TemplateOpen(), Text(text="foo"), TemplateClose(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] | |||||
--- | |||||
name: spaces_padding_no_slashes | |||||
label: spaces padding a free link with no slashes after the colon | |||||
input: " mailto:example@example.com " | |||||
output: [Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" ")] | |||||
--- | |||||
name: text_and_spaces_padding_no_slashes | |||||
label: text and spaces padding a free link with no slashes after the colon | |||||
input: "x mailto:example@example.com x" | |||||
output: [Text(text="x "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" x")] | |||||
--- | |||||
name: template_before_no_slashes | |||||
label: a template before a free link with no slashes after the colon | |||||
input: "{{foo}}mailto:example@example.com" | |||||
output: [TemplateOpen(), Text(text="foo"), TemplateClose(), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose()] | |||||
--- | |||||
name: no_slashes | |||||
label: a free link with no slashes after the colon | |||||
input: "mailto:example@example.com" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose()] | |||||
--- | |||||
name: slashes_optional | |||||
label: a free link using a scheme that doesn't need slashes, but has them anyway | |||||
input: "mailto://example@example.com" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="mailto://example@example.com"), ExternalLinkClose()] | |||||
--- | |||||
name: short | |||||
label: a very short free link | |||||
input: "mailto://abc" | |||||
output: [ExternalLinkOpen(brackets=False), Text(text="mailto://abc"), ExternalLinkClose()] | |||||
--- | |||||
name: slashes_missing | |||||
label: slashes missing from a free link with a scheme that requires them | |||||
input: "http:example@example.com" | |||||
output: [Text(text="http:example@example.com")] | |||||
--- | |||||
name: no_scheme_but_slashes | |||||
label: no scheme in a free link, but slashes (protocol-relative free links are not supported) | |||||
input: "//example.com" | |||||
output: [Text(text="//example.com")] | |||||
--- | |||||
name: no_scheme_but_colon | |||||
label: no scheme in a free link, but a colon | |||||
input: " :example.com" | |||||
output: [Text(text=" :example.com")] | |||||
--- | |||||
name: no_scheme_but_colon_and_slashes | |||||
label: no scheme in a free link, but a colon and slashes | |||||
input: " ://example.com" | |||||
output: [Text(text=" ://example.com")] | |||||
--- | |||||
name: fake_scheme_no_slashes | |||||
label: a nonexistent scheme in a free link, without slashes | |||||
input: "fake:example.com" | |||||
output: [Text(text="fake:example.com")] | |||||
--- | |||||
name: fake_scheme_slashes | |||||
label: a nonexistent scheme in a free link, with slashes | |||||
input: "fake://example.com" | |||||
output: [Text(text="fake://example.com")] | |||||
--- | |||||
name: fake_scheme_brackets_no_slashes | |||||
label: a nonexistent scheme in a bracketed link, without slashes | |||||
input: "[fake:example.com]" | |||||
output: [Text(text="[fake:example.com]")] | |||||
--- | |||||
name: fake_scheme_brackets_slashes | |||||
label: #=a nonexistent scheme in a bracketed link, with slashes | |||||
input: "[fake://example.com]" | |||||
output: [Text(text="[fake://example.com]")] | |||||
--- | |||||
name: interrupted_scheme | |||||
label: an otherwise valid scheme with something in the middle of it, in a free link | |||||
input: "ht?tp://example.com" | |||||
output: [Text(text="ht?tp://example.com")] | |||||
--- | |||||
name: interrupted_scheme_brackets | |||||
label: an otherwise valid scheme with something in the middle of it, in a bracketed link | |||||
input: "[ht?tp://example.com]" | |||||
output: [Text(text="[ht?tp://example.com]")] | |||||
--- | |||||
name: no_slashes_brackets | |||||
label: no slashes after the colon in a bracketed link | |||||
input: "[mailto:example@example.com Example]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="mailto:example@example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||||
--- | |||||
name: space_before_no_slashes_brackets | |||||
label: a space before a bracketed link with no slashes after the colon | |||||
input: "[ mailto:example@example.com Example]" | |||||
output: [Text(text="[ "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" Example]")] | |||||
--- | |||||
name: slashes_optional_brackets | |||||
label: a bracketed link using a scheme that doesn't need slashes, but has them anyway | |||||
input: "[mailto://example@example.com Example]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="mailto://example@example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||||
--- | |||||
name: short_brackets | |||||
label: a very short link in brackets | |||||
input: "[mailto://abc Example]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="mailto://abc"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||||
--- | |||||
name: slashes_missing_brackets | |||||
label: slashes missing from a scheme that requires them in a bracketed link | |||||
input: "[http:example@example.com Example]" | |||||
output: [Text(text="[http:example@example.com Example]")] | |||||
--- | |||||
name: protcol_relative | |||||
label: a protocol-relative link (in brackets) | |||||
input: "[//example.com Example]" | |||||
output: [ExternalLinkOpen(brackets=True), Text(text="//example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] | |||||
--- | |||||
name: scheme_missing_but_colon_brackets | |||||
label: scheme missing from a bracketed link, but with a colon | |||||
input: "[:example.com Example]" | |||||
output: [Text(text="[:example.com Example]")] | |||||
--- | |||||
name: scheme_missing_but_colon_slashes_brackets | |||||
label: scheme missing from a bracketed link, but with a colon and slashes | |||||
input: "[://example.com Example]" | |||||
output: [Text(text="[://example.com Example]")] | |||||
--- | |||||
name: unclosed_protocol_relative | |||||
label: an unclosed protocol-relative bracketed link | |||||
input: "[//example.com" | |||||
output: [Text(text="[//example.com")] | |||||
--- | |||||
name: space_before_protcol_relative | |||||
label: a space before a protocol-relative bracketed link | |||||
input: "[ //example.com]" | |||||
output: [Text(text="[ //example.com]")] | |||||
--- | |||||
name: unclosed_just_scheme | |||||
label: an unclosed bracketed link, ending after the scheme | |||||
input: "[http" | |||||
output: [Text(text="[http")] | |||||
--- | |||||
name: unclosed_scheme_colon | |||||
label: an unclosed bracketed link, ending after the colon | |||||
input: "[http:" | |||||
output: [Text(text="[http:")] | |||||
--- | |||||
name: unclosed_scheme_colon_slashes | |||||
label: an unclosed bracketed link, ending after the slashes | |||||
input: "[http://" | |||||
output: [Text(text="[http://")] | |||||
--- | |||||
name: incomplete_bracket | |||||
label: just an open bracket | |||||
input: "[" | |||||
output: [Text(text="[")] | |||||
--- | |||||
name: incomplete_scheme_colon | |||||
label: a free link with just a scheme and a colon | |||||
input: "http:" | |||||
output: [Text(text="http:")] | |||||
--- | |||||
name: incomplete_scheme_colon_slashes | |||||
label: a free link with just a scheme, colon, and slashes | |||||
input: "http://" | |||||
output: [Text(text="http://")] | |||||
--- | |||||
name: brackets_scheme_but_no_url | |||||
label: brackets around a scheme and a colon | |||||
input: "[mailto:]" | |||||
output: [Text(text="[mailto:]")] | |||||
--- | |||||
name: brackets_scheme_slashes_but_no_url | |||||
label: brackets around a scheme, colon, and slashes | |||||
input: "[http://]" | |||||
output: [Text(text="[http://]")] | |||||
--- | |||||
name: brackets_scheme_title_but_no_url | |||||
label: brackets around a scheme, colon, and slashes, with a title | |||||
input: "[http:// Example]" | |||||
output: [Text(text="[http:// Example]")] |
@@ -12,6 +12,13 @@ output: [TemplateOpen(), ArgumentOpen(), ArgumentOpen(), Text(text="foo"), Argum | |||||
--- | --- | ||||
name: link_in_template_name | |||||
label: a wikilink inside a template name, which breaks the template | |||||
input: "{{foo[[bar]]}}" | |||||
output: [Text(text="{{foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="}}")] | |||||
--- | |||||
name: rich_heading | name: rich_heading | ||||
label: a heading with templates/wikilinks in it | label: a heading with templates/wikilinks in it | ||||
input: "== Head{{ing}} [[with]] {{{funky|{{stuf}}}}} ==" | input: "== Head{{ing}} [[with]] {{{funky|{{stuf}}}}} ==" | ||||
@@ -51,3 +58,17 @@ name: wildcard_redux | |||||
label: an even wilder assortment of various things | label: an even wilder assortment of various things | ||||
input: "{{a|b|{{c|[[d]]{{{e}}}}}}}[[f|{{{g}}}<!--h-->]]{{i|j= }}" | input: "{{a|b|{{c|[[d]]{{{e}}}}}}}[[f|{{{g}}}<!--h-->]]{{i|j= }}" | ||||
output: [TemplateOpen(), Text(text="a"), TemplateParamSeparator(), Text(text="b"), TemplateParamSeparator(), TemplateOpen(), Text(text="c"), TemplateParamSeparator(), WikilinkOpen(), Text(text="d"), WikilinkClose(), ArgumentOpen(), Text(text="e"), ArgumentClose(), TemplateClose(), TemplateClose(), WikilinkOpen(), Text(text="f"), WikilinkSeparator(), ArgumentOpen(), Text(text="g"), ArgumentClose(), CommentStart(), Text(text="h"), CommentEnd(), WikilinkClose(), TemplateOpen(), Text(text="i"), TemplateParamSeparator(), Text(text="j"), TemplateParamEquals(), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), TemplateClose()] | output: [TemplateOpen(), Text(text="a"), TemplateParamSeparator(), Text(text="b"), TemplateParamSeparator(), TemplateOpen(), Text(text="c"), TemplateParamSeparator(), WikilinkOpen(), Text(text="d"), WikilinkClose(), ArgumentOpen(), Text(text="e"), ArgumentClose(), TemplateClose(), TemplateClose(), WikilinkOpen(), Text(text="f"), WikilinkSeparator(), ArgumentOpen(), Text(text="g"), ArgumentClose(), CommentStart(), Text(text="h"), CommentEnd(), WikilinkClose(), TemplateOpen(), Text(text="i"), TemplateParamSeparator(), Text(text="j"), TemplateParamEquals(), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), TemplateClose()] | ||||
--- | |||||
name: link_inside_dl | |||||
label: an external link inside a def list, such that the external link is parsed | |||||
input: ";;;mailto:example" | |||||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), ExternalLinkOpen(brackets=False), Text(text="mailto:example"), ExternalLinkClose()] | |||||
--- | |||||
name: link_inside_dl_2 | |||||
label: an external link inside a def list, such that the external link is not parsed | |||||
input: ";;;malito:example" | |||||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="malito"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="example")] |
@@ -40,17 +40,17 @@ output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar|b | |||||
--- | --- | ||||
name: nested | |||||
label: a wikilink nested within the value of another | |||||
input: "[[foo|[[bar]]]]" | |||||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), WikilinkOpen(), Text(text="bar"), WikilinkClose(), WikilinkClose()] | |||||
name: newline_text | |||||
label: a newline in the middle of the text | |||||
input: "[[foo|foo\nbar]]" | |||||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="foo\nbar"), WikilinkClose()] | |||||
--- | --- | ||||
name: nested_with_text | |||||
label: a wikilink nested within the value of another, separated by other data | |||||
input: "[[foo|a[[b]]c]]" | |||||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c"), WikilinkClose()] | |||||
name: bracket_text | |||||
label: a left bracket in the middle of the text | |||||
input: "[[foo|bar[baz]]" | |||||
output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar[baz"), WikilinkClose()] | |||||
--- | --- | ||||
@@ -96,13 +96,34 @@ output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), | |||||
--- | --- | ||||
name: invalid_nested_text | |||||
name: invalid_nested_padding | |||||
label: invalid wikilink: trying to nest in the wrong context, with a text param | label: invalid wikilink: trying to nest in the wrong context, with a text param | ||||
input: "[[foo[[bar]]|baz]]" | input: "[[foo[[bar]]|baz]]" | ||||
output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="|baz]]")] | output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="|baz]]")] | ||||
--- | --- | ||||
name: invalid_nested_text | |||||
label: invalid wikilink: a wikilink nested within the value of another | |||||
input: "[[foo|[[bar]]" | |||||
output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose()] | |||||
--- | |||||
name: invalid_nested_text_2 | |||||
label: invalid wikilink: a wikilink nested within the value of another, two pairs of closing brackets | |||||
input: "[[foo|[[bar]]]]" | |||||
output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="]]")] | |||||
--- | |||||
name: invalid_nested_text_padding | |||||
label: invalid wikilink: a wikilink nested within the value of another, separated by other data | |||||
input: "[[foo|a[[b]]c]]" | |||||
output: [Text(text="[[foo|a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c]]")] | |||||
--- | |||||
name: incomplete_open_only | name: incomplete_open_only | ||||
label: incomplete wikilinks: just an open | label: incomplete wikilinks: just an open | ||||
input: "[[" | input: "[[" | ||||