From fff93b77270209e01ff0d482d7e8c0f1824c556d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 23:04:44 -0400 Subject: [PATCH 01/35] Add changelog entries for ExternalLinks. --- CHANGELOG | 8 +++++--- docs/changelog.rst | 9 +++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 8922738..84edc60 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,8 +1,10 @@ v0.3 (unreleased): -- Added complete support for HTML Tags, along with appropriate unit tests. This - includes forms like foo, , and wiki-markup tags - like bold ('''), italics (''), and lists (*, #, ; and :). +- Added complete support for HTML Tags, including forms like foo, + , and wiki-markup tags like bold ('''), italics (''), and + lists (*, #, ; and :). +- Added support for ExternalLinks (http://example.com/ and + [http://example.com/ Example]). - Wikicode's filter methods are now passed 'recursive=True' by default instead of False. This is a breaking change if you rely on any filter() methods being non-recursive by default. diff --git a/docs/changelog.rst b/docs/changelog.rst index 86dfd78..810f594 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -7,10 +7,11 @@ v0.3 Unreleased (`changes `__): -- Added complete support for HTML :py:class:`Tags <.Tag>`, along with - appropriate unit tests. This includes forms like ``foo``, - ````, and wiki-markup tags like bold (``'''``), italics - (``''``), and lists (``*``, ``#``, ``;`` and ``:``). +- Added complete support for HTML :py:class:`Tags <.Tag>`, including forms like + ``foo``, ````, and wiki-markup tags like bold + (``'''``), italics (``''``), and lists (``*``, ``#``, ``;`` and ``:``). +- Added support for :py:class:`.ExternalLink`\ s (``http://example.com/`` and + ``[http://example.com/ Example]``). - :py:class:`Wikicode's <.Wikicode>` :py:meth:`.filter` methods are now passed *recursive=True* by default instead of *False*. **This is a breaking change if you rely on any filter() methods being non-recursive by default.** From 0886b6fbf6256f36a062448fda31fcd79da10d89 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 23:05:13 -0400 Subject: [PATCH 02/35] Add ExternalLink Node type. --- mwparserfromhell/nodes/external_link.py | 95 +++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 mwparserfromhell/nodes/external_link.py diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py new file mode 100644 index 0000000..a604f9a --- /dev/null +++ b/mwparserfromhell/nodes/external_link.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012-2013 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals + +from . import Node +from ..compat import str +from ..utils import parse_anything + +__all__ = ["ExternalLink"] + +class ExternalLink(Node): + """Represents an external link, like ``[http://example.com/ Example]``.""" + + def __init__(self, url, title=None, brackets=True): + super(ExternalLink, self).__init__() + self._url = url + self._title = title + self._brackets = brackets + + def __unicode__(self): + if self.brackets: + if self.title is not None: + return "[" + str(self.url) + " " + str(self.title) + "]" + return "[" + str(self.url) + "]" + return str(self.url) + + def __iternodes__(self, getter): + yield None, self + for child in getter(self.url): + yield self.url, child + if self.title is not None: + for child in getter(self.title): + yield self.title, child + + def __strip__(self, normalize, collapse): + if self.title.strip(): + return self.title.strip_code(normalize, collapse) + return None + + def __showtree__(self, write, get, mark): + write("[") + get(self.url) + if self.title is not None: + get(self.title) + write("]") + + @property + def url(self): + """The url of the link target, as a :py:class:`~.Wikicode` object.""" + return self._url + + @property + def title(self): + """The link title (if given), as a :py:class:`~.Wikicode` object.""" + return self._title + + @property + def brackets(self): + """Whether to enclose the URL in brackets or display it straight.""" + return self._brackets + + @url.setter + def url(self, value): + self._url = parse_anything(value) + + @title.setter + def title(self, value): + if value is None: + self._title = None + else: + self._title = parse_anything(value) + + @brackets.setter + def brackets(self, value): + self._brackets = bool(value) From 8fe8b1fef59446a24d1c66dc6b683dd5a3760a58 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 23:12:36 -0400 Subject: [PATCH 03/35] Implement ExternalLinks as tokens and in the builder. --- mwparserfromhell/nodes/__init__.py | 1 + mwparserfromhell/parser/builder.py | 22 ++++++++++++++++++++-- mwparserfromhell/parser/tokens.py | 4 ++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index faaa0b2..ba97b3f 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -69,6 +69,7 @@ from . import extras from .text import Text from .argument import Argument from .comment import Comment +from .external_link import ExternalLink from .heading import Heading from .html_entity import HTMLEntity from .tag import Tag diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 196ef14..ee914c3 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -24,8 +24,8 @@ from __future__ import unicode_literals from . import tokens from ..compat import str -from ..nodes import (Argument, Comment, Heading, HTMLEntity, Tag, Template, - Text, Wikilink) +from ..nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, Tag, + Template, Text, Wikilink) from ..nodes.extras import Attribute, Parameter from ..smart_list import SmartList from ..wikicode import Wikicode @@ -234,6 +234,22 @@ class Builder(object): else: self._write(self._handle_token(token)) + def _handle_external_link(self, token): + """Handle when an external link is at the head of the tokens.""" + brackets, url = token.brackets, None + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.ExternalLinkSeparator): + url = self._pop() + self._push() + elif isinstance(token, tokens.ExternalLinkClose): + if url is not None: + return ExternalLink(url, self._pop(), brackets) + return ExternalLink(self._pop(), brackets=brackets) + else: + self._write(self._handle_token(token)) + def _handle_token(self, token): """Handle a single token.""" if isinstance(token, tokens.Text): @@ -252,6 +268,8 @@ class Builder(object): return self._handle_comment() elif isinstance(token, tokens.TagOpenOpen): return self._handle_tag(token) + elif isinstance(token, tokens.ExternalLinkOpen): + return self._handle_external_link(token) def build(self, tokenlist): """Build a Wikicode object from a list tokens and return it.""" diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 0ffac86..ae58ec8 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -104,4 +104,8 @@ TagCloseSelfclose = make("TagCloseSelfclose") # /> TagOpenClose = make("TagOpenClose") # +ExternalLinkOpen = make("ExternalLinkOpen") # [ +ExternalLinkSeparator = make("ExternalLinkSeparator") # +ExternalLinkClose = make("ExternalLinkClose") # ] + del make From 88f4fa7c37d321858ccb20bc74e3f4e9e9eaa50a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Mon, 19 Aug 2013 23:21:15 -0400 Subject: [PATCH 04/35] Add external link contexts; reorder stuff for consistency. --- mwparserfromhell/parser/builder.py | 36 ++++++++++++------------ mwparserfromhell/parser/contexts.py | 55 ++++++++++++++++++++++--------------- mwparserfromhell/parser/tokens.py | 8 +++--- 3 files changed, 55 insertions(+), 44 deletions(-) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index ee914c3..d31f450 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -142,6 +142,22 @@ class Builder(object): else: self._write(self._handle_token(token)) + def _handle_external_link(self, token): + """Handle when an external link is at the head of the tokens.""" + brackets, url = token.brackets, None + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.ExternalLinkSeparator): + url = self._pop() + self._push() + elif isinstance(token, tokens.ExternalLinkClose): + if url is not None: + return ExternalLink(url, self._pop(), brackets) + return ExternalLink(self._pop(), brackets=brackets) + else: + self._write(self._handle_token(token)) + def _handle_entity(self): """Handle a case where an HTML entity is at the head of the tokens.""" token = self._tokens.pop() @@ -234,22 +250,6 @@ class Builder(object): else: self._write(self._handle_token(token)) - def _handle_external_link(self, token): - """Handle when an external link is at the head of the tokens.""" - brackets, url = token.brackets, None - self._push() - while self._tokens: - token = self._tokens.pop() - if isinstance(token, tokens.ExternalLinkSeparator): - url = self._pop() - self._push() - elif isinstance(token, tokens.ExternalLinkClose): - if url is not None: - return ExternalLink(url, self._pop(), brackets) - return ExternalLink(self._pop(), brackets=brackets) - else: - self._write(self._handle_token(token)) - def _handle_token(self, token): """Handle a single token.""" if isinstance(token, tokens.Text): @@ -260,6 +260,8 @@ class Builder(object): return self._handle_argument() elif isinstance(token, tokens.WikilinkOpen): return self._handle_wikilink() + elif isinstance(token, tokens.ExternalLinkOpen): + return self._handle_external_link(token) elif isinstance(token, tokens.HTMLEntityStart): return self._handle_entity() elif isinstance(token, tokens.HeadingStart): @@ -268,8 +270,6 @@ class Builder(object): return self._handle_comment() elif isinstance(token, tokens.TagOpenOpen): return self._handle_tag(token) - elif isinstance(token, tokens.ExternalLinkOpen): - return self._handle_external_link(token) def build(self, tokenlist): """Build a Wikicode object from a list tokens and return it.""" diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index a1b67be..38154bb 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -51,6 +51,12 @@ Local (stack-specific) contexts: * :py:const:`WIKILINK_TITLE` * :py:const:`WIKILINK_TEXT` +* :py:const:`EXTERNAL_LINK` + + * :py:const:`EXTERNAL_LINK_URL` + * :py:const:`EXTERNAL_LINK_TITLE` + * :py:const:`EXTERNAL_LINK_BRACKETS` + * :py:const:`HEADING` * :py:const:`HEADING_LEVEL_1` @@ -112,35 +118,40 @@ WIKILINK_TITLE = 1 << 5 WIKILINK_TEXT = 1 << 6 WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT -HEADING_LEVEL_1 = 1 << 7 -HEADING_LEVEL_2 = 1 << 8 -HEADING_LEVEL_3 = 1 << 9 -HEADING_LEVEL_4 = 1 << 10 -HEADING_LEVEL_5 = 1 << 11 -HEADING_LEVEL_6 = 1 << 12 +EXTERNAL_LINK_URL = 1 << 7 +EXTERNAL_LINK_TITLE = 1 << 8 +EXTERNAL_LINK_BRACKETS = 1 << 9 +EXTERNAL_LINK = EXTERNAL_LINK_URL + EXTERNAL_LINK_TITLE + +HEADING_LEVEL_1 = 1 << 10 +HEADING_LEVEL_2 = 1 << 11 +HEADING_LEVEL_3 = 1 << 12 +HEADING_LEVEL_4 = 1 << 13 +HEADING_LEVEL_5 = 1 << 14 +HEADING_LEVEL_6 = 1 << 15 HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 + HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6) -TAG_OPEN = 1 << 13 -TAG_ATTR = 1 << 14 -TAG_BODY = 1 << 15 -TAG_CLOSE = 1 << 16 +TAG_OPEN = 1 << 16 +TAG_ATTR = 1 << 17 +TAG_BODY = 1 << 18 +TAG_CLOSE = 1 << 19 TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE -STYLE_ITALICS = 1 << 17 -STYLE_BOLD = 1 << 18 -STYLE_PASS_AGAIN = 1 << 19 -STYLE_SECOND_PASS = 1 << 20 +STYLE_ITALICS = 1 << 20 +STYLE_BOLD = 1 << 21 +STYLE_PASS_AGAIN = 1 << 22 +STYLE_SECOND_PASS = 1 << 23 STYLE = STYLE_ITALICS + STYLE_BOLD + STYLE_PASS_AGAIN + STYLE_SECOND_PASS -DL_TERM = 1 << 21 +DL_TERM = 1 << 24 -HAS_TEXT = 1 << 22 -FAIL_ON_TEXT = 1 << 23 -FAIL_NEXT = 1 << 24 -FAIL_ON_LBRACE = 1 << 25 -FAIL_ON_RBRACE = 1 << 26 -FAIL_ON_EQUALS = 1 << 27 +HAS_TEXT = 1 << 25 +FAIL_ON_TEXT = 1 << 26 +FAIL_NEXT = 1 << 27 +FAIL_ON_LBRACE = 1 << 28 +FAIL_ON_RBRACE = 1 << 29 +FAIL_ON_EQUALS = 1 << 30 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) @@ -150,7 +161,7 @@ GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + HEADING + TAG + STYLE +FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXTERNAL_LINK + HEADING + TAG + STYLE UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index ae58ec8..57308ea 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -84,6 +84,10 @@ WikilinkOpen = make("WikilinkOpen") # [[ WikilinkSeparator = make("WikilinkSeparator") # | WikilinkClose = make("WikilinkClose") # ]] +ExternalLinkOpen = make("ExternalLinkOpen") # [ +ExternalLinkSeparator = make("ExternalLinkSeparator") # +ExternalLinkClose = make("ExternalLinkClose") # ] + HTMLEntityStart = make("HTMLEntityStart") # & HTMLEntityNumeric = make("HTMLEntityNumeric") # # HTMLEntityHex = make("HTMLEntityHex") # x @@ -104,8 +108,4 @@ TagCloseSelfclose = make("TagCloseSelfclose") # /> TagOpenClose = make("TagOpenClose") # -ExternalLinkOpen = make("ExternalLinkOpen") # [ -ExternalLinkSeparator = make("ExternalLinkSeparator") # -ExternalLinkClose = make("ExternalLinkClose") # ] - del make From cbf67c78424b5de14d0ad4b9023d81c61fcbe17d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Tue, 20 Aug 2013 02:07:38 -0400 Subject: [PATCH 05/35] Add hooks for some ext link stuff; add a INVALID_LINK aggregate context. --- mwparserfromhell/parser/contexts.py | 20 +++---- mwparserfromhell/parser/tokenizer.c | 15 +++--- mwparserfromhell/parser/tokenizer.h | 100 +++++++++++++++++++---------------- mwparserfromhell/parser/tokenizer.py | 18 +++++-- tests/tokenizer/integration.mwtest | 7 +++ 5 files changed, 96 insertions(+), 64 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 38154bb..c6d2941 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -51,11 +51,11 @@ Local (stack-specific) contexts: * :py:const:`WIKILINK_TITLE` * :py:const:`WIKILINK_TEXT` -* :py:const:`EXTERNAL_LINK` +* :py:const:`EXT_LINK` - * :py:const:`EXTERNAL_LINK_URL` - * :py:const:`EXTERNAL_LINK_TITLE` - * :py:const:`EXTERNAL_LINK_BRACKETS` + * :py:const:`EXT_LINK_URL` + * :py:const:`EXT_LINK_TITLE` + * :py:const:`EXT_LINK_BRACKETS` * :py:const:`HEADING` @@ -100,6 +100,7 @@ Aggregate contexts: * :py:const:`FAIL` * :py:const:`UNSAFE` * :py:const:`DOUBLE` +* :py:const:`INVALID_LINK` """ @@ -118,10 +119,10 @@ WIKILINK_TITLE = 1 << 5 WIKILINK_TEXT = 1 << 6 WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT -EXTERNAL_LINK_URL = 1 << 7 -EXTERNAL_LINK_TITLE = 1 << 8 -EXTERNAL_LINK_BRACKETS = 1 << 9 -EXTERNAL_LINK = EXTERNAL_LINK_URL + EXTERNAL_LINK_TITLE +EXT_LINK_URL = 1 << 7 +EXT_LINK_TITLE = 1 << 8 +EXT_LINK_BRACKETS = 1 << 9 +EXT_LINK = EXT_LINK_URL + EXT_LINK_TITLE + EXT_LINK_BRACKETS HEADING_LEVEL_1 = 1 << 10 HEADING_LEVEL_2 = 1 << 11 @@ -161,7 +162,8 @@ GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXTERNAL_LINK + HEADING + TAG + STYLE +FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK + HEADING + TAG + STYLE UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE +INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URL diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 67a4ae6..267e7c5 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2192,9 +2192,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) if (Tokenizer_emit_char(self, this)) return NULL; } - else if (this == next && next == *"[") { - if (!(this_context & LC_WIKILINK_TITLE) && - Tokenizer_CAN_RECURSE(self)) { + else if (this == next && next == *"[" && Tokenizer_CAN_RECURSE(self)) { + if (!(this_context & AGG_INVALID_LINK)) { if (Tokenizer_parse_wikilink(self)) return NULL; } @@ -2243,9 +2242,8 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) return NULL; } } - else if (this == *"<") { - if (!(this_context & LC_TAG_CLOSE) && - Tokenizer_CAN_RECURSE(self)) { + else if (this == *"<" && !(this_context & LC_TAG_CLOSE)) { + if (Tokenizer_CAN_RECURSE(self)) { if (Tokenizer_parse_tag(self)) return NULL; } @@ -2389,6 +2387,11 @@ static int load_tokens(void) WikilinkSeparator = PyObject_GetAttrString(tokens, "WikilinkSeparator"); WikilinkClose = PyObject_GetAttrString(tokens, "WikilinkClose"); + ExternalLinkOpen = PyObject_GetAttrString(tokens, "ExternalLinkOpen"); + ExternalLinkSeparator = PyObject_GetAttrString(tokens, + "ExternalLinkSeparator"); + ExternalLinkClose = PyObject_GetAttrString(tokens, "ExternalLinkClose"); + HTMLEntityStart = PyObject_GetAttrString(tokens, "HTMLEntityStart"); HTMLEntityNumeric = PyObject_GetAttrString(tokens, "HTMLEntityNumeric"); HTMLEntityHex = PyObject_GetAttrString(tokens, "HTMLEntityHex"); diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 264360e..16c76eb 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -82,6 +82,10 @@ static PyObject* WikilinkOpen; static PyObject* WikilinkSeparator; static PyObject* WikilinkClose; +static PyObject* ExternalLinkOpen; +static PyObject* ExternalLinkSeparator; +static PyObject* ExternalLinkClose; + static PyObject* HTMLEntityStart; static PyObject* HTMLEntityNumeric; static PyObject* HTMLEntityHex; @@ -104,48 +108,53 @@ static PyObject* TagCloseClose; /* Local contexts: */ -#define LC_TEMPLATE 0x0000007 -#define LC_TEMPLATE_NAME 0x0000001 -#define LC_TEMPLATE_PARAM_KEY 0x0000002 -#define LC_TEMPLATE_PARAM_VALUE 0x0000004 - -#define LC_ARGUMENT 0x0000018 -#define LC_ARGUMENT_NAME 0x0000008 -#define LC_ARGUMENT_DEFAULT 0x0000010 - -#define LC_WIKILINK 0x0000060 -#define LC_WIKILINK_TITLE 0x0000020 -#define LC_WIKILINK_TEXT 0x0000040 - -#define LC_HEADING 0x0001F80 -#define LC_HEADING_LEVEL_1 0x0000080 -#define LC_HEADING_LEVEL_2 0x0000100 -#define LC_HEADING_LEVEL_3 0x0000200 -#define LC_HEADING_LEVEL_4 0x0000400 -#define LC_HEADING_LEVEL_5 0x0000800 -#define LC_HEADING_LEVEL_6 0x0001000 - -#define LC_TAG 0x001E000 -#define LC_TAG_OPEN 0x0002000 -#define LC_TAG_ATTR 0x0004000 -#define LC_TAG_BODY 0x0008000 -#define LC_TAG_CLOSE 0x0010000 - -#define LC_STYLE 0x01E0000 -#define LC_STYLE_ITALICS 0x0020000 -#define LC_STYLE_BOLD 0x0040000 -#define LC_STYLE_PASS_AGAIN 0x0080000 -#define LC_STYLE_SECOND_PASS 0x0100000 - -#define LC_DLTERM 0x0200000 - -#define LC_SAFETY_CHECK 0xFC00000 -#define LC_HAS_TEXT 0x0400000 -#define LC_FAIL_ON_TEXT 0x0800000 -#define LC_FAIL_NEXT 0x1000000 -#define LC_FAIL_ON_LBRACE 0x2000000 -#define LC_FAIL_ON_RBRACE 0x4000000 -#define LC_FAIL_ON_EQUALS 0x8000000 +#define LC_TEMPLATE 0x00000007 +#define LC_TEMPLATE_NAME 0x00000001 +#define LC_TEMPLATE_PARAM_KEY 0x00000002 +#define LC_TEMPLATE_PARAM_VALUE 0x00000004 + +#define LC_ARGUMENT 0x00000018 +#define LC_ARGUMENT_NAME 0x00000008 +#define LC_ARGUMENT_DEFAULT 0x00000010 + +#define LC_WIKILINK 0x00000060 +#define LC_WIKILINK_TITLE 0x00000020 +#define LC_WIKILINK_TEXT 0x00000040 + +#define LC_EXT_LINK 0x00000380 +#define LC_EXT_LINK_URL 0x00000080 +#define LC_EXT_LINK_TITLE 0x00000100 +#define LC_EXT_LINK_BRACKETS 0x00000200 + +#define LC_HEADING 0x0000FC00 +#define LC_HEADING_LEVEL_1 0x00000400 +#define LC_HEADING_LEVEL_2 0x00000800 +#define LC_HEADING_LEVEL_3 0x00001000 +#define LC_HEADING_LEVEL_4 0x00002000 +#define LC_HEADING_LEVEL_5 0x00004000 +#define LC_HEADING_LEVEL_6 0x00008000 + +#define LC_TAG 0x000F0000 +#define LC_TAG_OPEN 0x00010000 +#define LC_TAG_ATTR 0x00020000 +#define LC_TAG_BODY 0x00040000 +#define LC_TAG_CLOSE 0x00080000 + +#define LC_STYLE 0x00F00000 +#define LC_STYLE_ITALICS 0x00100000 +#define LC_STYLE_BOLD 0x00200000 +#define LC_STYLE_PASS_AGAIN 0x00400000 +#define LC_STYLE_SECOND_PASS 0x00800000 + +#define LC_DLTERM 0x01000000 + +#define LC_SAFETY_CHECK 0x7E000000 +#define LC_HAS_TEXT 0x02000000 +#define LC_FAIL_ON_TEXT 0x04000000 +#define LC_FAIL_NEXT 0x08000000 +#define LC_FAIL_ON_LBRACE 0x10000000 +#define LC_FAIL_ON_RBRACE 0x20000000 +#define LC_FAIL_ON_EQUALS 0x40000000 /* Global contexts: */ @@ -153,9 +162,10 @@ static PyObject* TagCloseClose; /* Aggregate contexts: */ -#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) -#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) -#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) +#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) +#define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) +#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URL) /* Tag contexts: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 583d2f8..9f675ac 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -311,6 +311,11 @@ class Tokenizer(object): self._head += 1 return self._pop() + def _parse_external_link(self, brackets): + """Parse an external link at the head of the wikicode string.""" + self._emit_text(self._read()) + # raise NotImplementedError() + def _parse_heading(self): """Parse a section heading at the head of the wikicode string.""" self._global |= contexts.GL_HEADING @@ -898,8 +903,8 @@ class Tokenizer(object): return self._handle_argument_end() else: self._emit_text("}") - elif this == next == "[": - if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): + elif this == next == "[" and self._can_recurse(): + if not self._context & contexts.INVALID_LINK: self._parse_wikilink() else: self._emit_text("[") @@ -907,6 +912,11 @@ class Tokenizer(object): self._handle_wikilink_separator() elif this == next == "]" and self._context & contexts.WIKILINK: return self._handle_wikilink_end() + elif this == "[" and not self._context & contexts.INVALID_LINK: ## or this == ":" + if self._can_recurse(): + self._parse_external_link(brackets=this == "[") + else: + self._emit_text("[") elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() @@ -928,8 +938,8 @@ class Tokenizer(object): self._handle_tag_open_close() else: self._handle_invalid_tag_start() - elif this == "<": - if not self._context & contexts.TAG_CLOSE and self._can_recurse(): + elif this == "<" and not self._context & contexts.TAG_CLOSE: + if self._can_recurse(): self._parse_tag() else: self._emit_text("<") diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index 0277a51..e4ff8c4 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -12,6 +12,13 @@ output: [TemplateOpen(), ArgumentOpen(), ArgumentOpen(), Text(text="foo"), Argum --- +name: link_in_template_name +label: a wikilink inside a template name, which breaks the template +input: "{{foo[[bar]]}}" +output: [Text(text="{{foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="}}")] + +--- + name: rich_heading label: a heading with templates/wikilinks in it input: "== Head{{ing}} [[with]] {{{funky|{{stuf}}}}} ==" From 5e6e5b6301f5f50ca8585a5b73f72af49898cdf2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 Aug 2013 01:07:32 -0400 Subject: [PATCH 06/35] tag_defs.py -> definitions.py; more outline stuff --- mwparserfromhell/{tag_defs.py => definitions.py} | 2 +- mwparserfromhell/nodes/tag.py | 6 ++-- mwparserfromhell/parser/tokenizer.c | 16 +++++----- mwparserfromhell/parser/tokenizer.h | 8 ++--- mwparserfromhell/parser/tokenizer.py | 37 ++++++++++++++++++------ 5 files changed, 44 insertions(+), 25 deletions(-) rename mwparserfromhell/{tag_defs.py => definitions.py} (97%) diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/definitions.py similarity index 97% rename from mwparserfromhell/tag_defs.py rename to mwparserfromhell/definitions.py index 2395fc6..2d7ab0c 100644 --- a/mwparserfromhell/tag_defs.py +++ b/mwparserfromhell/definitions.py @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -"""Contains data regarding certain HTML tags.""" +"""Contains data about certain markup, like HTML tags and external links.""" from __future__ import unicode_literals diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index b4aec3e..80b8a88 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -25,7 +25,7 @@ from __future__ import unicode_literals from . import Node, Text from .extras import Attribute from ..compat import str -from ..tag_defs import is_visible +from ..definitions import is_visible from ..utils import parse_anything __all__ = ["Tag"] @@ -152,7 +152,7 @@ class Tag(Node): This makes the tag look like a lone close tag. It is technically invalid and is only parsable Wikicode when the tag itself is single-only, like ``
`` and ````. See - :py:func:`.tag_defs.is_single_only`. + :py:func:`.definitions.is_single_only`. """ return self._invalid @@ -161,7 +161,7 @@ class Tag(Node): """Whether the tag is implicitly self-closing, with no ending slash. This is only possible for specific "single" tags like ``
`` and - ``
  • ``. See :py:func:`.tag_defs.is_single`. This field only has an + ``
  • ``. See :py:func:`.definitions.is_single`. This field only has an effect if :py:attr:`self_closing` is also ``True``. """ return self._implicit diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 267e7c5..2b74f6b 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -37,12 +37,12 @@ static int heading_level_from_context(int n) } /* - Call the given function in tag_defs, using 'tag' as a parameter, and return - its output as a bool. + Call the given function in definitions.py, using 'tag' as a parameter, and + return its output as a bool. */ -static int call_tag_def_func(const char* funcname, PyObject* tag) +static int call_def_func(const char* funcname, PyObject* tag) { - PyObject* func = PyObject_GetAttrString(tag_defs, funcname); + PyObject* func = PyObject_GetAttrString(definitions, funcname); PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL); int ans = (result == Py_True) ? 1 : 0; @@ -2416,13 +2416,13 @@ static int load_tokens(void) return 0; } -static int load_tag_defs(void) +static int load_definitions(void) { PyObject *tempmod, *globals = PyEval_GetGlobals(), *locals = PyEval_GetLocals(), *fromlist = PyList_New(1), - *modname = IMPORT_NAME_FUNC("tag_defs"); + *modname = IMPORT_NAME_FUNC("definitions"); char *name = "mwparserfromhell"; if (!fromlist || !modname) @@ -2432,7 +2432,7 @@ static int load_tag_defs(void) Py_DECREF(fromlist); if (!tempmod) return -1; - tag_defs = PyObject_GetAttrString(tempmod, "tag_defs"); + definitions = PyObject_GetAttrString(tempmod, "definitions"); Py_DECREF(tempmod); return 0; } @@ -2455,7 +2455,7 @@ PyMODINIT_FUNC INIT_FUNC_NAME(void) NOARGS = PyTuple_New(0); if (!EMPTY || !NOARGS) INIT_ERROR; - if (load_entitydefs() || load_tokens() || load_tag_defs()) + if (load_entitydefs() || load_tokens() || load_definitions()) INIT_ERROR; #ifdef IS_PY3K return module; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 16c76eb..41c1c1b 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -62,7 +62,7 @@ static char** entitydefs; static PyObject* EMPTY; static PyObject* NOARGS; -static PyObject* tag_defs; +static PyObject* definitions; /* Tokens: */ @@ -241,9 +241,9 @@ typedef struct { /* Macros for accessing HTML tag definitions: */ #define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") -#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag)) -#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag)) -#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag)) +#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag)) +#define IS_SINGLE(tag) (call_def_func("is_single", tag)) +#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag)) /* Function prototypes: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9f675ac..07ae0b1 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -26,7 +26,7 @@ import re from . import contexts, tokens from ..compat import htmlentities -from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only +from ..definitions import get_html_tag, is_parsable, is_single, is_single_only __all__ = ["Tokenizer"] @@ -60,7 +60,7 @@ class Tokenizer(object): START = object() END = object() MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "'", "#", "*", ";", - ":", "/", "-", "\n", END] + ":", "/", "-", "\n", START, END] MAX_DEPTH = 40 MAX_CYCLES = 100000 regex = re.compile(r"([{}\[\]<>|=&'#*;:/\\\"\-!\n])", flags=re.IGNORECASE) @@ -311,10 +311,30 @@ class Tokenizer(object): self._head += 1 return self._pop() + def _really_parse_external_link(self, brackets): + """Really parse an external link.""" + # link = self._parse(contexts.EXT_LINK_URL) + raise BadRoute() + def _parse_external_link(self, brackets): """Parse an external link at the head of the wikicode string.""" - self._emit_text(self._read()) - # raise NotImplementedError() + reset = self._head + self._head += 1 + try: + bad_context = self._context & contexts.INVALID_LINK + if bad_context or not self._can_recurse(): + raise BadRoute() + link = self._really_parse_external_link(brackets) + except BadRoute: + self._head = reset + if not brackets and self._context & contexts.DL_TERM: + self._handle_dl_term() + else: + self._emit_text(self._read()) + else: + self._emit(tokens.ExternalLinkOpen(brackets)) + self._emit_all(link) + self._emit(tokens.ExternalLinkClose()) def _parse_heading(self): """Parse a section heading at the head of the wikicode string.""" @@ -912,11 +932,10 @@ class Tokenizer(object): self._handle_wikilink_separator() elif this == next == "]" and self._context & contexts.WIKILINK: return self._handle_wikilink_end() - elif this == "[" and not self._context & contexts.INVALID_LINK: ## or this == ":" - if self._can_recurse(): - self._parse_external_link(brackets=this == "[") - else: - self._emit_text("[") + elif this == "[": + self._parse_external_link(True) + elif this == ":" and self._read(-1) not in self.MARKERS: + self._parse_external_link(False) elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() From 5fc36cea7156fd86c848463fd6db2740462665c6 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 Aug 2013 02:48:13 -0400 Subject: [PATCH 07/35] Add is_protocol(). --- mwparserfromhell/definitions.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 2d7ab0c..7352f23 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -25,7 +25,17 @@ from __future__ import unicode_literals __all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", - "is_single_only"] + "is_single_only", "is_protocol"] + +URL_PROTOCOLS = { + # [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943 + "http": True, "https": True, "ftp": True, "ftps": True, "ssh": True, + "sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False, + "sips": False, "gopher": True, "telnet": True, "nntp": True, + "worldwind": True, "mailto": False, "tel": False, "sms": False, + "news": False, "svn": True, "git": True, "mms": True, "bitcoin": False, + "magnet": False, "urn": False, "geo": False +} PARSER_BLACKLIST = [ # enwiki extensions @ 2013-06-28 @@ -70,3 +80,9 @@ def is_single(tag): def is_single_only(tag): """Return whether or not the given *tag* must exist without a close tag.""" return tag.lower() in SINGLE_ONLY + +def is_protocol(protocol, slashes=True): + """Return whether *protcol* is valid for external links.""" + if slashes: + return protocol in URL_PROTOCOLS + return protocol in URL_PROTOCOLS and not URL_PROTOCOLS[protocol] From e2d007cb9f09c617e48d1240bb08de6d3e79895a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 Aug 2013 03:14:13 -0400 Subject: [PATCH 08/35] Actually, they're called schemes, not protocols. --- mwparserfromhell/definitions.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 7352f23..1cc1eb5 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -25,9 +25,9 @@ from __future__ import unicode_literals __all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", - "is_single_only", "is_protocol"] + "is_single_only", "is_scheme"] -URL_PROTOCOLS = { +URL_SCHEMES = { # [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943 "http": True, "https": True, "ftp": True, "ftps": True, "ssh": True, "sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False, @@ -81,8 +81,8 @@ def is_single_only(tag): """Return whether or not the given *tag* must exist without a close tag.""" return tag.lower() in SINGLE_ONLY -def is_protocol(protocol, slashes=True): - """Return whether *protcol* is valid for external links.""" +def is_scheme(scheme, slashes=True): + """Return whether *scheme* is valid for external links.""" if slashes: - return protocol in URL_PROTOCOLS - return protocol in URL_PROTOCOLS and not URL_PROTOCOLS[protocol] + return scheme in URL_SCHEMES + return scheme in URL_SCHEMES and not URL_SCHEMES[scheme] From 223f3fa6588390dca6c3a71d2e93c9be7d8a0fd9 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Wed, 21 Aug 2013 03:25:18 -0400 Subject: [PATCH 09/35] Actually, they're called URI schemes, not URL schemes. --- mwparserfromhell/definitions.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index 1cc1eb5..ef8255e 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -27,7 +27,7 @@ from __future__ import unicode_literals __all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", "is_single_only", "is_scheme"] -URL_SCHEMES = { +URI_SCHEMES = { # [mediawiki/core.git]/includes/DefaultSettings.php @ 374a0ad943 "http": True, "https": True, "ftp": True, "ftps": True, "ssh": True, "sftp": True, "irc": True, "ircs": True, "xmpp": False, "sip": False, @@ -83,6 +83,7 @@ def is_single_only(tag): def is_scheme(scheme, slashes=True): """Return whether *scheme* is valid for external links.""" + scheme = scheme.lower() if slashes: - return scheme in URL_SCHEMES - return scheme in URL_SCHEMES and not URL_SCHEMES[scheme] + return scheme in URI_SCHEMES + return scheme in URI_SCHEMES and not URI_SCHEMES[scheme] From f3025eaafe7178a0aaedca4a70648410037fc9ec Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 00:56:39 -0400 Subject: [PATCH 10/35] Fix some wikilink-related tests. --- tests/tokenizer/wikilinks.mwtest | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/tests/tokenizer/wikilinks.mwtest b/tests/tokenizer/wikilinks.mwtest index 0682ef1..8eb381a 100644 --- a/tests/tokenizer/wikilinks.mwtest +++ b/tests/tokenizer/wikilinks.mwtest @@ -40,17 +40,17 @@ output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar|b --- -name: nested -label: a wikilink nested within the value of another -input: "[[foo|[[bar]]]]" -output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), WikilinkOpen(), Text(text="bar"), WikilinkClose(), WikilinkClose()] +name: newline_text +label: a newline in the middle of the text +input: "[[foo|foo\nbar]]" +output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="foo\nbar"), WikilinkClose()] --- -name: nested_with_text -label: a wikilink nested within the value of another, separated by other data -input: "[[foo|a[[b]]c]]" -output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c"), WikilinkClose()] +name: bracket_text +label: a left bracket in the middle of the text +input: "[[foo|bar[baz]]" +output: [WikilinkOpen(), Text(text="foo"), WikilinkSeparator(), Text(text="bar[baz"), WikilinkClose()] --- @@ -96,13 +96,34 @@ output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), --- -name: invalid_nested_text +name: invalid_nested_padding label: invalid wikilink: trying to nest in the wrong context, with a text param input: "[[foo[[bar]]|baz]]" output: [Text(text="[[foo"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="|baz]]")] --- +name: invalid_nested_text +label: invalid wikilink: a wikilink nested within the value of another +input: "[[foo|[[bar]]" +output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose()] + +--- + +name: invalid_nested_text_2 +label: invalid wikilink: a wikilink nested within the value of another, two pairs of closing brackets +input: "[[foo|[[bar]]]]" +output: [Text(text="[[foo|"), WikilinkOpen(), Text(text="bar"), WikilinkClose(), Text(text="]]")] + +--- + +name: invalid_nested_text_padding +label: invalid wikilink: a wikilink nested within the value of another, separated by other data +input: "[[foo|a[[b]]c]]" +output: [Text(text="[[foo|a"), WikilinkOpen(), Text(text="b"), WikilinkClose(), Text(text="c]]")] + +--- + name: incomplete_open_only label: incomplete wikilinks: just an open input: "[[" From d42e05a554076d43dd53568bf383ec3e265c2fe2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 00:57:34 -0400 Subject: [PATCH 11/35] Implement improved wikilink handling. --- mwparserfromhell/parser/contexts.py | 12 ++++++------ mwparserfromhell/parser/tokenizer.c | 13 ++++++------- mwparserfromhell/parser/tokenizer.h | 4 ++-- mwparserfromhell/parser/tokenizer.py | 10 ++++++++-- 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index c6d2941..0d25400 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -53,7 +53,7 @@ Local (stack-specific) contexts: * :py:const:`EXT_LINK` - * :py:const:`EXT_LINK_URL` + * :py:const:`EXT_LINK_URI` * :py:const:`EXT_LINK_TITLE` * :py:const:`EXT_LINK_BRACKETS` @@ -119,10 +119,10 @@ WIKILINK_TITLE = 1 << 5 WIKILINK_TEXT = 1 << 6 WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT -EXT_LINK_URL = 1 << 7 +EXT_LINK_URI = 1 << 7 EXT_LINK_TITLE = 1 << 8 EXT_LINK_BRACKETS = 1 << 9 -EXT_LINK = EXT_LINK_URL + EXT_LINK_TITLE + EXT_LINK_BRACKETS +EXT_LINK = EXT_LINK_URI + EXT_LINK_TITLE + EXT_LINK_BRACKETS HEADING_LEVEL_1 = 1 << 10 HEADING_LEVEL_2 = 1 << 11 @@ -163,7 +163,7 @@ GL_HEADING = 1 << 0 # Aggregate contexts: FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK + HEADING + TAG + STYLE -UNSAFE = (TEMPLATE_NAME + WIKILINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + - TAG_CLOSE) +UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE -INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK_TITLE + EXT_LINK_URL +INVALID_LINK = TEMPLATE_NAME + ARGUMENT_NAME + WIKILINK + EXT_LINK diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 2b74f6b..46df405 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2050,18 +2050,17 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) if (context & LC_FAIL_NEXT) { return -1; } - if (context & LC_WIKILINK_TITLE) { - if (data == *"]" || data == *"{") + if (context & LC_WIKILINK) { + if (context & LC_WIKILINK_TEXT) + return (data == *"[" && Tokenizer_READ(self, 1) == *"[") ? -1 : 0; + else if (data == *"]" || data == *"{") self->topstack->context |= LC_FAIL_NEXT; else if (data == *"\n" || data == *"[" || data == *"}") return -1; return 0; } - if (context & LC_TAG_CLOSE) { - if (data == *"<") - return -1; - return 0; - } + if (context & LC_TAG_CLOSE) + return (data == *"<") ? -1 : 0; if (context & LC_TEMPLATE_NAME) { if (data == *"{" || data == *"}" || data == *"[") { self->topstack->context |= LC_FAIL_NEXT; diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 41c1c1b..5961dcc 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -163,9 +163,9 @@ static PyObject* TagCloseClose; /* Aggregate contexts: */ #define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) -#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) +#define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) -#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK_TITLE | LC_EXT_LINK_URL) +#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK_URL) /* Tag contexts: */ diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 07ae0b1..84de78e 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -835,12 +835,16 @@ class Tokenizer(object): context = self._context if context & contexts.FAIL_NEXT: return False - if context & contexts.WIKILINK_TITLE: - if this == "]" or this == "{": + if context & contexts.WIKILINK: + if context & contexts.WIKILINK_TEXT: + return not (this == self._read(1) == "[") + elif this == "]" or this == "{": self._context |= contexts.FAIL_NEXT elif this == "\n" or this == "[" or this == "}": return False return True + elif context & contexts.EXT_LINK_TITLE: + return this != "\n" elif context & contexts.TEMPLATE_NAME: if this == "{" or this == "}" or this == "[": self._context |= contexts.FAIL_NEXT @@ -936,6 +940,8 @@ class Tokenizer(object): self._parse_external_link(True) elif this == ":" and self._read(-1) not in self.MARKERS: self._parse_external_link(False) + elif this == "]" and self._context & contexts.EXT_LINK_TITLE: + return self._pop() elif this == "=" and not self._global & contexts.GL_HEADING: if self._read(-1) in ("\n", self.START): self._parse_heading() From da272ae10a78c8bd2be633aefab1b827c411d554 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 00:59:46 -0400 Subject: [PATCH 12/35] Start implementation of external links in Python. --- mwparserfromhell/parser/tokenizer.py | 100 +++++++++++++++++++++++++++++++++-- 1 file changed, 95 insertions(+), 5 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 84de78e..9acf32d 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -26,7 +26,8 @@ import re from . import contexts, tokens from ..compat import htmlentities -from ..definitions import get_html_tag, is_parsable, is_single, is_single_only +from ..definitions import (get_html_tag, is_parsable, is_single, + is_single_only, is_scheme) __all__ = ["Tokenizer"] @@ -313,8 +314,95 @@ class Tokenizer(object): def _really_parse_external_link(self, brackets): """Really parse an external link.""" - # link = self._parse(contexts.EXT_LINK_URL) - raise BadRoute() + scheme_valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" + if brackets: + self._push(contexts.EXT_LINK_URI) + if self._read() == self._read(1) == "/": + self._emit_text("//") + self._head += 2 + else: + scheme = "" + while all(char in scheme_valid for char in self._read()): + scheme += self._read() + self._emit_text(self._read()) + self._head += 1 + if self._read() != ":": + self._fail_route() + self._emit_text(":") + self._head += 1 + slashes = self._read() == self._read(1) == "/" + if slashes: + self._emit_text("//") + self._head += 2 + if not is_scheme(scheme, slashes): + self._fail_route() + else: + scheme = [] + try: + # Ugly, but we have to backtrack through the textbuffer looking + # for our scheme since it was just parsed as text: + for i in range(-1, -len(self._textbuffer) - 1, -1): + for char in reversed(self._textbuffer[i]): + if char.isspace() or char in self.MARKERS: + raise StopIteration() + if char not in scheme_valid: + raise BadRoute() + scheme.append(char) + except StopIteration: + pass + scheme = "".join(reversed(scheme)) + slashes = self._read() == self._read(1) == "/" + if not is_scheme(scheme, slashes): + raise BadRoute() + # Remove the scheme from the textbuffer, now that it's part of the + # external link: + length = len(scheme) + while length: + if length < len(self._textbuffer[-1]): + self._textbuffer[-1] = self._textbuffer[-1][:-length] + break + length -= len(self._textbuffer[-1]) + self._textbuffer.pop() + self._push(contexts.EXT_LINK_URI) + self._emit_text(scheme) + self._emit_text(":") + if slashes: + self._emit_text("//") + self._head += 2 + parentheses = False + + while True: + this, next = self._read(), self._read(1) + if this is self.END or this == "\n": + if brackets: + self._fail_route() + self._head -= 1 + return self._pop(), None + elif this == next == "{" and self._can_recurse(): + self._parse_template_or_argument() + elif this == "&": + self._parse_entity() + elif this == "]": + if not brackets: + self._head -= 1 + return self._pop(), None + elif this == "(" and not brackets and not parentheses: + parentheses = True + self._emit_text(this) + elif " " in this: ## Should be a more general whitespace check + before, after = this.split(" ", 1) + self._emit_text(before) + if brackets: + self._emit(tokens.ExternalLinkSeparator()) + self._emit_text(after) + self._context ^= contexts.EXT_LINK_URI + self._context |= contexts.EXT_LINK_TITLE + self._head += 1 + return self._parse(push=False), None + return self._pop(), " " + after + else: + self._emit_text(this) + self._head += 1 def _parse_external_link(self, brackets): """Parse an external link at the head of the wikicode string.""" @@ -324,7 +412,7 @@ class Tokenizer(object): bad_context = self._context & contexts.INVALID_LINK if bad_context or not self._can_recurse(): raise BadRoute() - link = self._really_parse_external_link(brackets) + link, extra = self._really_parse_external_link(brackets) except BadRoute: self._head = reset if not brackets and self._context & contexts.DL_TERM: @@ -332,9 +420,11 @@ class Tokenizer(object): else: self._emit_text(self._read()) else: - self._emit(tokens.ExternalLinkOpen(brackets)) + self._emit(tokens.ExternalLinkOpen(brackets=brackets)) self._emit_all(link) self._emit(tokens.ExternalLinkClose()) + if extra: + self._emit_text(extra) def _parse_heading(self): """Parse a section heading at the head of the wikicode string.""" From 0ecf2e42310bc36fbf220883f95836d4fe96bc7a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 01:02:27 -0400 Subject: [PATCH 13/35] Add a couple integration tests for ext links vs.
    . --- tests/tokenizer/integration.mwtest | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/tokenizer/integration.mwtest b/tests/tokenizer/integration.mwtest index e4ff8c4..083b12c 100644 --- a/tests/tokenizer/integration.mwtest +++ b/tests/tokenizer/integration.mwtest @@ -58,3 +58,17 @@ name: wildcard_redux label: an even wilder assortment of various things input: "{{a|b|{{c|[[d]]{{{e}}}}}}}[[f|{{{g}}}]]{{i|j= }}" output: [TemplateOpen(), Text(text="a"), TemplateParamSeparator(), Text(text="b"), TemplateParamSeparator(), TemplateOpen(), Text(text="c"), TemplateParamSeparator(), WikilinkOpen(), Text(text="d"), WikilinkClose(), ArgumentOpen(), Text(text="e"), ArgumentClose(), TemplateClose(), TemplateClose(), WikilinkOpen(), Text(text="f"), WikilinkSeparator(), ArgumentOpen(), Text(text="g"), ArgumentClose(), CommentStart(), Text(text="h"), CommentEnd(), WikilinkClose(), TemplateOpen(), Text(text="i"), TemplateParamSeparator(), Text(text="j"), TemplateParamEquals(), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), TemplateClose()] + +--- + +name: link_inside_dl +label: an external link inside a def list, such that the external link is parsed +input: ";;;mailto:example" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), ExternalLinkOpen(brackets=False), Text(text="mailto:example"), ExternalLinkClose()] + +--- + +name: link_inside_dl_2 +label: an external link inside a def list, such that the external link is not parsed +input: ";;;malito:example" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="malito"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="example")] From 93c51fe57c1711c674c41ea0799be5193ff3bf21 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 01:58:27 -0400 Subject: [PATCH 14/35] Tokenizer tests for external links. --- tests/tokenizer/external_links.mwtest | 459 ++++++++++++++++++++++++++++++++++ 1 file changed, 459 insertions(+) create mode 100644 tests/tokenizer/external_links.mwtest diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest new file mode 100644 index 0000000..9e0ebdd --- /dev/null +++ b/tests/tokenizer/external_links.mwtest @@ -0,0 +1,459 @@ +name: basic +label: basic external link +input: "http://example.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose()] + +--- + +name: basic_brackets +label: basic external link in brackets +input: "[http://example.com/]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkClose()] + +--- + +name: brackets_space +label: basic external link in brackets, with a space after +input: "[http://example.com/ ]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text=""), ExternalLinkClose()] + +--- + +name: brackets_title +label: basic external link in brackets, with a title +input: "[http://example.com/ Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_multiword_title +label: basic external link in brackets, with a multi-word title +input: "[http://example.com/ Example Web Page]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text="Example Web Page"), ExternalLinkClose()] + +--- + +name: brackets_adjacent +label: three adjacent bracket-enclosed external links +input: "[http://foo.com/ Foo][http://bar.com/ Bar]\n[http://baz.com/ Baz]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foo.com/"), ExternalLinkSeparator(), Text(text="Foo"), ExternalLinkClose(), ExternalLinkOpen(brackets=True), Text(text="http://bar.com/"), ExternalLinkSeparator(), Text(text="Bar"), ExternalLinkClose(), Text(text="\n"), ExternalLinkOpen(brackets=True), Text(text="http://baz.com/"), ExternalLinkSeparator(), Text(text="Baz"), ExternalLinkClose()] + +--- + +name: brackets_newline_before +label: bracket-enclosed link with a newline before the title +input: "[http://example.com/ \nExample]" +output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" \nExample]")] + +--- + +name: brackets_newline_inside +label: bracket-enclosed link with a newline in the title +input: "[http://example.com/ Example \nWeb Page]" +output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" Example \nWeb Page]")] + +--- + +name: brackets_newline_after +label: bracket-enclosed link with a newline after the title +input: "[http://example.com/ Example\n]" +output: [Text(text="["), ExternalLinkOpen(brackets=False), Text(text="http://example.com/"), ExternalLinkClose(), Text(text=" Example\n]")] + +--- + +name: brackets_space_before +label: bracket-enclosed link with a space before the URL +input: "[ http://example.com Example]" +output: [Text(text="[ "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" Example]")] + +--- + +name: brackets_title_like_url +label: bracket-enclosed link with a title that looks like a URL +input: "[http://example.com http://example.com]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: brackets_recursive +label: bracket-enclosed link with a bracket-enclosed link as the title +input: "[http://example.com [http://example.com]]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com"), ExternalLinkSeparator(), Text(text="[http://example.com"), ExternalLinkClose(), Text(text="]")] + +--- + +name: period_after +label: a period after a free link that is excluded +input: "http://example.com." +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=".")] + +--- + +name: colons_after +label: colons after a free link that are excluded +input: "http://example.com/foo:bar:::baz:::" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo:bar:::baz"), ExternalLinkClose(), Text(text=":::")] + +--- + +name: close_paren_after_excluded +label: a closing parenthesis after a free link that is excluded +input: "http://example.)com)" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.)com"), ExternalLinkClose(), Text(text=")")] + +--- + +name: close_paren_after_included +label: a closing parenthesis after a free link that is included because of an opening parenthesis in the URL +input: "http://example.(com)" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.(com)"), ExternalLinkClose()] + +--- + +name: open_bracket_inside +label: an open bracket inside a free link that causes it to be ended abruptly +input: "http://foobar[baz.com" +output: [ExternalLinkOpen(brackets=False), Text(text="http://foobar"), ExternalLinkClose(), Text(text="[baz.com")] + +--- + +name: brackets_period_after +label: a period after a bracket-enclosed link that is included +input: "[http://example.com. Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com."), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_colons_after +label: colons after a bracket-enclosed link that are included +input: "[http://example.com/foo:bar:::baz::: Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo:bar:::baz:::"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_close_paren_after_included +label: a closing parenthesis after a bracket-enclosed link that is included +input: "[http://example.)com) Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.)com)"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_close_paren_after_included_2 +label: a closing parenthesis after a bracket-enclosed link that is also included +input: "[http://example.(com) Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: brackets_open_bracket_inside +label: an open bracket inside a bracket-enclosed link that causes it to switch to the title context abruptly +input: "[http://foobar[baz.com Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar"), ExternalLinkSeparator(), Text(text="[baz.com Example"), ExternalLinkClose()] + +--- + +name: adjacent_space +label: two free links separated by a space +input: "http://example.com http://example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: adjacent_newline +label: two free links separated by a newline +input: "http://example.com\nhttp://example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text="\n"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: adjacent_close_bracket +label: two free links separated by a close bracket +input: "http://example.com]http://example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text="]"), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: html_entity_in_url +label: a HTML entity parsed correctly inside a free link +input: "http://exa mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="mple.com/"), ExternalLinkClose()] + +--- + +name: template_in_url +label: a template parsed correctly inside a free link +input: "http://exa{{template}}mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), TemplateOpen(), Text(text="template"), TemplateClose(), Text(text="mple.com/"), ExternalLinkClose()] + +--- + +name: argument_in_url +label: an argument parsed correctly inside a free link +input: "http://exa{{{argument}}}mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ArgumentOpen(), Text(text="argument"), ArgumentClose(), Text(text="mple.com/"), ExternalLinkClose()] + +--- + +name: wikilink_in_url +label: a wikilink that destroys a free link +input: "http://exa[[wikilink]]mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ExternalLinkClose(), WikilinkOpen(), Text(text="wikilink"), WikilinkClose(), Text(text="mple.com/")] + +--- + +name: external_link_in_url +label: a bracketed link that destroys a free link +input: "http://exa[http://example.com/]mple.com/" +output: [ExternalLinkOpen(brackets=False), Text(text="http://exa"), ExternalLinkClose(), ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkClose(), Text(text="mple.com/")] + +--- + +name: spaces_padding +label: spaces padding a free link +input: " http://example.com " +output: [Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" ")] + +--- + +name: text_and_spaces_padding +label: text and spaces padding a free link +input: "x http://example.com x" +output: [Text(text="x "), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose(), Text(text=" x")] + +--- + +name: template_before +label: a template before a free link +input: "{{foo}}http://example.com" +output: [TemplateOpen(), Text(text="foo"), TemplateClose(), ExternalLinkOpen(brackets=False), Text(text="http://example.com"), ExternalLinkClose()] + +--- + +name: spaces_padding_no_slashes +label: spaces padding a free link with no slashes after the colon +input: " mailto:example@example.com " +output: [Text(text=" "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" ")] + +--- + +name: text_and_spaces_padding_no_slashes +label: text and spaces padding a free link with no slashes after the colon +input: "x mailto:example@example.com x" +output: [Text(text="x "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" x")] + +--- + +name: template_before_no_slashes +label: a template before a free link with no slashes after the colon +input: "{{foo}}mailto:example@example.com" +output: [TemplateOpen(), Text(text="foo"), TemplateClose(), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose()] + +--- + +name: no_slashes +label: a free link with no slashes after the colon +input: "mailto:example@example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose()] + +--- + +name: slashes_optional +label: a free link using a scheme that doesn't need slashes, but has them anyway +input: "mailto://example@example.com" +output: [ExternalLinkOpen(brackets=False), Text(text="mailto://example@example.com"), ExternalLinkClose()] + +--- + +name: short +label: a very short free link +input: "mailto://abc" +output: [ExternalLinkOpen(brackets=False), Text(text="mailto://abc"), ExternalLinkClose()] + +--- + +name: slashes_missing +label: slashes missing from a free link with a scheme that requires them +input: "http:example@example.com" +output: [Text(text="http:example@example.com")] + +--- + +name: no_scheme_but_slashes +label: no scheme in a free link, but slashes (protocol-relative free links are not supported) +input: "//example.com" +output: [Text(text="//example.com")] + +--- + +name: no_scheme_but_colon +label: no scheme in a free link, but a colon +input: ":example.com" +output: [Text(text=":example.com")] + +--- + +name: no_scheme_but_colon_and_slashes +label: no scheme in a free link, but a colon and slashes +input: "://example.com" +output: [Text(text="://example.com")] + +--- + +name: fake_scheme_no_slashes +label: a nonexistent scheme in a free link, without slashes +input: "fake:example.com" +output: [Text(text="fake:example.com")] + +--- + +name: fake_scheme_slashes +label: a nonexistent scheme in a free link, with slashes +input: "fake://example.com" +output: [Text(text="fake://example.com")] + +--- + +name: fake_scheme_brackets_no_slashes +label: a nonexistent scheme in a bracketed link, without slashes +input: "[fake:example.com]" +output: [Text(text="[fake:example.com]")] + +--- + +name: fake_scheme_brackets_slashes +label: #=a nonexistent scheme in a bracketed link, with slashes +input: "[fake://example.com]" +output: [Text(text="[fake://example.com]")] + +--- + +name: interrupted_scheme +label: an otherwise valid scheme with something in the middle of it, in a free link +input: "ht?tp://example.com" +output: [Text(text="ht?tp://example.com")] + +--- + +name: interrupted_scheme_brackets +label: an otherwise valid scheme with something in the middle of it, in a bracketed link +input: "[ht?tp://example.com]" +output: [Text(text="[ht?tp://example.com]")] + +--- + +name: no_slashes_brackets +label: no slashes after the colon in a bracketed link +input: "[mailto:example@example.com Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="mailto:example@example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: space_before_no_slashes_brackets +label: a space before a bracketed link with no slashes after the colon +input: "[ mailto:example@example.com Example]" +output: [Text(text="[ "), ExternalLinkOpen(brackets=False), Text(text="mailto:example@example.com"), ExternalLinkClose(), Text(text=" Example]")] + +--- + +name: slashes_optional_brackets +label: a bracketed link using a scheme that doesn't need slashes, but has them anyway +input: "[mailto://example@example.com Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="mailto://example@example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: short_brackets +label: a very short link in brackets +input: "[mailto://abc Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="mailto://abc"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: slashes_missing_brackets +label: slashes missing from a scheme that requires them in a bracketed link +input: "[http:example@example.com Example]" +output: [Text(text="[http:example@example.com Example]")] + +--- + +name: protcol_relative +label: a protocol-relative link (in brackets) +input: "[//example.com Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="//example.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] + +--- + +name: scheme_missing_but_colon_brackets +label: scheme missing from a bracketed link, but with a colon +input: "[:example.com Example]" +output: [Text(text="[:example.com Example]")] + +--- + +name: scheme_missing_but_colon_slashes_brackets +label: scheme missing from a bracketed link, but with a colon and slashes +input: "[://example.com Example]" +output: [Text(text="[://example.com Example]")] + +--- + +name: unclosed_protocol_relative +label: an unclosed protocol-relative bracketed link +input: "[//example.com" +output: [Text(text="[//example.com")] + +--- + +name: space_before_protcol_relative +label: a space before a protocol-relative bracketed link +input: "[ //example.com]" +output: [Text(text="[ //example.com]")] + +--- + +name: unclosed_just_scheme +label: an unclosed bracketed link, ending after the scheme +input: "[http" +output: [Text(text="[http")] + +--- + +name: unclosed_scheme_colon +label: an unclosed bracketed link, ending after the colon +input: "[http:" +output: [Text(text="[http:")] + +--- + +name: unclosed_scheme_colon_slashes +label: an unclosed bracketed link, ending after the slashes +input: "[http://" +output: [Text(text="[http://")] + +--- + +name: incomplete_scheme_colon +label: a free link with just a scheme and a colon +input: "http:" +output: [Text(text="http:")] + +--- + +name: incomplete_scheme_colon_slashes +label: a free link with just a scheme, colon, and slashes +input: "http://" +output: [Text(text="http://")] + +--- + +name: brackets_scheme_but_no_url +label: brackets around a scheme, colon, and slashes +input: "[http://]" +output: [Text(text="[http://]")] + +--- + +name: brackets_scheme_title_but_no_url +label: brackets around a scheme, colon, and slashes, with a title +input: "[http:// Example]" +output: [Text(text="[http:// Example]")] From 176290d75a22ee44c1d81e9eb688025d4e1f808e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 02:01:47 -0400 Subject: [PATCH 15/35] Add a couple more tests. --- tests/tokenizer/external_links.mwtest | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index 9e0ebdd..7e1b7a5 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -432,6 +432,13 @@ output: [Text(text="[http://")] --- +name: incomplete_bracket +label: just an open bracket +input: "[" +output: [Text(text="[")] + +--- + name: incomplete_scheme_colon label: a free link with just a scheme and a colon input: "http:" @@ -447,6 +454,13 @@ output: [Text(text="http://")] --- name: brackets_scheme_but_no_url +label: brackets around a scheme and a colon +input: "[mailto:]" +output: [Text(text="[mailto:]")] + +--- + +name: brackets_scheme_slashes_but_no_url label: brackets around a scheme, colon, and slashes input: "[http://]" output: [Text(text="[http://]")] From 44ee185377df12f525c91c6712d6c7cf43c86936 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 02:21:06 -0400 Subject: [PATCH 16/35] Fix some tests involving colons starting lines. --- tests/tokenizer/external_links.mwtest | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index 7e1b7a5..ee221e0 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -287,15 +287,15 @@ output: [Text(text="//example.com")] name: no_scheme_but_colon label: no scheme in a free link, but a colon -input: ":example.com" -output: [Text(text=":example.com")] +input: " :example.com" +output: [Text(text=" :example.com")] --- name: no_scheme_but_colon_and_slashes label: no scheme in a free link, but a colon and slashes -input: "://example.com" -output: [Text(text="://example.com")] +input: " ://example.com" +output: [Text(text=" ://example.com")] --- From 432198547af4077687606abd3a21e5458fea6530 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 02:21:19 -0400 Subject: [PATCH 17/35] Fix some external links; refactor into different methods. --- mwparserfromhell/parser/tokenizer.py | 121 ++++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 53 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9acf32d..29bec56 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -312,65 +312,67 @@ class Tokenizer(object): self._head += 1 return self._pop() - def _really_parse_external_link(self, brackets): - """Really parse an external link.""" - scheme_valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" - if brackets: - self._push(contexts.EXT_LINK_URI) - if self._read() == self._read(1) == "/": - self._emit_text("//") - self._head += 2 - else: - scheme = "" - while all(char in scheme_valid for char in self._read()): - scheme += self._read() - self._emit_text(self._read()) - self._head += 1 - if self._read() != ":": - self._fail_route() - self._emit_text(":") - self._head += 1 - slashes = self._read() == self._read(1) == "/" - if slashes: - self._emit_text("//") - self._head += 2 - if not is_scheme(scheme, slashes): - self._fail_route() + def _parse_bracketed_uri_scheme(self): + """Parse the URI scheme of a bracket-enclosed external link.""" + self._push(contexts.EXT_LINK_URI) + if self._read() == self._read(1) == "/": + self._emit_text("//") + self._head += 2 else: - scheme = [] - try: - # Ugly, but we have to backtrack through the textbuffer looking - # for our scheme since it was just parsed as text: - for i in range(-1, -len(self._textbuffer) - 1, -1): - for char in reversed(self._textbuffer[i]): - if char.isspace() or char in self.MARKERS: - raise StopIteration() - if char not in scheme_valid: - raise BadRoute() - scheme.append(char) - except StopIteration: - pass - scheme = "".join(reversed(scheme)) - slashes = self._read() == self._read(1) == "/" - if not is_scheme(scheme, slashes): - raise BadRoute() - # Remove the scheme from the textbuffer, now that it's part of the - # external link: - length = len(scheme) - while length: - if length < len(self._textbuffer[-1]): - self._textbuffer[-1] = self._textbuffer[-1][:-length] - break - length -= len(self._textbuffer[-1]) - self._textbuffer.pop() - self._push(contexts.EXT_LINK_URI) - self._emit_text(scheme) + valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" + all_valid = lambda: all(char in valid for char in self._read()) + scheme = "" + while self._read() is not self.END and all_valid(): + scheme += self._read() + self._emit_text(self._read()) + self._head += 1 + if self._read() != ":": + self._fail_route() self._emit_text(":") + self._head += 1 + slashes = self._read() == self._read(1) == "/" if slashes: self._emit_text("//") self._head += 2 - parentheses = False + if not is_scheme(scheme, slashes): + self._fail_route() + + def _parse_free_uri_scheme(self): + """Parse the URI scheme of a free (no brackets) external link.""" + valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" + scheme = [] + try: + # Ugly, but we have to backtrack through the textbuffer looking for + # our scheme since it was just parsed as text: + for i in range(-1, -len(self._textbuffer) - 1, -1): + for char in reversed(self._textbuffer[i]): + if char.isspace() or char in self.MARKERS: + raise StopIteration() + if char not in valid: + raise BadRoute() + scheme.append(char) + except StopIteration: + pass + scheme = "".join(reversed(scheme)) + slashes = self._read() == self._read(1) == "/" + if not is_scheme(scheme, slashes): + raise BadRoute() + parentheses = False + self._push(contexts.EXT_LINK_URI) + self._emit_text(scheme) + self._emit_text(":") + if slashes: + self._emit_text("//") + self._head += 2 + def _really_parse_external_link(self, brackets): + """Really parse an external link.""" + if brackets: + self._parse_bracketed_uri_scheme() + else: + self._parse_free_uri_scheme() + if self._read() in self.MARKERS or self._read()[0].isspace(): ## Should actually check for valid chars + self._fail_route() while True: this, next = self._read(), self._read(1) if this is self.END or this == "\n": @@ -404,6 +406,16 @@ class Tokenizer(object): self._emit_text(this) self._head += 1 + def _remove_uri_scheme_from_textbuffer(self, scheme): + """Remove the URI scheme of a new external link from the textbuffer.""" + length = len(scheme) + while length: + if length < len(self._textbuffer[-1]): + self._textbuffer[-1] = self._textbuffer[-1][:-length] + break + length -= len(self._textbuffer[-1]) + self._textbuffer.pop() + def _parse_external_link(self, brackets): """Parse an external link at the head of the wikicode string.""" reset = self._head @@ -420,6 +432,9 @@ class Tokenizer(object): else: self._emit_text(self._read()) else: + if not brackets: + scheme = link[0].text.split(":", 1)[0] + self._remove_uri_scheme_from_textbuffer(scheme) self._emit(tokens.ExternalLinkOpen(brackets=brackets)) self._emit_all(link) self._emit(tokens.ExternalLinkClose()) From be505465c06595ff9c7592aeb729d2b69ee952a7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 02:34:55 -0400 Subject: [PATCH 18/35] Alter a test that imitates strange (incorrect?) MediaWiki behavior. --- tests/tokenizer/external_links.mwtest | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index ee221e0..b517ada 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -146,9 +146,9 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.(com)"), Ext --- name: brackets_open_bracket_inside -label: an open bracket inside a bracket-enclosed link that causes it to switch to the title context abruptly +label: an open bracket inside a bracket-enclosed link that is also included input: "[http://foobar[baz.com Example]" -output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar"), ExternalLinkSeparator(), Text(text="[baz.com Example"), ExternalLinkClose()] +output: [ExternalLinkOpen(brackets=True), Text(text="http://foobar[baz.com"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] --- From bd10aab823562f349f433ef80525aee134c5e317 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 05:05:30 -0400 Subject: [PATCH 19/35] Finish external links. --- mwparserfromhell/parser/tokenizer.py | 64 +++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 16 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 29bec56..e9768fa 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -357,7 +357,6 @@ class Tokenizer(object): slashes = self._read() == self._read(1) == "/" if not is_scheme(scheme, slashes): raise BadRoute() - parentheses = False self._push(contexts.EXT_LINK_URI) self._emit_text(scheme) self._emit_text(":") @@ -365,43 +364,75 @@ class Tokenizer(object): self._emit_text("//") self._head += 2 + def _handle_free_link_text(self, punct, tail, this): + """Handle text in a free ext link, including trailing punctuation.""" + if "(" in this and ")" in punct: + punct = punct[:-1] # ')' is not longer valid punctuation + if this.endswith(punct): + for i in range(-1, -len(this) - 1, -1): + if i == -len(this) or this[i - 1] not in punct: + break + stripped = this[:i] + if stripped and tail: + self._emit_text(tail) + tail = "" + tail += this[i:] + this = stripped + elif tail: + self._emit_text(tail) + tail = "" + self._emit_text(this) + return punct, tail + def _really_parse_external_link(self, brackets): """Really parse an external link.""" if brackets: self._parse_bracketed_uri_scheme() + invalid = ("\n", " ", "]") else: self._parse_free_uri_scheme() - if self._read() in self.MARKERS or self._read()[0].isspace(): ## Should actually check for valid chars + invalid = ("\n", " ", "[", "]") + punct = tuple(",;\.:!?)") + if self._read() is self.END or self._read()[0] in invalid: self._fail_route() + tail = "" while True: this, next = self._read(), self._read(1) if this is self.END or this == "\n": if brackets: self._fail_route() - self._head -= 1 - return self._pop(), None + return self._pop(), tail, -1 elif this == next == "{" and self._can_recurse(): + if not brackets and tail: + self._emit_text(tail) + tail = "" self._parse_template_or_argument() + elif this == "[": + if brackets: + self._emit_text("[") + else: + return self._pop(), tail, -1 + elif this == "]": + return self._pop(), tail, 0 if brackets else -1 elif this == "&": + if not brackets and tail: + self._emit_text(tail) + tail = "" self._parse_entity() - elif this == "]": - if not brackets: - self._head -= 1 - return self._pop(), None - elif this == "(" and not brackets and not parentheses: - parentheses = True - self._emit_text(this) - elif " " in this: ## Should be a more general whitespace check + elif " " in this: before, after = this.split(" ", 1) - self._emit_text(before) if brackets: + self._emit_text(before) self._emit(tokens.ExternalLinkSeparator()) self._emit_text(after) self._context ^= contexts.EXT_LINK_URI self._context |= contexts.EXT_LINK_TITLE self._head += 1 - return self._parse(push=False), None - return self._pop(), " " + after + return self._parse(push=False), None, 0 + punct, tail = self._handle_free_link_text(punct, tail, before) + return self._pop(), tail + " " + after, 0 + elif not brackets: + punct, tail = self._handle_free_link_text(punct, tail, this) else: self._emit_text(this) self._head += 1 @@ -424,7 +455,7 @@ class Tokenizer(object): bad_context = self._context & contexts.INVALID_LINK if bad_context or not self._can_recurse(): raise BadRoute() - link, extra = self._really_parse_external_link(brackets) + link, extra, delta = self._really_parse_external_link(brackets) except BadRoute: self._head = reset if not brackets and self._context & contexts.DL_TERM: @@ -438,6 +469,7 @@ class Tokenizer(object): self._emit(tokens.ExternalLinkOpen(brackets=brackets)) self._emit_all(link) self._emit(tokens.ExternalLinkClose()) + self._head += delta if extra: self._emit_text(extra) From 6eb72c56eb3219009f4f06b7a63f0b31d971dd9e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 14:34:45 -0400 Subject: [PATCH 20/35] Some cleanup. --- mwparserfromhell/parser/tokenizer.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index e9768fa..2340077 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -342,10 +342,10 @@ class Tokenizer(object): valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-" scheme = [] try: - # Ugly, but we have to backtrack through the textbuffer looking for - # our scheme since it was just parsed as text: - for i in range(-1, -len(self._textbuffer) - 1, -1): - for char in reversed(self._textbuffer[i]): + # We have to backtrack through the textbuffer looking for our + # scheme since it was just parsed as text: + for chunk in reversed(self._textbuffer): + for char in reversed(chunk): if char.isspace() or char in self.MARKERS: raise StopIteration() if char not in valid: @@ -369,7 +369,7 @@ class Tokenizer(object): if "(" in this and ")" in punct: punct = punct[:-1] # ')' is not longer valid punctuation if this.endswith(punct): - for i in range(-1, -len(this) - 1, -1): + for i in reversed(range(-len(this), 0)): if i == -len(this) or this[i - 1] not in punct: break stripped = this[:i] @@ -403,7 +403,7 @@ class Tokenizer(object): self._fail_route() return self._pop(), tail, -1 elif this == next == "{" and self._can_recurse(): - if not brackets and tail: + if tail: self._emit_text(tail) tail = "" self._parse_template_or_argument() @@ -415,7 +415,7 @@ class Tokenizer(object): elif this == "]": return self._pop(), tail, 0 if brackets else -1 elif this == "&": - if not brackets and tail: + if tail: self._emit_text(tail) tail = "" self._parse_entity() From 7b84b3f0df31e634bc9390dae2f3539c3dc04d3c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 15:01:59 -0400 Subject: [PATCH 21/35] Refactor out C's is_marker(); hooks for ext links. --- mwparserfromhell/parser/tokenizer.c | 77 ++++++++++++++++++------------------- mwparserfromhell/parser/tokenizer.h | 3 +- 2 files changed, 39 insertions(+), 41 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 46df405..8a2d9f9 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -24,6 +24,20 @@ SOFTWARE. #include "tokenizer.h" /* + Determine whether the given Py_UNICODE is a marker. +*/ +static int is_marker(Py_UNICODE this) +{ + int i; + + for (i = 0; i < NUM_MARKERS; i++) { + if (*MARKERS[i] == this) + return 1; + } + return 0; +} + +/* Given a context, return the heading level encoded within it. */ static int heading_level_from_context(int n) @@ -37,13 +51,13 @@ static int heading_level_from_context(int n) } /* - Call the given function in definitions.py, using 'tag' as a parameter, and - return its output as a bool. + Call the given function in definitions.py, using 'input' as a parameter, + and return its output as a bool. */ -static int call_def_func(const char* funcname, PyObject* tag) +static int call_def_func(const char* funcname, PyObject* input) { PyObject* func = PyObject_GetAttrString(definitions, funcname); - PyObject* result = PyObject_CallFunctionObjArgs(func, tag, NULL); + PyObject* result = PyObject_CallFunctionObjArgs(func, input, NULL); int ans = (result == Py_True) ? 1 : 0; Py_DECREF(func); @@ -1238,15 +1252,8 @@ Tokenizer_handle_tag_space(Tokenizer* self, TagData* data, Py_UNICODE text) static int Tokenizer_handle_tag_text(Tokenizer* self, Py_UNICODE text) { Py_UNICODE next = Tokenizer_READ(self, 1); - int i, is_marker = 0; - for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == text) { - is_marker = 1; - break; - } - } - if (!is_marker || !Tokenizer_CAN_RECURSE(self)) + if (!is_marker(text) || !Tokenizer_CAN_RECURSE(self)) return Tokenizer_emit_char(self, text); else if (text == next && next == *"{") return Tokenizer_parse_template_or_argument(self); @@ -1264,17 +1271,11 @@ static int Tokenizer_handle_tag_data(Tokenizer* self, TagData* data, Py_UNICODE chunk) { PyObject *trash; - int first_time, i, is_marker = 0, escaped; + int first_time, escaped; if (data->context & TAG_NAME) { first_time = !(data->context & TAG_NOTE_SPACE); - for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == chunk) { - is_marker = 1; - break; - } - } - if (is_marker || (Py_UNICODE_ISSPACE(chunk) && first_time)) { + if (is_marker(chunk) || (Py_UNICODE_ISSPACE(chunk) && first_time)) { // Tags must start with text, not spaces Tokenizer_fail_route(self); return 0; @@ -1623,7 +1624,6 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) Textbuffer* buf; PyObject *name, *tag; Py_UNICODE this; - int is_marker, i; self->head += 2; buf = Textbuffer_new(); @@ -1631,14 +1631,7 @@ static int Tokenizer_handle_invalid_tag_start(Tokenizer* self) return -1; while (1) { this = Tokenizer_READ(self, pos); - is_marker = 0; - for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == this) { - is_marker = 1; - break; - } - } - if (is_marker) { + if (is_marker(this)) { name = Textbuffer_render(buf); if (!name) { Textbuffer_dealloc(buf); @@ -2047,9 +2040,8 @@ static PyObject* Tokenizer_handle_end(Tokenizer* self, int context) */ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) { - if (context & LC_FAIL_NEXT) { + if (context & LC_FAIL_NEXT) return -1; - } if (context & LC_WIKILINK) { if (context & LC_WIKILINK_TEXT) return (data == *"[" && Tokenizer_READ(self, 1) == *"[") ? -1 : 0; @@ -2059,6 +2051,8 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) return -1; return 0; } + if (context & LC_EXT_LINK_TITLE) + return (data == *"\n") ? -1 : 0; if (context & LC_TAG_CLOSE) return (data == *"<") ? -1 : 0; if (context & LC_TEMPLATE_NAME) { @@ -2125,7 +2119,7 @@ static int Tokenizer_verify_safe(Tokenizer* self, int context, Py_UNICODE data) */ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) { - int this_context, is_marker, i; + int this_context; Py_UNICODE this, next, next_next, last; PyObject* temp; @@ -2145,14 +2139,7 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) return Tokenizer_fail_route(self); } } - is_marker = 0; - for (i = 0; i < NUM_MARKERS; i++) { - if (*MARKERS[i] == this) { - is_marker = 1; - break; - } - } - if (!is_marker) { + if (!is_marker(this)) { if (Tokenizer_emit_char(self, this)) return NULL; self->head++; @@ -2205,6 +2192,16 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) } else if (this == next && next == *"]" && this_context & LC_WIKILINK) return Tokenizer_handle_wikilink_end(self); + // else if (this == *"[") { + // if (Tokenizer_parse_external_link(self, 1)) + // return NULL; + // } + // else if (this == *":" && !is_marker(last)) { + // if (Tokenizer_parse_external_link(self, 0)) + // return NULL; + // } + // else if (this == *"]" && this_context & LC_EXT_LINK_TITLE) + // return Tokenizer_pop(self); else if (this == *"=" && !(self->global & GL_HEADING)) { if (last == *"\n" || last == *"") { if (Tokenizer_parse_heading(self)) diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 5961dcc..e125068 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -238,12 +238,13 @@ typedef struct { #define Tokenizer_emit_first_kwargs(self, token, kwargs) Tokenizer_emit_token_kwargs(self, token, kwargs, 1) -/* Macros for accessing HTML tag definitions: */ +/* Macros for accessing definitions: */ #define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") #define IS_PARSABLE(tag) (call_def_func("is_parsable", tag)) #define IS_SINGLE(tag) (call_def_func("is_single", tag)) #define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag)) +#define IS_SCHEME(scheme) (call_def_func("is_scheme", scheme)) /* Function prototypes: */ From a1948b06aaa05dd6585c4af9c254dfb966165e81 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 20:03:34 -0400 Subject: [PATCH 22/35] Tokenizer_parse_bracketed/free_uri_scheme(), other adjustments --- mwparserfromhell/parser/tokenizer.c | 315 ++++++++++++++++++++++++++++++++++-- mwparserfromhell/parser/tokenizer.h | 12 +- 2 files changed, 304 insertions(+), 23 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 8a2d9f9..80f4c56 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -51,13 +51,13 @@ static int heading_level_from_context(int n) } /* - Call the given function in definitions.py, using 'input' as a parameter, - and return its output as a bool. + Call the given function in definitions.py, using 'in1' and 'in2' as + parameters, and return its output as a bool. */ -static int call_def_func(const char* funcname, PyObject* input) +static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2) { PyObject* func = PyObject_GetAttrString(definitions, funcname); - PyObject* result = PyObject_CallFunctionObjArgs(func, input, NULL); + PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL); int ans = (result == Py_True) ? 1 : 0; Py_DECREF(func); @@ -129,8 +129,7 @@ static int Textbuffer_write(Textbuffer** this, Py_UNICODE code) new->next = self; *this = self = new; } - self->data[self->size] = code; - self->size++; + self->data[self->size++] = code; return 0; } @@ -822,6 +821,288 @@ static PyObject* Tokenizer_handle_wikilink_end(Tokenizer* self) } /* + Parse the URI scheme of a bracket-enclosed external link. +*/ +static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) +{ + static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; + Textbuffer* buffer; + PyObject* scheme; + Py_UNICODE this; + int slashes, i; + + if (Tokenizer_push(self, LC_EXT_LINK_URI)) + return -1; + if (Tokenizer_READ(self, 0) == *"/" && Tokenizer_READ(self, 1) == *"/") { + if (Tokenizer_emit_text(self, "//")) + return -1; + self->head += 2; + } + else { + buffer = Textbuffer_new(); + if (!buffer) + return -1; + while ((this = Tokenizer_READ(self, 0)) != *"") { + i = 0; + while (1) { + if (!valid[i]) + goto end_of_loop; + if (this == valid[i]) + break; + i++; + } + Textbuffer_write(&buffer, this); + if (Tokenizer_emit_char(self, this)) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head++; + } + end_of_loop: + if (this != *":") { + Textbuffer_dealloc(buffer); + Tokenizer_fail_route(self); + return 0; + } + if (Tokenizer_emit_char(self, *":")) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head++; + slashes = (Tokenizer_READ(self, 0) == *"/" && + Tokenizer_READ(self, 1) == *"/"); + if (slashes) { + if (Tokenizer_emit_text(self, "//")) { + Textbuffer_dealloc(buffer); + return -1; + } + self->head += 2; + } + scheme = Textbuffer_render(buffer); + Textbuffer_dealloc(buffer); + if (!scheme) + return -1; + if (!IS_SCHEME(scheme, slashes ? Py_True : Py_False)) { + Py_DECREF(scheme); + Tokenizer_fail_route(self); + return 0; + } + Py_DECREF(scheme); + } + return 0; +} + +/* + Parse the URI scheme of a free (no brackets) external link. +*/ +static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) +{ + static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; + Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer; + PyObject *temp, *reversed, *scheme; + Py_UNICODE chunk; + int slashes, i, j; + + if (!scheme_buffer) + return -1; + // We have to backtrack through the textbuffer looking for our scheme since + // it was just parsed as text: + temp_buffer = self->topstack->textbuffer; + while (temp_buffer) { + for (i = temp_buffer->size - 1; i >= 0; i++) { + chunk = temp_buffer->data[i]; + if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) + goto end_of_loop; + j = 0; + while (1) { + if (!valid[j]) { + Textbuffer_dealloc(scheme_buffer); + FAIL_ROUTE(0); + return 0; + } + if (chunk == valid[j]) + break; + j++; + } + Textbuffer_write(&scheme_buffer, chunk); + } + temp_buffer = temp_buffer->next; + } + end_of_loop: + temp = Textbuffer_render(scheme_buffer); + if (!temp) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + reversed = PyDict_GetItemString(PyEval_GetBuiltins(), "reversed"); + scheme = PyObject_CallFunctionObjArgs(reversed, temp, NULL); + Py_DECREF(temp); + if (!scheme) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + slashes = (Tokenizer_READ(self, 0) == *"/" && + Tokenizer_READ(self, 1) == *"/"); + if (!IS_SCHEME(scheme, slashes ? Py_True : Py_False)) { + Py_DECREF(scheme); + Textbuffer_dealloc(scheme_buffer); + FAIL_ROUTE(0); + return 0; + } + Py_DECREF(scheme); + if (Tokenizer_push(self, LC_EXT_LINK_URI)) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + while (temp_buffer) { + for (i = 0; i < temp_buffer->size; i++) { + if (Tokenizer_emit_char(self, temp_buffer->data[i])) { + Textbuffer_dealloc(scheme_buffer); + return -1; + } + } + temp_buffer = temp_buffer->next; + } + Textbuffer_dealloc(scheme_buffer); + if (Tokenizer_emit_char(self, *":")) + return -1; + if (slashes) { + if (Tokenizer_emit_text(self, "//")) + return -1; + self->head += 2; + } +} + +/* + Handle text in a free external link, including trailing punctuation. +*/ +static int +Tokenizer_handle_free_link_text(Tokenizer* self, PyObject** punct, + Textbuffer** tail, Py_UNICODE this) +{ + // if "(" in this and ")" in punct: + // punct = punct[:-1] # ')' is not longer valid punctuation + // if this.endswith(punct): + // for i in reversed(range(-len(this), 0)): + // if i == -len(this) or this[i - 1] not in punct: + // break + // stripped = this[:i] + // if stripped and tail: + // self._emit_text(tail) + // tail = "" + // tail += this[i:] + // this = stripped + // elif tail: + // self._emit_text(tail) + // tail = "" + // self._emit_text(this) + // return punct, tail +} + +/* + Really parse an external link. +*/ +static PyObject* +Tokenizer_really_parse_external_link(Tokenizer* self, int brackets) +{ + // if brackets: + // self._parse_bracketed_uri_scheme() + // invalid = ("\n", " ", "]") + // else: + // self._parse_free_uri_scheme() + // invalid = ("\n", " ", "[", "]") + // punct = tuple(",;\.:!?)") + // if self._read() is self.END or self._read()[0] in invalid: + // self._fail_route() + // tail = "" + // while True: + // this, next = self._read(), self._read(1) + // if this is self.END or this == "\n": + // if brackets: + // self._fail_route() + // return self._pop(), tail, -1 + // elif this == next == "{" and self._can_recurse(): + // if tail: + // self._emit_text(tail) + // tail = "" + // self._parse_template_or_argument() + // elif this == "[": + // if brackets: + // self._emit_text("[") + // else: + // return self._pop(), tail, -1 + // elif this == "]": + // return self._pop(), tail, 0 if brackets else -1 + // elif this == "&": + // if tail: + // self._emit_text(tail) + // tail = "" + // self._parse_entity() + // elif " " in this: + // before, after = this.split(" ", 1) + // if brackets: + // self._emit_text(before) + // self._emit(tokens.ExternalLinkSeparator()) + // self._emit_text(after) + // self._context ^= contexts.EXT_LINK_URI + // self._context |= contexts.EXT_LINK_TITLE + // self._head += 1 + // return self._parse(push=False), None, 0 + // punct, tail = self._handle_free_link_text(punct, tail, before) + // return self._pop(), tail + " " + after, 0 + // elif not brackets: + // punct, tail = self._handle_free_link_text(punct, tail, this) + // else: + // self._emit_text(this) + // self._head += 1 +} + +/* + Remove the URI scheme of a new external link from the textbuffer. +*/ +static int +Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* scheme) +{ + // length = len(scheme) + // while length: + // if length < len(self._textbuffer[-1]): + // self._textbuffer[-1] = self._textbuffer[-1][:-length] + // break + // length -= len(self._textbuffer[-1]) + // self._textbuffer.pop() +} + +/* + Parse an external link at the head of the wikicode string. +*/ +static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) +{ + // reset = self._head + // self._head += 1 + // try: + // bad_context = self._context & contexts.INVALID_LINK + // if bad_context or not self._can_recurse(): + // raise BadRoute() + // link, extra, delta = self._really_parse_external_link(brackets) + // except BadRoute: + // self._head = reset + // if not brackets and self._context & contexts.DL_TERM: + // self._handle_dl_term() + // else: + // self._emit_text(self._read()) + // else: + // if not brackets: + // scheme = link[0].text.split(":", 1)[0] + // self._remove_uri_scheme_from_textbuffer(scheme) + // self._emit(tokens.ExternalLinkOpen(brackets=brackets)) + // self._emit_all(link) + // self._emit(tokens.ExternalLinkClose()) + // self._head += delta + // if extra: + // self._emit_text(extra) +} + +/* Parse a section heading at the head of the wikicode string. */ static int Tokenizer_parse_heading(Tokenizer* self) @@ -1978,9 +2259,9 @@ static int Tokenizer_handle_hr(Tokenizer* self) self->head++; } markup = Textbuffer_render(buffer); + Textbuffer_dealloc(buffer); if (!markup) return -1; - Textbuffer_dealloc(buffer); kwargs = PyDict_New(); if (!kwargs) return -1; @@ -2192,16 +2473,16 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) } else if (this == next && next == *"]" && this_context & LC_WIKILINK) return Tokenizer_handle_wikilink_end(self); - // else if (this == *"[") { - // if (Tokenizer_parse_external_link(self, 1)) - // return NULL; - // } - // else if (this == *":" && !is_marker(last)) { - // if (Tokenizer_parse_external_link(self, 0)) - // return NULL; - // } - // else if (this == *"]" && this_context & LC_EXT_LINK_TITLE) - // return Tokenizer_pop(self); + else if (this == *"[") { + if (Tokenizer_parse_external_link(self, 1)) + return NULL; + } + else if (this == *":" && !is_marker(last)) { + if (Tokenizer_parse_external_link(self, 0)) + return NULL; + } + else if (this == *"]" && this_context & LC_EXT_LINK_TITLE) + return Tokenizer_pop(self); else if (this == *"=" && !(self->global & GL_HEADING)) { if (last == *"\n" || last == *"") { if (Tokenizer_parse_heading(self)) diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index e125068..a49e896 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -122,7 +122,7 @@ static PyObject* TagCloseClose; #define LC_WIKILINK_TEXT 0x00000040 #define LC_EXT_LINK 0x00000380 -#define LC_EXT_LINK_URL 0x00000080 +#define LC_EXT_LINK_URI 0x00000080 #define LC_EXT_LINK_TITLE 0x00000100 #define LC_EXT_LINK_BRACKETS 0x00000200 @@ -165,7 +165,7 @@ static PyObject* TagCloseClose; #define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) -#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK_URL) +#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK_URI) /* Tag contexts: */ @@ -241,10 +241,10 @@ typedef struct { /* Macros for accessing definitions: */ #define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") -#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag)) -#define IS_SINGLE(tag) (call_def_func("is_single", tag)) -#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag)) -#define IS_SCHEME(scheme) (call_def_func("is_scheme", scheme)) +#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL)) +#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL)) +#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL)) +#define IS_SCHEME(scheme, slashes) (call_def_func("is_scheme", scheme, slashes)) /* Function prototypes: */ From 6ecf15cad4c2d0f271e0de67d54869cb35f2d3b2 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 20:41:45 -0400 Subject: [PATCH 23/35] Tokenizer_parse_external_link() --- mwparserfromhell/parser/tokenizer.c | 93 ++++++++++++++++++++++++------------- mwparserfromhell/parser/tokenizer.h | 1 + 2 files changed, 62 insertions(+), 32 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 80f4c56..1681398 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -358,7 +358,7 @@ static void* Tokenizer_fail_route(Tokenizer* self) } /* - Write a token to the end of the current token stack. + Write a token to the current token stack. */ static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) { @@ -379,7 +379,8 @@ static int Tokenizer_emit_token(Tokenizer* self, PyObject* token, int first) } /* - Write a token to the end of the current token stack. + Write a token to the current token stack, with kwargs. Steals a reference + to kwargs. */ static int Tokenizer_emit_token_kwargs(Tokenizer* self, PyObject* token, PyObject* kwargs, int first) @@ -997,13 +998,15 @@ Tokenizer_handle_free_link_text(Tokenizer* self, PyObject** punct, // tail = "" // self._emit_text(this) // return punct, tail + return 0; } /* Really parse an external link. */ static PyObject* -Tokenizer_really_parse_external_link(Tokenizer* self, int brackets) +Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, + char** extra) { // if brackets: // self._parse_bracketed_uri_scheme() @@ -1020,7 +1023,8 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets) // if this is self.END or this == "\n": // if brackets: // self._fail_route() - // return self._pop(), tail, -1 + // self.head -= 1 + // return self._pop(), tail // elif this == next == "{" and self._can_recurse(): // if tail: // self._emit_text(tail) @@ -1030,9 +1034,12 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets) // if brackets: // self._emit_text("[") // else: - // return self._pop(), tail, -1 + // self._head -= 1 + // return self._pop(), tail // elif this == "]": - // return self._pop(), tail, 0 if brackets else -1 + // if not brackets: + // self._head -= 1 + // return self._pop(), tail // elif this == "&": // if tail: // self._emit_text(tail) @@ -1047,22 +1054,24 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets) // self._context ^= contexts.EXT_LINK_URI // self._context |= contexts.EXT_LINK_TITLE // self._head += 1 - // return self._parse(push=False), None, 0 + // return self._parse(push=False), None // punct, tail = self._handle_free_link_text(punct, tail, before) - // return self._pop(), tail + " " + after, 0 + // return self._pop(), tail + " " + after // elif not brackets: // punct, tail = self._handle_free_link_text(punct, tail, this) // else: // self._emit_text(this) // self._head += 1 + return NULL; } /* Remove the URI scheme of a new external link from the textbuffer. */ static int -Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* scheme) +Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) { + // scheme = link[0].text.split(":", 1)[0] // length = len(scheme) // while length: // if length < len(self._textbuffer[-1]): @@ -1070,6 +1079,7 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* scheme) // break // length -= len(self._textbuffer[-1]) // self._textbuffer.pop() + return 0; } /* @@ -1077,29 +1087,48 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* scheme) */ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) { - // reset = self._head - // self._head += 1 - // try: - // bad_context = self._context & contexts.INVALID_LINK - // if bad_context or not self._can_recurse(): - // raise BadRoute() - // link, extra, delta = self._really_parse_external_link(brackets) - // except BadRoute: - // self._head = reset - // if not brackets and self._context & contexts.DL_TERM: - // self._handle_dl_term() - // else: - // self._emit_text(self._read()) - // else: - // if not brackets: - // scheme = link[0].text.split(":", 1)[0] - // self._remove_uri_scheme_from_textbuffer(scheme) - // self._emit(tokens.ExternalLinkOpen(brackets=brackets)) - // self._emit_all(link) - // self._emit(tokens.ExternalLinkClose()) - // self._head += delta - // if extra: - // self._emit_text(extra) + Py_ssize_t reset = self->head; + PyObject *link, *kwargs; + char* extra; + + self->head++; + if (self->topstack->context & AGG_INVALID_LINK || !(Tokenizer_CAN_RECURSE(self))) { + FAIL_ROUTE(0); + } + else + link = Tokenizer_really_parse_external_link(self, brackets, &extra); + if (BAD_ROUTE) { + self->head = reset; + if (!brackets && self->topstack->context & LC_DLTERM) + return Tokenizer_handle_dl_term(self); + return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)); + } + if (!link) + return -1; + if (!brackets) { + if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) + return -1; + } + kwargs = PyDict_New(); + if (!kwargs) { + Py_DECREF(link); + return -1; + } + PyDict_SetItemString(kwargs, "brackets", brackets ? Py_True : Py_False); + if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) { + Py_DECREF(link); + return -1; + } + if (Tokenizer_emit_all(self, link)) { + Py_DECREF(link); + return -1; + } + Py_DECREF(link); + if (Tokenizer_emit(self, ExternalLinkClose)) + return -1; + if (extra) + return Tokenizer_emit_text(self, extra); + return 0; } /* diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index a49e896..0e41a85 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -258,6 +258,7 @@ static void TagData_dealloc(TagData*); static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); static void Tokenizer_dealloc(Tokenizer*); static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); +static int Tokenizer_handle_dl_term(Tokenizer*); static int Tokenizer_parse_tag(Tokenizer*); static PyObject* Tokenizer_parse(Tokenizer*, int, int); static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); From 7dcfa3fe929c3344994517fb28e3002d844a834d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Thu, 22 Aug 2013 22:15:49 -0400 Subject: [PATCH 24/35] Implement Tokenizer_really_parse_external_link(), some other fixes --- mwparserfromhell/definitions.py | 4 +- mwparserfromhell/parser/tokenizer.c | 228 ++++++++++++++++++++++-------------- mwparserfromhell/parser/tokenizer.h | 10 +- 3 files changed, 146 insertions(+), 96 deletions(-) diff --git a/mwparserfromhell/definitions.py b/mwparserfromhell/definitions.py index ef8255e..9449bcb 100644 --- a/mwparserfromhell/definitions.py +++ b/mwparserfromhell/definitions.py @@ -81,8 +81,10 @@ def is_single_only(tag): """Return whether or not the given *tag* must exist without a close tag.""" return tag.lower() in SINGLE_ONLY -def is_scheme(scheme, slashes=True): +def is_scheme(scheme, slashes=True, reverse=False): """Return whether *scheme* is valid for external links.""" + if reverse: # Convenience for C + scheme = scheme[::-1] scheme = scheme.lower() if slashes: return scheme in URI_SCHEMES diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 1681398..6f0c1a6 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -51,13 +51,14 @@ static int heading_level_from_context(int n) } /* - Call the given function in definitions.py, using 'in1' and 'in2' as + Call the given function in definitions.py, using 'in1', 'in2', and 'in3' as parameters, and return its output as a bool. */ -static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2) +static int call_def_func(const char* funcname, PyObject* in1, PyObject* in2, + PyObject* in3) { PyObject* func = PyObject_GetAttrString(definitions, funcname); - PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, NULL); + PyObject* result = PyObject_CallFunctionObjArgs(func, in1, in2, in3, NULL); int ans = (result == Py_True) ? 1 : 0; Py_DECREF(func); @@ -431,6 +432,28 @@ static int Tokenizer_emit_text(Tokenizer* self, const char* text) } /* + Write the contents of another textbuffer to the current textbuffer, + deallocating it in the process. +*/ +static int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer) +{ + Textbuffer *original = buffer; + int i; + + while (buffer) { + for (i = 0; i < buffer->size; i++) { + if (Tokenizer_emit_char(self, buffer->data[i])) { + Textbuffer_dealloc(original); + return -1; + } + } + buffer = buffer->next; + } + Textbuffer_dealloc(original); + return 0; +} + +/* Write a series of tokens to the current stack at once. */ static int Tokenizer_emit_all(Tokenizer* self, PyObject* tokenlist) @@ -883,7 +906,7 @@ static int Tokenizer_parse_bracketed_uri_scheme(Tokenizer* self) Textbuffer_dealloc(buffer); if (!scheme) return -1; - if (!IS_SCHEME(scheme, slashes ? Py_True : Py_False)) { + if (!IS_SCHEME(scheme, slashes, 0)) { Py_DECREF(scheme); Tokenizer_fail_route(self); return 0; @@ -900,7 +923,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) { static const char* valid = "abcdefghijklmnopqrstuvwxyz0123456789+.-"; Textbuffer *scheme_buffer = Textbuffer_new(), *temp_buffer; - PyObject *temp, *reversed, *scheme; + PyObject *scheme; Py_UNICODE chunk; int slashes, i, j; @@ -930,21 +953,14 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) temp_buffer = temp_buffer->next; } end_of_loop: - temp = Textbuffer_render(scheme_buffer); - if (!temp) { - Textbuffer_dealloc(scheme_buffer); - return -1; - } - reversed = PyDict_GetItemString(PyEval_GetBuiltins(), "reversed"); - scheme = PyObject_CallFunctionObjArgs(reversed, temp, NULL); - Py_DECREF(temp); + scheme = Textbuffer_render(scheme_buffer); if (!scheme) { Textbuffer_dealloc(scheme_buffer); return -1; } slashes = (Tokenizer_READ(self, 0) == *"/" && Tokenizer_READ(self, 1) == *"/"); - if (!IS_SCHEME(scheme, slashes ? Py_True : Py_False)) { + if (!IS_SCHEME(scheme, slashes, 1)) { Py_DECREF(scheme); Textbuffer_dealloc(scheme_buffer); FAIL_ROUTE(0); @@ -955,16 +971,8 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) Textbuffer_dealloc(scheme_buffer); return -1; } - while (temp_buffer) { - for (i = 0; i < temp_buffer->size; i++) { - if (Tokenizer_emit_char(self, temp_buffer->data[i])) { - Textbuffer_dealloc(scheme_buffer); - return -1; - } - } - temp_buffer = temp_buffer->next; - } - Textbuffer_dealloc(scheme_buffer); + if (Tokenizer_emit_textbuffer(self, scheme_buffer)) + return -1; if (Tokenizer_emit_char(self, *":")) return -1; if (slashes) { @@ -972,15 +980,25 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) return -1; self->head += 2; } + return 0; } /* Handle text in a free external link, including trailing punctuation. */ static int -Tokenizer_handle_free_link_text(Tokenizer* self, PyObject** punct, +Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, Textbuffer** tail, Py_UNICODE this) { + #define PUSH_TAIL_BUFFER(tail, error) \ + if ((tail)->size || (tail)->next) { \ + Tokenizer_emit_textbuffer(self, tail); \ + tail = Textbuffer_new(); \ + if (!(tail)) \ + return error; \ + } + + // punct = tuple(",;\.:!?)") // if "(" in this and ")" in punct: // punct = punct[:-1] # ')' is not longer valid punctuation // if this.endswith(punct): @@ -998,7 +1016,7 @@ Tokenizer_handle_free_link_text(Tokenizer* self, PyObject** punct, // tail = "" // self._emit_text(this) // return punct, tail - return 0; + return Tokenizer_emit_char(self, this); } /* @@ -1006,63 +1024,76 @@ Tokenizer_handle_free_link_text(Tokenizer* self, PyObject** punct, */ static PyObject* Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, - char** extra) -{ - // if brackets: - // self._parse_bracketed_uri_scheme() - // invalid = ("\n", " ", "]") - // else: - // self._parse_free_uri_scheme() - // invalid = ("\n", " ", "[", "]") - // punct = tuple(",;\.:!?)") - // if self._read() is self.END or self._read()[0] in invalid: - // self._fail_route() - // tail = "" - // while True: - // this, next = self._read(), self._read(1) - // if this is self.END or this == "\n": - // if brackets: - // self._fail_route() - // self.head -= 1 - // return self._pop(), tail - // elif this == next == "{" and self._can_recurse(): - // if tail: - // self._emit_text(tail) - // tail = "" - // self._parse_template_or_argument() - // elif this == "[": - // if brackets: - // self._emit_text("[") - // else: - // self._head -= 1 - // return self._pop(), tail - // elif this == "]": - // if not brackets: - // self._head -= 1 - // return self._pop(), tail - // elif this == "&": - // if tail: - // self._emit_text(tail) - // tail = "" - // self._parse_entity() - // elif " " in this: - // before, after = this.split(" ", 1) - // if brackets: - // self._emit_text(before) - // self._emit(tokens.ExternalLinkSeparator()) - // self._emit_text(after) - // self._context ^= contexts.EXT_LINK_URI - // self._context |= contexts.EXT_LINK_TITLE - // self._head += 1 - // return self._parse(push=False), None - // punct, tail = self._handle_free_link_text(punct, tail, before) - // return self._pop(), tail + " " + after - // elif not brackets: - // punct, tail = self._handle_free_link_text(punct, tail, this) - // else: - // self._emit_text(this) - // self._head += 1 - return NULL; + Textbuffer** extra) +{ + Py_UNICODE this, next; + int parens = 0; + + if (brackets ? Tokenizer_parse_bracketed_uri_scheme(self) : + Tokenizer_parse_free_uri_scheme(self)) + return NULL; + if (BAD_ROUTE) + return NULL; + this = Tokenizer_READ(self, 0); + if (this == *"" || this == *"\n" || this == *" " || this == *"]") + return Tokenizer_fail_route(self); + if (!brackets && this == *"[") + return Tokenizer_fail_route(self); + while (1) { + this = Tokenizer_READ(self, 0); + next = Tokenizer_READ(self, 1); + if (this == *"" || this == *"\n") { + if (brackets) + return Tokenizer_fail_route(self); + self->head--; + return Tokenizer_pop(self); + } + if (this == *"{" && next == *"{" && Tokenizer_CAN_RECURSE(self)) { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_template_or_argument(self)) + return NULL; + } + else if (this == *"[") { + if (!brackets) { + self->head--; + return Tokenizer_pop(self); + } + if (Tokenizer_emit_char(self, *"[")) + return NULL; + } + else if (this == *"]") { + if (!brackets) + self->head--; + return Tokenizer_pop(self); + } + else if (this == *"&") { + PUSH_TAIL_BUFFER(*extra, NULL) + if (Tokenizer_parse_entity(self)) + return NULL; + } + else if (this == *" ") { + if (brackets) { + if (Tokenizer_emit(self, ExternalLinkSeparator)) + return NULL; + self->topstack->context ^= LC_EXT_LINK_URI; + self->topstack->context |= LC_EXT_LINK_TITLE; + self->head++; + return Tokenizer_parse(self, 0, 0); + } + if (Textbuffer_write(extra, *" ")) + return NULL; + return Tokenizer_pop(self); + } + else if (!brackets) { + if (Tokenizer_handle_free_link_text(self, &parens, extra, this)) + return NULL; + } + else { + if (Tokenizer_emit_char(self, this)) + return NULL; + } + self->head++; + } } /* @@ -1089,45 +1120,60 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) { Py_ssize_t reset = self->head; PyObject *link, *kwargs; - char* extra; + Textbuffer *extra; self->head++; - if (self->topstack->context & AGG_INVALID_LINK || !(Tokenizer_CAN_RECURSE(self))) { + #define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK + if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { FAIL_ROUTE(0); } - else + else { + extra = Textbuffer_new(); + if (!extra) + return -1; link = Tokenizer_really_parse_external_link(self, brackets, &extra); + } if (BAD_ROUTE) { self->head = reset; + Textbuffer_dealloc(extra); if (!brackets && self->topstack->context & LC_DLTERM) return Tokenizer_handle_dl_term(self); return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)); } - if (!link) + if (!link) { + Textbuffer_dealloc(extra); return -1; + } if (!brackets) { - if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) + if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) { + Textbuffer_dealloc(extra); return -1; + } } kwargs = PyDict_New(); if (!kwargs) { + Textbuffer_dealloc(extra); Py_DECREF(link); return -1; } PyDict_SetItemString(kwargs, "brackets", brackets ? Py_True : Py_False); if (Tokenizer_emit_kwargs(self, ExternalLinkOpen, kwargs)) { + Textbuffer_dealloc(extra); Py_DECREF(link); return -1; } if (Tokenizer_emit_all(self, link)) { + Textbuffer_dealloc(extra); Py_DECREF(link); return -1; } Py_DECREF(link); - if (Tokenizer_emit(self, ExternalLinkClose)) + if (Tokenizer_emit(self, ExternalLinkClose)) { + Textbuffer_dealloc(extra); return -1; - if (extra) - return Tokenizer_emit_text(self, extra); + } + if (extra->size || extra->next) + return Tokenizer_emit_textbuffer(self, extra); return 0; } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index 0e41a85..e437814 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -241,10 +241,11 @@ typedef struct { /* Macros for accessing definitions: */ #define GET_HTML_TAG(markup) (markup == *":" ? "dd" : markup == *";" ? "dt" : "li") -#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL)) -#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL)) -#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL)) -#define IS_SCHEME(scheme, slashes) (call_def_func("is_scheme", scheme, slashes)) +#define IS_PARSABLE(tag) (call_def_func("is_parsable", tag, NULL, NULL)) +#define IS_SINGLE(tag) (call_def_func("is_single", tag, NULL, NULL)) +#define IS_SINGLE_ONLY(tag) (call_def_func("is_single_only", tag, NULL, NULL)) +#define IS_SCHEME(scheme, slashes, reverse) \ + (call_def_func("is_scheme", scheme, slashes ? Py_True : Py_False, reverse ? Py_True : Py_False)) /* Function prototypes: */ @@ -258,6 +259,7 @@ static void TagData_dealloc(TagData*); static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); static void Tokenizer_dealloc(Tokenizer*); static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); +static int Tokenizer_parse_entity(Tokenizer*); static int Tokenizer_handle_dl_term(Tokenizer*); static int Tokenizer_parse_tag(Tokenizer*); static PyObject* Tokenizer_parse(Tokenizer*, int, int); From c1b502bbe6405f408d4d98bc85154fd338443ce8 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 01:24:35 -0400 Subject: [PATCH 25/35] Finish external links implementation. --- mwparserfromhell/parser/tokenizer.c | 61 +++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 6f0c1a6..6310523 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -80,7 +80,7 @@ static PyObject* strip_tag_name(PyObject* token) Py_DECREF(text); if (!rstripped) return NULL; - lowered = PyObject_CallMethod(rstripped, "rstrip", NULL); + lowered = PyObject_CallMethod(rstripped, "lower", NULL); Py_DECREF(rstripped); return lowered; } @@ -998,24 +998,14 @@ Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, return error; \ } - // punct = tuple(",;\.:!?)") - // if "(" in this and ")" in punct: - // punct = punct[:-1] # ')' is not longer valid punctuation - // if this.endswith(punct): - // for i in reversed(range(-len(this), 0)): - // if i == -len(this) or this[i - 1] not in punct: - // break - // stripped = this[:i] - // if stripped and tail: - // self._emit_text(tail) - // tail = "" - // tail += this[i:] - // this = stripped - // elif tail: - // self._emit_text(tail) - // tail = "" - // self._emit_text(this) - // return punct, tail + if (this == *"(" && !(*parens)) + *parens = 1; + else if (this == *"," || this == *";" || this == *"\\" || this == *"." || + this == *":" || this == *"!" || this == *"?" || + (!(*parens) && this == *")")) + return Textbuffer_write(tail, this); + else + PUSH_TAIL_BUFFER(*tail, -1) return Tokenizer_emit_char(self, this); } @@ -1102,14 +1092,31 @@ Tokenizer_really_parse_external_link(Tokenizer* self, int brackets, static int Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) { - // scheme = link[0].text.split(":", 1)[0] - // length = len(scheme) - // while length: - // if length < len(self._textbuffer[-1]): - // self._textbuffer[-1] = self._textbuffer[-1][:-length] - // break - // length -= len(self._textbuffer[-1]) - // self._textbuffer.pop() + PyObject *text = PyObject_GetAttrString(PyList_GET_ITEM(link, 0), "text"), + *split, *scheme; + Py_ssize_t length; + Textbuffer* temp; + + if (!text) + return -1; + split = PyObject_CallMethod(text, "split", "si", ":", 1); + Py_DECREF(text); + if (!split) + return -1; + scheme = PyList_GET_ITEM(split, 0); + length = PyUnicode_GET_SIZE(scheme); + while (length) { + temp = self->topstack->textbuffer; + if (length <= temp->size) { + temp->size -= length; + break; + } + length -= temp->size; + self->topstack->textbuffer = temp->next; + free(temp->data); + free(temp); + } + Py_DECREF(split); return 0; } From b9324eb658eda01a874d18cf193b6647ba3d0e5e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 02:29:00 -0400 Subject: [PATCH 26/35] Fix Python tokenizer to not generate empty Text tokens. --- mwparserfromhell/parser/tokenizer.py | 3 ++- tests/tokenizer/external_links.mwtest | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 2340077..2c8d6d7 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -424,7 +424,8 @@ class Tokenizer(object): if brackets: self._emit_text(before) self._emit(tokens.ExternalLinkSeparator()) - self._emit_text(after) + if after: + self._emit_text(after) self._context ^= contexts.EXT_LINK_URI self._context |= contexts.EXT_LINK_TITLE self._head += 1 diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index b517ada..6666c05 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -15,7 +15,7 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), Exte name: brackets_space label: basic external link in brackets, with a space after input: "[http://example.com/ ]" -output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), Text(text=""), ExternalLinkClose()] +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/"), ExternalLinkSeparator(), ExternalLinkClose()] --- From 072b956a073e15ec164edd971e156cd256a37d8a Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 02:29:51 -0400 Subject: [PATCH 27/35] Make a couple tests harder. --- tests/tokenizer/external_links.mwtest | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/tokenizer/external_links.mwtest b/tests/tokenizer/external_links.mwtest index 6666c05..af7a570 100644 --- a/tests/tokenizer/external_links.mwtest +++ b/tests/tokenizer/external_links.mwtest @@ -91,8 +91,8 @@ output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com"), Exte name: colons_after label: colons after a free link that are excluded -input: "http://example.com/foo:bar:::baz:::" -output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo:bar:::baz"), ExternalLinkClose(), Text(text=":::")] +input: "http://example.com/foo:bar.:;baz!?," +output: [ExternalLinkOpen(brackets=False), Text(text="http://example.com/foo:bar.:;baz"), ExternalLinkClose(), Text(text="!?,")] --- @@ -126,8 +126,8 @@ output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com."), Exte name: brackets_colons_after label: colons after a bracket-enclosed link that are included -input: "[http://example.com/foo:bar:::baz::: Example]" -output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo:bar:::baz:::"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] +input: "[http://example.com/foo:bar.:;baz!?, Example]" +output: [ExternalLinkOpen(brackets=True), Text(text="http://example.com/foo:bar.:;baz!?,"), ExternalLinkSeparator(), Text(text="Example"), ExternalLinkClose()] --- From 2561cf5b5e94ee7df7878b879bcf2354e074b255 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 02:30:27 -0400 Subject: [PATCH 28/35] Fix all bugs in C implementation of external links. --- mwparserfromhell/parser/tokenizer.c | 56 ++++++++++++++++++++++++------------- mwparserfromhell/parser/tokenizer.h | 3 +- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 6310523..c2ac12f 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -100,7 +100,7 @@ static Textbuffer* Textbuffer_new(void) PyErr_NoMemory(); return NULL; } - buffer->next = NULL; + buffer->prev = buffer->next = NULL; return buffer; } @@ -128,6 +128,7 @@ static int Textbuffer_write(Textbuffer** this, Py_UNICODE code) if (!new) return -1; new->next = self; + self->prev = new; *this = self = new; } self->data[self->size++] = code; @@ -435,19 +436,33 @@ static int Tokenizer_emit_text(Tokenizer* self, const char* text) Write the contents of another textbuffer to the current textbuffer, deallocating it in the process. */ -static int Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer) +static int +Tokenizer_emit_textbuffer(Tokenizer* self, Textbuffer* buffer, int reverse) { Textbuffer *original = buffer; int i; - while (buffer) { - for (i = 0; i < buffer->size; i++) { - if (Tokenizer_emit_char(self, buffer->data[i])) { - Textbuffer_dealloc(original); - return -1; + if (reverse) { + do { + for (i = buffer->size - 1; i >= 0; i--) { + if (Tokenizer_emit_char(self, buffer->data[i])) { + Textbuffer_dealloc(original); + return -1; + } } - } - buffer = buffer->next; + } while ((buffer = buffer->next)); + } + else { + while (buffer->next) + buffer = buffer->next; + do { + for (i = 0; i < buffer->size; i++) { + if (Tokenizer_emit_char(self, buffer->data[i])) { + Textbuffer_dealloc(original); + return -1; + } + } + } while ((buffer = buffer->prev)); } Textbuffer_dealloc(original); return 0; @@ -933,7 +948,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) // it was just parsed as text: temp_buffer = self->topstack->textbuffer; while (temp_buffer) { - for (i = temp_buffer->size - 1; i >= 0; i++) { + for (i = temp_buffer->size - 1; i >= 0; i--) { chunk = temp_buffer->data[i]; if (Py_UNICODE_ISSPACE(chunk) || is_marker(chunk)) goto end_of_loop; @@ -971,7 +986,7 @@ static int Tokenizer_parse_free_uri_scheme(Tokenizer* self) Textbuffer_dealloc(scheme_buffer); return -1; } - if (Tokenizer_emit_textbuffer(self, scheme_buffer)) + if (Tokenizer_emit_textbuffer(self, scheme_buffer, 1)) return -1; if (Tokenizer_emit_char(self, *":")) return -1; @@ -990,16 +1005,18 @@ static int Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, Textbuffer** tail, Py_UNICODE this) { - #define PUSH_TAIL_BUFFER(tail, error) \ - if ((tail)->size || (tail)->next) { \ - Tokenizer_emit_textbuffer(self, tail); \ - tail = Textbuffer_new(); \ - if (!(tail)) \ - return error; \ + #define PUSH_TAIL_BUFFER(tail, error) \ + if ((tail)->size || (tail)->next) { \ + Tokenizer_emit_textbuffer(self, tail, 0); \ + tail = Textbuffer_new(); \ + if (!(tail)) \ + return error; \ } - if (this == *"(" && !(*parens)) + if (this == *"(" && !(*parens)) { *parens = 1; + PUSH_TAIL_BUFFER(*tail, -1) + } else if (this == *"," || this == *";" || this == *"\\" || this == *"." || this == *":" || this == *"!" || this == *"?" || (!(*parens) && this == *")")) @@ -1141,6 +1158,7 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) link = Tokenizer_really_parse_external_link(self, brackets, &extra); } if (BAD_ROUTE) { + RESET_ROUTE(); self->head = reset; Textbuffer_dealloc(extra); if (!brackets && self->topstack->context & LC_DLTERM) @@ -1180,7 +1198,7 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) return -1; } if (extra->size || extra->next) - return Tokenizer_emit_textbuffer(self, extra); + return Tokenizer_emit_textbuffer(self, extra, 0); return 0; } diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index e437814..c23fe4a 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -165,7 +165,7 @@ static PyObject* TagCloseClose; #define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) -#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK_URI) +#define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK) /* Tag contexts: */ @@ -184,6 +184,7 @@ static PyObject* TagCloseClose; struct Textbuffer { Py_ssize_t size; Py_UNICODE* data; + struct Textbuffer* prev; struct Textbuffer* next; }; From f1b95758d659c9352db9a7d1c4ca4ad85f82c400 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 03:22:18 -0400 Subject: [PATCH 29/35] Squash a memory leak. --- mwparserfromhell/parser/tokenizer.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index c2ac12f..3dca5c2 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1005,12 +1005,13 @@ static int Tokenizer_handle_free_link_text(Tokenizer* self, int* parens, Textbuffer** tail, Py_UNICODE this) { - #define PUSH_TAIL_BUFFER(tail, error) \ - if ((tail)->size || (tail)->next) { \ - Tokenizer_emit_textbuffer(self, tail, 0); \ - tail = Textbuffer_new(); \ - if (!(tail)) \ - return error; \ + #define PUSH_TAIL_BUFFER(tail, error) \ + if ((tail)->size || (tail)->next) { \ + if (Tokenizer_emit_textbuffer(self, tail, 0)) \ + return error; \ + tail = Textbuffer_new(); \ + if (!(tail)) \ + return error; \ } if (this == *"(" && !(*parens)) { @@ -1172,6 +1173,7 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) if (!brackets) { if (Tokenizer_remove_uri_scheme_from_textbuffer(self, link)) { Textbuffer_dealloc(extra); + Py_DECREF(link); return -1; } } @@ -1199,6 +1201,7 @@ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) } if (extra->size || extra->next) return Tokenizer_emit_textbuffer(self, extra, 0); + Textbuffer_dealloc(extra); return 0; } From 655cdc0dab2280ad4023ab78c6421448170b188d Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 19:36:53 -0400 Subject: [PATCH 30/35] TestBuilder.test_external_link() --- tests/test_builder.py | 46 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/tests/test_builder.py b/tests/test_builder.py index 29ae65a..152ab53 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -23,8 +23,8 @@ from __future__ import unicode_literals import unittest -from mwparserfromhell.nodes import (Argument, Comment, Heading, HTMLEntity, - Tag, Template, Text, Wikilink) +from mwparserfromhell.nodes import (Argument, Comment, ExternalLink, Heading, + HTMLEntity, Tag, Template, Text, Wikilink) from mwparserfromhell.nodes.extras import Attribute, Parameter from mwparserfromhell.parser import tokens from mwparserfromhell.parser.builder import Builder @@ -150,6 +150,48 @@ class TestBuilder(TreeEqualityTestCase): for test, valid in tests: self.assertWikicodeEqual(valid, self.builder.build(test)) + def test_external_link(self): + """tests for building ExternalLink nodes""" + tests = [ + ([tokens.ExternalLinkOpen(brackets=False), + tokens.Text(text="http://example.com/"), + tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example.com/"), + brackets=False)])), + + ([tokens.ExternalLinkOpen(brackets=True), + tokens.Text(text="http://example.com/"), + tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example.com/"))])), + + ([tokens.ExternalLinkOpen(brackets=True), + tokens.Text(text="http://example.com/"), + tokens.ExternalLinkSeparator(), tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example.com/"), wrap([]))])), + + ([tokens.ExternalLinkOpen(brackets=True), + tokens.Text(text="http://example.com/"), + tokens.ExternalLinkSeparator(), tokens.Text(text="Example"), + tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example.com/"), + wraptext("Example"))])), + + ([tokens.ExternalLinkOpen(brackets=False), + tokens.Text(text="http://example"), tokens.Text(text=".com/foo"), + tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example", ".com/foo"), + brackets=False)])), + + ([tokens.ExternalLinkOpen(brackets=True), + tokens.Text(text="http://example"), tokens.Text(text=".com/foo"), + tokens.ExternalLinkSeparator(), tokens.Text(text="Example"), + tokens.Text(text=" Web Page"), tokens.ExternalLinkClose()], + wrap([ExternalLink(wraptext("http://example", ".com/foo"), + wraptext("Example", " Web Page"))])), + ] + for test, valid in tests: + self.assertWikicodeEqual(valid, self.builder.build(test)) + def test_html_entity(self): """tests for building HTMLEntity nodes""" tests = [ From d91c65dc1f59347cc727296b3df29d21ade01f9e Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 20:05:32 -0400 Subject: [PATCH 31/35] TestExternalLink; some fixes in ExternalLink. --- mwparserfromhell/nodes/external_link.py | 19 ++--- tests/test_external_link.py | 130 ++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 9 deletions(-) create mode 100644 tests/test_external_link.py diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py index a604f9a..2ee37f3 100644 --- a/mwparserfromhell/nodes/external_link.py +++ b/mwparserfromhell/nodes/external_link.py @@ -53,16 +53,20 @@ class ExternalLink(Node): yield self.title, child def __strip__(self, normalize, collapse): - if self.title.strip(): - return self.title.strip_code(normalize, collapse) - return None + if self.brackets: + if self.title: + return self.title.strip_code(normalize, collapse) + return None + return self.url.strip_code(normalize, collapse) def __showtree__(self, write, get, mark): - write("[") + if self.brackets: + write("[") get(self.url) if self.title is not None: get(self.title) - write("]") + if self.brackets: + write("]") @property def url(self): @@ -85,10 +89,7 @@ class ExternalLink(Node): @title.setter def title(self, value): - if value is None: - self._title = None - else: - self._title = parse_anything(value) + self._title = None if value is None else parse_anything(value) @brackets.setter def brackets(self, value): diff --git a/tests/test_external_link.py b/tests/test_external_link.py new file mode 100644 index 0000000..13a82bf --- /dev/null +++ b/tests/test_external_link.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012-2013 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals +import unittest + +from mwparserfromhell.compat import str +from mwparserfromhell.nodes import ExternalLink, Text + +from ._test_tree_equality import TreeEqualityTestCase, getnodes, wrap, wraptext + +class TestExternalLink(TreeEqualityTestCase): + """Test cases for the ExternalLink node.""" + + def test_unicode(self): + """test ExternalLink.__unicode__()""" + node = ExternalLink(wraptext("http://example.com/"), brackets=False) + self.assertEqual("http://example.com/", str(node)) + node2 = ExternalLink(wraptext("http://example.com/")) + self.assertEqual("[http://example.com/]", str(node2)) + node3 = ExternalLink(wraptext("http://example.com/"), wrap([])) + self.assertEqual("[http://example.com/ ]", str(node3)) + node4 = ExternalLink(wraptext("http://example.com/"), + wraptext("Example Web Page")) + self.assertEqual("[http://example.com/ Example Web Page]", str(node4)) + + def test_iternodes(self): + """test ExternalLink.__iternodes__()""" + node1n1 = Text("http://example.com/") + node2n1 = Text("http://example.com/") + node2n2, node2n3 = Text("Example"), Text("Page") + node1 = ExternalLink(wrap([node1n1]), brackets=False) + node2 = ExternalLink(wrap([node2n1]), wrap([node2n2, node2n3])) + gen1 = node1.__iternodes__(getnodes) + gen2 = node2.__iternodes__(getnodes) + self.assertEqual((None, node1), next(gen1)) + self.assertEqual((None, node2), next(gen2)) + self.assertEqual((node1.url, node1n1), next(gen1)) + self.assertEqual((node2.url, node2n1), next(gen2)) + self.assertEqual((node2.title, node2n2), next(gen2)) + self.assertEqual((node2.title, node2n3), next(gen2)) + self.assertRaises(StopIteration, next, gen1) + self.assertRaises(StopIteration, next, gen2) + + def test_strip(self): + """test ExternalLink.__strip__()""" + node1 = ExternalLink(wraptext("http://example.com"), brackets=False) + node2 = ExternalLink(wraptext("http://example.com")) + node3 = ExternalLink(wraptext("http://example.com"), wrap([])) + node4 = ExternalLink(wraptext("http://example.com"), wraptext("Link")) + for a in (True, False): + for b in (True, False): + self.assertEqual("http://example.com", node1.__strip__(a, b)) + self.assertEqual(None, node2.__strip__(a, b)) + self.assertEqual(None, node3.__strip__(a, b)) + self.assertEqual("Link", node4.__strip__(a, b)) + + def test_showtree(self): + """test ExternalLink.__showtree__()""" + output = [] + getter, marker = object(), object() + get = lambda code: output.append((getter, code)) + mark = lambda: output.append(marker) + node1 = ExternalLink(wraptext("http://example.com"), brackets=False) + node2 = ExternalLink(wraptext("http://example.com"), wraptext("Link")) + node1.__showtree__(output.append, get, mark) + node2.__showtree__(output.append, get, mark) + valid = [ + (getter, node1.url), "[", (getter, node2.url), + (getter, node2.title), "]"] + self.assertEqual(valid, output) + + def test_url(self): + """test getter/setter for the url attribute""" + url = wraptext("http://example.com/") + node1 = ExternalLink(url, brackets=False) + node2 = ExternalLink(url, wraptext("Example")) + self.assertIs(url, node1.url) + self.assertIs(url, node2.url) + node1.url = "mailto:héhehé@spam.com" + node2.url = "mailto:héhehé@spam.com" + self.assertWikicodeEqual(wraptext("mailto:héhehé@spam.com"), node1.url) + self.assertWikicodeEqual(wraptext("mailto:héhehé@spam.com"), node2.url) + + def test_title(self): + """test getter/setter for the title attribute""" + title = wraptext("Example!") + node1 = ExternalLink(wraptext("http://example.com/"), brackets=False) + node2 = ExternalLink(wraptext("http://example.com/"), title) + self.assertIs(None, node1.title) + self.assertIs(title, node2.title) + node2.title = None + self.assertIs(None, node2.title) + node2.title = "My Website" + self.assertWikicodeEqual(wraptext("My Website"), node2.title) + + def test_brackets(self): + """test getter/setter for the brackets attribute""" + node1 = ExternalLink(wraptext("http://example.com/"), brackets=False) + node2 = ExternalLink(wraptext("http://example.com/"), wraptext("Link")) + self.assertFalse(node1.brackets) + self.assertTrue(node2.brackets) + node1.brackets = True + node2.brackets = False + self.assertTrue(node1.brackets) + self.assertFalse(node2.brackets) + self.assertEqual("[http://example.com/]", str(node1)) + self.assertEqual("http://example.com/", str(node2)) + +if __name__ == "__main__": + unittest.main(verbosity=2) From 67f1762aa402a7dee1b96f80e8d9d2521fe8b069 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 23:23:22 -0400 Subject: [PATCH 32/35] Doc updates, and allow passing a starting context to tokenize(). --- CHANGELOG | 2 +- docs/api/mwparserfromhell.nodes.rst | 8 ++++++++ docs/api/mwparserfromhell.rst | 6 +++--- docs/changelog.rst | 2 +- mwparserfromhell/__init__.py | 3 ++- mwparserfromhell/nodes/external_link.py | 2 +- mwparserfromhell/parser/__init__.py | 9 ++++----- mwparserfromhell/parser/tokenizer.c | 7 ++++--- mwparserfromhell/parser/tokenizer.py | 5 +++-- mwparserfromhell/utils.py | 16 +++++++++++----- tests/test_parser.py | 6 +++--- 11 files changed, 41 insertions(+), 25 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 84edc60..122247f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -16,7 +16,7 @@ v0.3 (unreleased): - Renamed Template.has_param() to has() for consistency with Template's other methods; has_param() is now an alias. - The C tokenizer extension now works on Python 3 in addition to Python 2.7. -- Various fixes and cleanup. +- Various bugfixes, internal changes, and cleanup. v0.2 (released June 20, 2013): diff --git a/docs/api/mwparserfromhell.nodes.rst b/docs/api/mwparserfromhell.nodes.rst index a093c17..7043070 100644 --- a/docs/api/mwparserfromhell.nodes.rst +++ b/docs/api/mwparserfromhell.nodes.rst @@ -25,6 +25,14 @@ nodes Package :undoc-members: :show-inheritance: +:mod:`external_link` Module +--------------------------- + +.. automodule:: mwparserfromhell.nodes.external_link + :members: + :undoc-members: + :show-inheritance: + :mod:`heading` Module --------------------- diff --git a/docs/api/mwparserfromhell.rst b/docs/api/mwparserfromhell.rst index b682139..0da522e 100644 --- a/docs/api/mwparserfromhell.rst +++ b/docs/api/mwparserfromhell.rst @@ -30,10 +30,10 @@ mwparserfromhell Package :members: :undoc-members: -:mod:`tag_defs` Module ----------------------- +:mod:`definitions` Module +------------------------- -.. automodule:: mwparserfromhell.tag_defs +.. automodule:: mwparserfromhell.definitions :members: :mod:`utils` Module diff --git a/docs/changelog.rst b/docs/changelog.rst index 810f594..f43a3c9 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -26,7 +26,7 @@ Unreleased :py:meth:`~.Template.has` for consistency with :py:class:`~.Template`\ 's other methods; :py:meth:`~.has_param` is now an alias. - The C tokenizer extension now works on Python 3 in addition to Python 2.7. -- Various fixes and cleanup. +- Various bugfixes, internal changes, and cleanup. v0.2 ---- diff --git a/mwparserfromhell/__init__.py b/mwparserfromhell/__init__.py index 738d4c2..74e1616 100644 --- a/mwparserfromhell/__init__.py +++ b/mwparserfromhell/__init__.py @@ -34,6 +34,7 @@ __license__ = "MIT License" __version__ = "0.3.dev" __email__ = "ben.kurtovic@verizon.net" -from . import compat, nodes, parser, smart_list, string_mixin, utils, wikicode +from . import (compat, definitions, nodes, parser, smart_list, string_mixin, + utils, wikicode) parse = utils.parse_anything diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py index 2ee37f3..bf1c9b1 100644 --- a/mwparserfromhell/nodes/external_link.py +++ b/mwparserfromhell/nodes/external_link.py @@ -70,7 +70,7 @@ class ExternalLink(Node): @property def url(self): - """The url of the link target, as a :py:class:`~.Wikicode` object.""" + """The URL of the link target, as a :py:class:`~.Wikicode` object.""" return self._url @property diff --git a/mwparserfromhell/parser/__init__.py b/mwparserfromhell/parser/__init__.py index 1fb95b5..22c3dc2 100644 --- a/mwparserfromhell/parser/__init__.py +++ b/mwparserfromhell/parser/__init__.py @@ -46,16 +46,15 @@ class Parser(object): :py:class:`~.Node`\ s by the :py:class:`~.Builder`. """ - def __init__(self, text): - self.text = text + def __init__(self): if use_c and CTokenizer: self._tokenizer = CTokenizer() else: self._tokenizer = Tokenizer() self._builder = Builder() - def parse(self): - """Return a string as a parsed :py:class:`~.Wikicode` object tree.""" - tokens = self._tokenizer.tokenize(self.text) + def parse(self, text, context=0): + """Parse *text*, returning a :py:class:`~.Wikicode` object tree.""" + tokens = self._tokenizer.tokenize(text, context) code = self._builder.build(tokens) return code diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index 3dca5c2..af6bf3b 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -2667,8 +2667,9 @@ static PyObject* Tokenizer_parse(Tokenizer* self, int context, int push) static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) { PyObject *text, *temp; + int context = 0; - if (PyArg_ParseTuple(args, "U", &text)) { + if (PyArg_ParseTuple(args, "U|i", &text, &context)) { Py_XDECREF(self->text); self->text = PySequence_Fast(text, "expected a sequence"); } @@ -2677,7 +2678,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) Py_ssize_t size; /* Failed to parse a Unicode object; try a string instead. */ PyErr_Clear(); - if (!PyArg_ParseTuple(args, "s#", &encoded, &size)) + if (!PyArg_ParseTuple(args, "s#|i", &encoded, &size, &context)) return NULL; temp = PyUnicode_FromStringAndSize(encoded, size); if (!text) @@ -2689,7 +2690,7 @@ static PyObject* Tokenizer_tokenize(Tokenizer* self, PyObject* args) } self->head = self->global = self->depth = self->cycles = 0; self->length = PyList_GET_SIZE(self->text); - return Tokenizer_parse(self, 0, 1); + return Tokenizer_parse(self, context, 1); } static int load_entitydefs(void) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 2c8d6d7..6ab549a 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -1125,8 +1125,9 @@ class Tokenizer(object): self._emit_text(this) self._head += 1 - def tokenize(self, text): + def tokenize(self, text, context=0): """Build a list of tokens from a string of wikicode and return it.""" split = self.regex.split(text) self._text = [segment for segment in split if segment] - return self._parse() + self._head = self._global = self._depth = self._cycles = 0 + return self._parse(context) diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index 31e5ba0..758e751 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -33,7 +33,7 @@ from .smart_list import SmartList __all__ = ["parse_anything"] -def parse_anything(value): +def parse_anything(value, context=0): """Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. This differs from :py:meth:`.Parser.parse` in that we accept more than just @@ -44,6 +44,12 @@ def parse_anything(value): on-the-fly by various methods of :py:class:`~.Wikicode` and others like :py:class:`~.Template`, such as :py:meth:`wikicode.insert() <.Wikicode.insert>` or setting :py:meth:`template.name <.Template.name>`. + + If given, *context* will be passed as a starting context to the parser. + This is helpful when this function is used inside node attribute setters. + For example, :py:class:`~.ExternalLink`\ 's :py:attr:`~.ExternalLink.url` + setter sets *context* to :py:mod:`contexts.EXT_LINK_URI <.contexts>` to + prevent the URL itself from becoming an :py:class:`~.ExternalLink`. """ from .parser import Parser from .wikicode import Wikicode @@ -53,17 +59,17 @@ def parse_anything(value): elif isinstance(value, Node): return Wikicode(SmartList([value])) elif isinstance(value, str): - return Parser(value).parse() + return Parser().parse(value, context) elif isinstance(value, bytes): - return Parser(value.decode("utf8")).parse() + return Parser().parse(value.decode("utf8"), context) elif isinstance(value, int): - return Parser(str(value)).parse() + return Parser().parse(str(value), context) elif value is None: return Wikicode(SmartList()) try: nodelist = SmartList() for item in value: - nodelist += parse_anything(item).nodes + nodelist += parse_anything(item, context).nodes except TypeError: error = "Needs string, Node, Wikicode, int, None, or iterable of these, but got {0}: {1}" raise ValueError(error.format(type(value).__name__, value)) diff --git a/tests/test_parser.py b/tests/test_parser.py index ec5f065..8760c0e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -36,9 +36,9 @@ class TestParser(TreeEqualityTestCase): def test_use_c(self): """make sure the correct tokenizer is used""" if parser.use_c: - self.assertTrue(parser.Parser(None)._tokenizer.USES_C) + self.assertTrue(parser.Parser()._tokenizer.USES_C) parser.use_c = False - self.assertFalse(parser.Parser(None)._tokenizer.USES_C) + self.assertFalse(parser.Parser()._tokenizer.USES_C) def test_parsing(self): """integration test for parsing overall""" @@ -59,7 +59,7 @@ class TestParser(TreeEqualityTestCase): ])) ]) ]) - actual = parser.Parser(text).parse() + actual = parser.Parser().parse(text) self.assertWikicodeEqual(expected, actual) if __name__ == "__main__": From d1a7d25220b0acf9bc8a43fb49d8b711431156b7 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 23 Aug 2013 23:36:34 -0400 Subject: [PATCH 33/35] Set right context for ExternalLink.url; Wikicode.filter_external_links() --- mwparserfromhell/nodes/external_link.py | 3 ++- mwparserfromhell/wikicode.py | 10 +++++----- tests/test_wikicode.py | 1 + 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mwparserfromhell/nodes/external_link.py b/mwparserfromhell/nodes/external_link.py index bf1c9b1..d74f6b3 100644 --- a/mwparserfromhell/nodes/external_link.py +++ b/mwparserfromhell/nodes/external_link.py @@ -85,7 +85,8 @@ class ExternalLink(Node): @url.setter def url(self, value): - self._url = parse_anything(value) + from ..parser import contexts + self._url = parse_anything(value, contexts.EXT_LINK_URI) @title.setter def title(self, value): diff --git a/mwparserfromhell/wikicode.py b/mwparserfromhell/wikicode.py index b5e854d..c3249d9 100644 --- a/mwparserfromhell/wikicode.py +++ b/mwparserfromhell/wikicode.py @@ -24,8 +24,8 @@ from __future__ import unicode_literals import re from .compat import maxsize, py3k, str -from .nodes import (Argument, Comment, Heading, HTMLEntity, Node, Tag, - Template, Text, Wikilink) +from .nodes import (Argument, Comment, ExternalLink, Heading, HTMLEntity, + Node, Tag, Template, Text, Wikilink) from .string_mixin import StringMixIn from .utils import parse_anything @@ -509,6 +509,6 @@ class Wikicode(StringMixIn): return "\n".join(self._get_tree(self, [], marker, 0)) Wikicode._build_filter_methods( - arguments=Argument, comments=Comment, headings=Heading, - html_entities=HTMLEntity, tags=Tag, templates=Template, text=Text, - wikilinks=Wikilink) + arguments=Argument, comments=Comment, external_links=ExternalLink, + headings=Heading, html_entities=HTMLEntity, tags=Tag, templates=Template, + text=Text, wikilinks=Wikilink) diff --git a/tests/test_wikicode.py b/tests/test_wikicode.py index 08cf93c..14d801c 100644 --- a/tests/test_wikicode.py +++ b/tests/test_wikicode.py @@ -276,6 +276,7 @@ class TestWikicode(TreeEqualityTestCase): self.assertEqual(["{{{e}}}"], get_filter("arguments")) self.assertIs(code.get(4), get_filter("arguments")[0]) self.assertEqual([], get_filter("comments")) + self.assertEqual([], get_filter("external_links")) self.assertEqual([], get_filter("headings")) self.assertEqual([], get_filter("html_entities")) self.assertEqual([], get_filter("tags")) From fcdc0abd22259b4aa6213a088989bbd1f9c922bd Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 Aug 2013 00:05:13 -0400 Subject: [PATCH 34/35] Fix autofail contexts. --- mwparserfromhell/parser/contexts.py | 2 +- mwparserfromhell/parser/tokenizer.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 0d25400..33da8f7 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -162,7 +162,7 @@ GL_HEADING = 1 << 0 # Aggregate contexts: -FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK + HEADING + TAG + STYLE +FAIL = TEMPLATE + ARGUMENT + WIKILINK + EXT_LINK_TITLE + HEADING + TAG + STYLE UNSAFE = (TEMPLATE_NAME + WIKILINK + EXT_LINK_TITLE + TEMPLATE_PARAM_KEY + ARGUMENT_NAME + TAG_CLOSE) DOUBLE = TEMPLATE_PARAM_KEY + TAG_CLOSE diff --git a/mwparserfromhell/parser/tokenizer.h b/mwparserfromhell/parser/tokenizer.h index c23fe4a..da3c57a 100644 --- a/mwparserfromhell/parser/tokenizer.h +++ b/mwparserfromhell/parser/tokenizer.h @@ -162,7 +162,7 @@ static PyObject* TagCloseClose; /* Aggregate contexts: */ -#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_HEADING | LC_TAG | LC_STYLE) +#define AGG_FAIL (LC_TEMPLATE | LC_ARGUMENT | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_HEADING | LC_TAG | LC_STYLE) #define AGG_UNSAFE (LC_TEMPLATE_NAME | LC_WIKILINK | LC_EXT_LINK_TITLE | LC_TEMPLATE_PARAM_KEY | LC_ARGUMENT_NAME) #define AGG_DOUBLE (LC_TEMPLATE_PARAM_KEY | LC_TAG_CLOSE) #define AGG_INVALID_LINK (LC_TEMPLATE_NAME | LC_ARGUMENT_NAME | LC_WIKILINK | LC_EXT_LINK) From 4d04cae7802e7a1775016e8a599d2555fe32b763 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 24 Aug 2013 00:27:05 -0400 Subject: [PATCH 35/35] Fix a segfault with GCC. --- mwparserfromhell/parser/tokenizer.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index af6bf3b..07d3988 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -1143,28 +1143,29 @@ Tokenizer_remove_uri_scheme_from_textbuffer(Tokenizer* self, PyObject* link) */ static int Tokenizer_parse_external_link(Tokenizer* self, int brackets) { + #define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK + #define NOT_A_LINK \ + if (!brackets && self->topstack->context & LC_DLTERM) \ + return Tokenizer_handle_dl_term(self); \ + return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)) + Py_ssize_t reset = self->head; PyObject *link, *kwargs; - Textbuffer *extra; + Textbuffer *extra = 0; - self->head++; - #define INVALID_CONTEXT self->topstack->context & AGG_INVALID_LINK if (INVALID_CONTEXT || !(Tokenizer_CAN_RECURSE(self))) { - FAIL_ROUTE(0); - } - else { - extra = Textbuffer_new(); - if (!extra) - return -1; - link = Tokenizer_really_parse_external_link(self, brackets, &extra); + NOT_A_LINK; } + extra = Textbuffer_new(); + if (!extra) + return -1; + self->head++; + link = Tokenizer_really_parse_external_link(self, brackets, &extra); if (BAD_ROUTE) { RESET_ROUTE(); self->head = reset; Textbuffer_dealloc(extra); - if (!brackets && self->topstack->context & LC_DLTERM) - return Tokenizer_handle_dl_term(self); - return Tokenizer_emit_char(self, Tokenizer_READ(self, 0)); + NOT_A_LINK; } if (!link) { Textbuffer_dealloc(extra);