From 2cfb0973421bbed89b21d929fb4f2db468f0e741 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 16 Sep 2012 22:13:51 -0400 Subject: [PATCH 1/3] Comment class for ; implement in builder. --- mwparserfromhell/nodes/__init__.py | 1 + mwparserfromhell/nodes/comment.py | 46 ++++++++++++++++++++++++++++++++++++++ mwparserfromhell/parser/builder.py | 15 ++++++++++++- mwparserfromhell/parser/tokens.py | 3 +++ 4 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 mwparserfromhell/nodes/comment.py diff --git a/mwparserfromhell/nodes/__init__.py b/mwparserfromhell/nodes/__init__.py index c04f718..a56e916 100644 --- a/mwparserfromhell/nodes/__init__.py +++ b/mwparserfromhell/nodes/__init__.py @@ -68,6 +68,7 @@ class Node(StringMixIn): from . import extras from .text import Text from .argument import Argument +from .comment import Comment from .heading import Heading from .html_entity import HTMLEntity from .tag import Tag diff --git a/mwparserfromhell/nodes/comment.py b/mwparserfromhell/nodes/comment.py new file mode 100644 index 0000000..dad0214 --- /dev/null +++ b/mwparserfromhell/nodes/comment.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2012 Ben Kurtovic +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import unicode_literals + +from . import Node +from ..compat import str + +__all__ = ["Comment"] + +class Comment(Node): + """Represents a hidden HTML comment, like ````.""" + def __init__(self, contents): + super(Text, self).__init__() + self._contents = contents + + def __unicode__(self): + return "" + + @property + def contents(self): + """The hidden text contained between ````.""" + return self._contents + + @value.setter + def contents(self, value): + self._contents = str(value) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 94fd57d..e597507 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -24,7 +24,7 @@ from __future__ import unicode_literals from . import tokens from ..compat import str -from ..nodes import Argument, Heading, HTMLEntity, Tag, Template, Text +from ..nodes import Argument, Comment, Heading, HTMLEntity, Tag, Template, Text from ..nodes.extras import Attribute, Parameter from ..smart_list import SmartList from ..wikicode import Wikicode @@ -152,6 +152,17 @@ class Builder(object): else: self._write(self._handle_token(token)) + def _handle_comment(self): + """Handle a case where a hidden comment is at the head of the tokens.""" + self._push() + while self._tokens: + token = self._tokens.pop() + if isinstance(token, tokens.CommentEnd): + contents = self._pop() + return Comment(contents) + else: + self._write(self._handle_token(token)) + def _handle_attribute(self): """Handle a case where a tag attribute is at the head of the tokens.""" name, quoted = None, False @@ -209,6 +220,8 @@ class Builder(object): return self._handle_entity() elif isinstance(token, tokens.HeadingStart): return self._handle_heading(token) + elif isinstance(token, tokens.CommentStart): + return self._handle_comment() elif isinstance(token, tokens.TagOpenOpen): return self._handle_tag(token) diff --git a/mwparserfromhell/parser/tokens.py b/mwparserfromhell/parser/tokens.py index 0e91d48..ab6f356 100644 --- a/mwparserfromhell/parser/tokens.py +++ b/mwparserfromhell/parser/tokens.py @@ -87,6 +87,9 @@ HTMLEntityEnd = make("HTMLEntityEnd") # ; HeadingStart = make("HeadingStart") # =... HeadingEnd = make("HeadingEnd") # =... +CommentStart = make("CommentStart") # + TagOpenOpen = make("TagOpenOpen") # < TagAttrStart = make("TagAttrStart") TagAttrEquals = make("TagAttrEquals") # = From 65862befdc065a0fbbadad14b31c21195693a6e8 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sun, 16 Sep 2012 22:52:06 -0400 Subject: [PATCH 2/3] Support comments in tokenizer. --- mwparserfromhell/nodes/comment.py | 2 +- mwparserfromhell/parser/builder.py | 2 +- mwparserfromhell/parser/contexts.py | 66 +++++++++++++++++++----------------- mwparserfromhell/parser/tokenizer.py | 38 +++++++++++++++++---- 4 files changed, 69 insertions(+), 39 deletions(-) diff --git a/mwparserfromhell/nodes/comment.py b/mwparserfromhell/nodes/comment.py index dad0214..ff77b18 100644 --- a/mwparserfromhell/nodes/comment.py +++ b/mwparserfromhell/nodes/comment.py @@ -41,6 +41,6 @@ class Comment(Node): """The hidden text contained between ````.""" return self._contents - @value.setter + @contents.setter def contents(self, value): self._contents = str(value) diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index e597507..e03d94f 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -126,7 +126,7 @@ class Builder(object): self._write(self._handle_token(token)) def _handle_entity(self): - """Handle a case where a HTML entity is at the head of the tokens.""" + """Handle a case where an HTML entity is at the head of the tokens.""" token = self._tokens.pop() if isinstance(token, tokens.HTMLEntityNumeric): token = self._tokens.pop() diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 5969239..e1e96e1 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -35,49 +35,53 @@ will cover ``BAR == 0b10`` and ``BAZ == 0b01``). Local (stack-specific) contexts: -* :py:const:`TEMPLATE` (``0b00000000111``) +* :py:const:`TEMPLATE` - * :py:const:`TEMPLATE_NAME` (``0b00000000001``) - * :py:const:`TEMPLATE_PARAM_KEY` (``0b00000000010``) - * :py:const:`TEMPLATE_PARAM_VALUE` (``0b00000000100``) + * :py:const:`TEMPLATE_NAME` + * :py:const:`TEMPLATE_PARAM_KEY` + * :py:const:`TEMPLATE_PARAM_VALUE` -* :py:const:`ARGUMENT` (``0b00000011000``) +* :py:const:`ARGUMENT` - * :py:const:`ARGUMENT_NAME` (``0b00000001000``) - * :py:const:`ARGUMENT_DEFAULT` (``0b00000010000``) + * :py:const:`ARGUMENT_NAME` + * :py:const:`ARGUMENT_DEFAULT` -* :py:const:`HEADING` (``0b111111000``) +* :py:const:`HEADING` - * :py:const:`HEADING_LEVEL_1` (``0b00000100000``) - * :py:const:`HEADING_LEVEL_2` (``0b00001000000``) - * :py:const:`HEADING_LEVEL_3` (``0b00010000000``) - * :py:const:`HEADING_LEVEL_4` (``0b00100000000``) - * :py:const:`HEADING_LEVEL_5` (``0b01000000000``) - * :py:const:`HEADING_LEVEL_6` (``0b10000000000``) + * :py:const:`HEADING_LEVEL_1` + * :py:const:`HEADING_LEVEL_2` + * :py:const:`HEADING_LEVEL_3` + * :py:const:`HEADING_LEVEL_4` + * :py:const:`HEADING_LEVEL_5` + * :py:const:`HEADING_LEVEL_6` + +* :py:const:`COMMENT` Global contexts: -* :py:const:`GL_HEADING` (``0b1``) +* :py:const:`GL_HEADING` """ # Local contexts: -TEMPLATE = 0b00000000111 -TEMPLATE_NAME = 0b00000000001 -TEMPLATE_PARAM_KEY = 0b00000000010 -TEMPLATE_PARAM_VALUE = 0b00000000100 - -ARGUMENT = 0b00000011000 -ARGUMENT_NAME = 0b00000001000 -ARGUMENT_DEFAULT = 0b00000010000 - -HEADING = 0b11111100000 -HEADING_LEVEL_1 = 0b00000100000 -HEADING_LEVEL_2 = 0b00001000000 -HEADING_LEVEL_3 = 0b00010000000 -HEADING_LEVEL_4 = 0b00100000000 -HEADING_LEVEL_5 = 0b01000000000 -HEADING_LEVEL_6 = 0b10000000000 +TEMPLATE = 0b000000000111 +TEMPLATE_NAME = 0b000000000001 +TEMPLATE_PARAM_KEY = 0b000000000010 +TEMPLATE_PARAM_VALUE = 0b000000000100 + +ARGUMENT = 0b000000011000 +ARGUMENT_NAME = 0b000000001000 +ARGUMENT_DEFAULT = 0b000000010000 + +HEADING = 0b011111100000 +HEADING_LEVEL_1 = 0b000000100000 +HEADING_LEVEL_2 = 0b000001000000 +HEADING_LEVEL_3 = 0b000010000000 +HEADING_LEVEL_4 = 0b000100000000 +HEADING_LEVEL_5 = 0b001000000000 +HEADING_LEVEL_6 = 0b010000000000 + +COMMENT = 0b100000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 9e6ae11..e51a081 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -41,8 +41,8 @@ class Tokenizer(object): START = object() END = object() MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", - "/", "-", "\n", END] - regex = re.compile(r"([{}\[\]<>|=&#*;:/\-\n])", flags=re.IGNORECASE) + "/", "-", "!", "\n", END] + regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE) def __init__(self): self._text = None @@ -327,7 +327,7 @@ class Tokenizer(object): return self._pop(), after_level def _really_parse_entity(self): - """Actually parse a HTML entity and ensure that it is valid.""" + """Actually parse an HTML entity and ensure that it is valid.""" self._write(tokens.HTMLEntityStart()) self._head += 1 @@ -369,7 +369,7 @@ class Tokenizer(object): self._write(tokens.HTMLEntityEnd()) def _parse_entity(self): - """Parse a HTML entity at the head of the wikicode string.""" + """Parse an HTML entity at the head of the wikicode string.""" reset = self._head self._push() try: @@ -380,6 +380,21 @@ class Tokenizer(object): else: self._write_all(self._pop()) + def _parse_comment(self): + """Parse an HTML comment at the head of the wikicode string.""" + self._head += 4 + reset = self._head - 1 + try: + comment = self._parse(contexts.COMMENT) + except BadRoute: + self._head = reset + self._write_text("``.""" def __init__(self, contents): - super(Text, self).__init__() + super(Comment, self).__init__() self._contents = contents def __unicode__(self):