From 6450814729c4725760386ae9e8a24a30c46b7033 Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 28 Jun 2013 23:34:24 -0400 Subject: [PATCH] Remove 'type' attribute from tags; rework tag definitions. --- mwparserfromhell/nodes/tag.py | 30 ++------- mwparserfromhell/parser/builder.py | 8 +-- mwparserfromhell/parser/tokenizer.py | 21 ++---- mwparserfromhell/tag_defs.py | 123 ++++++++++------------------------- mwparserfromhell/utils.py | 2 + tests/test_builder.py | 9 ++- tests/tokenizer/tags.mwtest | 28 ++++---- 7 files changed, 72 insertions(+), 149 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index d301d85..cd5d0a2 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -24,18 +24,17 @@ from __future__ import unicode_literals from . import Node, Text from ..compat import str -from ..tag_defs import TagDefinitions +from ..tag_defs import get_wikicode, is_visible from ..utils import parse_anything __all__ = ["Tag"] -class Tag(TagDefinitions, Node): +class Tag(Node): """Represents an HTML-style tag in wikicode, like ````.""" - def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, + def __init__(self, tag, contents=None, attrs=None, showtag=True, self_closing=False, padding="", closing_tag=None): super(Tag, self).__init__() - self._type = type_ self._tag = tag self._contents = contents if attrs: @@ -52,7 +51,7 @@ class Tag(TagDefinitions, Node): def __unicode__(self): if not self.showtag: - open_, close = self.WIKICODE[self.type] + open_, close = get_wikicode[self.tag] if self.self_closing: return open_ else: @@ -84,7 +83,7 @@ class Tag(TagDefinitions, Node): yield self.contents, child def __strip__(self, normalize, collapse): - if self.type in self.TAGS_VISIBLE: + if is_visible(self.tag): return self.contents.strip_code(normalize, collapse) return None @@ -113,11 +112,6 @@ class Tag(TagDefinitions, Node): write(">") @property - def type(self): - """The tag type.""" - return self._type - - @property def tag(self): """The tag itself, as a :py:class:`~.Wikicode` object.""" return self._tag @@ -159,23 +153,9 @@ class Tag(TagDefinitions, Node): """ return self._closing_tag - @type.setter - def type(self, value): - value = int(value) - if value not in self.TAGS_ALL: - raise ValueError(value) - self._type = value - for key in self.TRANSLATIONS: - if self.TRANSLATIONS[key] == value: - self._tag = self._closing_tag = parse_anything(key) - @tag.setter def tag(self, value): self._tag = self._closing_tag = parse_anything(value) - try: - self._type = self.TRANSLATIONS[text] - except KeyError: - self._type = self.TAG_UNKNOWN @contents.setter def contents(self, value): diff --git a/mwparserfromhell/parser/builder.py b/mwparserfromhell/parser/builder.py index 5ec0780..53abe91 100644 --- a/mwparserfromhell/parser/builder.py +++ b/mwparserfromhell/parser/builder.py @@ -202,7 +202,7 @@ class Builder(object): def _handle_tag(self, token): """Handle a case where a tag is at the head of the tokens.""" - type_, showtag = token.type, token.showtag + showtag = token.showtag attrs = [] self._push() while self._tokens: @@ -215,14 +215,14 @@ class Builder(object): self._push() elif isinstance(token, tokens.TagCloseSelfclose): tag = self._pop() - return Tag(type_, tag, attrs=attrs, showtag=showtag, + return Tag(tag, attrs=attrs, showtag=showtag, self_closing=True, padding=token.padding) elif isinstance(token, tokens.TagOpenClose): contents = self._pop() self._push() elif isinstance(token, tokens.TagCloseClose): - return Tag(type_, tag, contents, attrs, showtag, False, - padding, self._pop()) + return Tag(tag, contents, attrs, showtag, False, padding, + self._pop()) else: self._write(self._handle_token(token)) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 67a652a..e7fdb0e 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -27,7 +27,7 @@ import re from . import contexts from . import tokens from ..compat import htmlentities -from ..nodes.tag import Tag +from ..tag_defs import is_parsable __all__ = ["Tokenizer"] @@ -416,8 +416,8 @@ class Tokenizer(object): else: self._write_all(tokens) - def _get_tag_type_from_stack(self, stack=None): - """Return the tag type based on the text in *stack*. + def _get_tag_from_stack(self, stack=None): + """Return the tag based on the text in *stack*. If *stack* is ``None``, we will use the current, topmost one. """ @@ -427,11 +427,7 @@ class Tokenizer(object): if not stack: self._fail_route() # Tag has an empty name? text = [tok for tok in stack if isinstance(tok, tokens.Text)] - text = "".join([token.text for token in text]).rstrip().lower() - try: - return Tag.TRANSLATIONS[text] - except KeyError: - return Tag.TAG_UNKNOWN + return "".join([token.text for token in text]).rstrip().lower() def _actually_close_tag_opening(self): """Handle cleanup at the end of a opening tag. @@ -447,8 +443,7 @@ class Tokenizer(object): if self._context & contexts.TAG_OPEN_ATTR_BODY: self._context ^= contexts.TAG_OPEN_ATTR_BODY else: - tag = self._get_tag_type_from_stack() - self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) + self._write_first(tokens.TagOpenOpen(showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_BODY @@ -509,8 +504,7 @@ class Tokenizer(object): is_quoted = False if self._context & contexts.TAG_OPEN_NAME: self._write_text(chunks.pop(0)) - tag = self._get_tag_type_from_stack() - self._write_first(tokens.TagOpenOpen(type=tag, showtag=True)) + self._write_first(tokens.TagOpenOpen(showtag=True)) self._context ^= contexts.TAG_OPEN_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME self._actually_handle_chunk(chunks, True) @@ -584,8 +578,7 @@ class Tokenizer(object): def _handle_tag_close_close(self): """Handle the ending of a closing tag (````).""" closing = self._pop() - tag = self._get_tag_type_from_stack(closing) - if tag != self._stack[0].type: + if self._get_tag_from_stack(closing) != self._get_tag_from_stack(): # Closing and opening tags are not the same, so fail this route: self._fail_route() self._write_all(closing) diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py index b2ee90d..369692b 100644 --- a/mwparserfromhell/tag_defs.py +++ b/mwparserfromhell/tag_defs.py @@ -20,99 +20,48 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -from __future__ import unicode_literals +"""Contains data regarding certain HTML tags.""" -class TagDefinitions(object): - """Contains numerical definitions for valid HTML (and wikicode) tags. +from __future__ import unicode_literals - Base class for :py:class:`~.Tag` objects. - """ +__all__ = ["get_wikicode", "is_parsable", "is_visible"] - TAG_UNKNOWN = 0 +PARSER_BLACKLIST = [ + # enwiki extensions @ 2013-06-28 + "categorytree", "gallery", "hiero", "imagemap", "inputbox", "math", + "nowiki", "pre", "score", "section", "source", "syntaxhighlight", + "templatedata", "timeline" +] - # Basic HTML: - TAG_ITALIC = 1 - TAG_BOLD = 2 - TAG_UNDERLINE = 3 - TAG_STRIKETHROUGH = 4 - TAG_UNORDERED_LIST = 5 - TAG_ORDERED_LIST = 6 - TAG_DEF_TERM = 7 - TAG_DEF_ITEM = 8 - TAG_BLOCKQUOTE = 9 - TAG_RULE = 10 - TAG_BREAK = 11 - TAG_ABBR = 12 - TAG_PRE = 13 - TAG_MONOSPACE = 14 - TAG_CODE = 15 - TAG_SPAN = 16 - TAG_DIV = 17 - TAG_FONT = 18 - TAG_SMALL = 19 - TAG_BIG = 20 - TAG_CENTER = 21 +INVISIBLE_TAGS = [ + # enwiki extensions @ 2013-06-28 + "categorytree", "gallery", "imagemap", "inputbox", "math", "score", + "section", "templatedata", "timeline" +] - # MediaWiki parser hooks: - TAG_REF = 101 - TAG_GALLERY = 102 - TAG_MATH = 103 - TAG_NOWIKI = 104 - TAG_NOINCLUDE = 105 - TAG_INCLUDEONLY = 106 - TAG_ONLYINCLUDE = 107 +# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 +SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] +SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] - # Additional parser hooks: - TAG_SYNTAXHIGHLIGHT = 201 - TAG_POEM = 202 +WIKICODE = { + "i": {"open": "''", "close": "''"}, + "b": {"open": "'''", "close": "'''"}, + "ul": {"open": "*"}, + "ol": {"open": "#"}, + "dt": {"open": ";"}, + "dd": {"open": ":"}, + "hr": {"open": "----"}, +} - # Lists of tags: - TAGS_ALL = set(range(300)) - TAGS_INVISIBLE = {TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE} - TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE +def get_wikicode(tag): + """Return the appropriate wikicode before and after the given *tag*.""" + data = WIKICODE[tag.lower()] + return (data.get("open"), data.get("close")) - TRANSLATIONS = { - "i": TAG_ITALIC, - "em": TAG_ITALIC, - "b": TAG_BOLD, - "strong": TAG_BOLD, - "u": TAG_UNDERLINE, - "s": TAG_STRIKETHROUGH, - "ul": TAG_UNORDERED_LIST, - "ol": TAG_ORDERED_LIST, - "dt": TAG_DEF_TERM, - "dd": TAG_DEF_ITEM, - "blockquote": TAG_BLOCKQUOTE, - "hl": TAG_RULE, - "br": TAG_BREAK, - "abbr": TAG_ABBR, - "pre": TAG_PRE, - "tt": TAG_MONOSPACE, - "code": TAG_CODE, - "span": TAG_SPAN, - "div": TAG_DIV, - "font": TAG_FONT, - "small": TAG_SMALL, - "big": TAG_BIG, - "center": TAG_CENTER, - "ref": TAG_REF, - "gallery": TAG_GALLERY, - "math": TAG_MATH, - "nowiki": TAG_NOWIKI, - "noinclude": TAG_NOINCLUDE, - "includeonly": TAG_INCLUDEONLY, - "onlyinclude": TAG_ONLYINCLUDE, - "syntaxhighlight": TAG_SYNTAXHIGHLIGHT, - "source": TAG_SYNTAXHIGHLIGHT, - "poem": TAG_POEM, - } +def is_parsable(tag): + """Return if the given *tag*'s contents should be passed to the parser.""" + return tag.lower() not in PARSER_BLACKLIST - WIKICODE = { - TAG_ITALIC: ("''", "''"), - TAG_BOLD: ("'''", "'''"), - TAG_UNORDERED_LIST: ("*", ""), - TAG_ORDERED_LIST: ("#", ""), - TAG_DEF_TERM: (";", ""), - TAG_DEF_ITEM: (":", ""), - TAG_RULE: ("----", ""), - } +def is_visible(tag): + """Return whether or not the given *tag* contains visible text.""" + return tag.lower() not in INVISIBLE_TAGS diff --git a/mwparserfromhell/utils.py b/mwparserfromhell/utils.py index b797419..31e5ba0 100644 --- a/mwparserfromhell/utils.py +++ b/mwparserfromhell/utils.py @@ -31,6 +31,8 @@ from .compat import bytes, str from .nodes import Node from .smart_list import SmartList +__all__ = ["parse_anything"] + def parse_anything(value): """Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. diff --git a/tests/test_builder.py b/tests/test_builder.py index 85a8c60..0c635ce 100644 --- a/tests/test_builder.py +++ b/tests/test_builder.py @@ -193,11 +193,10 @@ class TestBuilder(TreeEqualityTestCase): def test_tag(self): """tests for building Tag nodes""" tests = [ - ([tokens.TagOpenOpen(showtag=True, type=101), - tokens.Text(text="ref"), tokens.TagCloseOpen(padding=""), - tokens.TagOpenClose(), tokens.Text(text="ref"), - tokens.TagCloseClose()], - wrap([Tag(101, wraptext("ref"), wrap([]), [], True, False, "", + ([tokens.TagOpenOpen(showtag=True), tokens.Text(text="ref"), + tokens.TagCloseOpen(padding=""), tokens.TagOpenClose(), + tokens.Text(text="ref"), tokens.TagCloseClose()], + wrap([Tag(wraptext("ref"), wrap([]), [], True, False, "", wraptext("ref"))])), ] for test, valid in tests: diff --git a/tests/tokenizer/tags.mwtest b/tests/tokenizer/tags.mwtest index 5af2074..a76d6b6 100644 --- a/tests/tokenizer/tags.mwtest +++ b/tests/tokenizer/tags.mwtest @@ -1,98 +1,98 @@ name: basic label: a basic tag with an open and close input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: basic_selfclosing label: a basic self-closing tag input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding="")] --- name: content label: a tag with some content in the middle input: "this is a reference" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: padded_open label: a tag with some padding in the open tag input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: padded_close label: a tag with some padding in the close tag input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()] --- name: padded_selfclosing label: a self-closing tag with padding input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding=" ")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding=" ")] --- name: attribute label: a tag with a single attribute input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_value label: a tag with a single attribute with a value input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_quoted label: a tag with a single quoted attribute input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_hyphen label: a tag with a single attribute, containing a hyphen input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_quoted_hyphen label: a tag with a single quoted attribute, containing a hyphen input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] --- name: attribute_selfclosing label: a self-closing tag with a single attribute input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")] --- name: attribute_selfclosing_value label: a self-closing tag with a single attribute with a value input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] --- name: attribute_selfclosing_value_quoted label: a self-closing tag with a single quoted attribute input: "" -output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] +output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] ---