Selaa lähdekoodia

Remove 'type' attribute from tags; rework tag definitions.

tags/v0.3
Ben Kurtovic 11 vuotta sitten
vanhempi
commit
6450814729
7 muutettua tiedostoa jossa 72 lisäystä ja 149 poistoa
  1. +5
    -25
      mwparserfromhell/nodes/tag.py
  2. +4
    -4
      mwparserfromhell/parser/builder.py
  3. +7
    -14
      mwparserfromhell/parser/tokenizer.py
  4. +36
    -87
      mwparserfromhell/tag_defs.py
  5. +2
    -0
      mwparserfromhell/utils.py
  6. +4
    -5
      tests/test_builder.py
  7. +14
    -14
      tests/tokenizer/tags.mwtest

+ 5
- 25
mwparserfromhell/nodes/tag.py Näytä tiedosto

@@ -24,18 +24,17 @@ from __future__ import unicode_literals


from . import Node, Text from . import Node, Text
from ..compat import str from ..compat import str
from ..tag_defs import TagDefinitions
from ..tag_defs import get_wikicode, is_visible
from ..utils import parse_anything from ..utils import parse_anything


__all__ = ["Tag"] __all__ = ["Tag"]


class Tag(TagDefinitions, Node):
class Tag(Node):
"""Represents an HTML-style tag in wikicode, like ``<ref>``.""" """Represents an HTML-style tag in wikicode, like ``<ref>``."""


def __init__(self, type_, tag, contents=None, attrs=None, showtag=True,
def __init__(self, tag, contents=None, attrs=None, showtag=True,
self_closing=False, padding="", closing_tag=None): self_closing=False, padding="", closing_tag=None):
super(Tag, self).__init__() super(Tag, self).__init__()
self._type = type_
self._tag = tag self._tag = tag
self._contents = contents self._contents = contents
if attrs: if attrs:
@@ -52,7 +51,7 @@ class Tag(TagDefinitions, Node):


def __unicode__(self): def __unicode__(self):
if not self.showtag: if not self.showtag:
open_, close = self.WIKICODE[self.type]
open_, close = get_wikicode[self.tag]
if self.self_closing: if self.self_closing:
return open_ return open_
else: else:
@@ -84,7 +83,7 @@ class Tag(TagDefinitions, Node):
yield self.contents, child yield self.contents, child


def __strip__(self, normalize, collapse): def __strip__(self, normalize, collapse):
if self.type in self.TAGS_VISIBLE:
if is_visible(self.tag):
return self.contents.strip_code(normalize, collapse) return self.contents.strip_code(normalize, collapse)
return None return None


@@ -113,11 +112,6 @@ class Tag(TagDefinitions, Node):
write(">") write(">")


@property @property
def type(self):
"""The tag type."""
return self._type

@property
def tag(self): def tag(self):
"""The tag itself, as a :py:class:`~.Wikicode` object.""" """The tag itself, as a :py:class:`~.Wikicode` object."""
return self._tag return self._tag
@@ -159,23 +153,9 @@ class Tag(TagDefinitions, Node):
""" """
return self._closing_tag return self._closing_tag


@type.setter
def type(self, value):
value = int(value)
if value not in self.TAGS_ALL:
raise ValueError(value)
self._type = value
for key in self.TRANSLATIONS:
if self.TRANSLATIONS[key] == value:
self._tag = self._closing_tag = parse_anything(key)

@tag.setter @tag.setter
def tag(self, value): def tag(self, value):
self._tag = self._closing_tag = parse_anything(value) self._tag = self._closing_tag = parse_anything(value)
try:
self._type = self.TRANSLATIONS[text]
except KeyError:
self._type = self.TAG_UNKNOWN


@contents.setter @contents.setter
def contents(self, value): def contents(self, value):


+ 4
- 4
mwparserfromhell/parser/builder.py Näytä tiedosto

@@ -202,7 +202,7 @@ class Builder(object):


def _handle_tag(self, token): def _handle_tag(self, token):
"""Handle a case where a tag is at the head of the tokens.""" """Handle a case where a tag is at the head of the tokens."""
type_, showtag = token.type, token.showtag
showtag = token.showtag
attrs = [] attrs = []
self._push() self._push()
while self._tokens: while self._tokens:
@@ -215,14 +215,14 @@ class Builder(object):
self._push() self._push()
elif isinstance(token, tokens.TagCloseSelfclose): elif isinstance(token, tokens.TagCloseSelfclose):
tag = self._pop() tag = self._pop()
return Tag(type_, tag, attrs=attrs, showtag=showtag,
return Tag(tag, attrs=attrs, showtag=showtag,
self_closing=True, padding=token.padding) self_closing=True, padding=token.padding)
elif isinstance(token, tokens.TagOpenClose): elif isinstance(token, tokens.TagOpenClose):
contents = self._pop() contents = self._pop()
self._push() self._push()
elif isinstance(token, tokens.TagCloseClose): elif isinstance(token, tokens.TagCloseClose):
return Tag(type_, tag, contents, attrs, showtag, False,
padding, self._pop())
return Tag(tag, contents, attrs, showtag, False, padding,
self._pop())
else: else:
self._write(self._handle_token(token)) self._write(self._handle_token(token))




+ 7
- 14
mwparserfromhell/parser/tokenizer.py Näytä tiedosto

@@ -27,7 +27,7 @@ import re
from . import contexts from . import contexts
from . import tokens from . import tokens
from ..compat import htmlentities from ..compat import htmlentities
from ..nodes.tag import Tag
from ..tag_defs import is_parsable


__all__ = ["Tokenizer"] __all__ = ["Tokenizer"]


@@ -416,8 +416,8 @@ class Tokenizer(object):
else: else:
self._write_all(tokens) self._write_all(tokens)


def _get_tag_type_from_stack(self, stack=None):
"""Return the tag type based on the text in *stack*.
def _get_tag_from_stack(self, stack=None):
"""Return the tag based on the text in *stack*.


If *stack* is ``None``, we will use the current, topmost one. If *stack* is ``None``, we will use the current, topmost one.
""" """
@@ -427,11 +427,7 @@ class Tokenizer(object):
if not stack: if not stack:
self._fail_route() # Tag has an empty name? self._fail_route() # Tag has an empty name?
text = [tok for tok in stack if isinstance(tok, tokens.Text)] text = [tok for tok in stack if isinstance(tok, tokens.Text)]
text = "".join([token.text for token in text]).rstrip().lower()
try:
return Tag.TRANSLATIONS[text]
except KeyError:
return Tag.TAG_UNKNOWN
return "".join([token.text for token in text]).rstrip().lower()


def _actually_close_tag_opening(self): def _actually_close_tag_opening(self):
"""Handle cleanup at the end of a opening tag. """Handle cleanup at the end of a opening tag.
@@ -447,8 +443,7 @@ class Tokenizer(object):
if self._context & contexts.TAG_OPEN_ATTR_BODY: if self._context & contexts.TAG_OPEN_ATTR_BODY:
self._context ^= contexts.TAG_OPEN_ATTR_BODY self._context ^= contexts.TAG_OPEN_ATTR_BODY
else: else:
tag = self._get_tag_type_from_stack()
self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
self._write_first(tokens.TagOpenOpen(showtag=True))
self._context ^= contexts.TAG_OPEN_NAME self._context ^= contexts.TAG_OPEN_NAME
self._context |= contexts.TAG_BODY self._context |= contexts.TAG_BODY


@@ -509,8 +504,7 @@ class Tokenizer(object):
is_quoted = False is_quoted = False
if self._context & contexts.TAG_OPEN_NAME: if self._context & contexts.TAG_OPEN_NAME:
self._write_text(chunks.pop(0)) self._write_text(chunks.pop(0))
tag = self._get_tag_type_from_stack()
self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
self._write_first(tokens.TagOpenOpen(showtag=True))
self._context ^= contexts.TAG_OPEN_NAME self._context ^= contexts.TAG_OPEN_NAME
self._context |= contexts.TAG_OPEN_ATTR_NAME self._context |= contexts.TAG_OPEN_ATTR_NAME
self._actually_handle_chunk(chunks, True) self._actually_handle_chunk(chunks, True)
@@ -584,8 +578,7 @@ class Tokenizer(object):
def _handle_tag_close_close(self): def _handle_tag_close_close(self):
"""Handle the ending of a closing tag (``</foo>``).""" """Handle the ending of a closing tag (``</foo>``)."""
closing = self._pop() closing = self._pop()
tag = self._get_tag_type_from_stack(closing)
if tag != self._stack[0].type:
if self._get_tag_from_stack(closing) != self._get_tag_from_stack():
# Closing and opening tags are not the same, so fail this route: # Closing and opening tags are not the same, so fail this route:
self._fail_route() self._fail_route()
self._write_all(closing) self._write_all(closing)


+ 36
- 87
mwparserfromhell/tag_defs.py Näytä tiedosto

@@ -20,99 +20,48 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE. # SOFTWARE.


from __future__ import unicode_literals
"""Contains data regarding certain HTML tags."""


class TagDefinitions(object):
"""Contains numerical definitions for valid HTML (and wikicode) tags.
from __future__ import unicode_literals


Base class for :py:class:`~.Tag` objects.
"""
__all__ = ["get_wikicode", "is_parsable", "is_visible"]


TAG_UNKNOWN = 0
PARSER_BLACKLIST = [
# enwiki extensions @ 2013-06-28
"categorytree", "gallery", "hiero", "imagemap", "inputbox", "math",
"nowiki", "pre", "score", "section", "source", "syntaxhighlight",
"templatedata", "timeline"
]


# Basic HTML:
TAG_ITALIC = 1
TAG_BOLD = 2
TAG_UNDERLINE = 3
TAG_STRIKETHROUGH = 4
TAG_UNORDERED_LIST = 5
TAG_ORDERED_LIST = 6
TAG_DEF_TERM = 7
TAG_DEF_ITEM = 8
TAG_BLOCKQUOTE = 9
TAG_RULE = 10
TAG_BREAK = 11
TAG_ABBR = 12
TAG_PRE = 13
TAG_MONOSPACE = 14
TAG_CODE = 15
TAG_SPAN = 16
TAG_DIV = 17
TAG_FONT = 18
TAG_SMALL = 19
TAG_BIG = 20
TAG_CENTER = 21
INVISIBLE_TAGS = [
# enwiki extensions @ 2013-06-28
"categorytree", "gallery", "imagemap", "inputbox", "math", "score",
"section", "templatedata", "timeline"
]


# MediaWiki parser hooks:
TAG_REF = 101
TAG_GALLERY = 102
TAG_MATH = 103
TAG_NOWIKI = 104
TAG_NOINCLUDE = 105
TAG_INCLUDEONLY = 106
TAG_ONLYINCLUDE = 107
# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"]


# Additional parser hooks:
TAG_SYNTAXHIGHLIGHT = 201
TAG_POEM = 202
WIKICODE = {
"i": {"open": "''", "close": "''"},
"b": {"open": "'''", "close": "'''"},
"ul": {"open": "*"},
"ol": {"open": "#"},
"dt": {"open": ";"},
"dd": {"open": ":"},
"hr": {"open": "----"},
}


# Lists of tags:
TAGS_ALL = set(range(300))
TAGS_INVISIBLE = {TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE}
TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE
def get_wikicode(tag):
"""Return the appropriate wikicode before and after the given *tag*."""
data = WIKICODE[tag.lower()]
return (data.get("open"), data.get("close"))


TRANSLATIONS = {
"i": TAG_ITALIC,
"em": TAG_ITALIC,
"b": TAG_BOLD,
"strong": TAG_BOLD,
"u": TAG_UNDERLINE,
"s": TAG_STRIKETHROUGH,
"ul": TAG_UNORDERED_LIST,
"ol": TAG_ORDERED_LIST,
"dt": TAG_DEF_TERM,
"dd": TAG_DEF_ITEM,
"blockquote": TAG_BLOCKQUOTE,
"hl": TAG_RULE,
"br": TAG_BREAK,
"abbr": TAG_ABBR,
"pre": TAG_PRE,
"tt": TAG_MONOSPACE,
"code": TAG_CODE,
"span": TAG_SPAN,
"div": TAG_DIV,
"font": TAG_FONT,
"small": TAG_SMALL,
"big": TAG_BIG,
"center": TAG_CENTER,
"ref": TAG_REF,
"gallery": TAG_GALLERY,
"math": TAG_MATH,
"nowiki": TAG_NOWIKI,
"noinclude": TAG_NOINCLUDE,
"includeonly": TAG_INCLUDEONLY,
"onlyinclude": TAG_ONLYINCLUDE,
"syntaxhighlight": TAG_SYNTAXHIGHLIGHT,
"source": TAG_SYNTAXHIGHLIGHT,
"poem": TAG_POEM,
}
def is_parsable(tag):
"""Return if the given *tag*'s contents should be passed to the parser."""
return tag.lower() not in PARSER_BLACKLIST


WIKICODE = {
TAG_ITALIC: ("''", "''"),
TAG_BOLD: ("'''", "'''"),
TAG_UNORDERED_LIST: ("*", ""),
TAG_ORDERED_LIST: ("#", ""),
TAG_DEF_TERM: (";", ""),
TAG_DEF_ITEM: (":", ""),
TAG_RULE: ("----", ""),
}
def is_visible(tag):
"""Return whether or not the given *tag* contains visible text."""
return tag.lower() not in INVISIBLE_TAGS

+ 2
- 0
mwparserfromhell/utils.py Näytä tiedosto

@@ -31,6 +31,8 @@ from .compat import bytes, str
from .nodes import Node from .nodes import Node
from .smart_list import SmartList from .smart_list import SmartList


__all__ = ["parse_anything"]

def parse_anything(value): def parse_anything(value):
"""Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. """Return a :py:class:`~.Wikicode` for *value*, allowing multiple types.




+ 4
- 5
tests/test_builder.py Näytä tiedosto

@@ -193,11 +193,10 @@ class TestBuilder(TreeEqualityTestCase):
def test_tag(self): def test_tag(self):
"""tests for building Tag nodes""" """tests for building Tag nodes"""
tests = [ tests = [
([tokens.TagOpenOpen(showtag=True, type=101),
tokens.Text(text="ref"), tokens.TagCloseOpen(padding=""),
tokens.TagOpenClose(), tokens.Text(text="ref"),
tokens.TagCloseClose()],
wrap([Tag(101, wraptext("ref"), wrap([]), [], True, False, "",
([tokens.TagOpenOpen(showtag=True), tokens.Text(text="ref"),
tokens.TagCloseOpen(padding=""), tokens.TagOpenClose(),
tokens.Text(text="ref"), tokens.TagCloseClose()],
wrap([Tag(wraptext("ref"), wrap([]), [], True, False, "",
wraptext("ref"))])), wraptext("ref"))])),
] ]
for test, valid in tests: for test, valid in tests:


+ 14
- 14
tests/tokenizer/tags.mwtest Näytä tiedosto

@@ -1,98 +1,98 @@
name: basic name: basic
label: a basic tag with an open and close label: a basic tag with an open and close
input: "<ref></ref>" input: "<ref></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]


--- ---


name: basic_selfclosing name: basic_selfclosing
label: a basic self-closing tag label: a basic self-closing tag
input: "<ref/>" input: "<ref/>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding="")]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding="")]


--- ---


name: content name: content
label: a tag with some content in the middle label: a tag with some content in the middle
input: "<ref>this is a reference</ref>" input: "<ref>this is a reference</ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()]


--- ---


name: padded_open name: padded_open
label: a tag with some padding in the open tag label: a tag with some padding in the open tag
input: "<ref ></ref>" input: "<ref ></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()]


--- ---


name: padded_close name: padded_close
label: a tag with some padding in the close tag label: a tag with some padding in the close tag
input: "<ref></ref >" input: "<ref></ref >"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()]


--- ---


name: padded_selfclosing name: padded_selfclosing
label: a self-closing tag with padding label: a self-closing tag with padding
input: "<ref />" input: "<ref />"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding=" ")]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding=" ")]


--- ---


name: attribute name: attribute
label: a tag with a single attribute label: a tag with a single attribute
input: "<ref name></ref>" input: "<ref name></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]


--- ---


name: attribute_value name: attribute_value
label: a tag with a single attribute with a value label: a tag with a single attribute with a value
input: "<ref name=foo></ref>" input: "<ref name=foo></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]


--- ---


name: attribute_quoted name: attribute_quoted
label: a tag with a single quoted attribute label: a tag with a single quoted attribute
input: "<ref name="foo"></ref>" input: "<ref name="foo"></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]


--- ---


name: attribute_hyphen name: attribute_hyphen
label: a tag with a single attribute, containing a hyphen label: a tag with a single attribute, containing a hyphen
input: "<ref name=foo-bar></ref>" input: "<ref name=foo-bar></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]


--- ---


name: attribute_quoted_hyphen name: attribute_quoted_hyphen
label: a tag with a single quoted attribute, containing a hyphen label: a tag with a single quoted attribute, containing a hyphen
input: "<ref name="foo-bar"></ref>" input: "<ref name="foo-bar"></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]


--- ---


name: attribute_selfclosing name: attribute_selfclosing
label: a self-closing tag with a single attribute label: a self-closing tag with a single attribute
input: "<ref name/>" input: "<ref name/>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")]


--- ---


name: attribute_selfclosing_value name: attribute_selfclosing_value
label: a self-closing tag with a single attribute with a value label: a self-closing tag with a single attribute with a value
input: "<ref name=foo/>" input: "<ref name=foo/>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]


--- ---


name: attribute_selfclosing_value_quoted name: attribute_selfclosing_value_quoted
label: a self-closing tag with a single quoted attribute label: a self-closing tag with a single quoted attribute
input: "<ref name="foo"/>" input: "<ref name="foo"/>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]


--- ---




Ladataan…
Peruuta
Tallenna