Ver a proveniência

Remove 'type' attribute from tags; rework tag definitions.

tags/v0.3
Ben Kurtovic há 11 anos
ascendente
cometimento
6450814729
7 ficheiros alterados com 72 adições e 149 eliminações
  1. +5
    -25
      mwparserfromhell/nodes/tag.py
  2. +4
    -4
      mwparserfromhell/parser/builder.py
  3. +7
    -14
      mwparserfromhell/parser/tokenizer.py
  4. +36
    -87
      mwparserfromhell/tag_defs.py
  5. +2
    -0
      mwparserfromhell/utils.py
  6. +4
    -5
      tests/test_builder.py
  7. +14
    -14
      tests/tokenizer/tags.mwtest

+ 5
- 25
mwparserfromhell/nodes/tag.py Ver ficheiro

@@ -24,18 +24,17 @@ from __future__ import unicode_literals

from . import Node, Text
from ..compat import str
from ..tag_defs import TagDefinitions
from ..tag_defs import get_wikicode, is_visible
from ..utils import parse_anything

__all__ = ["Tag"]

class Tag(TagDefinitions, Node):
class Tag(Node):
"""Represents an HTML-style tag in wikicode, like ``<ref>``."""

def __init__(self, type_, tag, contents=None, attrs=None, showtag=True,
def __init__(self, tag, contents=None, attrs=None, showtag=True,
self_closing=False, padding="", closing_tag=None):
super(Tag, self).__init__()
self._type = type_
self._tag = tag
self._contents = contents
if attrs:
@@ -52,7 +51,7 @@ class Tag(TagDefinitions, Node):

def __unicode__(self):
if not self.showtag:
open_, close = self.WIKICODE[self.type]
open_, close = get_wikicode[self.tag]
if self.self_closing:
return open_
else:
@@ -84,7 +83,7 @@ class Tag(TagDefinitions, Node):
yield self.contents, child

def __strip__(self, normalize, collapse):
if self.type in self.TAGS_VISIBLE:
if is_visible(self.tag):
return self.contents.strip_code(normalize, collapse)
return None

@@ -113,11 +112,6 @@ class Tag(TagDefinitions, Node):
write(">")

@property
def type(self):
"""The tag type."""
return self._type

@property
def tag(self):
"""The tag itself, as a :py:class:`~.Wikicode` object."""
return self._tag
@@ -159,23 +153,9 @@ class Tag(TagDefinitions, Node):
"""
return self._closing_tag

@type.setter
def type(self, value):
value = int(value)
if value not in self.TAGS_ALL:
raise ValueError(value)
self._type = value
for key in self.TRANSLATIONS:
if self.TRANSLATIONS[key] == value:
self._tag = self._closing_tag = parse_anything(key)

@tag.setter
def tag(self, value):
self._tag = self._closing_tag = parse_anything(value)
try:
self._type = self.TRANSLATIONS[text]
except KeyError:
self._type = self.TAG_UNKNOWN

@contents.setter
def contents(self, value):


+ 4
- 4
mwparserfromhell/parser/builder.py Ver ficheiro

@@ -202,7 +202,7 @@ class Builder(object):

def _handle_tag(self, token):
"""Handle a case where a tag is at the head of the tokens."""
type_, showtag = token.type, token.showtag
showtag = token.showtag
attrs = []
self._push()
while self._tokens:
@@ -215,14 +215,14 @@ class Builder(object):
self._push()
elif isinstance(token, tokens.TagCloseSelfclose):
tag = self._pop()
return Tag(type_, tag, attrs=attrs, showtag=showtag,
return Tag(tag, attrs=attrs, showtag=showtag,
self_closing=True, padding=token.padding)
elif isinstance(token, tokens.TagOpenClose):
contents = self._pop()
self._push()
elif isinstance(token, tokens.TagCloseClose):
return Tag(type_, tag, contents, attrs, showtag, False,
padding, self._pop())
return Tag(tag, contents, attrs, showtag, False, padding,
self._pop())
else:
self._write(self._handle_token(token))



+ 7
- 14
mwparserfromhell/parser/tokenizer.py Ver ficheiro

@@ -27,7 +27,7 @@ import re
from . import contexts
from . import tokens
from ..compat import htmlentities
from ..nodes.tag import Tag
from ..tag_defs import is_parsable

__all__ = ["Tokenizer"]

@@ -416,8 +416,8 @@ class Tokenizer(object):
else:
self._write_all(tokens)

def _get_tag_type_from_stack(self, stack=None):
"""Return the tag type based on the text in *stack*.
def _get_tag_from_stack(self, stack=None):
"""Return the tag based on the text in *stack*.

If *stack* is ``None``, we will use the current, topmost one.
"""
@@ -427,11 +427,7 @@ class Tokenizer(object):
if not stack:
self._fail_route() # Tag has an empty name?
text = [tok for tok in stack if isinstance(tok, tokens.Text)]
text = "".join([token.text for token in text]).rstrip().lower()
try:
return Tag.TRANSLATIONS[text]
except KeyError:
return Tag.TAG_UNKNOWN
return "".join([token.text for token in text]).rstrip().lower()

def _actually_close_tag_opening(self):
"""Handle cleanup at the end of a opening tag.
@@ -447,8 +443,7 @@ class Tokenizer(object):
if self._context & contexts.TAG_OPEN_ATTR_BODY:
self._context ^= contexts.TAG_OPEN_ATTR_BODY
else:
tag = self._get_tag_type_from_stack()
self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
self._write_first(tokens.TagOpenOpen(showtag=True))
self._context ^= contexts.TAG_OPEN_NAME
self._context |= contexts.TAG_BODY

@@ -509,8 +504,7 @@ class Tokenizer(object):
is_quoted = False
if self._context & contexts.TAG_OPEN_NAME:
self._write_text(chunks.pop(0))
tag = self._get_tag_type_from_stack()
self._write_first(tokens.TagOpenOpen(type=tag, showtag=True))
self._write_first(tokens.TagOpenOpen(showtag=True))
self._context ^= contexts.TAG_OPEN_NAME
self._context |= contexts.TAG_OPEN_ATTR_NAME
self._actually_handle_chunk(chunks, True)
@@ -584,8 +578,7 @@ class Tokenizer(object):
def _handle_tag_close_close(self):
"""Handle the ending of a closing tag (``</foo>``)."""
closing = self._pop()
tag = self._get_tag_type_from_stack(closing)
if tag != self._stack[0].type:
if self._get_tag_from_stack(closing) != self._get_tag_from_stack():
# Closing and opening tags are not the same, so fail this route:
self._fail_route()
self._write_all(closing)


+ 36
- 87
mwparserfromhell/tag_defs.py Ver ficheiro

@@ -20,99 +20,48 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import unicode_literals
"""Contains data regarding certain HTML tags."""

class TagDefinitions(object):
"""Contains numerical definitions for valid HTML (and wikicode) tags.
from __future__ import unicode_literals

Base class for :py:class:`~.Tag` objects.
"""
__all__ = ["get_wikicode", "is_parsable", "is_visible"]

TAG_UNKNOWN = 0
PARSER_BLACKLIST = [
# enwiki extensions @ 2013-06-28
"categorytree", "gallery", "hiero", "imagemap", "inputbox", "math",
"nowiki", "pre", "score", "section", "source", "syntaxhighlight",
"templatedata", "timeline"
]

# Basic HTML:
TAG_ITALIC = 1
TAG_BOLD = 2
TAG_UNDERLINE = 3
TAG_STRIKETHROUGH = 4
TAG_UNORDERED_LIST = 5
TAG_ORDERED_LIST = 6
TAG_DEF_TERM = 7
TAG_DEF_ITEM = 8
TAG_BLOCKQUOTE = 9
TAG_RULE = 10
TAG_BREAK = 11
TAG_ABBR = 12
TAG_PRE = 13
TAG_MONOSPACE = 14
TAG_CODE = 15
TAG_SPAN = 16
TAG_DIV = 17
TAG_FONT = 18
TAG_SMALL = 19
TAG_BIG = 20
TAG_CENTER = 21
INVISIBLE_TAGS = [
# enwiki extensions @ 2013-06-28
"categorytree", "gallery", "imagemap", "inputbox", "math", "score",
"section", "templatedata", "timeline"
]

# MediaWiki parser hooks:
TAG_REF = 101
TAG_GALLERY = 102
TAG_MATH = 103
TAG_NOWIKI = 104
TAG_NOINCLUDE = 105
TAG_INCLUDEONLY = 106
TAG_ONLYINCLUDE = 107
# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"]

# Additional parser hooks:
TAG_SYNTAXHIGHLIGHT = 201
TAG_POEM = 202
WIKICODE = {
"i": {"open": "''", "close": "''"},
"b": {"open": "'''", "close": "'''"},
"ul": {"open": "*"},
"ol": {"open": "#"},
"dt": {"open": ";"},
"dd": {"open": ":"},
"hr": {"open": "----"},
}

# Lists of tags:
TAGS_ALL = set(range(300))
TAGS_INVISIBLE = {TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE}
TAGS_VISIBLE = TAGS_ALL - TAGS_INVISIBLE
def get_wikicode(tag):
"""Return the appropriate wikicode before and after the given *tag*."""
data = WIKICODE[tag.lower()]
return (data.get("open"), data.get("close"))

TRANSLATIONS = {
"i": TAG_ITALIC,
"em": TAG_ITALIC,
"b": TAG_BOLD,
"strong": TAG_BOLD,
"u": TAG_UNDERLINE,
"s": TAG_STRIKETHROUGH,
"ul": TAG_UNORDERED_LIST,
"ol": TAG_ORDERED_LIST,
"dt": TAG_DEF_TERM,
"dd": TAG_DEF_ITEM,
"blockquote": TAG_BLOCKQUOTE,
"hl": TAG_RULE,
"br": TAG_BREAK,
"abbr": TAG_ABBR,
"pre": TAG_PRE,
"tt": TAG_MONOSPACE,
"code": TAG_CODE,
"span": TAG_SPAN,
"div": TAG_DIV,
"font": TAG_FONT,
"small": TAG_SMALL,
"big": TAG_BIG,
"center": TAG_CENTER,
"ref": TAG_REF,
"gallery": TAG_GALLERY,
"math": TAG_MATH,
"nowiki": TAG_NOWIKI,
"noinclude": TAG_NOINCLUDE,
"includeonly": TAG_INCLUDEONLY,
"onlyinclude": TAG_ONLYINCLUDE,
"syntaxhighlight": TAG_SYNTAXHIGHLIGHT,
"source": TAG_SYNTAXHIGHLIGHT,
"poem": TAG_POEM,
}
def is_parsable(tag):
"""Return if the given *tag*'s contents should be passed to the parser."""
return tag.lower() not in PARSER_BLACKLIST

WIKICODE = {
TAG_ITALIC: ("''", "''"),
TAG_BOLD: ("'''", "'''"),
TAG_UNORDERED_LIST: ("*", ""),
TAG_ORDERED_LIST: ("#", ""),
TAG_DEF_TERM: (";", ""),
TAG_DEF_ITEM: (":", ""),
TAG_RULE: ("----", ""),
}
def is_visible(tag):
"""Return whether or not the given *tag* contains visible text."""
return tag.lower() not in INVISIBLE_TAGS

+ 2
- 0
mwparserfromhell/utils.py Ver ficheiro

@@ -31,6 +31,8 @@ from .compat import bytes, str
from .nodes import Node
from .smart_list import SmartList

__all__ = ["parse_anything"]

def parse_anything(value):
"""Return a :py:class:`~.Wikicode` for *value*, allowing multiple types.



+ 4
- 5
tests/test_builder.py Ver ficheiro

@@ -193,11 +193,10 @@ class TestBuilder(TreeEqualityTestCase):
def test_tag(self):
"""tests for building Tag nodes"""
tests = [
([tokens.TagOpenOpen(showtag=True, type=101),
tokens.Text(text="ref"), tokens.TagCloseOpen(padding=""),
tokens.TagOpenClose(), tokens.Text(text="ref"),
tokens.TagCloseClose()],
wrap([Tag(101, wraptext("ref"), wrap([]), [], True, False, "",
([tokens.TagOpenOpen(showtag=True), tokens.Text(text="ref"),
tokens.TagCloseOpen(padding=""), tokens.TagOpenClose(),
tokens.Text(text="ref"), tokens.TagCloseClose()],
wrap([Tag(wraptext("ref"), wrap([]), [], True, False, "",
wraptext("ref"))])),
]
for test, valid in tests:


+ 14
- 14
tests/tokenizer/tags.mwtest Ver ficheiro

@@ -1,98 +1,98 @@
name: basic
label: a basic tag with an open and close
input: "<ref></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: basic_selfclosing
label: a basic self-closing tag
input: "<ref/>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding="")]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding="")]

---

name: content
label: a tag with some content in the middle
input: "<ref>this is a reference</ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: padded_open
label: a tag with some padding in the open tag
input: "<ref ></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: padded_close
label: a tag with some padding in the close tag
input: "<ref></ref >"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()]

---

name: padded_selfclosing
label: a self-closing tag with padding
input: "<ref />"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagCloseSelfclose(padding=" ")]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagCloseSelfclose(padding=" ")]

---

name: attribute
label: a tag with a single attribute
input: "<ref name></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: attribute_value
label: a tag with a single attribute with a value
input: "<ref name=foo></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: attribute_quoted
label: a tag with a single quoted attribute
input: "<ref name="foo"></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: attribute_hyphen
label: a tag with a single attribute, containing a hyphen
input: "<ref name=foo-bar></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: attribute_quoted_hyphen
label: a tag with a single quoted attribute, containing a hyphen
input: "<ref name="foo-bar"></ref>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: attribute_selfclosing
label: a self-closing tag with a single attribute
input: "<ref name/>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagCloseSelfclose(padding="")]

---

name: attribute_selfclosing_value
label: a self-closing tag with a single attribute with a value
input: "<ref name=foo/>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]

---

name: attribute_selfclosing_value_quoted
label: a self-closing tag with a single quoted attribute
input: "<ref name="foo"/>"
output: [TagOpenOpen(showtag=True, type=101), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]
output: [TagOpenOpen(showtag=True), Text(text="ref"), TagAttrStart(padding=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]

---



Carregando…
Cancelar
Guardar