@@ -46,6 +46,7 @@ nodes Package | |||||
.. automodule:: mwparserfromhell.nodes.tag | .. automodule:: mwparserfromhell.nodes.tag | ||||
:members: | :members: | ||||
:undoc-members: | |||||
:show-inheritance: | :show-inheritance: | ||||
:mod:`template` Module | :mod:`template` Module | ||||
@@ -30,6 +30,12 @@ mwparserfromhell Package | |||||
:members: | :members: | ||||
:undoc-members: | :undoc-members: | ||||
:mod:`tag_defs` Module | |||||
---------------------- | |||||
.. automodule:: mwparserfromhell.tag_defs | |||||
:members: | |||||
:mod:`utils` Module | :mod:`utils` Module | ||||
------------------- | ------------------- | ||||
@@ -36,18 +36,23 @@ class Attribute(StringMixIn): | |||||
whose value is ``"foo"``. | whose value is ``"foo"``. | ||||
""" | """ | ||||
def __init__(self, name, value=None, quoted=True): | |||||
def __init__(self, name, value=None, quoted=True, pad_first="", | |||||
pad_before_eq="", pad_after_eq=""): | |||||
super(Attribute, self).__init__() | super(Attribute, self).__init__() | ||||
self._name = name | self._name = name | ||||
self._value = value | self._value = value | ||||
self._quoted = quoted | self._quoted = quoted | ||||
self._pad_first = pad_first | |||||
self._pad_before_eq = pad_before_eq | |||||
self._pad_after_eq = pad_after_eq | |||||
def __unicode__(self): | def __unicode__(self): | ||||
base = self.pad_first + str(self.name) + self.pad_before_eq | |||||
if self.value: | if self.value: | ||||
if self.quoted: | if self.quoted: | ||||
return str(self.name) + '="' + str(self.value) + '"' | |||||
return str(self.name) + "=" + str(self.value) | |||||
return str(self.name) | |||||
return base + '="' + self.pad_after_eq + str(self.value) + '"' | |||||
return base + "=" + self.pad_after_eq + str(self.value) | |||||
return base | |||||
@property | @property | ||||
def name(self): | def name(self): | ||||
@@ -64,14 +69,41 @@ class Attribute(StringMixIn): | |||||
"""Whether the attribute's value is quoted with double quotes.""" | """Whether the attribute's value is quoted with double quotes.""" | ||||
return self._quoted | return self._quoted | ||||
@property | |||||
def pad_first(self): | |||||
"""Spacing to insert right before the attribute.""" | |||||
return self._pad_first | |||||
@property | |||||
def pad_before_eq(self): | |||||
"""Spacing to insert right before the equal sign.""" | |||||
return self._pad_before_eq | |||||
@property | |||||
def pad_after_eq(self): | |||||
"""Spacing to insert right after the equal sign.""" | |||||
return self._pad_after_eq | |||||
@name.setter | @name.setter | ||||
def name(self, newval): | |||||
self._name = parse_anything(newval) | |||||
def name(self, value): | |||||
self._name = parse_anything(value) | |||||
@value.setter | @value.setter | ||||
def value(self, newval): | def value(self, newval): | ||||
self._value = parse_anything(newval) | self._value = parse_anything(newval) | ||||
@quoted.setter | @quoted.setter | ||||
def quoted(self, newval): | |||||
self._quoted = bool(newval) | |||||
def quoted(self, value): | |||||
self._quoted = bool(value) | |||||
@pad_first.setter | |||||
def pad_first(self, value): | |||||
self._pad_first = str(value) | |||||
@pad_before_eq.setter | |||||
def pad_before_eq(self, value): | |||||
self._pad_before_eq = str(value) | |||||
@pad_after_eq.setter | |||||
def pad_after_eq(self, value): | |||||
self._pad_after_eq = str(value) |
@@ -24,6 +24,7 @@ from __future__ import unicode_literals | |||||
from . import Node, Text | from . import Node, Text | ||||
from ..compat import str | from ..compat import str | ||||
from ..tag_defs import get_wikicode, is_visible | |||||
from ..utils import parse_anything | from ..utils import parse_anything | ||||
__all__ = ["Tag"] | __all__ = ["Tag"] | ||||
@@ -31,79 +32,39 @@ __all__ = ["Tag"] | |||||
class Tag(Node): | class Tag(Node): | ||||
"""Represents an HTML-style tag in wikicode, like ``<ref>``.""" | """Represents an HTML-style tag in wikicode, like ``<ref>``.""" | ||||
TAG_UNKNOWN = 0 | |||||
# Basic HTML: | |||||
TAG_ITALIC = 1 | |||||
TAG_BOLD = 2 | |||||
TAG_UNDERLINE = 3 | |||||
TAG_STRIKETHROUGH = 4 | |||||
TAG_UNORDERED_LIST = 5 | |||||
TAG_ORDERED_LIST = 6 | |||||
TAG_DEF_TERM = 7 | |||||
TAG_DEF_ITEM = 8 | |||||
TAG_BLOCKQUOTE = 9 | |||||
TAG_RULE = 10 | |||||
TAG_BREAK = 11 | |||||
TAG_ABBR = 12 | |||||
TAG_PRE = 13 | |||||
TAG_MONOSPACE = 14 | |||||
TAG_CODE = 15 | |||||
TAG_SPAN = 16 | |||||
TAG_DIV = 17 | |||||
TAG_FONT = 18 | |||||
TAG_SMALL = 19 | |||||
TAG_BIG = 20 | |||||
TAG_CENTER = 21 | |||||
# MediaWiki parser hooks: | |||||
TAG_REF = 101 | |||||
TAG_GALLERY = 102 | |||||
TAG_MATH = 103 | |||||
TAG_NOWIKI = 104 | |||||
TAG_NOINCLUDE = 105 | |||||
TAG_INCLUDEONLY = 106 | |||||
TAG_ONLYINCLUDE = 107 | |||||
# Additional parser hooks: | |||||
TAG_SYNTAXHIGHLIGHT = 201 | |||||
TAG_POEM = 202 | |||||
# Lists of tags: | |||||
TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE)) | |||||
TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE | |||||
def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, | |||||
self_closing=False, open_padding=0, close_padding=0): | |||||
def __init__(self, tag, contents=None, attrs=None, showtag=True, | |||||
self_closing=False, invalid=False, implicit=False, padding="", | |||||
closing_tag=None): | |||||
super(Tag, self).__init__() | super(Tag, self).__init__() | ||||
self._type = type_ | |||||
self._tag = tag | self._tag = tag | ||||
self._contents = contents | self._contents = contents | ||||
if attrs: | |||||
self._attrs = attrs | |||||
else: | |||||
self._attrs = [] | |||||
self._attrs = attrs if attrs else [] | |||||
self._showtag = showtag | self._showtag = showtag | ||||
self._self_closing = self_closing | self._self_closing = self_closing | ||||
self._open_padding = open_padding | |||||
self._close_padding = close_padding | |||||
self._invalid = invalid | |||||
self._implicit = implicit | |||||
self._padding = padding | |||||
if closing_tag: | |||||
self._closing_tag = closing_tag | |||||
elif not self_closing: | |||||
self._closing_tag = tag | |||||
def __unicode__(self): | def __unicode__(self): | ||||
if not self.showtag: | if not self.showtag: | ||||
open_, close = self._translate() | |||||
open_, close = get_wikicode[self.tag] | |||||
if self.self_closing: | if self.self_closing: | ||||
return open_ | return open_ | ||||
else: | else: | ||||
return open_ + str(self.contents) + close | return open_ + str(self.contents) + close | ||||
result = "<" + str(self.tag) | |||||
if self.attrs: | |||||
result += " " + " ".join([str(attr) for attr in self.attrs]) | |||||
result = ("</" if self.invalid else "<") + str(self.tag) | |||||
if self.attributes: | |||||
result += "".join([str(attr) for attr in self.attributes]) | |||||
if self.self_closing: | if self.self_closing: | ||||
result += " " * self.open_padding + "/>" | |||||
result += self.padding + (">" if self.implicit else "/>") | |||||
else: | else: | ||||
result += " " * self.open_padding + ">" + str(self.contents) | |||||
result += "</" + str(self.tag) + " " * self.close_padding + ">" | |||||
result += self.padding + ">" + str(self.contents) | |||||
result += "</" + str(self.closing_tag) + ">" | |||||
return result | return result | ||||
def __iternodes__(self, getter): | def __iternodes__(self, getter): | ||||
@@ -111,66 +72,43 @@ class Tag(Node): | |||||
if self.showtag: | if self.showtag: | ||||
for child in getter(self.tag): | for child in getter(self.tag): | ||||
yield self.tag, child | yield self.tag, child | ||||
for attr in self.attrs: | |||||
for attr in self.attributes: | |||||
for child in getter(attr.name): | for child in getter(attr.name): | ||||
yield attr.name, child | yield attr.name, child | ||||
if attr.value: | if attr.value: | ||||
for child in getter(attr.value): | for child in getter(attr.value): | ||||
yield attr.value, child | yield attr.value, child | ||||
for child in getter(self.contents): | |||||
yield self.contents, child | |||||
if self.contents: | |||||
for child in getter(self.contents): | |||||
yield self.contents, child | |||||
if not self.self_closing and self.closing_tag: | |||||
for child in getter(self.closing_tag): | |||||
yield self.closing_tag, child | |||||
def __strip__(self, normalize, collapse): | def __strip__(self, normalize, collapse): | ||||
if self.type in self.TAGS_VISIBLE: | |||||
if is_visible(self.tag): | |||||
return self.contents.strip_code(normalize, collapse) | return self.contents.strip_code(normalize, collapse) | ||||
return None | return None | ||||
def __showtree__(self, write, get, mark): | def __showtree__(self, write, get, mark): | ||||
tagnodes = self.tag.nodes | |||||
if (not self.attrs and len(tagnodes) == 1 and isinstance(tagnodes[0], Text)): | |||||
write("<" + str(tagnodes[0]) + ">") | |||||
write("</" if self.invalid else "<") | |||||
get(self.tag) | |||||
for attr in self.attributes: | |||||
get(attr.name) | |||||
if not attr.value: | |||||
continue | |||||
write(" = ") | |||||
mark() | |||||
get(attr.value) | |||||
if self.self_closing: | |||||
write(">" if self.implicit else "/>") | |||||
else: | else: | ||||
write("<") | |||||
get(self.tag) | |||||
for attr in self.attrs: | |||||
get(attr.name) | |||||
if not attr.value: | |||||
continue | |||||
write(" = ") | |||||
mark() | |||||
get(attr.value) | |||||
write(">") | write(">") | ||||
get(self.contents) | |||||
if len(tagnodes) == 1 and isinstance(tagnodes[0], Text): | |||||
write("</" + str(tagnodes[0]) + ">") | |||||
else: | |||||
get(self.contents) | |||||
write("</") | write("</") | ||||
get(self.tag) | |||||
get(self.closing_tag) | |||||
write(">") | write(">") | ||||
def _translate(self): | |||||
"""If the HTML-style tag has a wikicode representation, return that. | |||||
For example, ``<b>Foo</b>`` can be represented as ``'''Foo'''``. This | |||||
returns a tuple of the character starting the sequence and the | |||||
character ending it. | |||||
""" | |||||
translations = { | |||||
self.TAG_ITALIC: ("''", "''"), | |||||
self.TAG_BOLD: ("'''", "'''"), | |||||
self.TAG_UNORDERED_LIST: ("*", ""), | |||||
self.TAG_ORDERED_LIST: ("#", ""), | |||||
self.TAG_DEF_TERM: (";", ""), | |||||
self.TAG_DEF_ITEM: (":", ""), | |||||
self.TAG_RULE: ("----", ""), | |||||
} | |||||
return translations[self.type] | |||||
@property | |||||
def type(self): | |||||
"""The tag type.""" | |||||
return self._type | |||||
@property | @property | ||||
def tag(self): | def tag(self): | ||||
"""The tag itself, as a :py:class:`~.Wikicode` object.""" | """The tag itself, as a :py:class:`~.Wikicode` object.""" | ||||
@@ -182,7 +120,7 @@ class Tag(Node): | |||||
return self._contents | return self._contents | ||||
@property | @property | ||||
def attrs(self): | |||||
def attributes(self): | |||||
"""The list of attributes affecting the tag. | """The list of attributes affecting the tag. | ||||
Each attribute is an instance of :py:class:`~.Attribute`. | Each attribute is an instance of :py:class:`~.Attribute`. | ||||
@@ -196,29 +134,47 @@ class Tag(Node): | |||||
@property | @property | ||||
def self_closing(self): | def self_closing(self): | ||||
"""Whether the tag is self-closing with no content.""" | |||||
"""Whether the tag is self-closing with no content (like ``<br/>``).""" | |||||
return self._self_closing | return self._self_closing | ||||
@property | @property | ||||
def open_padding(self): | |||||
"""How much spacing to insert before the first closing >.""" | |||||
return self._open_padding | |||||
def invalid(self): | |||||
"""Whether the tag starts with a backslash after the opening bracket. | |||||
This makes the tag look like a lone close tag. It is technically | |||||
invalid and is only parsable Wikicode when the tag itself is | |||||
single-only, like ``<br>`` and ``<img>``. See | |||||
:py:func:`tag_defs.is_single_only`. | |||||
""" | |||||
return self._invalid | |||||
@property | @property | ||||
def close_padding(self): | |||||
"""How much spacing to insert before the last closing >.""" | |||||
return self._close_padding | |||||
def implicit(self): | |||||
"""Whether the tag is implicitly self-closing, with no ending slash. | |||||
@type.setter | |||||
def type(self, value): | |||||
value = int(value) | |||||
if value not in self.TAGS_INVISIBLE | self.TAGS_VISIBLE: | |||||
raise ValueError(value) | |||||
self._type = value | |||||
This is only possible for specific "single" tags like ``<br>`` and | |||||
``<li>``. See :py:func:`tag_defs.is_single`. This field only has an | |||||
effect if :py:attr:`self_closing` is also ``True``. | |||||
""" | |||||
return self._implicit | |||||
@property | |||||
def padding(self): | |||||
"""Spacing to insert before the first closing ``>``.""" | |||||
return self._padding | |||||
@property | |||||
def closing_tag(self): | |||||
"""The closing tag, as a :py:class:`~.Wikicode` object. | |||||
This will usually equal :py:attr:`tag`, unless there is additional | |||||
spacing, comments, or the like. | |||||
""" | |||||
return self._closing_tag | |||||
@tag.setter | @tag.setter | ||||
def tag(self, value): | def tag(self, value): | ||||
self._tag = parse_anything(value) | |||||
self._tag = self._closing_tag = parse_anything(value) | |||||
@contents.setter | @contents.setter | ||||
def contents(self, value): | def contents(self, value): | ||||
@@ -232,10 +188,18 @@ class Tag(Node): | |||||
def self_closing(self, value): | def self_closing(self, value): | ||||
self._self_closing = bool(value) | self._self_closing = bool(value) | ||||
@open_padding.setter | |||||
def open_padding(self, value): | |||||
self._open_padding = int(value) | |||||
@invalid.setter | |||||
def invalid(self, value): | |||||
self._invalid = bool(value) | |||||
@implicit.setter | |||||
def implicit(self, value): | |||||
self._implicit = bool(value) | |||||
@padding.setter | |||||
def padding(self, value): | |||||
self._padding = str(value) | |||||
@close_padding.setter | |||||
def close_padding(self, value): | |||||
self._close_padding = int(value) | |||||
@closing_tag.setter | |||||
def closing_tag(self, value): | |||||
self._closing_tag = parse_anything(value) |
@@ -170,7 +170,7 @@ class Builder(object): | |||||
self._write(self._handle_token(token)) | self._write(self._handle_token(token)) | ||||
def _handle_comment(self): | def _handle_comment(self): | ||||
"""Handle a case where a hidden comment is at the head of the tokens.""" | |||||
"""Handle a case where an HTML comment is at the head of the tokens.""" | |||||
self._push() | self._push() | ||||
while self._tokens: | while self._tokens: | ||||
token = self._tokens.pop() | token = self._tokens.pop() | ||||
@@ -180,7 +180,7 @@ class Builder(object): | |||||
else: | else: | ||||
self._write(self._handle_token(token)) | self._write(self._handle_token(token)) | ||||
def _handle_attribute(self): | |||||
def _handle_attribute(self, start): | |||||
"""Handle a case where a tag attribute is at the head of the tokens.""" | """Handle a case where a tag attribute is at the head of the tokens.""" | ||||
name, quoted = None, False | name, quoted = None, False | ||||
self._push() | self._push() | ||||
@@ -191,37 +191,47 @@ class Builder(object): | |||||
self._push() | self._push() | ||||
elif isinstance(token, tokens.TagAttrQuote): | elif isinstance(token, tokens.TagAttrQuote): | ||||
quoted = True | quoted = True | ||||
elif isinstance(token, (tokens.TagAttrStart, | |||||
tokens.TagCloseOpen)): | |||||
elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen, | |||||
tokens.TagCloseSelfclose)): | |||||
self._tokens.append(token) | self._tokens.append(token) | ||||
if name is not None: | |||||
return Attribute(name, self._pop(), quoted) | |||||
return Attribute(self._pop(), quoted=quoted) | |||||
if name: | |||||
value = self._pop() | |||||
else: | |||||
name, value = self._pop(), None | |||||
return Attribute(name, value, quoted, start.pad_first, | |||||
start.pad_before_eq, start.pad_after_eq) | |||||
else: | else: | ||||
self._write(self._handle_token(token)) | self._write(self._handle_token(token)) | ||||
def _handle_tag(self, token): | def _handle_tag(self, token): | ||||
"""Handle a case where a tag is at the head of the tokens.""" | """Handle a case where a tag is at the head of the tokens.""" | ||||
type_, showtag = token.type, token.showtag | |||||
attrs = [] | |||||
close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose) | |||||
implicit, attrs, contents, closing_tag = False, [], None, None | |||||
showtag = token.get("showtag", True) | |||||
invalid = token.get("invalid", False) | |||||
self._push() | self._push() | ||||
while self._tokens: | while self._tokens: | ||||
token = self._tokens.pop() | token = self._tokens.pop() | ||||
if isinstance(token, tokens.TagAttrStart): | if isinstance(token, tokens.TagAttrStart): | ||||
attrs.append(self._handle_attribute()) | |||||
attrs.append(self._handle_attribute(token)) | |||||
elif isinstance(token, tokens.TagCloseOpen): | elif isinstance(token, tokens.TagCloseOpen): | ||||
open_pad = token.padding | |||||
padding = token.padding | |||||
tag = self._pop() | tag = self._pop() | ||||
self._push() | self._push() | ||||
elif isinstance(token, tokens.TagCloseSelfclose): | |||||
tag = self._pop() | |||||
return Tag(type_, tag, attrs=attrs, showtag=showtag, | |||||
self_closing=True, open_padding=token.padding) | |||||
elif isinstance(token, tokens.TagOpenClose): | elif isinstance(token, tokens.TagOpenClose): | ||||
contents = self._pop() | contents = self._pop() | ||||
elif isinstance(token, tokens.TagCloseClose): | |||||
return Tag(type_, tag, contents, attrs, showtag, False, | |||||
open_pad, token.padding) | |||||
self._push() | |||||
elif isinstance(token, close_tokens): | |||||
if isinstance(token, tokens.TagCloseSelfclose): | |||||
tag = self._pop() | |||||
self_closing = True | |||||
padding = token.padding | |||||
implicit = token.get("implicit", False) | |||||
else: | |||||
self_closing = False | |||||
closing_tag = self._pop() | |||||
return Tag(tag, contents, attrs, showtag, self_closing, | |||||
invalid, implicit, padding, closing_tag) | |||||
else: | else: | ||||
self._write(self._handle_token(token)) | self._write(self._handle_token(token)) | ||||
@@ -62,6 +62,13 @@ Local (stack-specific) contexts: | |||||
* :py:const:`COMMENT` | * :py:const:`COMMENT` | ||||
* :py:const:`TAG` | |||||
* :py:const:`TAG_OPEN` | |||||
* :py:const:`TAG_ATTR` | |||||
* :py:const:`TAG_BODY` | |||||
* :py:const:`TAG_CLOSE` | |||||
* :py:const:`SAFETY_CHECK` | * :py:const:`SAFETY_CHECK` | ||||
* :py:const:`HAS_TEXT` | * :py:const:`HAS_TEXT` | ||||
@@ -78,37 +85,45 @@ Global contexts: | |||||
# Local contexts: | # Local contexts: | ||||
TEMPLATE = 0b00000000000000000111 | |||||
TEMPLATE_NAME = 0b00000000000000000001 | |||||
TEMPLATE_PARAM_KEY = 0b00000000000000000010 | |||||
TEMPLATE_PARAM_VALUE = 0b00000000000000000100 | |||||
ARGUMENT = 0b00000000000000011000 | |||||
ARGUMENT_NAME = 0b00000000000000001000 | |||||
ARGUMENT_DEFAULT = 0b00000000000000010000 | |||||
WIKILINK = 0b00000000000001100000 | |||||
WIKILINK_TITLE = 0b00000000000000100000 | |||||
WIKILINK_TEXT = 0b00000000000001000000 | |||||
HEADING = 0b00000001111110000000 | |||||
HEADING_LEVEL_1 = 0b00000000000010000000 | |||||
HEADING_LEVEL_2 = 0b00000000000100000000 | |||||
HEADING_LEVEL_3 = 0b00000000001000000000 | |||||
HEADING_LEVEL_4 = 0b00000000010000000000 | |||||
HEADING_LEVEL_5 = 0b00000000100000000000 | |||||
HEADING_LEVEL_6 = 0b00000001000000000000 | |||||
COMMENT = 0b00000010000000000000 | |||||
SAFETY_CHECK = 0b11111100000000000000 | |||||
HAS_TEXT = 0b00000100000000000000 | |||||
FAIL_ON_TEXT = 0b00001000000000000000 | |||||
FAIL_NEXT = 0b00010000000000000000 | |||||
FAIL_ON_LBRACE = 0b00100000000000000000 | |||||
FAIL_ON_RBRACE = 0b01000000000000000000 | |||||
FAIL_ON_EQUALS = 0b10000000000000000000 | |||||
TEMPLATE_NAME = 1 << 0 | |||||
TEMPLATE_PARAM_KEY = 1 << 1 | |||||
TEMPLATE_PARAM_VALUE = 1 << 2 | |||||
TEMPLATE = TEMPLATE_NAME + TEMPLATE_PARAM_KEY + TEMPLATE_PARAM_VALUE | |||||
ARGUMENT_NAME = 1 << 3 | |||||
ARGUMENT_DEFAULT = 1 << 4 | |||||
ARGUMENT = ARGUMENT_NAME + ARGUMENT_DEFAULT | |||||
WIKILINK_TITLE = 1 << 5 | |||||
WIKILINK_TEXT = 1 << 6 | |||||
WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT | |||||
HEADING_LEVEL_1 = 1 << 7 | |||||
HEADING_LEVEL_2 = 1 << 8 | |||||
HEADING_LEVEL_3 = 1 << 9 | |||||
HEADING_LEVEL_4 = 1 << 10 | |||||
HEADING_LEVEL_5 = 1 << 11 | |||||
HEADING_LEVEL_6 = 1 << 12 | |||||
HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 + | |||||
HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6) | |||||
COMMENT = 1 << 13 | |||||
TAG_OPEN = 1 << 14 | |||||
TAG_ATTR = 1 << 15 | |||||
TAG_BODY = 1 << 16 | |||||
TAG_CLOSE = 1 << 17 | |||||
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE | |||||
HAS_TEXT = 1 << 18 | |||||
FAIL_ON_TEXT = 1 << 19 | |||||
FAIL_NEXT = 1 << 20 | |||||
FAIL_ON_LBRACE = 1 << 21 | |||||
FAIL_ON_RBRACE = 1 << 22 | |||||
FAIL_ON_EQUALS = 1 << 23 | |||||
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | |||||
FAIL_ON_RBRACE + FAIL_ON_EQUALS) | |||||
# Global contexts: | # Global contexts: | ||||
GL_HEADING = 0b1 | |||||
GL_HEADING = 1 << 0 |
@@ -41,10 +41,10 @@ SOFTWARE. | |||||
#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" | #define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" | ||||
static const char* MARKERS[] = { | static const char* MARKERS[] = { | ||||
"{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-", | |||||
"!", "\n", ""}; | |||||
"{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-", | |||||
"\n", ""}; | |||||
#define NUM_MARKERS 18 | |||||
#define NUM_MARKERS 17 | |||||
#define TEXTBUFFER_BLOCKSIZE 1024 | #define TEXTBUFFER_BLOCKSIZE 1024 | ||||
#define MAX_DEPTH 40 | #define MAX_DEPTH 40 | ||||
#define MAX_CYCLES 100000 | #define MAX_CYCLES 100000 | ||||
@@ -60,10 +60,10 @@ static char** entitydefs; | |||||
static PyObject* EMPTY; | static PyObject* EMPTY; | ||||
static PyObject* NOARGS; | static PyObject* NOARGS; | ||||
static PyObject* tokens; | |||||
static PyObject* tag_defs; | |||||
/* Tokens */ | |||||
/* Tokens: */ | |||||
static PyObject* Text; | static PyObject* Text; | ||||
@@ -102,41 +102,58 @@ static PyObject* TagCloseClose; | |||||
/* Local contexts: */ | /* Local contexts: */ | ||||
#define LC_TEMPLATE 0x00007 | |||||
#define LC_TEMPLATE_NAME 0x00001 | |||||
#define LC_TEMPLATE_PARAM_KEY 0x00002 | |||||
#define LC_TEMPLATE_PARAM_VALUE 0x00004 | |||||
#define LC_ARGUMENT 0x00018 | |||||
#define LC_ARGUMENT_NAME 0x00008 | |||||
#define LC_ARGUMENT_DEFAULT 0x00010 | |||||
#define LC_WIKILINK 0x00060 | |||||
#define LC_WIKILINK_TITLE 0x00020 | |||||
#define LC_WIKILINK_TEXT 0x00040 | |||||
#define LC_HEADING 0x01F80 | |||||
#define LC_HEADING_LEVEL_1 0x00080 | |||||
#define LC_HEADING_LEVEL_2 0x00100 | |||||
#define LC_HEADING_LEVEL_3 0x00200 | |||||
#define LC_HEADING_LEVEL_4 0x00400 | |||||
#define LC_HEADING_LEVEL_5 0x00800 | |||||
#define LC_HEADING_LEVEL_6 0x01000 | |||||
#define LC_COMMENT 0x02000 | |||||
#define LC_SAFETY_CHECK 0xFC000 | |||||
#define LC_HAS_TEXT 0x04000 | |||||
#define LC_FAIL_ON_TEXT 0x08000 | |||||
#define LC_FAIL_NEXT 0x10000 | |||||
#define LC_FAIL_ON_LBRACE 0x20000 | |||||
#define LC_FAIL_ON_RBRACE 0x40000 | |||||
#define LC_FAIL_ON_EQUALS 0x80000 | |||||
#define LC_TEMPLATE 0x000007 | |||||
#define LC_TEMPLATE_NAME 0x000001 | |||||
#define LC_TEMPLATE_PARAM_KEY 0x000002 | |||||
#define LC_TEMPLATE_PARAM_VALUE 0x000004 | |||||
#define LC_ARGUMENT 0x000018 | |||||
#define LC_ARGUMENT_NAME 0x000008 | |||||
#define LC_ARGUMENT_DEFAULT 0x000010 | |||||
#define LC_WIKILINK 0x000060 | |||||
#define LC_WIKILINK_TITLE 0x000020 | |||||
#define LC_WIKILINK_TEXT 0x000040 | |||||
#define LC_HEADING 0x001F80 | |||||
#define LC_HEADING_LEVEL_1 0x000080 | |||||
#define LC_HEADING_LEVEL_2 0x000100 | |||||
#define LC_HEADING_LEVEL_3 0x000200 | |||||
#define LC_HEADING_LEVEL_4 0x000400 | |||||
#define LC_HEADING_LEVEL_5 0x000800 | |||||
#define LC_HEADING_LEVEL_6 0x001000 | |||||
#define LC_COMMENT 0x002000 | |||||
#define LC_TAG 0x03C000 | |||||
#define LC_TAG_OPEN 0x004000 | |||||
#define LC_TAG_ATTR 0x008000 | |||||
#define LC_TAG_BODY 0x010000 | |||||
#define LC_TAG_CLOSE 0x020000 | |||||
#define LC_SAFETY_CHECK 0xFC0000 | |||||
#define LC_HAS_TEXT 0x040000 | |||||
#define LC_FAIL_ON_TEXT 0x080000 | |||||
#define LC_FAIL_NEXT 0x100000 | |||||
#define LC_FAIL_ON_LBRACE 0x200000 | |||||
#define LC_FAIL_ON_RBRACE 0x400000 | |||||
#define LC_FAIL_ON_EQUALS 0x800000 | |||||
/* Global contexts: */ | /* Global contexts: */ | ||||
#define GL_HEADING 0x1 | #define GL_HEADING 0x1 | ||||
/* Tag contexts: */ | |||||
#define TAG_NAME 0x01 | |||||
#define TAG_ATTR_READY 0x02 | |||||
#define TAG_ATTR_NAME 0x04 | |||||
#define TAG_ATTR_VALUE 0x08 | |||||
#define TAG_QUOTED 0x10 | |||||
#define TAG_NOTE_SPACE 0x20 | |||||
#define TAG_NOTE_EQUALS 0x40 | |||||
#define TAG_NOTE_QUOTE 0x80 | |||||
/* Miscellaneous structs: */ | /* Miscellaneous structs: */ | ||||
@@ -158,13 +175,24 @@ typedef struct { | |||||
int level; | int level; | ||||
} HeadingData; | } HeadingData; | ||||
typedef struct { | |||||
int context; | |||||
struct Textbuffer* pad_first; | |||||
struct Textbuffer* pad_before_eq; | |||||
struct Textbuffer* pad_after_eq; | |||||
Py_ssize_t reset; | |||||
} TagData; | |||||
typedef struct Textbuffer Textbuffer; | |||||
typedef struct Stack Stack; | |||||
/* Tokenizer object definition: */ | /* Tokenizer object definition: */ | ||||
typedef struct { | typedef struct { | ||||
PyObject_HEAD | PyObject_HEAD | ||||
PyObject* text; /* text to tokenize */ | PyObject* text; /* text to tokenize */ | ||||
struct Stack* topstack; /* topmost stack */ | |||||
Stack* topstack; /* topmost stack */ | |||||
Py_ssize_t head; /* current position in text */ | Py_ssize_t head; /* current position in text */ | ||||
Py_ssize_t length; /* length of text */ | Py_ssize_t length; /* length of text */ | ||||
int global; /* global context */ | int global; /* global context */ | ||||
@@ -176,49 +204,31 @@ typedef struct { | |||||
/* Macros for accessing Tokenizer data: */ | /* Macros for accessing Tokenizer data: */ | ||||
#define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta))) | #define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta))) | ||||
#define Tokenizer_READ_BACKWARDS(self, delta) \ | |||||
(*PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, delta))) | |||||
#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES) | #define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES) | ||||
/* Macros for accessing HTML tag definitions: */ | |||||
#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag)) | |||||
#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag)) | |||||
#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag)) | |||||
/* Function prototypes: */ | /* Function prototypes: */ | ||||
static int heading_level_from_context(int); | |||||
static Textbuffer* Textbuffer_new(void); | |||||
static void Textbuffer_dealloc(Textbuffer*); | |||||
static TagData* TagData_new(void); | |||||
static void TagData_dealloc(TagData*); | |||||
static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); | static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*); | ||||
static struct Textbuffer* Textbuffer_new(void); | |||||
static void Tokenizer_dealloc(Tokenizer*); | static void Tokenizer_dealloc(Tokenizer*); | ||||
static void Textbuffer_dealloc(struct Textbuffer*); | |||||
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); | static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*); | ||||
static int Tokenizer_push(Tokenizer*, int); | |||||
static PyObject* Textbuffer_render(struct Textbuffer*); | |||||
static int Tokenizer_push_textbuffer(Tokenizer*); | |||||
static void Tokenizer_delete_top_of_stack(Tokenizer*); | |||||
static PyObject* Tokenizer_pop(Tokenizer*); | |||||
static PyObject* Tokenizer_pop_keeping_context(Tokenizer*); | |||||
static void* Tokenizer_fail_route(Tokenizer*); | |||||
static int Tokenizer_write(Tokenizer*, PyObject*); | |||||
static int Tokenizer_write_first(Tokenizer*, PyObject*); | |||||
static int Tokenizer_write_text(Tokenizer*, Py_UNICODE); | |||||
static int Tokenizer_write_all(Tokenizer*, PyObject*); | |||||
static int Tokenizer_write_text_then_stack(Tokenizer*, const char*); | |||||
static PyObject* Tokenizer_read(Tokenizer*, Py_ssize_t); | |||||
static PyObject* Tokenizer_read_backwards(Tokenizer*, Py_ssize_t); | |||||
static int Tokenizer_parse_template_or_argument(Tokenizer*); | |||||
static int Tokenizer_parse_template(Tokenizer*); | |||||
static int Tokenizer_parse_argument(Tokenizer*); | |||||
static int Tokenizer_handle_template_param(Tokenizer*); | |||||
static int Tokenizer_handle_template_param_value(Tokenizer*); | |||||
static PyObject* Tokenizer_handle_template_end(Tokenizer*); | |||||
static int Tokenizer_handle_argument_separator(Tokenizer*); | |||||
static PyObject* Tokenizer_handle_argument_end(Tokenizer*); | |||||
static int Tokenizer_parse_wikilink(Tokenizer*); | |||||
static int Tokenizer_handle_wikilink_separator(Tokenizer*); | |||||
static PyObject* Tokenizer_handle_wikilink_end(Tokenizer*); | |||||
static int Tokenizer_parse_heading(Tokenizer*); | |||||
static HeadingData* Tokenizer_handle_heading_end(Tokenizer*); | |||||
static int Tokenizer_really_parse_entity(Tokenizer*); | |||||
static int Tokenizer_parse_entity(Tokenizer*); | |||||
static int Tokenizer_parse_comment(Tokenizer*); | |||||
static int Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE); | |||||
static PyObject* Tokenizer_parse(Tokenizer*, int); | |||||
static int Tokenizer_parse_tag(Tokenizer*); | |||||
static PyObject* Tokenizer_parse(Tokenizer*, int, int); | |||||
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*); | ||||
@@ -24,9 +24,9 @@ from __future__ import unicode_literals | |||||
from math import log | from math import log | ||||
import re | import re | ||||
from . import contexts | |||||
from . import tokens | |||||
from . import contexts, tokens | |||||
from ..compat import htmlentities | from ..compat import htmlentities | ||||
from ..tag_defs import is_parsable, is_single, is_single_only | |||||
__all__ = ["Tokenizer"] | __all__ = ["Tokenizer"] | ||||
@@ -35,16 +35,34 @@ class BadRoute(Exception): | |||||
pass | pass | ||||
class _TagOpenData(object): | |||||
"""Stores data about an HTML open tag, like ``<ref name="foo">``.""" | |||||
CX_NAME = 1 << 0 | |||||
CX_ATTR_READY = 1 << 1 | |||||
CX_ATTR_NAME = 1 << 2 | |||||
CX_ATTR_VALUE = 1 << 3 | |||||
CX_QUOTED = 1 << 4 | |||||
CX_NOTE_SPACE = 1 << 5 | |||||
CX_NOTE_EQUALS = 1 << 6 | |||||
CX_NOTE_QUOTE = 1 << 7 | |||||
def __init__(self): | |||||
self.context = self.CX_NAME | |||||
self.padding_buffer = {"first": "", "before_eq": "", "after_eq": ""} | |||||
self.reset = 0 | |||||
class Tokenizer(object): | class Tokenizer(object): | ||||
"""Creates a list of tokens from a string of wikicode.""" | """Creates a list of tokens from a string of wikicode.""" | ||||
USES_C = False | USES_C = False | ||||
START = object() | START = object() | ||||
END = object() | END = object() | ||||
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", | MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", | ||||
"/", "-", "!", "\n", END] | |||||
"/", "-", "\n", END] | |||||
MAX_DEPTH = 40 | MAX_DEPTH = 40 | ||||
MAX_CYCLES = 100000 | MAX_CYCLES = 100000 | ||||
regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE) | |||||
regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE) | |||||
tag_splitter = re.compile(r"([\s\"\\]+)") | |||||
def __init__(self): | def __init__(self): | ||||
self._text = None | self._text = None | ||||
@@ -117,33 +135,33 @@ class Tokenizer(object): | |||||
self._pop() | self._pop() | ||||
raise BadRoute() | raise BadRoute() | ||||
def _write(self, token): | |||||
def _emit(self, token): | |||||
"""Write a token to the end of the current token stack.""" | """Write a token to the end of the current token stack.""" | ||||
self._push_textbuffer() | self._push_textbuffer() | ||||
self._stack.append(token) | self._stack.append(token) | ||||
def _write_first(self, token): | |||||
def _emit_first(self, token): | |||||
"""Write a token to the beginning of the current token stack.""" | """Write a token to the beginning of the current token stack.""" | ||||
self._push_textbuffer() | self._push_textbuffer() | ||||
self._stack.insert(0, token) | self._stack.insert(0, token) | ||||
def _write_text(self, text): | |||||
def _emit_text(self, text): | |||||
"""Write text to the current textbuffer.""" | """Write text to the current textbuffer.""" | ||||
self._textbuffer.append(text) | self._textbuffer.append(text) | ||||
def _write_all(self, tokenlist): | |||||
def _emit_all(self, tokenlist): | |||||
"""Write a series of tokens to the current stack at once.""" | """Write a series of tokens to the current stack at once.""" | ||||
if tokenlist and isinstance(tokenlist[0], tokens.Text): | if tokenlist and isinstance(tokenlist[0], tokens.Text): | ||||
self._write_text(tokenlist.pop(0).text) | |||||
self._emit_text(tokenlist.pop(0).text) | |||||
self._push_textbuffer() | self._push_textbuffer() | ||||
self._stack.extend(tokenlist) | self._stack.extend(tokenlist) | ||||
def _write_text_then_stack(self, text): | |||||
def _emit_text_then_stack(self, text): | |||||
"""Pop the current stack, write *text*, and then write the stack.""" | """Pop the current stack, write *text*, and then write the stack.""" | ||||
stack = self._pop() | stack = self._pop() | ||||
self._write_text(text) | |||||
self._emit_text(text) | |||||
if stack: | if stack: | ||||
self._write_all(stack) | |||||
self._emit_all(stack) | |||||
self._head -= 1 | self._head -= 1 | ||||
def _read(self, delta=0, wrap=False, strict=False): | def _read(self, delta=0, wrap=False, strict=False): | ||||
@@ -168,6 +186,30 @@ class Tokenizer(object): | |||||
self._fail_route() | self._fail_route() | ||||
return self.END | return self.END | ||||
def _parse_template(self): | |||||
"""Parse a template at the head of the wikicode string.""" | |||||
reset = self._head | |||||
try: | |||||
template = self._parse(contexts.TEMPLATE_NAME) | |||||
except BadRoute: | |||||
self._head = reset | |||||
raise | |||||
self._emit_first(tokens.TemplateOpen()) | |||||
self._emit_all(template) | |||||
self._emit(tokens.TemplateClose()) | |||||
def _parse_argument(self): | |||||
"""Parse an argument at the head of the wikicode string.""" | |||||
reset = self._head | |||||
try: | |||||
argument = self._parse(contexts.ARGUMENT_NAME) | |||||
except BadRoute: | |||||
self._head = reset | |||||
raise | |||||
self._emit_first(tokens.ArgumentOpen()) | |||||
self._emit_all(argument) | |||||
self._emit(tokens.ArgumentClose()) | |||||
def _parse_template_or_argument(self): | def _parse_template_or_argument(self): | ||||
"""Parse a template or argument at the head of the wikicode string.""" | """Parse a template or argument at the head of the wikicode string.""" | ||||
self._head += 2 | self._head += 2 | ||||
@@ -179,12 +221,12 @@ class Tokenizer(object): | |||||
while braces: | while braces: | ||||
if braces == 1: | if braces == 1: | ||||
return self._write_text_then_stack("{") | |||||
return self._emit_text_then_stack("{") | |||||
if braces == 2: | if braces == 2: | ||||
try: | try: | ||||
self._parse_template() | self._parse_template() | ||||
except BadRoute: | except BadRoute: | ||||
return self._write_text_then_stack("{{") | |||||
return self._emit_text_then_stack("{{") | |||||
break | break | ||||
try: | try: | ||||
self._parse_argument() | self._parse_argument() | ||||
@@ -194,35 +236,13 @@ class Tokenizer(object): | |||||
self._parse_template() | self._parse_template() | ||||
braces -= 2 | braces -= 2 | ||||
except BadRoute: | except BadRoute: | ||||
return self._write_text_then_stack("{" * braces) | |||||
return self._emit_text_then_stack("{" * braces) | |||||
if braces: | if braces: | ||||
self._head += 1 | self._head += 1 | ||||
self._write_all(self._pop()) | |||||
def _parse_template(self): | |||||
"""Parse a template at the head of the wikicode string.""" | |||||
reset = self._head | |||||
try: | |||||
template = self._parse(contexts.TEMPLATE_NAME) | |||||
except BadRoute: | |||||
self._head = reset | |||||
raise | |||||
self._write_first(tokens.TemplateOpen()) | |||||
self._write_all(template) | |||||
self._write(tokens.TemplateClose()) | |||||
def _parse_argument(self): | |||||
"""Parse an argument at the head of the wikicode string.""" | |||||
reset = self._head | |||||
try: | |||||
argument = self._parse(contexts.ARGUMENT_NAME) | |||||
except BadRoute: | |||||
self._head = reset | |||||
raise | |||||
self._write_first(tokens.ArgumentOpen()) | |||||
self._write_all(argument) | |||||
self._write(tokens.ArgumentClose()) | |||||
self._emit_all(self._pop()) | |||||
if self._context & contexts.FAIL_NEXT: | |||||
self._context ^= contexts.FAIL_NEXT | |||||
def _handle_template_param(self): | def _handle_template_param(self): | ||||
"""Handle a template parameter at the head of the string.""" | """Handle a template parameter at the head of the string.""" | ||||
@@ -231,22 +251,22 @@ class Tokenizer(object): | |||||
elif self._context & contexts.TEMPLATE_PARAM_VALUE: | elif self._context & contexts.TEMPLATE_PARAM_VALUE: | ||||
self._context ^= contexts.TEMPLATE_PARAM_VALUE | self._context ^= contexts.TEMPLATE_PARAM_VALUE | ||||
elif self._context & contexts.TEMPLATE_PARAM_KEY: | elif self._context & contexts.TEMPLATE_PARAM_KEY: | ||||
self._write_all(self._pop(keep_context=True)) | |||||
self._emit_all(self._pop(keep_context=True)) | |||||
self._context |= contexts.TEMPLATE_PARAM_KEY | self._context |= contexts.TEMPLATE_PARAM_KEY | ||||
self._write(tokens.TemplateParamSeparator()) | |||||
self._emit(tokens.TemplateParamSeparator()) | |||||
self._push(self._context) | self._push(self._context) | ||||
def _handle_template_param_value(self): | def _handle_template_param_value(self): | ||||
"""Handle a template parameter's value at the head of the string.""" | """Handle a template parameter's value at the head of the string.""" | ||||
self._write_all(self._pop(keep_context=True)) | |||||
self._emit_all(self._pop(keep_context=True)) | |||||
self._context ^= contexts.TEMPLATE_PARAM_KEY | self._context ^= contexts.TEMPLATE_PARAM_KEY | ||||
self._context |= contexts.TEMPLATE_PARAM_VALUE | self._context |= contexts.TEMPLATE_PARAM_VALUE | ||||
self._write(tokens.TemplateParamEquals()) | |||||
self._emit(tokens.TemplateParamEquals()) | |||||
def _handle_template_end(self): | def _handle_template_end(self): | ||||
"""Handle the end of a template at the head of the string.""" | """Handle the end of a template at the head of the string.""" | ||||
if self._context & contexts.TEMPLATE_PARAM_KEY: | if self._context & contexts.TEMPLATE_PARAM_KEY: | ||||
self._write_all(self._pop(keep_context=True)) | |||||
self._emit_all(self._pop(keep_context=True)) | |||||
self._head += 1 | self._head += 1 | ||||
return self._pop() | return self._pop() | ||||
@@ -254,7 +274,7 @@ class Tokenizer(object): | |||||
"""Handle the separator between an argument's name and default.""" | """Handle the separator between an argument's name and default.""" | ||||
self._context ^= contexts.ARGUMENT_NAME | self._context ^= contexts.ARGUMENT_NAME | ||||
self._context |= contexts.ARGUMENT_DEFAULT | self._context |= contexts.ARGUMENT_DEFAULT | ||||
self._write(tokens.ArgumentSeparator()) | |||||
self._emit(tokens.ArgumentSeparator()) | |||||
def _handle_argument_end(self): | def _handle_argument_end(self): | ||||
"""Handle the end of an argument at the head of the string.""" | """Handle the end of an argument at the head of the string.""" | ||||
@@ -269,17 +289,19 @@ class Tokenizer(object): | |||||
wikilink = self._parse(contexts.WIKILINK_TITLE) | wikilink = self._parse(contexts.WIKILINK_TITLE) | ||||
except BadRoute: | except BadRoute: | ||||
self._head = reset | self._head = reset | ||||
self._write_text("[[") | |||||
self._emit_text("[[") | |||||
else: | else: | ||||
self._write(tokens.WikilinkOpen()) | |||||
self._write_all(wikilink) | |||||
self._write(tokens.WikilinkClose()) | |||||
if self._context & contexts.FAIL_NEXT: | |||||
self._context ^= contexts.FAIL_NEXT | |||||
self._emit(tokens.WikilinkOpen()) | |||||
self._emit_all(wikilink) | |||||
self._emit(tokens.WikilinkClose()) | |||||
def _handle_wikilink_separator(self): | def _handle_wikilink_separator(self): | ||||
"""Handle the separator between a wikilink's title and its text.""" | """Handle the separator between a wikilink's title and its text.""" | ||||
self._context ^= contexts.WIKILINK_TITLE | self._context ^= contexts.WIKILINK_TITLE | ||||
self._context |= contexts.WIKILINK_TEXT | self._context |= contexts.WIKILINK_TEXT | ||||
self._write(tokens.WikilinkSeparator()) | |||||
self._emit(tokens.WikilinkSeparator()) | |||||
def _handle_wikilink_end(self): | def _handle_wikilink_end(self): | ||||
"""Handle the end of a wikilink at the head of the string.""" | """Handle the end of a wikilink at the head of the string.""" | ||||
@@ -301,13 +323,13 @@ class Tokenizer(object): | |||||
title, level = self._parse(context) | title, level = self._parse(context) | ||||
except BadRoute: | except BadRoute: | ||||
self._head = reset + best - 1 | self._head = reset + best - 1 | ||||
self._write_text("=" * best) | |||||
self._emit_text("=" * best) | |||||
else: | else: | ||||
self._write(tokens.HeadingStart(level=level)) | |||||
self._emit(tokens.HeadingStart(level=level)) | |||||
if level < best: | if level < best: | ||||
self._write_text("=" * (best - level)) | |||||
self._write_all(title) | |||||
self._write(tokens.HeadingEnd()) | |||||
self._emit_text("=" * (best - level)) | |||||
self._emit_all(title) | |||||
self._emit(tokens.HeadingEnd()) | |||||
finally: | finally: | ||||
self._global ^= contexts.GL_HEADING | self._global ^= contexts.GL_HEADING | ||||
@@ -326,28 +348,28 @@ class Tokenizer(object): | |||||
after, after_level = self._parse(self._context) | after, after_level = self._parse(self._context) | ||||
except BadRoute: | except BadRoute: | ||||
if level < best: | if level < best: | ||||
self._write_text("=" * (best - level)) | |||||
self._emit_text("=" * (best - level)) | |||||
self._head = reset + best - 1 | self._head = reset + best - 1 | ||||
return self._pop(), level | return self._pop(), level | ||||
else: # Found another closure | else: # Found another closure | ||||
self._write_text("=" * best) | |||||
self._write_all(after) | |||||
self._emit_text("=" * best) | |||||
self._emit_all(after) | |||||
return self._pop(), after_level | return self._pop(), after_level | ||||
def _really_parse_entity(self): | def _really_parse_entity(self): | ||||
"""Actually parse an HTML entity and ensure that it is valid.""" | """Actually parse an HTML entity and ensure that it is valid.""" | ||||
self._write(tokens.HTMLEntityStart()) | |||||
self._emit(tokens.HTMLEntityStart()) | |||||
self._head += 1 | self._head += 1 | ||||
this = self._read(strict=True) | this = self._read(strict=True) | ||||
if this == "#": | if this == "#": | ||||
numeric = True | numeric = True | ||||
self._write(tokens.HTMLEntityNumeric()) | |||||
self._emit(tokens.HTMLEntityNumeric()) | |||||
self._head += 1 | self._head += 1 | ||||
this = self._read(strict=True) | this = self._read(strict=True) | ||||
if this[0].lower() == "x": | if this[0].lower() == "x": | ||||
hexadecimal = True | hexadecimal = True | ||||
self._write(tokens.HTMLEntityHex(char=this[0])) | |||||
self._emit(tokens.HTMLEntityHex(char=this[0])) | |||||
this = this[1:] | this = this[1:] | ||||
if not this: | if not this: | ||||
self._fail_route() | self._fail_route() | ||||
@@ -373,8 +395,8 @@ class Tokenizer(object): | |||||
if this not in htmlentities.entitydefs: | if this not in htmlentities.entitydefs: | ||||
self._fail_route() | self._fail_route() | ||||
self._write(tokens.Text(text=this)) | |||||
self._write(tokens.HTMLEntityEnd()) | |||||
self._emit(tokens.Text(text=this)) | |||||
self._emit(tokens.HTMLEntityEnd()) | |||||
def _parse_entity(self): | def _parse_entity(self): | ||||
"""Parse an HTML entity at the head of the wikicode string.""" | """Parse an HTML entity at the head of the wikicode string.""" | ||||
@@ -384,9 +406,9 @@ class Tokenizer(object): | |||||
self._really_parse_entity() | self._really_parse_entity() | ||||
except BadRoute: | except BadRoute: | ||||
self._head = reset | self._head = reset | ||||
self._write_text(self._read()) | |||||
self._emit_text(self._read()) | |||||
else: | else: | ||||
self._write_all(self._pop()) | |||||
self._emit_all(self._pop()) | |||||
def _parse_comment(self): | def _parse_comment(self): | ||||
"""Parse an HTML comment at the head of the wikicode string.""" | """Parse an HTML comment at the head of the wikicode string.""" | ||||
@@ -396,13 +418,231 @@ class Tokenizer(object): | |||||
comment = self._parse(contexts.COMMENT) | comment = self._parse(contexts.COMMENT) | ||||
except BadRoute: | except BadRoute: | ||||
self._head = reset | self._head = reset | ||||
self._write_text("<!--") | |||||
self._emit_text("<!--") | |||||
else: | else: | ||||
self._write(tokens.CommentStart()) | |||||
self._write_all(comment) | |||||
self._write(tokens.CommentEnd()) | |||||
self._emit(tokens.CommentStart()) | |||||
self._emit_all(comment) | |||||
self._emit(tokens.CommentEnd()) | |||||
self._head += 2 | self._head += 2 | ||||
def _push_tag_buffer(self, data): | |||||
"""Write a pending tag attribute from *data* to the stack.""" | |||||
if data.context & data.CX_QUOTED: | |||||
self._emit_first(tokens.TagAttrQuote()) | |||||
self._emit_all(self._pop()) | |||||
buf = data.padding_buffer | |||||
self._emit_first(tokens.TagAttrStart(pad_first=buf["first"], | |||||
pad_before_eq=buf["before_eq"], pad_after_eq=buf["after_eq"])) | |||||
self._emit_all(self._pop()) | |||||
data.padding_buffer = {key: "" for key in data.padding_buffer} | |||||
def _handle_tag_space(self, data, text): | |||||
"""Handle whitespace (*text*) inside of an HTML open tag.""" | |||||
ctx = data.context | |||||
end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NOTE_QUOTE) | |||||
if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NOTE_SPACE): | |||||
self._push_tag_buffer(data) | |||||
data.context = data.CX_ATTR_READY | |||||
elif ctx & data.CX_NOTE_SPACE: | |||||
data.context = data.CX_ATTR_READY | |||||
elif ctx & data.CX_ATTR_NAME: | |||||
data.context |= data.CX_NOTE_EQUALS | |||||
data.padding_buffer["before_eq"] += text | |||||
if ctx & data.CX_QUOTED and not ctx & data.CX_NOTE_SPACE: | |||||
self._emit_text(text) | |||||
elif data.context & data.CX_ATTR_READY: | |||||
data.padding_buffer["first"] += text | |||||
elif data.context & data.CX_ATTR_VALUE: | |||||
data.padding_buffer["after_eq"] += text | |||||
def _handle_tag_text(self, text): | |||||
"""Handle regular *text* inside of an HTML open tag.""" | |||||
next = self._read(1) | |||||
if not self._can_recurse() or text not in self.MARKERS: | |||||
self._emit_text(text) | |||||
elif text == next == "{": | |||||
self._parse_template_or_argument() | |||||
elif text == next == "[": | |||||
self._parse_wikilink() | |||||
elif text == "<": | |||||
self._parse_tag() | |||||
else: | |||||
self._emit_text(text) | |||||
def _handle_tag_data(self, data, text): | |||||
"""Handle all sorts of *text* data inside of an HTML open tag.""" | |||||
for chunk in self.tag_splitter.split(text): | |||||
if not chunk: | |||||
continue | |||||
if data.context & data.CX_NAME: | |||||
if chunk in self.MARKERS or chunk.isspace(): | |||||
self._fail_route() # Tags must start with text, not spaces | |||||
data.context = data.CX_NOTE_SPACE | |||||
elif chunk.isspace(): | |||||
self._handle_tag_space(data, chunk) | |||||
continue | |||||
elif data.context & data.CX_NOTE_SPACE: | |||||
if data.context & data.CX_QUOTED: | |||||
data.context = data.CX_ATTR_VALUE | |||||
self._pop() | |||||
self._head = data.reset - 1 # Will be auto-incremented | |||||
return # Break early | |||||
self._fail_route() | |||||
elif data.context & data.CX_ATTR_READY: | |||||
data.context = data.CX_ATTR_NAME | |||||
self._push(contexts.TAG_ATTR) | |||||
elif data.context & data.CX_ATTR_NAME: | |||||
if chunk == "=": | |||||
data.context = data.CX_ATTR_VALUE | data.CX_NOTE_QUOTE | |||||
self._emit(tokens.TagAttrEquals()) | |||||
continue | |||||
if data.context & data.CX_NOTE_EQUALS: | |||||
self._push_tag_buffer(data) | |||||
data.context = data.CX_ATTR_NAME | |||||
self._push(contexts.TAG_ATTR) | |||||
elif data.context & data.CX_ATTR_VALUE: | |||||
escaped = self._read(-1) == "\\" and self._read(-2) != "\\" | |||||
if data.context & data.CX_NOTE_QUOTE: | |||||
data.context ^= data.CX_NOTE_QUOTE | |||||
if chunk == '"' and not escaped: | |||||
data.context |= data.CX_QUOTED | |||||
self._push(self._context) | |||||
data.reset = self._head | |||||
continue | |||||
elif data.context & data.CX_QUOTED: | |||||
if chunk == '"' and not escaped: | |||||
data.context |= data.CX_NOTE_SPACE | |||||
continue | |||||
self._handle_tag_text(chunk) | |||||
def _handle_tag_close_open(self, data, token): | |||||
"""Handle the closing of a open tag (``<foo>``).""" | |||||
if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE): | |||||
self._push_tag_buffer(data) | |||||
self._emit(token(padding=data.padding_buffer["first"])) | |||||
self._head += 1 | |||||
def _handle_tag_open_close(self): | |||||
"""Handle the opening of a closing tag (``</foo>``).""" | |||||
self._emit(tokens.TagOpenClose()) | |||||
self._push(contexts.TAG_CLOSE) | |||||
self._head += 1 | |||||
def _handle_tag_close_close(self): | |||||
"""Handle the ending of a closing tag (``</foo>``).""" | |||||
strip = lambda tok: tok.text.rstrip().lower() | |||||
closing = self._pop() | |||||
if len(closing) != 1 or (not isinstance(closing[0], tokens.Text) or | |||||
strip(closing[0]) != strip(self._stack[1])): | |||||
self._fail_route() | |||||
self._emit_all(closing) | |||||
self._emit(tokens.TagCloseClose()) | |||||
return self._pop() | |||||
def _handle_blacklisted_tag(self): | |||||
"""Handle the body of an HTML tag that is parser-blacklisted.""" | |||||
while True: | |||||
this, next = self._read(), self._read(1) | |||||
self._head += 1 | |||||
if this is self.END: | |||||
self._fail_route() | |||||
elif this == "<" and next == "/": | |||||
self._handle_tag_open_close() | |||||
return self._parse(push=False) | |||||
else: | |||||
self._emit_text(this) | |||||
def _handle_single_only_tag_end(self): | |||||
"""Handle the end of an implicitly closing single-only HTML tag.""" | |||||
padding = self._stack.pop().padding | |||||
self._emit(tokens.TagCloseSelfclose(padding=padding, implicit=True)) | |||||
self._head -= 1 # Offset displacement done by _handle_tag_close_open | |||||
return self._pop() | |||||
def _handle_single_tag_end(self): | |||||
"""Handle the stream end when inside a single-supporting HTML tag.""" | |||||
gen = enumerate(self._stack) | |||||
index = next(i for i, t in gen if isinstance(t, tokens.TagCloseOpen)) | |||||
padding = self._stack[index].padding | |||||
token = tokens.TagCloseSelfclose(padding=padding, implicit=True) | |||||
self._stack[index] = token | |||||
return self._pop() | |||||
def _really_parse_tag(self): | |||||
"""Actually parse an HTML tag, starting with the open (``<foo>``).""" | |||||
data = _TagOpenData() | |||||
self._push(contexts.TAG_OPEN) | |||||
self._emit(tokens.TagOpenOpen()) | |||||
while True: | |||||
this, next = self._read(), self._read(1) | |||||
can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or | |||||
data.context & data.CX_NOTE_SPACE) | |||||
if this is self.END: | |||||
if self._context & contexts.TAG_ATTR: | |||||
if data.context & data.CX_QUOTED: | |||||
# Unclosed attribute quote: reset, don't die | |||||
data.context = data.CX_ATTR_VALUE | |||||
self._pop() | |||||
self._head = data.reset | |||||
continue | |||||
self._pop() | |||||
self._fail_route() | |||||
elif this == ">" and can_exit: | |||||
self._handle_tag_close_open(data, tokens.TagCloseOpen) | |||||
self._context = contexts.TAG_BODY | |||||
if is_single_only(self._stack[1].text): | |||||
return self._handle_single_only_tag_end() | |||||
if is_parsable(self._stack[1].text): | |||||
return self._parse(push=False) | |||||
return self._handle_blacklisted_tag() | |||||
elif this == "/" and next == ">" and can_exit: | |||||
self._handle_tag_close_open(data, tokens.TagCloseSelfclose) | |||||
return self._pop() | |||||
else: | |||||
self._handle_tag_data(data, this) | |||||
self._head += 1 | |||||
def _handle_invalid_tag_start(self): | |||||
"""Handle the (possible) start of an implicitly closing single tag.""" | |||||
reset = self._head + 1 | |||||
self._head += 2 | |||||
try: | |||||
if not is_single_only(self.tag_splitter.split(self._read())[0]): | |||||
raise BadRoute() | |||||
tag = self._really_parse_tag() | |||||
except BadRoute: | |||||
self._head = reset | |||||
self._emit_text("</") | |||||
else: | |||||
tag[0].invalid = True # Set flag of TagOpenOpen | |||||
self._emit_all(tag) | |||||
def _parse_tag(self): | |||||
"""Parse an HTML tag at the head of the wikicode string.""" | |||||
reset = self._head | |||||
self._head += 1 | |||||
try: | |||||
tag = self._really_parse_tag() | |||||
except BadRoute: | |||||
self._head = reset | |||||
self._emit_text("<") | |||||
else: | |||||
self._emit_all(tag) | |||||
def _handle_end(self): | |||||
"""Handle the end of the stream of wikitext.""" | |||||
fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | | |||||
contexts.HEADING | contexts.COMMENT | contexts.TAG) | |||||
double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) | |||||
if self._context & fail: | |||||
if self._context & contexts.TAG_BODY: | |||||
if is_single(self._stack[1].text): | |||||
return self._handle_single_tag_end() | |||||
if self._context & double_fail: | |||||
self._pop() | |||||
self._fail_route() | |||||
return self._pop() | |||||
def _verify_safe(self, this): | def _verify_safe(self, this): | ||||
"""Make sure we are not trying to write an invalid character.""" | """Make sure we are not trying to write an invalid character.""" | ||||
context = self._context | context = self._context | ||||
@@ -414,7 +654,7 @@ class Tokenizer(object): | |||||
elif this == "\n" or this == "[" or this == "}": | elif this == "\n" or this == "[" or this == "}": | ||||
return False | return False | ||||
return True | return True | ||||
if context & contexts.TEMPLATE_NAME: | |||||
elif context & contexts.TEMPLATE_NAME: | |||||
if this == "{" or this == "}" or this == "[": | if this == "{" or this == "}" or this == "[": | ||||
self._context |= contexts.FAIL_NEXT | self._context |= contexts.FAIL_NEXT | ||||
return True | return True | ||||
@@ -432,6 +672,8 @@ class Tokenizer(object): | |||||
elif this is self.END or not this.isspace(): | elif this is self.END or not this.isspace(): | ||||
self._context |= contexts.HAS_TEXT | self._context |= contexts.HAS_TEXT | ||||
return True | return True | ||||
elif context & contexts.TAG_CLOSE: | |||||
return this != "<" | |||||
else: | else: | ||||
if context & contexts.FAIL_ON_EQUALS: | if context & contexts.FAIL_ON_EQUALS: | ||||
if this == "=": | if this == "=": | ||||
@@ -458,44 +700,38 @@ class Tokenizer(object): | |||||
self._context |= contexts.FAIL_ON_RBRACE | self._context |= contexts.FAIL_ON_RBRACE | ||||
return True | return True | ||||
def _parse(self, context=0): | |||||
def _parse(self, context=0, push=True): | |||||
"""Parse the wikicode string, using *context* for when to stop.""" | """Parse the wikicode string, using *context* for when to stop.""" | ||||
self._push(context) | |||||
unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | | |||||
contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME | | |||||
contexts.TAG_CLOSE) | |||||
double_unsafe = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE) | |||||
if push: | |||||
self._push(context) | |||||
while True: | while True: | ||||
this = self._read() | this = self._read() | ||||
unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE | | |||||
contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME) | |||||
if self._context & unsafe: | if self._context & unsafe: | ||||
if not self._verify_safe(this): | if not self._verify_safe(this): | ||||
if self._context & contexts.TEMPLATE_PARAM_KEY: | |||||
if self._context & double_unsafe: | |||||
self._pop() | self._pop() | ||||
self._fail_route() | self._fail_route() | ||||
if this not in self.MARKERS: | if this not in self.MARKERS: | ||||
self._write_text(this) | |||||
self._emit_text(this) | |||||
self._head += 1 | self._head += 1 | ||||
continue | continue | ||||
if this is self.END: | if this is self.END: | ||||
fail = (contexts.TEMPLATE | contexts.ARGUMENT | | |||||
contexts.WIKILINK | contexts.HEADING | | |||||
contexts.COMMENT) | |||||
if self._context & contexts.TEMPLATE_PARAM_KEY: | |||||
self._pop() | |||||
if self._context & fail: | |||||
self._fail_route() | |||||
return self._pop() | |||||
return self._handle_end() | |||||
next = self._read(1) | next = self._read(1) | ||||
if self._context & contexts.COMMENT: | if self._context & contexts.COMMENT: | ||||
if this == next == "-" and self._read(2) == ">": | if this == next == "-" and self._read(2) == ">": | ||||
return self._pop() | return self._pop() | ||||
else: | else: | ||||
self._write_text(this) | |||||
self._emit_text(this) | |||||
elif this == next == "{": | elif this == next == "{": | ||||
if self._can_recurse(): | if self._can_recurse(): | ||||
self._parse_template_or_argument() | self._parse_template_or_argument() | ||||
if self._context & contexts.FAIL_NEXT: | |||||
self._context ^= contexts.FAIL_NEXT | |||||
else: | else: | ||||
self._write_text("{") | |||||
self._emit_text("{") | |||||
elif this == "|" and self._context & contexts.TEMPLATE: | elif this == "|" and self._context & contexts.TEMPLATE: | ||||
self._handle_template_param() | self._handle_template_param() | ||||
elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY: | elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY: | ||||
@@ -508,14 +744,12 @@ class Tokenizer(object): | |||||
if self._read(2) == "}": | if self._read(2) == "}": | ||||
return self._handle_argument_end() | return self._handle_argument_end() | ||||
else: | else: | ||||
self._write_text("}") | |||||
self._emit_text("}") | |||||
elif this == next == "[": | elif this == next == "[": | ||||
if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): | if not self._context & contexts.WIKILINK_TITLE and self._can_recurse(): | ||||
self._parse_wikilink() | self._parse_wikilink() | ||||
if self._context & contexts.FAIL_NEXT: | |||||
self._context ^= contexts.FAIL_NEXT | |||||
else: | else: | ||||
self._write_text("[") | |||||
self._emit_text("[") | |||||
elif this == "|" and self._context & contexts.WIKILINK_TITLE: | elif this == "|" and self._context & contexts.WIKILINK_TITLE: | ||||
self._handle_wikilink_separator() | self._handle_wikilink_separator() | ||||
elif this == next == "]" and self._context & contexts.WIKILINK: | elif this == next == "]" and self._context & contexts.WIKILINK: | ||||
@@ -524,7 +758,7 @@ class Tokenizer(object): | |||||
if self._read(-1) in ("\n", self.START): | if self._read(-1) in ("\n", self.START): | ||||
self._parse_heading() | self._parse_heading() | ||||
else: | else: | ||||
self._write_text("=") | |||||
self._emit_text("=") | |||||
elif this == "=" and self._context & contexts.HEADING: | elif this == "=" and self._context & contexts.HEADING: | ||||
return self._handle_heading_end() | return self._handle_heading_end() | ||||
elif this == "\n" and self._context & contexts.HEADING: | elif this == "\n" and self._context & contexts.HEADING: | ||||
@@ -535,9 +769,21 @@ class Tokenizer(object): | |||||
if self._read(2) == self._read(3) == "-": | if self._read(2) == self._read(3) == "-": | ||||
self._parse_comment() | self._parse_comment() | ||||
else: | else: | ||||
self._write_text(this) | |||||
self._emit_text(this) | |||||
elif this == "<" and next == "/" and self._read(2) is not self.END: | |||||
if self._context & contexts.TAG_BODY: | |||||
self._handle_tag_open_close() | |||||
else: | |||||
self._handle_invalid_tag_start() | |||||
elif this == "<": | |||||
if not self._context & contexts.TAG_CLOSE and self._can_recurse(): | |||||
self._parse_tag() | |||||
else: | |||||
self._emit_text("<") | |||||
elif this == ">" and self._context & contexts.TAG_CLOSE: | |||||
return self._handle_tag_close_close() | |||||
else: | else: | ||||
self._write_text(this) | |||||
self._emit_text(this) | |||||
self._head += 1 | self._head += 1 | ||||
def tokenize(self, text): | def tokenize(self, text): | ||||
@@ -63,6 +63,10 @@ class Token(object): | |||||
def __delattr__(self, key): | def __delattr__(self, key): | ||||
del self._kwargs[key] | del self._kwargs[key] | ||||
def get(self, key, default=None): | |||||
"""Same as :py:meth:`__getattr__`, but has a *default* if missing.""" | |||||
return self._kwargs.get(key, default) | |||||
def make(name): | def make(name): | ||||
"""Create a new Token class using ``type()`` and add it to ``__all__``.""" | """Create a new Token class using ``type()`` and add it to ``__all__``.""" | ||||
@@ -0,0 +1,76 @@ | |||||
# -*- coding: utf-8 -*- | |||||
# | |||||
# Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net> | |||||
# | |||||
# Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
# of this software and associated documentation files (the "Software"), to deal | |||||
# in the Software without restriction, including without limitation the rights | |||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
# copies of the Software, and to permit persons to whom the Software is | |||||
# furnished to do so, subject to the following conditions: | |||||
# | |||||
# The above copyright notice and this permission notice shall be included in | |||||
# all copies or substantial portions of the Software. | |||||
# | |||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
# SOFTWARE. | |||||
"""Contains data regarding certain HTML tags.""" | |||||
from __future__ import unicode_literals | |||||
__all__ = ["get_wikicode", "is_parsable", "is_visible", "is_single", | |||||
"is_single_only"] | |||||
PARSER_BLACKLIST = [ | |||||
# enwiki extensions @ 2013-06-28 | |||||
"categorytree", "gallery", "hiero", "imagemap", "inputbox", "math", | |||||
"nowiki", "pre", "score", "section", "source", "syntaxhighlight", | |||||
"templatedata", "timeline" | |||||
] | |||||
INVISIBLE_TAGS = [ | |||||
# enwiki extensions @ 2013-06-28 | |||||
"categorytree", "gallery", "imagemap", "inputbox", "math", "score", | |||||
"section", "templatedata", "timeline" | |||||
] | |||||
# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762 | |||||
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] | |||||
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] | |||||
WIKICODE = { | |||||
"i": {"open": "''", "close": "''"}, | |||||
"b": {"open": "'''", "close": "'''"}, | |||||
"ul": {"open": "*"}, | |||||
"ol": {"open": "#"}, | |||||
"dt": {"open": ";"}, | |||||
"dd": {"open": ":"}, | |||||
"hr": {"open": "----"}, | |||||
} | |||||
def get_wikicode(tag): | |||||
"""Return the appropriate wikicode before and after the given *tag*.""" | |||||
data = WIKICODE[tag.lower()] | |||||
return (data.get("open"), data.get("close")) | |||||
def is_parsable(tag): | |||||
"""Return if the given *tag*'s contents should be passed to the parser.""" | |||||
return tag.lower() not in PARSER_BLACKLIST | |||||
def is_visible(tag): | |||||
"""Return whether or not the given *tag* contains visible text.""" | |||||
return tag.lower() not in INVISIBLE_TAGS | |||||
def is_single(tag): | |||||
"""Return whether or not the given *tag* can exist without a close tag.""" | |||||
return tag.lower() in SINGLE | |||||
def is_single_only(tag): | |||||
"""Return whether or not the given *tag* must exist without a close tag.""" | |||||
return tag.lower() in SINGLE_ONLY |
@@ -31,6 +31,8 @@ from .compat import bytes, str | |||||
from .nodes import Node | from .nodes import Node | ||||
from .smart_list import SmartList | from .smart_list import SmartList | ||||
__all__ = ["parse_anything"] | |||||
def parse_anything(value): | def parse_anything(value): | ||||
"""Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. | """Return a :py:class:`~.Wikicode` for *value*, allowing multiple types. | ||||
@@ -91,7 +91,23 @@ class TreeEqualityTestCase(TestCase): | |||||
def assertTagNodeEqual(self, expected, actual): | def assertTagNodeEqual(self, expected, actual): | ||||
"""Assert that two Tag nodes have the same data.""" | """Assert that two Tag nodes have the same data.""" | ||||
self.fail("Holding this until feature/html_tags is ready.") | |||||
self.assertWikicodeEqual(expected.tag, actual.tag) | |||||
if expected.contents is not None: | |||||
self.assertWikicodeEqual(expected.contents, actual.contents) | |||||
length = len(expected.attributes) | |||||
self.assertEqual(length, len(actual.attributes)) | |||||
for i in range(length): | |||||
exp_attr = expected.attributes[i] | |||||
act_attr = actual.attributes[i] | |||||
self.assertWikicodeEqual(exp_attr.name, act_attr.name) | |||||
if exp_attr.value is not None: | |||||
self.assertWikicodeEqual(exp_attr.value, act_attr.value) | |||||
self.assertIs(exp_attr.quoted, act_attr.quoted) | |||||
self.assertEqual(exp.attr.padding, act_attr.padding) | |||||
self.assertIs(expected.showtag, actual.showtag) | |||||
self.assertIs(expected.self_closing, actual.self_closing) | |||||
self.assertEqual(expected.padding, actual.padding) | |||||
self.assertWikicodeEqual(expected.closing_tag, actual.closing_tag) | |||||
def assertTemplateNodeEqual(self, expected, actual): | def assertTemplateNodeEqual(self, expected, actual): | ||||
"""Assert that two Template nodes have the same data.""" | """Assert that two Template nodes have the same data.""" | ||||
@@ -198,6 +198,18 @@ class TestBuilder(TreeEqualityTestCase): | |||||
for test, valid in tests: | for test, valid in tests: | ||||
self.assertWikicodeEqual(valid, self.builder.build(test)) | self.assertWikicodeEqual(valid, self.builder.build(test)) | ||||
def test_tag(self): | |||||
"""tests for building Tag nodes""" | |||||
tests = [ | |||||
([tokens.TagOpenOpen(), tokens.Text(text="ref"), | |||||
tokens.TagCloseOpen(padding=""), tokens.TagOpenClose(), | |||||
tokens.Text(text="ref"), tokens.TagCloseClose()], | |||||
wrap([Tag(wraptext("ref"), wrap([]), [], True, False, "", | |||||
wraptext("ref"))])), | |||||
] | |||||
for test, valid in tests: | |||||
self.assertWikicodeEqual(valid, self.builder.build(test)) | |||||
def test_integration(self): | def test_integration(self): | ||||
"""a test for building a combination of templates together""" | """a test for building a combination of templates together""" | ||||
# {{{{{{{{foo}}bar|baz=biz}}buzz}}usr|{{bin}}}} | # {{{{{{{{foo}}bar|baz=biz}}buzz}}usr|{{bin}}}} | ||||
@@ -33,6 +33,13 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t | |||||
--- | --- | ||||
name: rich_tags | |||||
label: a HTML tag with tons of other things in it | |||||
input: "{{dubious claim}}<ref name={{abc}} foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}} \n mno = "{{p}} [[q]] {{r}}">[[Source]]</ref>" | |||||
output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" \n ", pad_before_eq=" ", pad_after_eq=" "), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: wildcard | name: wildcard | ||||
label: a wildcard assortment of various things | label: a wildcard assortment of various things | ||||
input: "{{{{{{{{foo}}bar|baz=biz}}buzz}}usr|{{bin}}}}" | input: "{{{{{{{{foo}}bar|baz=biz}}buzz}}usr|{{bin}}}}" | ||||
@@ -0,0 +1,529 @@ | |||||
name: basic | |||||
label: a basic tag with an open and close | |||||
input: "<ref></ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: basic_selfclosing | |||||
label: a basic self-closing tag | |||||
input: "<ref/>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagCloseSelfclose(padding="")] | |||||
--- | |||||
name: content | |||||
label: a tag with some content in the middle | |||||
input: "<ref>this is a reference</ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: padded_open | |||||
label: a tag with some padding in the open tag | |||||
input: "<ref ></ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: padded_close | |||||
label: a tag with some padding in the close tag | |||||
input: "<ref></ref >" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()] | |||||
--- | |||||
name: padded_selfclosing | |||||
label: a self-closing tag with padding | |||||
input: "<ref />" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagCloseSelfclose(padding=" ")] | |||||
--- | |||||
name: attribute | |||||
label: a tag with a single attribute | |||||
input: "<ref name></ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: attribute_value | |||||
label: a tag with a single attribute with a value | |||||
input: "<ref name=foo></ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: attribute_quoted | |||||
label: a tag with a single quoted attribute | |||||
input: "<ref name="foo bar"></ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: attribute_hyphen | |||||
label: a tag with a single attribute, containing a hyphen | |||||
input: "<ref name=foo-bar></ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: attribute_quoted_hyphen | |||||
label: a tag with a single quoted attribute, containing a hyphen | |||||
input: "<ref name="foo-bar"></ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: attribute_selfclosing | |||||
label: a self-closing tag with a single attribute | |||||
input: "<ref name/>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(padding="")] | |||||
--- | |||||
name: attribute_selfclosing_value | |||||
label: a self-closing tag with a single attribute with a value | |||||
input: "<ref name=foo/>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")] | |||||
--- | |||||
name: attribute_selfclosing_value_quoted | |||||
label: a self-closing tag with a single quoted attribute | |||||
input: "<ref name="foo"/>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")] | |||||
--- | |||||
name: nested_tag | |||||
label: a tag nested within the attributes of another | |||||
input: "<ref name=<span style="color: red;">foo</span>>citation</ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: nested_tag_quoted | |||||
label: a tag nested within the attributes of another, quoted | |||||
input: "<ref name="<span style="color: red;">foo</span>">citation</ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: nested_troll_tag | |||||
label: a bogus tag that appears to be nested within the attributes of another | |||||
input: "<ref name=</ ><//>>citation</ref>" | |||||
output: [Text(text="<ref name=</ ><//>>citation</ref>")] | |||||
--- | |||||
name: nested_troll_tag_quoted | |||||
label: a bogus tag that appears to be nested within the attributes of another, quoted | |||||
input: "<ref name="</ ><//>">citation</ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="</ ><//>"), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: invalid_space_begin_open | |||||
label: invalid tag: a space at the beginning of the open tag | |||||
input: "< ref>test</ref>" | |||||
output: [Text(text="< ref>test</ref>")] | |||||
--- | |||||
name: invalid_space_begin_close | |||||
label: invalid tag: a space at the beginning of the close tag | |||||
input: "<ref>test</ ref>" | |||||
output: [Text(text="<ref>test</ ref>")] | |||||
--- | |||||
name: valid_space_end | |||||
label: valid tag: spaces at the ends of both the open and close tags | |||||
input: "<ref >test</ref >" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=" "), Text(text="test"), TagOpenClose(), Text(text="ref "), TagCloseClose()] | |||||
--- | |||||
name: invalid_template_ends | |||||
label: invalid tag: a template at the ends of both the open and close tags | |||||
input: "<ref {{foo}}>test</ref {{foo}}>" | |||||
output: [Text(text="<ref "), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">test</ref "), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">")] | |||||
--- | |||||
name: invalid_template_ends_nospace | |||||
label: invalid tag: a template at the ends of both the open and close tags, without spacing | |||||
input: "<ref {{foo}}>test</ref{{foo}}>" | |||||
output: [Text(text="<ref "), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">test</ref"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">")] | |||||
--- | |||||
name: valid_template_end_open | |||||
label: valid tag: a template at the end of the open tag | |||||
input: "<ref {{foo}}>test</ref>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref"), TagCloseClose()] | |||||
--- | |||||
name: valid_template_end_open_space_end_close | |||||
label: valid tag: a template at the end of the open tag; whitespace at the end of the close tag | |||||
input: "<ref {{foo}}>test</ref\n>" | |||||
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref\n"), TagCloseClose()] | |||||
--- | |||||
name: invalid_template_end_open_nospace | |||||
label: invalid tag: a template at the end of the open tag, without spacing | |||||
input: "<ref{{foo}}>test</ref>" | |||||
output: [Text(text="<ref"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">test</ref>")] | |||||
--- | |||||
name: invalid_template_start_close | |||||
label: invalid tag: a template at the beginning of the close tag | |||||
input: "<ref>test</{{foo}}ref>" | |||||
output: [Text(text="<ref>test</"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text="ref>")] | |||||
--- | |||||
name: invalid_template_start_open | |||||
label: invalid tag: a template at the beginning of the open tag | |||||
input: "<{{foo}}ref>test</ref>" | |||||
output: [Text(text="<"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text="ref>test</ref>")] | |||||
--- | |||||
name: unclosed_quote | |||||
label: a quoted attribute that is never closed | |||||
input: "<span style="foobar>stuff</span>" | |||||
output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foobar"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()] | |||||
--- | |||||
name: fake_quote | |||||
label: a fake quoted attribute | |||||
input: "<span style="foo"bar>stuff</span>" | |||||
output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo\"bar"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()] | |||||
--- | |||||
name: fake_quote_complex | |||||
label: a fake quoted attribute, with spaces and templates and links | |||||
input: "<span style="foo {{bar}}\n[[baz]]"buzz >stuff</span>" | |||||
output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo"), TagAttrStart(pad_first=" ", pad_before_eq="\n", pad_after_eq=""), TemplateOpen(), Text(text="bar"), TemplateClose(), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), WikilinkOpen(), Text(text="baz"), WikilinkClose(), Text(text="\"buzz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()] | |||||
--- | |||||
name: incomplete_lbracket | |||||
label: incomplete tags: just a left bracket | |||||
input: "<" | |||||
output: [Text(text="<")] | |||||
--- | |||||
name: incomplete_lbracket_junk | |||||
label: incomplete tags: just a left bracket, surrounded by stuff | |||||
input: "foo<bar" | |||||
output: [Text(text="foo<bar")] | |||||
--- | |||||
name: incomplete_unclosed_open | |||||
label: incomplete tags: an unclosed open tag | |||||
input: "junk <ref" | |||||
output: [Text(text="junk <ref")] | |||||
--- | |||||
name: incomplete_unclosed_open_space | |||||
label: incomplete tags: an unclosed open tag, space | |||||
input: "junk <ref " | |||||
output: [Text(text="junk <ref ")] | |||||
--- | |||||
name: incomplete_unclosed_open_unnamed_attr | |||||
label: incomplete tags: an unclosed open tag, unnamed attribute | |||||
input: "junk <ref name" | |||||
output: [Text(text="junk <ref name")] | |||||
--- | |||||
name: incomplete_unclosed_open_attr_equals | |||||
label: incomplete tags: an unclosed open tag, attribute, equal sign | |||||
input: "junk <ref name=" | |||||
output: [Text(text="junk <ref name=")] | |||||
--- | |||||
name: incomplete_unclosed_open_attr_equals_quoted | |||||
label: incomplete tags: an unclosed open tag, attribute, equal sign, quote | |||||
input: "junk <ref name="" | |||||
output: [Text(text="junk <ref name=\"")] | |||||
--- | |||||
name: incomplete_unclosed_open_attr | |||||
label: incomplete tags: an unclosed open tag, attribute with a key/value | |||||
input: "junk <ref name=foo" | |||||
output: [Text(text="junk <ref name=foo")] | |||||
--- | |||||
name: incomplete_unclosed_open_attr_quoted | |||||
label: incomplete tags: an unclosed open tag, attribute with a key/value, quoted | |||||
input: "junk <ref name="foo"" | |||||
output: [Text(text="junk <ref name=\"foo\"")] | |||||
--- | |||||
name: incomplete_open | |||||
label: incomplete tags: an open tag | |||||
input: "junk <ref>" | |||||
output: [Text(text="junk <ref>")] | |||||
--- | |||||
name: incomplete_open_unnamed_attr | |||||
label: incomplete tags: an open tag, unnamed attribute | |||||
input: "junk <ref name>" | |||||
output: [Text(text="junk <ref name>")] | |||||
--- | |||||
name: incomplete_open_attr_equals | |||||
label: incomplete tags: an open tag, attribute, equal sign | |||||
input: "junk <ref name=>" | |||||
output: [Text(text="junk <ref name=>")] | |||||
--- | |||||
name: incomplete_open_attr | |||||
label: incomplete tags: an open tag, attribute with a key/value | |||||
input: "junk <ref name=foo>" | |||||
output: [Text(text="junk <ref name=foo>")] | |||||
--- | |||||
name: incomplete_open_attr_quoted | |||||
label: incomplete tags: an open tag, attribute with a key/value, quoted | |||||
input: "junk <ref name="foo">" | |||||
output: [Text(text="junk <ref name=\"foo\">")] | |||||
--- | |||||
name: incomplete_open_text | |||||
label: incomplete tags: an open tag, text | |||||
input: "junk <ref>foo" | |||||
output: [Text(text="junk <ref>foo")] | |||||
--- | |||||
name: incomplete_open_attr_text | |||||
label: incomplete tags: an open tag, attribute with a key/value, text | |||||
input: "junk <ref name=foo>bar" | |||||
output: [Text(text="junk <ref name=foo>bar")] | |||||
--- | |||||
name: incomplete_open_text_lbracket | |||||
label: incomplete tags: an open tag, text, left open bracket | |||||
input: "junk <ref>bar<" | |||||
output: [Text(text="junk <ref>bar<")] | |||||
--- | |||||
name: incomplete_open_text_lbracket_slash | |||||
label: incomplete tags: an open tag, text, left bracket, slash | |||||
input: "junk <ref>bar</" | |||||
output: [Text(text="junk <ref>bar</")] | |||||
--- | |||||
name: incomplete_open_text_unclosed_close | |||||
label: incomplete tags: an open tag, text, unclosed close | |||||
input: "junk <ref>bar</ref" | |||||
output: [Text(text="junk <ref>bar</ref")] | |||||
--- | |||||
name: incomplete_open_text_wrong_close | |||||
label: incomplete tags: an open tag, text, wrong close | |||||
input: "junk <ref>bar</span>" | |||||
output: [Text(text="junk <ref>bar</span>")] | |||||
--- | |||||
name: incomplete_close | |||||
label: incomplete tags: a close tag | |||||
input: "junk </ref>" | |||||
output: [Text(text="junk </ref>")] | |||||
--- | |||||
name: incomplete_no_tag_name_open | |||||
label: incomplete tags: no tag name within brackets; just an open | |||||
input: "junk <>" | |||||
output: [Text(text="junk <>")] | |||||
--- | |||||
name: incomplete_no_tag_name_selfclosing | |||||
label: incomplete tags: no tag name within brackets; self-closing | |||||
input: "junk < />" | |||||
output: [Text(text="junk < />")] | |||||
--- | |||||
name: incomplete_no_tag_name_open_close | |||||
label: incomplete tags: no tag name within brackets; open and close | |||||
input: "junk <></>" | |||||
output: [Text(text="junk <></>")] | |||||
--- | |||||
name: backslash_premature_before | |||||
label: a backslash before a quote before a space | |||||
input: "<foo attribute="this is\\" quoted">blah</foo>" | |||||
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\" quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] | |||||
--- | |||||
name: backslash_premature_after | |||||
label: a backslash before a quote after a space | |||||
input: "<foo attribute="this is \\"quoted">blah</foo>" | |||||
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is \\\"quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] | |||||
--- | |||||
name: backslash_premature_middle | |||||
label: a backslash before a quote in the middle of a word | |||||
input: "<foo attribute="this i\\"s quoted">blah</foo>" | |||||
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this i\\\"s quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] | |||||
--- | |||||
name: backslash_adjacent | |||||
label: escaped quotes next to unescaped quotes | |||||
input: "<foo attribute="\\"this is quoted\\"">blah</foo>" | |||||
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\\"this is quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] | |||||
--- | |||||
name: backslash_endquote | |||||
label: backslashes before the end quote, causing the attribute to become unquoted | |||||
input: "<foo attribute="this_is quoted\\">blah</foo>" | |||||
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), Text(text="\"this_is"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] | |||||
--- | |||||
name: backslash_double | |||||
label: two adjacent backslashes, which do *not* affect the quote | |||||
input: "<foo attribute="this is\\\\" quoted">blah</foo>" | |||||
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] | |||||
--- | |||||
name: backslash_triple | |||||
label: three adjacent backslashes, which do *not* affect the quote | |||||
input: "<foo attribute="this is\\\\\\" quoted">blah</foo>" | |||||
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] | |||||
--- | |||||
name: backslash_unaffecting | |||||
label: backslashes near quotes, but not immediately adjacent, thus having no effect | |||||
input: "<foo attribute="\\quote\\d" also="quote\\d\\">blah</foo>" | |||||
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\quote\\d"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="also"), TagAttrEquals(), Text(text="\"quote\\d\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()] | |||||
--- | |||||
name: unparsable | |||||
label: a tag that should not be put through the normal parser | |||||
input: "{{t1}}<nowiki>{{t2}}</nowiki>{{t3}}" | |||||
output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()] | |||||
--- | |||||
name: unparsable_complex | |||||
label: a tag that should not be put through the normal parser; lots of stuff inside | |||||
input: "{{t1}}<pre>{{t2}}\n==Heading==\nThis is some text with a [[page|link]].</pre>{{t3}}" | |||||
output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="pre"), TagCloseOpen(padding=""), Text(text="{{t2}}\n==Heading==\nThis is some text with a [[page|link]]."), TagOpenClose(), Text(text="pre"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()] | |||||
--- | |||||
name: unparsable_attributed | |||||
label: a tag that should not be put through the normal parser; parsed attributes | |||||
input: "{{t1}}<nowiki attr=val attr2="{{val2}}">{{t2}}</nowiki>{{t3}}" | |||||
output: [TemplateOpen(), Text(text=u't1'), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr"), TagAttrEquals(), Text(text="val"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr2"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="val2"), TemplateClose(), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()] | |||||
--- | |||||
name: unparsable_incomplete | |||||
label: a tag that should not be put through the normal parser; incomplete | |||||
input: "{{t1}}<nowiki>{{t2}}{{t3}}" | |||||
output: [TemplateOpen(), Text(text="t1"), TemplateClose(), Text(text="<nowiki>"), TemplateOpen(), Text(text="t2"), TemplateClose(), TemplateOpen(), Text(text="t3"), TemplateClose()] | |||||
--- | |||||
name: single_open_close | |||||
label: a tag that supports being single; both an open and a close tag | |||||
input: "foo<li>bar{{baz}}</li>" | |||||
output: [Text(text="foo"), TagOpenOpen(), Text(text="li"), TagCloseOpen(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose(), TagOpenClose(), Text(text="li"), TagCloseClose()] | |||||
--- | |||||
name: single_open | |||||
label: a tag that supports being single; just an open tag | |||||
input: "foo<li>bar{{baz}}" | |||||
output: [Text(text="foo"), TagOpenOpen(), Text(text="li"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()] | |||||
--- | |||||
name: single_selfclose | |||||
label: a tag that supports being single; a self-closing tag | |||||
input: "foo<li/>bar{{baz}}" | |||||
output: [Text(text="foo"), TagOpenOpen(), Text(text="li"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()] | |||||
--- | |||||
name: single_close | |||||
label: a tag that supports being single; just a close tag | |||||
input: "foo</li>bar{{baz}}" | |||||
output: [Text(text="foo</li>bar"), TemplateOpen(), Text(text="baz"), TemplateClose()] | |||||
--- | |||||
name: single_only_open_close | |||||
label: a tag that can only be single; both an open and a close tag | |||||
input: "foo<br>bar{{baz}}</br>" | |||||
output: [Text(text="foo"), TagOpenOpen(), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose(), TagOpenOpen(invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True)] | |||||
--- | |||||
name: single_only_open | |||||
label: a tag that can only be single; just an open tag | |||||
input: "foo<br>bar{{baz}}" | |||||
output: [Text(text="foo"), TagOpenOpen(), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()] | |||||
--- | |||||
name: single_only_selfclose | |||||
label: a tag that can only be single; a self-closing tag | |||||
input: "foo<br/>bar{{baz}}" | |||||
output: [Text(text="foo"), TagOpenOpen(), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()] | |||||
--- | |||||
name: single_only_close | |||||
label: a tag that can only be single; just a close tag | |||||
input: "foo</br>bar{{baz}}" | |||||
output: [Text(text="foo"), TagOpenOpen(invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()] | |||||
--- | |||||
name: single_only_double | |||||
label: a tag that can only be single; a tag with backslashes at the beginning and end | |||||
input: "foo</br/>bar{{baz}}" | |||||
output: [Text(text="foo"), TagOpenOpen(invalid=True), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()] |
@@ -23,3 +23,10 @@ name: unicode2 | |||||
label: additional unicode check for non-BMP codepoints | label: additional unicode check for non-BMP codepoints | ||||
input: "𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰" | input: "𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰" | ||||
output: [Text(text="𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰")] | output: [Text(text="𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰")] | ||||
--- | |||||
name: large | |||||
label: a lot of text, requiring multiple textbuffer blocks in the C tokenizer | |||||
input: "ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN" | |||||
output: [Text(text="ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN")] |