Browse Source

Merge branch 'feature/html_tags' into develop (#9)

tags/v0.3
Ben Kurtovic 10 years ago
parent
commit
c568bcbaf4
19 changed files with 2313 additions and 650 deletions
  1. +1
    -0
      docs/api/mwparserfromhell.nodes.rst
  2. +6
    -0
      docs/api/mwparserfromhell.rst
  3. +40
    -8
      mwparserfromhell/nodes/extras/attribute.py
  4. +87
    -123
      mwparserfromhell/nodes/tag.py
  5. +28
    -18
      mwparserfromhell/parser/builder.py
  6. +46
    -31
      mwparserfromhell/parser/contexts.py
  7. +1028
    -302
      mwparserfromhell/parser/tokenizer.c
  8. +81
    -71
      mwparserfromhell/parser/tokenizer.h
  9. +342
    -96
      mwparserfromhell/parser/tokenizer.py
  10. +4
    -0
      mwparserfromhell/parser/tokens.py
  11. +76
    -0
      mwparserfromhell/tag_defs.py
  12. +2
    -0
      mwparserfromhell/utils.py
  13. +17
    -1
      tests/_test_tree_equality.py
  14. +0
    -0
     
  15. +12
    -0
      tests/test_builder.py
  16. +0
    -0
     
  17. +7
    -0
      tests/tokenizer/integration.mwtest
  18. +529
    -0
      tests/tokenizer/tags.mwtest
  19. +7
    -0
      tests/tokenizer/text.mwtest

+ 1
- 0
docs/api/mwparserfromhell.nodes.rst View File

@@ -46,6 +46,7 @@ nodes Package

.. automodule:: mwparserfromhell.nodes.tag
:members:
:undoc-members:
:show-inheritance:

:mod:`template` Module


+ 6
- 0
docs/api/mwparserfromhell.rst View File

@@ -30,6 +30,12 @@ mwparserfromhell Package
:members:
:undoc-members:

:mod:`tag_defs` Module
----------------------

.. automodule:: mwparserfromhell.tag_defs
:members:

:mod:`utils` Module
-------------------



+ 40
- 8
mwparserfromhell/nodes/extras/attribute.py View File

@@ -36,18 +36,23 @@ class Attribute(StringMixIn):
whose value is ``"foo"``.
"""

def __init__(self, name, value=None, quoted=True):
def __init__(self, name, value=None, quoted=True, pad_first="",
pad_before_eq="", pad_after_eq=""):
super(Attribute, self).__init__()
self._name = name
self._value = value
self._quoted = quoted
self._pad_first = pad_first
self._pad_before_eq = pad_before_eq
self._pad_after_eq = pad_after_eq

def __unicode__(self):
base = self.pad_first + str(self.name) + self.pad_before_eq
if self.value:
if self.quoted:
return str(self.name) + '="' + str(self.value) + '"'
return str(self.name) + "=" + str(self.value)
return str(self.name)
return base + '="' + self.pad_after_eq + str(self.value) + '"'
return base + "=" + self.pad_after_eq + str(self.value)
return base

@property
def name(self):
@@ -64,14 +69,41 @@ class Attribute(StringMixIn):
"""Whether the attribute's value is quoted with double quotes."""
return self._quoted

@property
def pad_first(self):
"""Spacing to insert right before the attribute."""
return self._pad_first

@property
def pad_before_eq(self):
"""Spacing to insert right before the equal sign."""
return self._pad_before_eq

@property
def pad_after_eq(self):
"""Spacing to insert right after the equal sign."""
return self._pad_after_eq

@name.setter
def name(self, newval):
self._name = parse_anything(newval)
def name(self, value):
self._name = parse_anything(value)

@value.setter
def value(self, newval):
self._value = parse_anything(newval)

@quoted.setter
def quoted(self, newval):
self._quoted = bool(newval)
def quoted(self, value):
self._quoted = bool(value)

@pad_first.setter
def pad_first(self, value):
self._pad_first = str(value)

@pad_before_eq.setter
def pad_before_eq(self, value):
self._pad_before_eq = str(value)

@pad_after_eq.setter
def pad_after_eq(self, value):
self._pad_after_eq = str(value)

+ 87
- 123
mwparserfromhell/nodes/tag.py View File

@@ -24,6 +24,7 @@ from __future__ import unicode_literals

from . import Node, Text
from ..compat import str
from ..tag_defs import get_wikicode, is_visible
from ..utils import parse_anything

__all__ = ["Tag"]
@@ -31,79 +32,39 @@ __all__ = ["Tag"]
class Tag(Node):
"""Represents an HTML-style tag in wikicode, like ``<ref>``."""

TAG_UNKNOWN = 0

# Basic HTML:
TAG_ITALIC = 1
TAG_BOLD = 2
TAG_UNDERLINE = 3
TAG_STRIKETHROUGH = 4
TAG_UNORDERED_LIST = 5
TAG_ORDERED_LIST = 6
TAG_DEF_TERM = 7
TAG_DEF_ITEM = 8
TAG_BLOCKQUOTE = 9
TAG_RULE = 10
TAG_BREAK = 11
TAG_ABBR = 12
TAG_PRE = 13
TAG_MONOSPACE = 14
TAG_CODE = 15
TAG_SPAN = 16
TAG_DIV = 17
TAG_FONT = 18
TAG_SMALL = 19
TAG_BIG = 20
TAG_CENTER = 21

# MediaWiki parser hooks:
TAG_REF = 101
TAG_GALLERY = 102
TAG_MATH = 103
TAG_NOWIKI = 104
TAG_NOINCLUDE = 105
TAG_INCLUDEONLY = 106
TAG_ONLYINCLUDE = 107

# Additional parser hooks:
TAG_SYNTAXHIGHLIGHT = 201
TAG_POEM = 202

# Lists of tags:
TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE))
TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE

def __init__(self, type_, tag, contents=None, attrs=None, showtag=True,
self_closing=False, open_padding=0, close_padding=0):
def __init__(self, tag, contents=None, attrs=None, showtag=True,
self_closing=False, invalid=False, implicit=False, padding="",
closing_tag=None):
super(Tag, self).__init__()
self._type = type_
self._tag = tag
self._contents = contents
if attrs:
self._attrs = attrs
else:
self._attrs = []
self._attrs = attrs if attrs else []
self._showtag = showtag
self._self_closing = self_closing
self._open_padding = open_padding
self._close_padding = close_padding
self._invalid = invalid
self._implicit = implicit
self._padding = padding
if closing_tag:
self._closing_tag = closing_tag
elif not self_closing:
self._closing_tag = tag

def __unicode__(self):
if not self.showtag:
open_, close = self._translate()
open_, close = get_wikicode[self.tag]
if self.self_closing:
return open_
else:
return open_ + str(self.contents) + close

result = "<" + str(self.tag)
if self.attrs:
result += " " + " ".join([str(attr) for attr in self.attrs])
result = ("</" if self.invalid else "<") + str(self.tag)
if self.attributes:
result += "".join([str(attr) for attr in self.attributes])
if self.self_closing:
result += " " * self.open_padding + "/>"
result += self.padding + (">" if self.implicit else "/>")
else:
result += " " * self.open_padding + ">" + str(self.contents)
result += "</" + str(self.tag) + " " * self.close_padding + ">"
result += self.padding + ">" + str(self.contents)
result += "</" + str(self.closing_tag) + ">"
return result

def __iternodes__(self, getter):
@@ -111,66 +72,43 @@ class Tag(Node):
if self.showtag:
for child in getter(self.tag):
yield self.tag, child
for attr in self.attrs:
for attr in self.attributes:
for child in getter(attr.name):
yield attr.name, child
if attr.value:
for child in getter(attr.value):
yield attr.value, child
for child in getter(self.contents):
yield self.contents, child
if self.contents:
for child in getter(self.contents):
yield self.contents, child
if not self.self_closing and self.closing_tag:
for child in getter(self.closing_tag):
yield self.closing_tag, child

def __strip__(self, normalize, collapse):
if self.type in self.TAGS_VISIBLE:
if is_visible(self.tag):
return self.contents.strip_code(normalize, collapse)
return None

def __showtree__(self, write, get, mark):
tagnodes = self.tag.nodes
if (not self.attrs and len(tagnodes) == 1 and isinstance(tagnodes[0], Text)):
write("<" + str(tagnodes[0]) + ">")
write("</" if self.invalid else "<")
get(self.tag)
for attr in self.attributes:
get(attr.name)
if not attr.value:
continue
write(" = ")
mark()
get(attr.value)
if self.self_closing:
write(">" if self.implicit else "/>")
else:
write("<")
get(self.tag)
for attr in self.attrs:
get(attr.name)
if not attr.value:
continue
write(" = ")
mark()
get(attr.value)
write(">")
get(self.contents)
if len(tagnodes) == 1 and isinstance(tagnodes[0], Text):
write("</" + str(tagnodes[0]) + ">")
else:
get(self.contents)
write("</")
get(self.tag)
get(self.closing_tag)
write(">")

def _translate(self):
"""If the HTML-style tag has a wikicode representation, return that.

For example, ``<b>Foo</b>`` can be represented as ``'''Foo'''``. This
returns a tuple of the character starting the sequence and the
character ending it.
"""
translations = {
self.TAG_ITALIC: ("''", "''"),
self.TAG_BOLD: ("'''", "'''"),
self.TAG_UNORDERED_LIST: ("*", ""),
self.TAG_ORDERED_LIST: ("#", ""),
self.TAG_DEF_TERM: (";", ""),
self.TAG_DEF_ITEM: (":", ""),
self.TAG_RULE: ("----", ""),
}
return translations[self.type]

@property
def type(self):
"""The tag type."""
return self._type

@property
def tag(self):
"""The tag itself, as a :py:class:`~.Wikicode` object."""
@@ -182,7 +120,7 @@ class Tag(Node):
return self._contents

@property
def attrs(self):
def attributes(self):
"""The list of attributes affecting the tag.

Each attribute is an instance of :py:class:`~.Attribute`.
@@ -196,29 +134,47 @@ class Tag(Node):

@property
def self_closing(self):
"""Whether the tag is self-closing with no content."""
"""Whether the tag is self-closing with no content (like ``<br/>``)."""
return self._self_closing

@property
def open_padding(self):
"""How much spacing to insert before the first closing >."""
return self._open_padding
def invalid(self):
"""Whether the tag starts with a backslash after the opening bracket.

This makes the tag look like a lone close tag. It is technically
invalid and is only parsable Wikicode when the tag itself is
single-only, like ``<br>`` and ``<img>``. See
:py:func:`tag_defs.is_single_only`.
"""
return self._invalid

@property
def close_padding(self):
"""How much spacing to insert before the last closing >."""
return self._close_padding
def implicit(self):
"""Whether the tag is implicitly self-closing, with no ending slash.

@type.setter
def type(self, value):
value = int(value)
if value not in self.TAGS_INVISIBLE | self.TAGS_VISIBLE:
raise ValueError(value)
self._type = value
This is only possible for specific "single" tags like ``<br>`` and
``<li>``. See :py:func:`tag_defs.is_single`. This field only has an
effect if :py:attr:`self_closing` is also ``True``.
"""
return self._implicit

@property
def padding(self):
"""Spacing to insert before the first closing ``>``."""
return self._padding

@property
def closing_tag(self):
"""The closing tag, as a :py:class:`~.Wikicode` object.

This will usually equal :py:attr:`tag`, unless there is additional
spacing, comments, or the like.
"""
return self._closing_tag

@tag.setter
def tag(self, value):
self._tag = parse_anything(value)
self._tag = self._closing_tag = parse_anything(value)

@contents.setter
def contents(self, value):
@@ -232,10 +188,18 @@ class Tag(Node):
def self_closing(self, value):
self._self_closing = bool(value)

@open_padding.setter
def open_padding(self, value):
self._open_padding = int(value)
@invalid.setter
def invalid(self, value):
self._invalid = bool(value)

@implicit.setter
def implicit(self, value):
self._implicit = bool(value)

@padding.setter
def padding(self, value):
self._padding = str(value)

@close_padding.setter
def close_padding(self, value):
self._close_padding = int(value)
@closing_tag.setter
def closing_tag(self, value):
self._closing_tag = parse_anything(value)

+ 28
- 18
mwparserfromhell/parser/builder.py View File

@@ -170,7 +170,7 @@ class Builder(object):
self._write(self._handle_token(token))

def _handle_comment(self):
"""Handle a case where a hidden comment is at the head of the tokens."""
"""Handle a case where an HTML comment is at the head of the tokens."""
self._push()
while self._tokens:
token = self._tokens.pop()
@@ -180,7 +180,7 @@ class Builder(object):
else:
self._write(self._handle_token(token))

def _handle_attribute(self):
def _handle_attribute(self, start):
"""Handle a case where a tag attribute is at the head of the tokens."""
name, quoted = None, False
self._push()
@@ -191,37 +191,47 @@ class Builder(object):
self._push()
elif isinstance(token, tokens.TagAttrQuote):
quoted = True
elif isinstance(token, (tokens.TagAttrStart,
tokens.TagCloseOpen)):
elif isinstance(token, (tokens.TagAttrStart, tokens.TagCloseOpen,
tokens.TagCloseSelfclose)):
self._tokens.append(token)
if name is not None:
return Attribute(name, self._pop(), quoted)
return Attribute(self._pop(), quoted=quoted)
if name:
value = self._pop()
else:
name, value = self._pop(), None
return Attribute(name, value, quoted, start.pad_first,
start.pad_before_eq, start.pad_after_eq)
else:
self._write(self._handle_token(token))

def _handle_tag(self, token):
"""Handle a case where a tag is at the head of the tokens."""
type_, showtag = token.type, token.showtag
attrs = []
close_tokens = (tokens.TagCloseSelfclose, tokens.TagCloseClose)
implicit, attrs, contents, closing_tag = False, [], None, None
showtag = token.get("showtag", True)
invalid = token.get("invalid", False)
self._push()
while self._tokens:
token = self._tokens.pop()
if isinstance(token, tokens.TagAttrStart):
attrs.append(self._handle_attribute())
attrs.append(self._handle_attribute(token))
elif isinstance(token, tokens.TagCloseOpen):
open_pad = token.padding
padding = token.padding
tag = self._pop()
self._push()
elif isinstance(token, tokens.TagCloseSelfclose):
tag = self._pop()
return Tag(type_, tag, attrs=attrs, showtag=showtag,
self_closing=True, open_padding=token.padding)
elif isinstance(token, tokens.TagOpenClose):
contents = self._pop()
elif isinstance(token, tokens.TagCloseClose):
return Tag(type_, tag, contents, attrs, showtag, False,
open_pad, token.padding)
self._push()
elif isinstance(token, close_tokens):
if isinstance(token, tokens.TagCloseSelfclose):
tag = self._pop()
self_closing = True
padding = token.padding
implicit = token.get("implicit", False)
else:
self_closing = False
closing_tag = self._pop()
return Tag(tag, contents, attrs, showtag, self_closing,
invalid, implicit, padding, closing_tag)
else:
self._write(self._handle_token(token))



+ 46
- 31
mwparserfromhell/parser/contexts.py View File

@@ -62,6 +62,13 @@ Local (stack-specific) contexts:

* :py:const:`COMMENT`

* :py:const:`TAG`

* :py:const:`TAG_OPEN`
* :py:const:`TAG_ATTR`
* :py:const:`TAG_BODY`
* :py:const:`TAG_CLOSE`

* :py:const:`SAFETY_CHECK`

* :py:const:`HAS_TEXT`
@@ -78,37 +85,45 @@ Global contexts:

# Local contexts:

TEMPLATE = 0b00000000000000000111
TEMPLATE_NAME = 0b00000000000000000001
TEMPLATE_PARAM_KEY = 0b00000000000000000010
TEMPLATE_PARAM_VALUE = 0b00000000000000000100

ARGUMENT = 0b00000000000000011000
ARGUMENT_NAME = 0b00000000000000001000
ARGUMENT_DEFAULT = 0b00000000000000010000

WIKILINK = 0b00000000000001100000
WIKILINK_TITLE = 0b00000000000000100000
WIKILINK_TEXT = 0b00000000000001000000

HEADING = 0b00000001111110000000
HEADING_LEVEL_1 = 0b00000000000010000000
HEADING_LEVEL_2 = 0b00000000000100000000
HEADING_LEVEL_3 = 0b00000000001000000000
HEADING_LEVEL_4 = 0b00000000010000000000
HEADING_LEVEL_5 = 0b00000000100000000000
HEADING_LEVEL_6 = 0b00000001000000000000

COMMENT = 0b00000010000000000000

SAFETY_CHECK = 0b11111100000000000000
HAS_TEXT = 0b00000100000000000000
FAIL_ON_TEXT = 0b00001000000000000000
FAIL_NEXT = 0b00010000000000000000
FAIL_ON_LBRACE = 0b00100000000000000000
FAIL_ON_RBRACE = 0b01000000000000000000
FAIL_ON_EQUALS = 0b10000000000000000000
TEMPLATE_NAME = 1 << 0
TEMPLATE_PARAM_KEY = 1 << 1
TEMPLATE_PARAM_VALUE = 1 << 2
TEMPLATE = TEMPLATE_NAME + TEMPLATE_PARAM_KEY + TEMPLATE_PARAM_VALUE

ARGUMENT_NAME = 1 << 3
ARGUMENT_DEFAULT = 1 << 4
ARGUMENT = ARGUMENT_NAME + ARGUMENT_DEFAULT

WIKILINK_TITLE = 1 << 5
WIKILINK_TEXT = 1 << 6
WIKILINK = WIKILINK_TITLE + WIKILINK_TEXT

HEADING_LEVEL_1 = 1 << 7
HEADING_LEVEL_2 = 1 << 8
HEADING_LEVEL_3 = 1 << 9
HEADING_LEVEL_4 = 1 << 10
HEADING_LEVEL_5 = 1 << 11
HEADING_LEVEL_6 = 1 << 12
HEADING = (HEADING_LEVEL_1 + HEADING_LEVEL_2 + HEADING_LEVEL_3 +
HEADING_LEVEL_4 + HEADING_LEVEL_5 + HEADING_LEVEL_6)

COMMENT = 1 << 13

TAG_OPEN = 1 << 14
TAG_ATTR = 1 << 15
TAG_BODY = 1 << 16
TAG_CLOSE = 1 << 17
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE

HAS_TEXT = 1 << 18
FAIL_ON_TEXT = 1 << 19
FAIL_NEXT = 1 << 20
FAIL_ON_LBRACE = 1 << 21
FAIL_ON_RBRACE = 1 << 22
FAIL_ON_EQUALS = 1 << 23
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS)

# Global contexts:

GL_HEADING = 0b1
GL_HEADING = 1 << 0

+ 1028
- 302
mwparserfromhell/parser/tokenizer.c
File diff suppressed because it is too large
View File


+ 81
- 71
mwparserfromhell/parser/tokenizer.h View File

@@ -41,10 +41,10 @@ SOFTWARE.
#define ALPHANUM "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

static const char* MARKERS[] = {
"{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-",
"!", "\n", ""};
"{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":", "/", "-",
"\n", ""};

#define NUM_MARKERS 18
#define NUM_MARKERS 17
#define TEXTBUFFER_BLOCKSIZE 1024
#define MAX_DEPTH 40
#define MAX_CYCLES 100000
@@ -60,10 +60,10 @@ static char** entitydefs;

static PyObject* EMPTY;
static PyObject* NOARGS;
static PyObject* tokens;
static PyObject* tag_defs;


/* Tokens */
/* Tokens: */

static PyObject* Text;

@@ -102,41 +102,58 @@ static PyObject* TagCloseClose;

/* Local contexts: */

#define LC_TEMPLATE 0x00007
#define LC_TEMPLATE_NAME 0x00001
#define LC_TEMPLATE_PARAM_KEY 0x00002
#define LC_TEMPLATE_PARAM_VALUE 0x00004

#define LC_ARGUMENT 0x00018
#define LC_ARGUMENT_NAME 0x00008
#define LC_ARGUMENT_DEFAULT 0x00010

#define LC_WIKILINK 0x00060
#define LC_WIKILINK_TITLE 0x00020
#define LC_WIKILINK_TEXT 0x00040

#define LC_HEADING 0x01F80
#define LC_HEADING_LEVEL_1 0x00080
#define LC_HEADING_LEVEL_2 0x00100
#define LC_HEADING_LEVEL_3 0x00200
#define LC_HEADING_LEVEL_4 0x00400
#define LC_HEADING_LEVEL_5 0x00800
#define LC_HEADING_LEVEL_6 0x01000

#define LC_COMMENT 0x02000

#define LC_SAFETY_CHECK 0xFC000
#define LC_HAS_TEXT 0x04000
#define LC_FAIL_ON_TEXT 0x08000
#define LC_FAIL_NEXT 0x10000
#define LC_FAIL_ON_LBRACE 0x20000
#define LC_FAIL_ON_RBRACE 0x40000
#define LC_FAIL_ON_EQUALS 0x80000
#define LC_TEMPLATE 0x000007
#define LC_TEMPLATE_NAME 0x000001
#define LC_TEMPLATE_PARAM_KEY 0x000002
#define LC_TEMPLATE_PARAM_VALUE 0x000004

#define LC_ARGUMENT 0x000018
#define LC_ARGUMENT_NAME 0x000008
#define LC_ARGUMENT_DEFAULT 0x000010

#define LC_WIKILINK 0x000060
#define LC_WIKILINK_TITLE 0x000020
#define LC_WIKILINK_TEXT 0x000040

#define LC_HEADING 0x001F80
#define LC_HEADING_LEVEL_1 0x000080
#define LC_HEADING_LEVEL_2 0x000100
#define LC_HEADING_LEVEL_3 0x000200
#define LC_HEADING_LEVEL_4 0x000400
#define LC_HEADING_LEVEL_5 0x000800
#define LC_HEADING_LEVEL_6 0x001000

#define LC_COMMENT 0x002000

#define LC_TAG 0x03C000
#define LC_TAG_OPEN 0x004000
#define LC_TAG_ATTR 0x008000
#define LC_TAG_BODY 0x010000
#define LC_TAG_CLOSE 0x020000

#define LC_SAFETY_CHECK 0xFC0000
#define LC_HAS_TEXT 0x040000
#define LC_FAIL_ON_TEXT 0x080000
#define LC_FAIL_NEXT 0x100000
#define LC_FAIL_ON_LBRACE 0x200000
#define LC_FAIL_ON_RBRACE 0x400000
#define LC_FAIL_ON_EQUALS 0x800000

/* Global contexts: */

#define GL_HEADING 0x1

/* Tag contexts: */

#define TAG_NAME 0x01
#define TAG_ATTR_READY 0x02
#define TAG_ATTR_NAME 0x04
#define TAG_ATTR_VALUE 0x08
#define TAG_QUOTED 0x10
#define TAG_NOTE_SPACE 0x20
#define TAG_NOTE_EQUALS 0x40
#define TAG_NOTE_QUOTE 0x80


/* Miscellaneous structs: */

@@ -158,13 +175,24 @@ typedef struct {
int level;
} HeadingData;

typedef struct {
int context;
struct Textbuffer* pad_first;
struct Textbuffer* pad_before_eq;
struct Textbuffer* pad_after_eq;
Py_ssize_t reset;
} TagData;

typedef struct Textbuffer Textbuffer;
typedef struct Stack Stack;


/* Tokenizer object definition: */

typedef struct {
PyObject_HEAD
PyObject* text; /* text to tokenize */
struct Stack* topstack; /* topmost stack */
Stack* topstack; /* topmost stack */
Py_ssize_t head; /* current position in text */
Py_ssize_t length; /* length of text */
int global; /* global context */
@@ -176,49 +204,31 @@ typedef struct {
/* Macros for accessing Tokenizer data: */

#define Tokenizer_READ(self, delta) (*PyUnicode_AS_UNICODE(Tokenizer_read(self, delta)))
#define Tokenizer_READ_BACKWARDS(self, delta) \
(*PyUnicode_AS_UNICODE(Tokenizer_read_backwards(self, delta)))
#define Tokenizer_CAN_RECURSE(self) (self->depth < MAX_DEPTH && self->cycles < MAX_CYCLES)


/* Macros for accessing HTML tag definitions: */

#define IS_PARSABLE(tag) (call_tag_def_func("is_parsable", tag))
#define IS_SINGLE(tag) (call_tag_def_func("is_single", tag))
#define IS_SINGLE_ONLY(tag) (call_tag_def_func("is_single_only", tag))


/* Function prototypes: */

static int heading_level_from_context(int);
static Textbuffer* Textbuffer_new(void);
static void Textbuffer_dealloc(Textbuffer*);

static TagData* TagData_new(void);
static void TagData_dealloc(TagData*);

static PyObject* Tokenizer_new(PyTypeObject*, PyObject*, PyObject*);
static struct Textbuffer* Textbuffer_new(void);
static void Tokenizer_dealloc(Tokenizer*);
static void Textbuffer_dealloc(struct Textbuffer*);
static int Tokenizer_init(Tokenizer*, PyObject*, PyObject*);
static int Tokenizer_push(Tokenizer*, int);
static PyObject* Textbuffer_render(struct Textbuffer*);
static int Tokenizer_push_textbuffer(Tokenizer*);
static void Tokenizer_delete_top_of_stack(Tokenizer*);
static PyObject* Tokenizer_pop(Tokenizer*);
static PyObject* Tokenizer_pop_keeping_context(Tokenizer*);
static void* Tokenizer_fail_route(Tokenizer*);
static int Tokenizer_write(Tokenizer*, PyObject*);
static int Tokenizer_write_first(Tokenizer*, PyObject*);
static int Tokenizer_write_text(Tokenizer*, Py_UNICODE);
static int Tokenizer_write_all(Tokenizer*, PyObject*);
static int Tokenizer_write_text_then_stack(Tokenizer*, const char*);
static PyObject* Tokenizer_read(Tokenizer*, Py_ssize_t);
static PyObject* Tokenizer_read_backwards(Tokenizer*, Py_ssize_t);
static int Tokenizer_parse_template_or_argument(Tokenizer*);
static int Tokenizer_parse_template(Tokenizer*);
static int Tokenizer_parse_argument(Tokenizer*);
static int Tokenizer_handle_template_param(Tokenizer*);
static int Tokenizer_handle_template_param_value(Tokenizer*);
static PyObject* Tokenizer_handle_template_end(Tokenizer*);
static int Tokenizer_handle_argument_separator(Tokenizer*);
static PyObject* Tokenizer_handle_argument_end(Tokenizer*);
static int Tokenizer_parse_wikilink(Tokenizer*);
static int Tokenizer_handle_wikilink_separator(Tokenizer*);
static PyObject* Tokenizer_handle_wikilink_end(Tokenizer*);
static int Tokenizer_parse_heading(Tokenizer*);
static HeadingData* Tokenizer_handle_heading_end(Tokenizer*);
static int Tokenizer_really_parse_entity(Tokenizer*);
static int Tokenizer_parse_entity(Tokenizer*);
static int Tokenizer_parse_comment(Tokenizer*);
static int Tokenizer_verify_safe(Tokenizer*, int, Py_UNICODE);
static PyObject* Tokenizer_parse(Tokenizer*, int);
static int Tokenizer_parse_tag(Tokenizer*);
static PyObject* Tokenizer_parse(Tokenizer*, int, int);
static PyObject* Tokenizer_tokenize(Tokenizer*, PyObject*);




+ 342
- 96
mwparserfromhell/parser/tokenizer.py View File

@@ -24,9 +24,9 @@ from __future__ import unicode_literals
from math import log
import re

from . import contexts
from . import tokens
from . import contexts, tokens
from ..compat import htmlentities
from ..tag_defs import is_parsable, is_single, is_single_only

__all__ = ["Tokenizer"]

@@ -35,16 +35,34 @@ class BadRoute(Exception):
pass


class _TagOpenData(object):
"""Stores data about an HTML open tag, like ``<ref name="foo">``."""
CX_NAME = 1 << 0
CX_ATTR_READY = 1 << 1
CX_ATTR_NAME = 1 << 2
CX_ATTR_VALUE = 1 << 3
CX_QUOTED = 1 << 4
CX_NOTE_SPACE = 1 << 5
CX_NOTE_EQUALS = 1 << 6
CX_NOTE_QUOTE = 1 << 7

def __init__(self):
self.context = self.CX_NAME
self.padding_buffer = {"first": "", "before_eq": "", "after_eq": ""}
self.reset = 0


class Tokenizer(object):
"""Creates a list of tokens from a string of wikicode."""
USES_C = False
START = object()
END = object()
MARKERS = ["{", "}", "[", "]", "<", ">", "|", "=", "&", "#", "*", ";", ":",
"/", "-", "!", "\n", END]
"/", "-", "\n", END]
MAX_DEPTH = 40
MAX_CYCLES = 100000
regex = re.compile(r"([{}\[\]<>|=&#*;:/\-!\n])", flags=re.IGNORECASE)
regex = re.compile(r"([{}\[\]<>|=&#*;:/\\\"\-!\n])", flags=re.IGNORECASE)
tag_splitter = re.compile(r"([\s\"\\]+)")

def __init__(self):
self._text = None
@@ -117,33 +135,33 @@ class Tokenizer(object):
self._pop()
raise BadRoute()

def _write(self, token):
def _emit(self, token):
"""Write a token to the end of the current token stack."""
self._push_textbuffer()
self._stack.append(token)

def _write_first(self, token):
def _emit_first(self, token):
"""Write a token to the beginning of the current token stack."""
self._push_textbuffer()
self._stack.insert(0, token)

def _write_text(self, text):
def _emit_text(self, text):
"""Write text to the current textbuffer."""
self._textbuffer.append(text)

def _write_all(self, tokenlist):
def _emit_all(self, tokenlist):
"""Write a series of tokens to the current stack at once."""
if tokenlist and isinstance(tokenlist[0], tokens.Text):
self._write_text(tokenlist.pop(0).text)
self._emit_text(tokenlist.pop(0).text)
self._push_textbuffer()
self._stack.extend(tokenlist)

def _write_text_then_stack(self, text):
def _emit_text_then_stack(self, text):
"""Pop the current stack, write *text*, and then write the stack."""
stack = self._pop()
self._write_text(text)
self._emit_text(text)
if stack:
self._write_all(stack)
self._emit_all(stack)
self._head -= 1

def _read(self, delta=0, wrap=False, strict=False):
@@ -168,6 +186,30 @@ class Tokenizer(object):
self._fail_route()
return self.END

def _parse_template(self):
"""Parse a template at the head of the wikicode string."""
reset = self._head
try:
template = self._parse(contexts.TEMPLATE_NAME)
except BadRoute:
self._head = reset
raise
self._emit_first(tokens.TemplateOpen())
self._emit_all(template)
self._emit(tokens.TemplateClose())

def _parse_argument(self):
"""Parse an argument at the head of the wikicode string."""
reset = self._head
try:
argument = self._parse(contexts.ARGUMENT_NAME)
except BadRoute:
self._head = reset
raise
self._emit_first(tokens.ArgumentOpen())
self._emit_all(argument)
self._emit(tokens.ArgumentClose())

def _parse_template_or_argument(self):
"""Parse a template or argument at the head of the wikicode string."""
self._head += 2
@@ -179,12 +221,12 @@ class Tokenizer(object):

while braces:
if braces == 1:
return self._write_text_then_stack("{")
return self._emit_text_then_stack("{")
if braces == 2:
try:
self._parse_template()
except BadRoute:
return self._write_text_then_stack("{{")
return self._emit_text_then_stack("{{")
break
try:
self._parse_argument()
@@ -194,35 +236,13 @@ class Tokenizer(object):
self._parse_template()
braces -= 2
except BadRoute:
return self._write_text_then_stack("{" * braces)
return self._emit_text_then_stack("{" * braces)
if braces:
self._head += 1

self._write_all(self._pop())

def _parse_template(self):
"""Parse a template at the head of the wikicode string."""
reset = self._head
try:
template = self._parse(contexts.TEMPLATE_NAME)
except BadRoute:
self._head = reset
raise
self._write_first(tokens.TemplateOpen())
self._write_all(template)
self._write(tokens.TemplateClose())

def _parse_argument(self):
"""Parse an argument at the head of the wikicode string."""
reset = self._head
try:
argument = self._parse(contexts.ARGUMENT_NAME)
except BadRoute:
self._head = reset
raise
self._write_first(tokens.ArgumentOpen())
self._write_all(argument)
self._write(tokens.ArgumentClose())
self._emit_all(self._pop())
if self._context & contexts.FAIL_NEXT:
self._context ^= contexts.FAIL_NEXT

def _handle_template_param(self):
"""Handle a template parameter at the head of the string."""
@@ -231,22 +251,22 @@ class Tokenizer(object):
elif self._context & contexts.TEMPLATE_PARAM_VALUE:
self._context ^= contexts.TEMPLATE_PARAM_VALUE
elif self._context & contexts.TEMPLATE_PARAM_KEY:
self._write_all(self._pop(keep_context=True))
self._emit_all(self._pop(keep_context=True))
self._context |= contexts.TEMPLATE_PARAM_KEY
self._write(tokens.TemplateParamSeparator())
self._emit(tokens.TemplateParamSeparator())
self._push(self._context)

def _handle_template_param_value(self):
"""Handle a template parameter's value at the head of the string."""
self._write_all(self._pop(keep_context=True))
self._emit_all(self._pop(keep_context=True))
self._context ^= contexts.TEMPLATE_PARAM_KEY
self._context |= contexts.TEMPLATE_PARAM_VALUE
self._write(tokens.TemplateParamEquals())
self._emit(tokens.TemplateParamEquals())

def _handle_template_end(self):
"""Handle the end of a template at the head of the string."""
if self._context & contexts.TEMPLATE_PARAM_KEY:
self._write_all(self._pop(keep_context=True))
self._emit_all(self._pop(keep_context=True))
self._head += 1
return self._pop()

@@ -254,7 +274,7 @@ class Tokenizer(object):
"""Handle the separator between an argument's name and default."""
self._context ^= contexts.ARGUMENT_NAME
self._context |= contexts.ARGUMENT_DEFAULT
self._write(tokens.ArgumentSeparator())
self._emit(tokens.ArgumentSeparator())

def _handle_argument_end(self):
"""Handle the end of an argument at the head of the string."""
@@ -269,17 +289,19 @@ class Tokenizer(object):
wikilink = self._parse(contexts.WIKILINK_TITLE)
except BadRoute:
self._head = reset
self._write_text("[[")
self._emit_text("[[")
else:
self._write(tokens.WikilinkOpen())
self._write_all(wikilink)
self._write(tokens.WikilinkClose())
if self._context & contexts.FAIL_NEXT:
self._context ^= contexts.FAIL_NEXT
self._emit(tokens.WikilinkOpen())
self._emit_all(wikilink)
self._emit(tokens.WikilinkClose())

def _handle_wikilink_separator(self):
"""Handle the separator between a wikilink's title and its text."""
self._context ^= contexts.WIKILINK_TITLE
self._context |= contexts.WIKILINK_TEXT
self._write(tokens.WikilinkSeparator())
self._emit(tokens.WikilinkSeparator())

def _handle_wikilink_end(self):
"""Handle the end of a wikilink at the head of the string."""
@@ -301,13 +323,13 @@ class Tokenizer(object):
title, level = self._parse(context)
except BadRoute:
self._head = reset + best - 1
self._write_text("=" * best)
self._emit_text("=" * best)
else:
self._write(tokens.HeadingStart(level=level))
self._emit(tokens.HeadingStart(level=level))
if level < best:
self._write_text("=" * (best - level))
self._write_all(title)
self._write(tokens.HeadingEnd())
self._emit_text("=" * (best - level))
self._emit_all(title)
self._emit(tokens.HeadingEnd())
finally:
self._global ^= contexts.GL_HEADING

@@ -326,28 +348,28 @@ class Tokenizer(object):
after, after_level = self._parse(self._context)
except BadRoute:
if level < best:
self._write_text("=" * (best - level))
self._emit_text("=" * (best - level))
self._head = reset + best - 1
return self._pop(), level
else: # Found another closure
self._write_text("=" * best)
self._write_all(after)
self._emit_text("=" * best)
self._emit_all(after)
return self._pop(), after_level

def _really_parse_entity(self):
"""Actually parse an HTML entity and ensure that it is valid."""
self._write(tokens.HTMLEntityStart())
self._emit(tokens.HTMLEntityStart())
self._head += 1

this = self._read(strict=True)
if this == "#":
numeric = True
self._write(tokens.HTMLEntityNumeric())
self._emit(tokens.HTMLEntityNumeric())
self._head += 1
this = self._read(strict=True)
if this[0].lower() == "x":
hexadecimal = True
self._write(tokens.HTMLEntityHex(char=this[0]))
self._emit(tokens.HTMLEntityHex(char=this[0]))
this = this[1:]
if not this:
self._fail_route()
@@ -373,8 +395,8 @@ class Tokenizer(object):
if this not in htmlentities.entitydefs:
self._fail_route()

self._write(tokens.Text(text=this))
self._write(tokens.HTMLEntityEnd())
self._emit(tokens.Text(text=this))
self._emit(tokens.HTMLEntityEnd())

def _parse_entity(self):
"""Parse an HTML entity at the head of the wikicode string."""
@@ -384,9 +406,9 @@ class Tokenizer(object):
self._really_parse_entity()
except BadRoute:
self._head = reset
self._write_text(self._read())
self._emit_text(self._read())
else:
self._write_all(self._pop())
self._emit_all(self._pop())

def _parse_comment(self):
"""Parse an HTML comment at the head of the wikicode string."""
@@ -396,13 +418,231 @@ class Tokenizer(object):
comment = self._parse(contexts.COMMENT)
except BadRoute:
self._head = reset
self._write_text("<!--")
self._emit_text("<!--")
else:
self._write(tokens.CommentStart())
self._write_all(comment)
self._write(tokens.CommentEnd())
self._emit(tokens.CommentStart())
self._emit_all(comment)
self._emit(tokens.CommentEnd())
self._head += 2

def _push_tag_buffer(self, data):
"""Write a pending tag attribute from *data* to the stack."""
if data.context & data.CX_QUOTED:
self._emit_first(tokens.TagAttrQuote())
self._emit_all(self._pop())
buf = data.padding_buffer
self._emit_first(tokens.TagAttrStart(pad_first=buf["first"],
pad_before_eq=buf["before_eq"], pad_after_eq=buf["after_eq"]))
self._emit_all(self._pop())
data.padding_buffer = {key: "" for key in data.padding_buffer}

def _handle_tag_space(self, data, text):
"""Handle whitespace (*text*) inside of an HTML open tag."""
ctx = data.context
end_of_value = ctx & data.CX_ATTR_VALUE and not ctx & (data.CX_QUOTED | data.CX_NOTE_QUOTE)
if end_of_value or (ctx & data.CX_QUOTED and ctx & data.CX_NOTE_SPACE):
self._push_tag_buffer(data)
data.context = data.CX_ATTR_READY
elif ctx & data.CX_NOTE_SPACE:
data.context = data.CX_ATTR_READY
elif ctx & data.CX_ATTR_NAME:
data.context |= data.CX_NOTE_EQUALS
data.padding_buffer["before_eq"] += text
if ctx & data.CX_QUOTED and not ctx & data.CX_NOTE_SPACE:
self._emit_text(text)
elif data.context & data.CX_ATTR_READY:
data.padding_buffer["first"] += text
elif data.context & data.CX_ATTR_VALUE:
data.padding_buffer["after_eq"] += text

def _handle_tag_text(self, text):
"""Handle regular *text* inside of an HTML open tag."""
next = self._read(1)
if not self._can_recurse() or text not in self.MARKERS:
self._emit_text(text)
elif text == next == "{":
self._parse_template_or_argument()
elif text == next == "[":
self._parse_wikilink()
elif text == "<":
self._parse_tag()
else:
self._emit_text(text)

def _handle_tag_data(self, data, text):
"""Handle all sorts of *text* data inside of an HTML open tag."""
for chunk in self.tag_splitter.split(text):
if not chunk:
continue
if data.context & data.CX_NAME:
if chunk in self.MARKERS or chunk.isspace():
self._fail_route() # Tags must start with text, not spaces
data.context = data.CX_NOTE_SPACE
elif chunk.isspace():
self._handle_tag_space(data, chunk)
continue
elif data.context & data.CX_NOTE_SPACE:
if data.context & data.CX_QUOTED:
data.context = data.CX_ATTR_VALUE
self._pop()
self._head = data.reset - 1 # Will be auto-incremented
return # Break early
self._fail_route()
elif data.context & data.CX_ATTR_READY:
data.context = data.CX_ATTR_NAME
self._push(contexts.TAG_ATTR)
elif data.context & data.CX_ATTR_NAME:
if chunk == "=":
data.context = data.CX_ATTR_VALUE | data.CX_NOTE_QUOTE
self._emit(tokens.TagAttrEquals())
continue
if data.context & data.CX_NOTE_EQUALS:
self._push_tag_buffer(data)
data.context = data.CX_ATTR_NAME
self._push(contexts.TAG_ATTR)
elif data.context & data.CX_ATTR_VALUE:
escaped = self._read(-1) == "\\" and self._read(-2) != "\\"
if data.context & data.CX_NOTE_QUOTE:
data.context ^= data.CX_NOTE_QUOTE
if chunk == '"' and not escaped:
data.context |= data.CX_QUOTED
self._push(self._context)
data.reset = self._head
continue
elif data.context & data.CX_QUOTED:
if chunk == '"' and not escaped:
data.context |= data.CX_NOTE_SPACE
continue
self._handle_tag_text(chunk)

def _handle_tag_close_open(self, data, token):
"""Handle the closing of a open tag (``<foo>``)."""
if data.context & (data.CX_ATTR_NAME | data.CX_ATTR_VALUE):
self._push_tag_buffer(data)
self._emit(token(padding=data.padding_buffer["first"]))
self._head += 1

def _handle_tag_open_close(self):
"""Handle the opening of a closing tag (``</foo>``)."""
self._emit(tokens.TagOpenClose())
self._push(contexts.TAG_CLOSE)
self._head += 1

def _handle_tag_close_close(self):
"""Handle the ending of a closing tag (``</foo>``)."""
strip = lambda tok: tok.text.rstrip().lower()
closing = self._pop()
if len(closing) != 1 or (not isinstance(closing[0], tokens.Text) or
strip(closing[0]) != strip(self._stack[1])):
self._fail_route()
self._emit_all(closing)
self._emit(tokens.TagCloseClose())
return self._pop()

def _handle_blacklisted_tag(self):
"""Handle the body of an HTML tag that is parser-blacklisted."""
while True:
this, next = self._read(), self._read(1)
self._head += 1
if this is self.END:
self._fail_route()
elif this == "<" and next == "/":
self._handle_tag_open_close()
return self._parse(push=False)
else:
self._emit_text(this)

def _handle_single_only_tag_end(self):
"""Handle the end of an implicitly closing single-only HTML tag."""
padding = self._stack.pop().padding
self._emit(tokens.TagCloseSelfclose(padding=padding, implicit=True))
self._head -= 1 # Offset displacement done by _handle_tag_close_open
return self._pop()

def _handle_single_tag_end(self):
"""Handle the stream end when inside a single-supporting HTML tag."""
gen = enumerate(self._stack)
index = next(i for i, t in gen if isinstance(t, tokens.TagCloseOpen))
padding = self._stack[index].padding
token = tokens.TagCloseSelfclose(padding=padding, implicit=True)
self._stack[index] = token
return self._pop()

def _really_parse_tag(self):
"""Actually parse an HTML tag, starting with the open (``<foo>``)."""
data = _TagOpenData()
self._push(contexts.TAG_OPEN)
self._emit(tokens.TagOpenOpen())
while True:
this, next = self._read(), self._read(1)
can_exit = (not data.context & (data.CX_QUOTED | data.CX_NAME) or
data.context & data.CX_NOTE_SPACE)
if this is self.END:
if self._context & contexts.TAG_ATTR:
if data.context & data.CX_QUOTED:
# Unclosed attribute quote: reset, don't die
data.context = data.CX_ATTR_VALUE
self._pop()
self._head = data.reset
continue
self._pop()
self._fail_route()
elif this == ">" and can_exit:
self._handle_tag_close_open(data, tokens.TagCloseOpen)
self._context = contexts.TAG_BODY
if is_single_only(self._stack[1].text):
return self._handle_single_only_tag_end()
if is_parsable(self._stack[1].text):
return self._parse(push=False)
return self._handle_blacklisted_tag()
elif this == "/" and next == ">" and can_exit:
self._handle_tag_close_open(data, tokens.TagCloseSelfclose)
return self._pop()
else:
self._handle_tag_data(data, this)
self._head += 1

def _handle_invalid_tag_start(self):
"""Handle the (possible) start of an implicitly closing single tag."""
reset = self._head + 1
self._head += 2
try:
if not is_single_only(self.tag_splitter.split(self._read())[0]):
raise BadRoute()
tag = self._really_parse_tag()
except BadRoute:
self._head = reset
self._emit_text("</")
else:
tag[0].invalid = True # Set flag of TagOpenOpen
self._emit_all(tag)

def _parse_tag(self):
"""Parse an HTML tag at the head of the wikicode string."""
reset = self._head
self._head += 1
try:
tag = self._really_parse_tag()
except BadRoute:
self._head = reset
self._emit_text("<")
else:
self._emit_all(tag)

def _handle_end(self):
"""Handle the end of the stream of wikitext."""
fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
contexts.HEADING | contexts.COMMENT | contexts.TAG)
double_fail = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
if self._context & fail:
if self._context & contexts.TAG_BODY:
if is_single(self._stack[1].text):
return self._handle_single_tag_end()
if self._context & double_fail:
self._pop()
self._fail_route()
return self._pop()

def _verify_safe(self, this):
"""Make sure we are not trying to write an invalid character."""
context = self._context
@@ -414,7 +654,7 @@ class Tokenizer(object):
elif this == "\n" or this == "[" or this == "}":
return False
return True
if context & contexts.TEMPLATE_NAME:
elif context & contexts.TEMPLATE_NAME:
if this == "{" or this == "}" or this == "[":
self._context |= contexts.FAIL_NEXT
return True
@@ -432,6 +672,8 @@ class Tokenizer(object):
elif this is self.END or not this.isspace():
self._context |= contexts.HAS_TEXT
return True
elif context & contexts.TAG_CLOSE:
return this != "<"
else:
if context & contexts.FAIL_ON_EQUALS:
if this == "=":
@@ -458,44 +700,38 @@ class Tokenizer(object):
self._context |= contexts.FAIL_ON_RBRACE
return True

def _parse(self, context=0):
def _parse(self, context=0, push=True):
"""Parse the wikicode string, using *context* for when to stop."""
self._push(context)
unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME |
contexts.TAG_CLOSE)
double_unsafe = (contexts.TEMPLATE_PARAM_KEY | contexts.TAG_CLOSE)
if push:
self._push(context)
while True:
this = self._read()
unsafe = (contexts.TEMPLATE_NAME | contexts.WIKILINK_TITLE |
contexts.TEMPLATE_PARAM_KEY | contexts.ARGUMENT_NAME)
if self._context & unsafe:
if not self._verify_safe(this):
if self._context & contexts.TEMPLATE_PARAM_KEY:
if self._context & double_unsafe:
self._pop()
self._fail_route()
if this not in self.MARKERS:
self._write_text(this)
self._emit_text(this)
self._head += 1
continue
if this is self.END:
fail = (contexts.TEMPLATE | contexts.ARGUMENT |
contexts.WIKILINK | contexts.HEADING |
contexts.COMMENT)
if self._context & contexts.TEMPLATE_PARAM_KEY:
self._pop()
if self._context & fail:
self._fail_route()
return self._pop()
return self._handle_end()
next = self._read(1)
if self._context & contexts.COMMENT:
if this == next == "-" and self._read(2) == ">":
return self._pop()
else:
self._write_text(this)
self._emit_text(this)
elif this == next == "{":
if self._can_recurse():
self._parse_template_or_argument()
if self._context & contexts.FAIL_NEXT:
self._context ^= contexts.FAIL_NEXT
else:
self._write_text("{")
self._emit_text("{")
elif this == "|" and self._context & contexts.TEMPLATE:
self._handle_template_param()
elif this == "=" and self._context & contexts.TEMPLATE_PARAM_KEY:
@@ -508,14 +744,12 @@ class Tokenizer(object):
if self._read(2) == "}":
return self._handle_argument_end()
else:
self._write_text("}")
self._emit_text("}")
elif this == next == "[":
if not self._context & contexts.WIKILINK_TITLE and self._can_recurse():
self._parse_wikilink()
if self._context & contexts.FAIL_NEXT:
self._context ^= contexts.FAIL_NEXT
else:
self._write_text("[")
self._emit_text("[")
elif this == "|" and self._context & contexts.WIKILINK_TITLE:
self._handle_wikilink_separator()
elif this == next == "]" and self._context & contexts.WIKILINK:
@@ -524,7 +758,7 @@ class Tokenizer(object):
if self._read(-1) in ("\n", self.START):
self._parse_heading()
else:
self._write_text("=")
self._emit_text("=")
elif this == "=" and self._context & contexts.HEADING:
return self._handle_heading_end()
elif this == "\n" and self._context & contexts.HEADING:
@@ -535,9 +769,21 @@ class Tokenizer(object):
if self._read(2) == self._read(3) == "-":
self._parse_comment()
else:
self._write_text(this)
self._emit_text(this)
elif this == "<" and next == "/" and self._read(2) is not self.END:
if self._context & contexts.TAG_BODY:
self._handle_tag_open_close()
else:
self._handle_invalid_tag_start()
elif this == "<":
if not self._context & contexts.TAG_CLOSE and self._can_recurse():
self._parse_tag()
else:
self._emit_text("<")
elif this == ">" and self._context & contexts.TAG_CLOSE:
return self._handle_tag_close_close()
else:
self._write_text(this)
self._emit_text(this)
self._head += 1

def tokenize(self, text):


+ 4
- 0
mwparserfromhell/parser/tokens.py View File

@@ -63,6 +63,10 @@ class Token(object):
def __delattr__(self, key):
del self._kwargs[key]

def get(self, key, default=None):
"""Same as :py:meth:`__getattr__`, but has a *default* if missing."""
return self._kwargs.get(key, default)


def make(name):
"""Create a new Token class using ``type()`` and add it to ``__all__``."""


+ 76
- 0
mwparserfromhell/tag_defs.py View File

@@ -0,0 +1,76 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2012-2013 Ben Kurtovic <ben.kurtovic@verizon.net>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""Contains data regarding certain HTML tags."""

from __future__ import unicode_literals

__all__ = ["get_wikicode", "is_parsable", "is_visible", "is_single",
"is_single_only"]

PARSER_BLACKLIST = [
# enwiki extensions @ 2013-06-28
"categorytree", "gallery", "hiero", "imagemap", "inputbox", "math",
"nowiki", "pre", "score", "section", "source", "syntaxhighlight",
"templatedata", "timeline"
]

INVISIBLE_TAGS = [
# enwiki extensions @ 2013-06-28
"categorytree", "gallery", "imagemap", "inputbox", "math", "score",
"section", "templatedata", "timeline"
]

# [mediawiki/core.git]/includes/Sanitizer.php @ 87a0aef762
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"]

WIKICODE = {
"i": {"open": "''", "close": "''"},
"b": {"open": "'''", "close": "'''"},
"ul": {"open": "*"},
"ol": {"open": "#"},
"dt": {"open": ";"},
"dd": {"open": ":"},
"hr": {"open": "----"},
}

def get_wikicode(tag):
"""Return the appropriate wikicode before and after the given *tag*."""
data = WIKICODE[tag.lower()]
return (data.get("open"), data.get("close"))

def is_parsable(tag):
"""Return if the given *tag*'s contents should be passed to the parser."""
return tag.lower() not in PARSER_BLACKLIST

def is_visible(tag):
"""Return whether or not the given *tag* contains visible text."""
return tag.lower() not in INVISIBLE_TAGS

def is_single(tag):
"""Return whether or not the given *tag* can exist without a close tag."""
return tag.lower() in SINGLE

def is_single_only(tag):
"""Return whether or not the given *tag* must exist without a close tag."""
return tag.lower() in SINGLE_ONLY

+ 2
- 0
mwparserfromhell/utils.py View File

@@ -31,6 +31,8 @@ from .compat import bytes, str
from .nodes import Node
from .smart_list import SmartList

__all__ = ["parse_anything"]

def parse_anything(value):
"""Return a :py:class:`~.Wikicode` for *value*, allowing multiple types.



+ 17
- 1
tests/_test_tree_equality.py View File

@@ -91,7 +91,23 @@ class TreeEqualityTestCase(TestCase):

def assertTagNodeEqual(self, expected, actual):
"""Assert that two Tag nodes have the same data."""
self.fail("Holding this until feature/html_tags is ready.")
self.assertWikicodeEqual(expected.tag, actual.tag)
if expected.contents is not None:
self.assertWikicodeEqual(expected.contents, actual.contents)
length = len(expected.attributes)
self.assertEqual(length, len(actual.attributes))
for i in range(length):
exp_attr = expected.attributes[i]
act_attr = actual.attributes[i]
self.assertWikicodeEqual(exp_attr.name, act_attr.name)
if exp_attr.value is not None:
self.assertWikicodeEqual(exp_attr.value, act_attr.value)
self.assertIs(exp_attr.quoted, act_attr.quoted)
self.assertEqual(exp.attr.padding, act_attr.padding)
self.assertIs(expected.showtag, actual.showtag)
self.assertIs(expected.self_closing, actual.self_closing)
self.assertEqual(expected.padding, actual.padding)
self.assertWikicodeEqual(expected.closing_tag, actual.closing_tag)

def assertTemplateNodeEqual(self, expected, actual):
"""Assert that two Template nodes have the same data."""


+ 0
- 0
View File


+ 12
- 0
tests/test_builder.py View File

@@ -198,6 +198,18 @@ class TestBuilder(TreeEqualityTestCase):
for test, valid in tests:
self.assertWikicodeEqual(valid, self.builder.build(test))

def test_tag(self):
"""tests for building Tag nodes"""
tests = [
([tokens.TagOpenOpen(), tokens.Text(text="ref"),
tokens.TagCloseOpen(padding=""), tokens.TagOpenClose(),
tokens.Text(text="ref"), tokens.TagCloseClose()],
wrap([Tag(wraptext("ref"), wrap([]), [], True, False, "",
wraptext("ref"))])),
]
for test, valid in tests:
self.assertWikicodeEqual(valid, self.builder.build(test))

def test_integration(self):
"""a test for building a combination of templates together"""
# {{{{{{{{foo}}bar|baz=biz}}buzz}}usr|{{bin}}}}


+ 0
- 0
View File


+ 7
- 0
tests/tokenizer/integration.mwtest View File

@@ -33,6 +33,13 @@ output: [Text(text="&n"), CommentStart(), Text(text="foo"), CommentEnd(), Text(t

---

name: rich_tags
label: a HTML tag with tons of other things in it
input: "{{dubious claim}}<ref name={{abc}} foo="bar {{baz}}" abc={{de}}f ghi=j{{k}}{{l}} \n mno = "{{p}} [[q]] {{r}}">[[Source]]</ref>"
output: [TemplateOpen(), Text(text="dubious claim"), TemplateClose(), TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TemplateOpen(), Text(text="abc"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="foo"), TagAttrEquals(), TagAttrQuote(), Text(text="bar "), TemplateOpen(), Text(text="baz"), TemplateClose(), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="abc"), TagAttrEquals(), TemplateOpen(), Text(text="de"), TemplateClose(), Text(text="f"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="ghi"), TagAttrEquals(), Text(text="j"), TemplateOpen(), Text(text="k"), TemplateClose(), TemplateOpen(), Text(text="l"), TemplateClose(), TagAttrStart(pad_first=" \n ", pad_before_eq=" ", pad_after_eq=" "), Text(text="mno"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="p"), TemplateClose(), Text(text=" "), WikilinkOpen(), Text(text="q"), WikilinkClose(), Text(text=" "), TemplateOpen(), Text(text="r"), TemplateClose(), TagCloseOpen(padding=""), WikilinkOpen(), Text(text="Source"), WikilinkClose(), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: wildcard
label: a wildcard assortment of various things
input: "{{{{{{{{foo}}bar|baz=biz}}buzz}}usr|{{bin}}}}"


+ 529
- 0
tests/tokenizer/tags.mwtest View File

@@ -0,0 +1,529 @@
name: basic
label: a basic tag with an open and close
input: "<ref></ref>"
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: basic_selfclosing
label: a basic self-closing tag
input: "<ref/>"
output: [TagOpenOpen(), Text(text="ref"), TagCloseSelfclose(padding="")]

---

name: content
label: a tag with some content in the middle
input: "<ref>this is a reference</ref>"
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), Text(text="this is a reference"), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: padded_open
label: a tag with some padding in the open tag
input: "<ref ></ref>"
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=" "), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: padded_close
label: a tag with some padding in the close tag
input: "<ref></ref >"
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref "), TagCloseClose()]

---

name: padded_selfclosing
label: a self-closing tag with padding
input: "<ref />"
output: [TagOpenOpen(), Text(text="ref"), TagCloseSelfclose(padding=" ")]

---

name: attribute
label: a tag with a single attribute
input: "<ref name></ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: attribute_value
label: a tag with a single attribute with a value
input: "<ref name=foo></ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: attribute_quoted
label: a tag with a single quoted attribute
input: "<ref name="foo bar"></ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: attribute_hyphen
label: a tag with a single attribute, containing a hyphen
input: "<ref name=foo-bar></ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: attribute_quoted_hyphen
label: a tag with a single quoted attribute, containing a hyphen
input: "<ref name="foo-bar"></ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo-bar"), TagCloseOpen(padding=""), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: attribute_selfclosing
label: a self-closing tag with a single attribute
input: "<ref name/>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagCloseSelfclose(padding="")]

---

name: attribute_selfclosing_value
label: a self-closing tag with a single attribute with a value
input: "<ref name=foo/>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), Text(text="foo"), TagCloseSelfclose(padding="")]

---

name: attribute_selfclosing_value_quoted
label: a self-closing tag with a single quoted attribute
input: "<ref name="foo"/>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="foo"), TagCloseSelfclose(padding="")]

---

name: nested_tag
label: a tag nested within the attributes of another
input: "<ref name=<span style="color: red;">foo</span>>citation</ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: nested_tag_quoted
label: a tag nested within the attributes of another, quoted
input: "<ref name="<span style="color: red;">foo</span>">citation</ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), TagAttrQuote(), Text(text="color: red;"), TagCloseOpen(padding=""), Text(text="foo"), TagOpenClose(), Text(text="span"), TagCloseClose(), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: nested_troll_tag
label: a bogus tag that appears to be nested within the attributes of another
input: "<ref name=</ ><//>>citation</ref>"
output: [Text(text="<ref name=</ ><//>>citation</ref>")]

---

name: nested_troll_tag_quoted
label: a bogus tag that appears to be nested within the attributes of another, quoted
input: "<ref name="</ ><//>">citation</ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="name"), TagAttrEquals(), TagAttrQuote(), Text(text="</ ><//>"), TagCloseOpen(padding=""), Text(text="citation"), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: invalid_space_begin_open
label: invalid tag: a space at the beginning of the open tag
input: "< ref>test</ref>"
output: [Text(text="< ref>test</ref>")]

---

name: invalid_space_begin_close
label: invalid tag: a space at the beginning of the close tag
input: "<ref>test</ ref>"
output: [Text(text="<ref>test</ ref>")]

---

name: valid_space_end
label: valid tag: spaces at the ends of both the open and close tags
input: "<ref >test</ref >"
output: [TagOpenOpen(), Text(text="ref"), TagCloseOpen(padding=" "), Text(text="test"), TagOpenClose(), Text(text="ref "), TagCloseClose()]

---

name: invalid_template_ends
label: invalid tag: a template at the ends of both the open and close tags
input: "<ref {{foo}}>test</ref {{foo}}>"
output: [Text(text="<ref "), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">test</ref "), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">")]

---

name: invalid_template_ends_nospace
label: invalid tag: a template at the ends of both the open and close tags, without spacing
input: "<ref {{foo}}>test</ref{{foo}}>"
output: [Text(text="<ref "), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">test</ref"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">")]

---

name: valid_template_end_open
label: valid tag: a template at the end of the open tag
input: "<ref {{foo}}>test</ref>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref"), TagCloseClose()]

---

name: valid_template_end_open_space_end_close
label: valid tag: a template at the end of the open tag; whitespace at the end of the close tag
input: "<ref {{foo}}>test</ref\n>"
output: [TagOpenOpen(), Text(text="ref"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), TemplateOpen(), Text(text="foo"), TemplateClose(), TagCloseOpen(padding=""), Text(text="test"), TagOpenClose(), Text(text="ref\n"), TagCloseClose()]

---

name: invalid_template_end_open_nospace
label: invalid tag: a template at the end of the open tag, without spacing
input: "<ref{{foo}}>test</ref>"
output: [Text(text="<ref"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text=">test</ref>")]

---

name: invalid_template_start_close
label: invalid tag: a template at the beginning of the close tag
input: "<ref>test</{{foo}}ref>"
output: [Text(text="<ref>test</"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text="ref>")]

---

name: invalid_template_start_open
label: invalid tag: a template at the beginning of the open tag
input: "<{{foo}}ref>test</ref>"
output: [Text(text="<"), TemplateOpen(), Text(text="foo"), TemplateClose(), Text(text="ref>test</ref>")]

---

name: unclosed_quote
label: a quoted attribute that is never closed
input: "<span style="foobar>stuff</span>"
output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foobar"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]

---

name: fake_quote
label: a fake quoted attribute
input: "<span style="foo"bar>stuff</span>"
output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo\"bar"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]

---

name: fake_quote_complex
label: a fake quoted attribute, with spaces and templates and links
input: "<span style="foo {{bar}}\n[[baz]]"buzz >stuff</span>"
output: [TagOpenOpen(), Text(text="span"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="style"), TagAttrEquals(), Text(text="\"foo"), TagAttrStart(pad_first=" ", pad_before_eq="\n", pad_after_eq=""), TemplateOpen(), Text(text="bar"), TemplateClose(), TagAttrStart(pad_first="", pad_before_eq=" ", pad_after_eq=""), WikilinkOpen(), Text(text="baz"), WikilinkClose(), Text(text="\"buzz"), TagCloseOpen(padding=""), Text(text="stuff"), TagOpenClose(), Text(text="span"), TagCloseClose()]

---

name: incomplete_lbracket
label: incomplete tags: just a left bracket
input: "<"
output: [Text(text="<")]

---

name: incomplete_lbracket_junk
label: incomplete tags: just a left bracket, surrounded by stuff
input: "foo<bar"
output: [Text(text="foo<bar")]

---

name: incomplete_unclosed_open
label: incomplete tags: an unclosed open tag
input: "junk <ref"
output: [Text(text="junk <ref")]

---

name: incomplete_unclosed_open_space
label: incomplete tags: an unclosed open tag, space
input: "junk <ref "
output: [Text(text="junk <ref ")]

---

name: incomplete_unclosed_open_unnamed_attr
label: incomplete tags: an unclosed open tag, unnamed attribute
input: "junk <ref name"
output: [Text(text="junk <ref name")]

---

name: incomplete_unclosed_open_attr_equals
label: incomplete tags: an unclosed open tag, attribute, equal sign
input: "junk <ref name="
output: [Text(text="junk <ref name=")]

---

name: incomplete_unclosed_open_attr_equals_quoted
label: incomplete tags: an unclosed open tag, attribute, equal sign, quote
input: "junk <ref name=""
output: [Text(text="junk <ref name=\"")]

---

name: incomplete_unclosed_open_attr
label: incomplete tags: an unclosed open tag, attribute with a key/value
input: "junk <ref name=foo"
output: [Text(text="junk <ref name=foo")]

---

name: incomplete_unclosed_open_attr_quoted
label: incomplete tags: an unclosed open tag, attribute with a key/value, quoted
input: "junk <ref name="foo""
output: [Text(text="junk <ref name=\"foo\"")]

---

name: incomplete_open
label: incomplete tags: an open tag
input: "junk <ref>"
output: [Text(text="junk <ref>")]

---

name: incomplete_open_unnamed_attr
label: incomplete tags: an open tag, unnamed attribute
input: "junk <ref name>"
output: [Text(text="junk <ref name>")]

---

name: incomplete_open_attr_equals
label: incomplete tags: an open tag, attribute, equal sign
input: "junk <ref name=>"
output: [Text(text="junk <ref name=>")]

---

name: incomplete_open_attr
label: incomplete tags: an open tag, attribute with a key/value
input: "junk <ref name=foo>"
output: [Text(text="junk <ref name=foo>")]

---

name: incomplete_open_attr_quoted
label: incomplete tags: an open tag, attribute with a key/value, quoted
input: "junk <ref name="foo">"
output: [Text(text="junk <ref name=\"foo\">")]

---

name: incomplete_open_text
label: incomplete tags: an open tag, text
input: "junk <ref>foo"
output: [Text(text="junk <ref>foo")]

---

name: incomplete_open_attr_text
label: incomplete tags: an open tag, attribute with a key/value, text
input: "junk <ref name=foo>bar"
output: [Text(text="junk <ref name=foo>bar")]

---

name: incomplete_open_text_lbracket
label: incomplete tags: an open tag, text, left open bracket
input: "junk <ref>bar<"
output: [Text(text="junk <ref>bar<")]

---

name: incomplete_open_text_lbracket_slash
label: incomplete tags: an open tag, text, left bracket, slash
input: "junk <ref>bar</"
output: [Text(text="junk <ref>bar</")]

---

name: incomplete_open_text_unclosed_close
label: incomplete tags: an open tag, text, unclosed close
input: "junk <ref>bar</ref"
output: [Text(text="junk <ref>bar</ref")]

---

name: incomplete_open_text_wrong_close
label: incomplete tags: an open tag, text, wrong close
input: "junk <ref>bar</span>"
output: [Text(text="junk <ref>bar</span>")]

---

name: incomplete_close
label: incomplete tags: a close tag
input: "junk </ref>"
output: [Text(text="junk </ref>")]

---

name: incomplete_no_tag_name_open
label: incomplete tags: no tag name within brackets; just an open
input: "junk <>"
output: [Text(text="junk <>")]

---

name: incomplete_no_tag_name_selfclosing
label: incomplete tags: no tag name within brackets; self-closing
input: "junk < />"
output: [Text(text="junk < />")]

---

name: incomplete_no_tag_name_open_close
label: incomplete tags: no tag name within brackets; open and close
input: "junk <></>"
output: [Text(text="junk <></>")]

---

name: backslash_premature_before
label: a backslash before a quote before a space
input: "<foo attribute="this is\\" quoted">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\" quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: backslash_premature_after
label: a backslash before a quote after a space
input: "<foo attribute="this is \\"quoted">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is \\\"quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: backslash_premature_middle
label: a backslash before a quote in the middle of a word
input: "<foo attribute="this i\\"s quoted">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this i\\\"s quoted"), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: backslash_adjacent
label: escaped quotes next to unescaped quotes
input: "<foo attribute="\\"this is quoted\\"">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\\"this is quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: backslash_endquote
label: backslashes before the end quote, causing the attribute to become unquoted
input: "<foo attribute="this_is quoted\\">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), Text(text="\"this_is"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: backslash_double
label: two adjacent backslashes, which do *not* affect the quote
input: "<foo attribute="this is\\\\" quoted">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: backslash_triple
label: three adjacent backslashes, which do *not* affect the quote
input: "<foo attribute="this is\\\\\\" quoted">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="this is\\\\\\"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="quoted\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: backslash_unaffecting
label: backslashes near quotes, but not immediately adjacent, thus having no effect
input: "<foo attribute="\\quote\\d" also="quote\\d\\">blah</foo>"
output: [TagOpenOpen(), Text(text="foo"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attribute"), TagAttrEquals(), TagAttrQuote(), Text(text="\\quote\\d"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="also"), TagAttrEquals(), Text(text="\"quote\\d\\\""), TagCloseOpen(padding=""), Text(text="blah"), TagOpenClose(), Text(text="foo"), TagCloseClose()]

---

name: unparsable
label: a tag that should not be put through the normal parser
input: "{{t1}}<nowiki>{{t2}}</nowiki>{{t3}}"
output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]

---

name: unparsable_complex
label: a tag that should not be put through the normal parser; lots of stuff inside
input: "{{t1}}<pre>{{t2}}\n==Heading==\nThis is some text with a [[page|link]].</pre>{{t3}}"
output: [TemplateOpen(), Text(text="t1"), TemplateClose(), TagOpenOpen(), Text(text="pre"), TagCloseOpen(padding=""), Text(text="{{t2}}\n==Heading==\nThis is some text with a [[page|link]]."), TagOpenClose(), Text(text="pre"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]

---

name: unparsable_attributed
label: a tag that should not be put through the normal parser; parsed attributes
input: "{{t1}}<nowiki attr=val attr2="{{val2}}">{{t2}}</nowiki>{{t3}}"
output: [TemplateOpen(), Text(text=u't1'), TemplateClose(), TagOpenOpen(), Text(text="nowiki"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr"), TagAttrEquals(), Text(text="val"), TagAttrStart(pad_first=" ", pad_before_eq="", pad_after_eq=""), Text(text="attr2"), TagAttrEquals(), TagAttrQuote(), TemplateOpen(), Text(text="val2"), TemplateClose(), TagCloseOpen(padding=""), Text(text="{{t2}}"), TagOpenClose(), Text(text="nowiki"), TagCloseClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]

---

name: unparsable_incomplete
label: a tag that should not be put through the normal parser; incomplete
input: "{{t1}}<nowiki>{{t2}}{{t3}}"
output: [TemplateOpen(), Text(text="t1"), TemplateClose(), Text(text="<nowiki>"), TemplateOpen(), Text(text="t2"), TemplateClose(), TemplateOpen(), Text(text="t3"), TemplateClose()]

---

name: single_open_close
label: a tag that supports being single; both an open and a close tag
input: "foo<li>bar{{baz}}</li>"
output: [Text(text="foo"), TagOpenOpen(), Text(text="li"), TagCloseOpen(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose(), TagOpenClose(), Text(text="li"), TagCloseClose()]

---

name: single_open
label: a tag that supports being single; just an open tag
input: "foo<li>bar{{baz}}"
output: [Text(text="foo"), TagOpenOpen(), Text(text="li"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]

---

name: single_selfclose
label: a tag that supports being single; a self-closing tag
input: "foo<li/>bar{{baz}}"
output: [Text(text="foo"), TagOpenOpen(), Text(text="li"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]

---

name: single_close
label: a tag that supports being single; just a close tag
input: "foo</li>bar{{baz}}"
output: [Text(text="foo</li>bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]

---

name: single_only_open_close
label: a tag that can only be single; both an open and a close tag
input: "foo<br>bar{{baz}}</br>"
output: [Text(text="foo"), TagOpenOpen(), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose(), TagOpenOpen(invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True)]

---

name: single_only_open
label: a tag that can only be single; just an open tag
input: "foo<br>bar{{baz}}"
output: [Text(text="foo"), TagOpenOpen(), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]

---

name: single_only_selfclose
label: a tag that can only be single; a self-closing tag
input: "foo<br/>bar{{baz}}"
output: [Text(text="foo"), TagOpenOpen(), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]

---

name: single_only_close
label: a tag that can only be single; just a close tag
input: "foo</br>bar{{baz}}"
output: [Text(text="foo"), TagOpenOpen(invalid=True), Text(text="br"), TagCloseSelfclose(padding="", implicit=True), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]

---

name: single_only_double
label: a tag that can only be single; a tag with backslashes at the beginning and end
input: "foo</br/>bar{{baz}}"
output: [Text(text="foo"), TagOpenOpen(invalid=True), Text(text="br"), TagCloseSelfclose(padding=""), Text(text="bar"), TemplateOpen(), Text(text="baz"), TemplateClose()]

+ 7
- 0
tests/tokenizer/text.mwtest View File

@@ -23,3 +23,10 @@ name: unicode2
label: additional unicode check for non-BMP codepoints
input: "𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰"
output: [Text(text="𐌲𐌿𐍄𐌰𐍂𐌰𐌶𐌳𐌰")]

---

name: large
label: a lot of text, requiring multiple textbuffer blocks in the C tokenizer
input: "ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN"
output: [Text(text="ZWfsZYcZyhGbkDYJiguJuuhsNyHGFkFhnjkbLJyXIygTHqcXdhsDkEOTSIKYlBiohLIkiXxvyebUyCGvvBcYqFdtcftGmaAanKXEIyYSEKlTfEEbdGhdePVwVImOyKiHSzAEuGyEVRIKPZaNjQsYqpqARIQfvAklFtQyTJVGlLwjJIxYkiqmHBmdOvTyNqJRbMvouoqXRyOhYDwowtkcZGSOcyzVxibQdnzhDYbrgbatUrlOMRvFSzmLWHRihtXnddwYadPgFWUOxAzAgddJVDXHerawdkrRuWaEXfuwQSkQUmLEJUmrgXDVlXCpciaisfuOUjBldElygamkkXbewzLucKRnAEBimIIotXeslRRhnqQjrypnLQvvdCsKFWPVTZaHvzJMFEahDHWcCbyXgxFvknWjhVfiLSDuFhGoFxqSvhjnnRZLmCMhmWeOgSoanDEInKTWHnbpKyUlabLppITDFFxyWKAnUYJQIcmYnrvMmzmtYvsbCYbebgAhMFVVFAKUSvlkLFYluDpbpBaNFWyfXTaOdSBrfiHDTWGBTUCXMqVvRCIMrEjWpQaGsABkioGnveQWqBTDdRQlxQiUipwfyqAocMddXqdvTHhEwjEzMkOSWVPjJvDtClhYwpvRztPmRKCSpGIpXQqrYtTLmShFdpKtOxGtGOZYIdyUGPjdmyvhJTQMtgYJWUUZnecRjBfQXsyWQWikyONySLzLEqRFqcJYdRNFcGwWZtfZasfFWcvdsHRXoqKlKYihRAOJdrPBDdxksXFwKceQVncmFXfUfBsNgjKzoObVExSnRnjegeEhqxXzPmFcuiasViAFeaXrAxXhSfSyCILkKYpjxNeKynUmdcGAbwRwRnlAFbOSCafmzXddiNpLCFTHBELvArdXFpKUGpSHRekhrMedMRNkQzmSyFKjVwiWwCvbNWjgxJRzYeRxHiCCRMXktmKBxbxGZvOpvZIJOwvGIxcBLzsMFlDqAMLtScdsJtrbIUAvKfcdChXGnBzIxGxXMgxJhayrziaCswdpjJJJhkaYnGhHXqZwOzHFdhhUIEtfjERdLaSPRTDDMHpQtonNaIgXUYhjdbnnKppfMBxgNSOOXJAPtFjfAKnrRDrumZBpNhxMstqjTGBViRkDqbTdXYUirsedifGYzZpQkvdNhtFTOPgsYXYCwZHLcSLSfwfpQKtWfZuRUUryHJsbVsAOQcIJdSKKlOvCeEjUQNRPHKXuBJUjPuaAJJxcDMqyaufqfVwUmHLdjeYZzSiiGLHOTCInpVAalbXXTMLugLiwFiyPSuSFiyJUKVrWjbZAHaJtZnQmnvorRrxdPKThqXzNgTjszQiCoMczRnwGYJMERUWGXFyrSbAqsHmLwLlnJOJoXNsjVehQjVOpQOQJAZWwFZBlgyVIplzLTlFwumPgBLYrUIAJAcmvHPGfHfWQguCjfTYzxYfbohaLFAPwxFRrNuCdCzLlEbuhyYjCmuDBTJDMCdLpNRVqEALjnPSaBPsKWRCKNGwEMFpiEWbYZRwaMopjoUuBUvMpvyLfsPKDrfQLiFOQIWPtLIMoijUEUYfhykHrSKbTtrvjwIzHdWZDVwLIpNkloCqpzIsErxxKAFuFEjikWNYChqYqVslXMtoSWzNhbMuxYbzLfJIcPGoUeGPkGyPQNhDyrjgdKekzftFrRPTuyLYqCArkDcWHTrjPQHfoThBNnTQyMwLEWxEnBXLtzJmFVLGEPrdbEwlXpgYfnVnWoNXgPQKKyiXifpvrmJATzQOzYwFhliiYxlbnsEPKbHYUfJLrwYPfSUwTIHiEvBFMrEtVmqJobfcwsiiEudTIiAnrtuywgKLOiMYbEIOAOJdOXqroPjWnQQcTNxFvkIEIsuHLyhSqSphuSmlvknzydQEnebOreeZwOouXYKlObAkaWHhOdTFLoMCHOWrVKeXjcniaxtgCziKEqWOZUWHJQpcDJzYnnduDZrmxgjZroBRwoPBUTJMYipsgJwbTSlvMyXXdAmiEWGMiQxhGvHGPLOKeTxNaLnFVbWpiYIVyqN")]

Loading…
Cancel
Save