@@ -69,6 +69,8 @@ Local (stack-specific) contexts: | |||
* :py:const:`TAG_BODY` | |||
* :py:const:`TAG_CLOSE` | |||
* :py:const:`DL_TERM` | |||
* :py:const:`SAFETY_CHECK` | |||
* :py:const:`HAS_TEXT` | |||
@@ -115,12 +117,14 @@ TAG_BODY = 1 << 16 | |||
TAG_CLOSE = 1 << 17 | |||
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE | |||
HAS_TEXT = 1 << 18 | |||
FAIL_ON_TEXT = 1 << 19 | |||
FAIL_NEXT = 1 << 20 | |||
FAIL_ON_LBRACE = 1 << 21 | |||
FAIL_ON_RBRACE = 1 << 22 | |||
FAIL_ON_EQUALS = 1 << 23 | |||
DL_TERM = 1 << 18 | |||
HAS_TEXT = 1 << 19 | |||
FAIL_ON_TEXT = 1 << 20 | |||
FAIL_NEXT = 1 << 21 | |||
FAIL_ON_LBRACE = 1 << 22 | |||
FAIL_ON_RBRACE = 1 << 23 | |||
FAIL_ON_EQUALS = 1 << 24 | |||
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | |||
FAIL_ON_RBRACE + FAIL_ON_EQUALS) | |||
@@ -26,7 +26,7 @@ import re | |||
from . import contexts, tokens | |||
from ..compat import htmlentities | |||
from ..tag_defs import is_parsable, is_single, is_single_only | |||
from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only | |||
__all__ = ["Tokenizer"] | |||
@@ -629,20 +629,24 @@ class Tokenizer(object): | |||
else: | |||
self._emit_all(tag) | |||
def _parse_list(self): | |||
"""Parse a wiki-style list (``#``, ``*``, ``;``, ``:``).""" | |||
def emit(): | |||
self._emit(tokens.TagOpenOpen(wiki_markup=self._read())) | |||
self._emit_text("li") | |||
self._emit(tokens.TagCloseSelfclose()) | |||
def _handle_list_marker(self): | |||
"""Handle a list marker at the head (``#``, ``*``, ``;``, ``:``).""" | |||
markup = self._read() | |||
if markup == ";": | |||
self._context |= contexts.DL_TERM | |||
self._emit(tokens.TagOpenOpen(wiki_markup=markup)) | |||
self._emit_text(get_html_tag(markup)) | |||
self._emit(tokens.TagCloseSelfclose()) | |||
emit() | |||
while self._read(1) in ("#", "*"): | |||
def _handle_list(self): | |||
"""Handle a wiki-style list (``#``, ``*``, ``;``, ``:``).""" | |||
self._handle_list_marker() | |||
while self._read(1) in ("#", "*", ";", ":"): | |||
self._head += 1 | |||
emit() | |||
self._handle_list_marker() | |||
def _parse_hr(self): | |||
"""Parse a wiki-style horizontal rule (``----``) at the string head.""" | |||
def _handle_hr(self): | |||
"""Handle a wiki-style horizontal rule (``----``) in the string.""" | |||
length = 4 | |||
self._head += 3 | |||
while self._read(1) == "-": | |||
@@ -652,6 +656,14 @@ class Tokenizer(object): | |||
self._emit_text("hr") | |||
self._emit(tokens.TagCloseSelfclose()) | |||
def _handle_dl_term(self): | |||
"""Handle the term in a description list (``foo`` in ``;foo:bar``).""" | |||
self._context ^= contexts.DL_TERM | |||
if self._read() == ":": | |||
self._handle_list_marker() | |||
else: | |||
self._emit_text("\n") | |||
def _handle_end(self): | |||
"""Handle the end of the stream of wikitext.""" | |||
fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | | |||
@@ -806,12 +818,14 @@ class Tokenizer(object): | |||
elif this == ">" and self._context & contexts.TAG_CLOSE: | |||
return self._handle_tag_close_close() | |||
elif self._read(-1) in ("\n", self.START): | |||
if this in ("#", "*"): | |||
self._parse_list() | |||
if this in ("#", "*", ";", ":"): | |||
self._handle_list() | |||
elif this == next == self._read(2) == self._read(3) == "-": | |||
self._parse_hr() | |||
self._handle_hr() | |||
else: | |||
self._emit_text(self._read()) | |||
self._emit_text(this) | |||
elif this in ("\n", ":") and self._context & contexts.DL_TERM: | |||
self._handle_dl_term() | |||
else: | |||
self._emit_text(this) | |||
self._head += 1 | |||
@@ -24,7 +24,8 @@ | |||
from __future__ import unicode_literals | |||
__all__ = ["is_parsable", "is_visible", "is_single", "is_single_only"] | |||
__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", | |||
"is_single_only"] | |||
PARSER_BLACKLIST = [ | |||
# enwiki extensions @ 2013-06-28 | |||
@@ -43,6 +44,17 @@ INVISIBLE_TAGS = [ | |||
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] | |||
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] | |||
MARKUP_TO_HTML = { | |||
"#": "li", | |||
"*": "li", | |||
";": "dt", | |||
":": "dd" | |||
} | |||
def get_html_tag(markup): | |||
"""Return the HTML tag associated with the given wiki-markup.""" | |||
return MARKUP_TO_HTML[markup] | |||
def is_parsable(tag): | |||
"""Return if the given *tag*'s contents should be passed to the parser.""" | |||
return tag.lower() not in PARSER_BLACKLIST | |||
@@ -281,7 +281,7 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tex | |||
name: dt_adjacent | |||
label: mdttiple adjacent dts | |||
input: ";\n;b\n;c\nd\n;e\nf" | |||
input: "a\n;b\n;c\nd\n;e\nf" | |||
output: [Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="e\nf")] | |||
--- | |||
@@ -309,7 +309,7 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tag | |||
name: complex_dd | |||
label: dd with a lot in it | |||
input: ": this is a :test of an [[description item|dd]] with {{plenty|of|stuff}}" | |||
input: ": this is a test of an [[description item|dd]] with {{plenty|of|stuff}}" | |||
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="description item"), WikilinkSeparator(), Text(text="dd"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()] | |||
--- | |||
@@ -323,7 +323,7 @@ output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Tex | |||
name: dd_adjacent | |||
label: mddtiple adjacent dds | |||
input: ":\n:b\n:c\nd\n:e\nf" | |||
input: "a\n:b\n:c\nd\n:e\nf" | |||
output: [Text(text="a\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf")] | |||
--- | |||
@@ -358,15 +358,15 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tex | |||
name: dt_dd_mix2 | |||
label: the correct usage of a dt/dd unit, as in a dl | |||
input: ";foo:bar" | |||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")] | |||
input: ";foo:bar:baz" | |||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")] | |||
--- | |||
name: dt_dd_mix3 | |||
label: another complex example of dts and dds | |||
input: ";:::;foo::;:bar;;" | |||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=";:bar;;")] | |||
label: another example of correct (but strange) dt/dd usage | |||
input: ":;;::foo:bar:baz" | |||
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")] | |||
--- | |||