@@ -69,6 +69,8 @@ Local (stack-specific) contexts: | |||||
* :py:const:`TAG_BODY` | * :py:const:`TAG_BODY` | ||||
* :py:const:`TAG_CLOSE` | * :py:const:`TAG_CLOSE` | ||||
* :py:const:`DL_TERM` | |||||
* :py:const:`SAFETY_CHECK` | * :py:const:`SAFETY_CHECK` | ||||
* :py:const:`HAS_TEXT` | * :py:const:`HAS_TEXT` | ||||
@@ -115,12 +117,14 @@ TAG_BODY = 1 << 16 | |||||
TAG_CLOSE = 1 << 17 | TAG_CLOSE = 1 << 17 | ||||
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE | TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE | ||||
HAS_TEXT = 1 << 18 | |||||
FAIL_ON_TEXT = 1 << 19 | |||||
FAIL_NEXT = 1 << 20 | |||||
FAIL_ON_LBRACE = 1 << 21 | |||||
FAIL_ON_RBRACE = 1 << 22 | |||||
FAIL_ON_EQUALS = 1 << 23 | |||||
DL_TERM = 1 << 18 | |||||
HAS_TEXT = 1 << 19 | |||||
FAIL_ON_TEXT = 1 << 20 | |||||
FAIL_NEXT = 1 << 21 | |||||
FAIL_ON_LBRACE = 1 << 22 | |||||
FAIL_ON_RBRACE = 1 << 23 | |||||
FAIL_ON_EQUALS = 1 << 24 | |||||
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + | ||||
FAIL_ON_RBRACE + FAIL_ON_EQUALS) | FAIL_ON_RBRACE + FAIL_ON_EQUALS) | ||||
@@ -26,7 +26,7 @@ import re | |||||
from . import contexts, tokens | from . import contexts, tokens | ||||
from ..compat import htmlentities | from ..compat import htmlentities | ||||
from ..tag_defs import is_parsable, is_single, is_single_only | |||||
from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only | |||||
__all__ = ["Tokenizer"] | __all__ = ["Tokenizer"] | ||||
@@ -629,20 +629,24 @@ class Tokenizer(object): | |||||
else: | else: | ||||
self._emit_all(tag) | self._emit_all(tag) | ||||
def _parse_list(self): | |||||
"""Parse a wiki-style list (``#``, ``*``, ``;``, ``:``).""" | |||||
def emit(): | |||||
self._emit(tokens.TagOpenOpen(wiki_markup=self._read())) | |||||
self._emit_text("li") | |||||
self._emit(tokens.TagCloseSelfclose()) | |||||
def _handle_list_marker(self): | |||||
"""Handle a list marker at the head (``#``, ``*``, ``;``, ``:``).""" | |||||
markup = self._read() | |||||
if markup == ";": | |||||
self._context |= contexts.DL_TERM | |||||
self._emit(tokens.TagOpenOpen(wiki_markup=markup)) | |||||
self._emit_text(get_html_tag(markup)) | |||||
self._emit(tokens.TagCloseSelfclose()) | |||||
emit() | |||||
while self._read(1) in ("#", "*"): | |||||
def _handle_list(self): | |||||
"""Handle a wiki-style list (``#``, ``*``, ``;``, ``:``).""" | |||||
self._handle_list_marker() | |||||
while self._read(1) in ("#", "*", ";", ":"): | |||||
self._head += 1 | self._head += 1 | ||||
emit() | |||||
self._handle_list_marker() | |||||
def _parse_hr(self): | |||||
"""Parse a wiki-style horizontal rule (``----``) at the string head.""" | |||||
def _handle_hr(self): | |||||
"""Handle a wiki-style horizontal rule (``----``) in the string.""" | |||||
length = 4 | length = 4 | ||||
self._head += 3 | self._head += 3 | ||||
while self._read(1) == "-": | while self._read(1) == "-": | ||||
@@ -652,6 +656,14 @@ class Tokenizer(object): | |||||
self._emit_text("hr") | self._emit_text("hr") | ||||
self._emit(tokens.TagCloseSelfclose()) | self._emit(tokens.TagCloseSelfclose()) | ||||
def _handle_dl_term(self): | |||||
"""Handle the term in a description list (``foo`` in ``;foo:bar``).""" | |||||
self._context ^= contexts.DL_TERM | |||||
if self._read() == ":": | |||||
self._handle_list_marker() | |||||
else: | |||||
self._emit_text("\n") | |||||
def _handle_end(self): | def _handle_end(self): | ||||
"""Handle the end of the stream of wikitext.""" | """Handle the end of the stream of wikitext.""" | ||||
fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | | fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | | ||||
@@ -806,12 +818,14 @@ class Tokenizer(object): | |||||
elif this == ">" and self._context & contexts.TAG_CLOSE: | elif this == ">" and self._context & contexts.TAG_CLOSE: | ||||
return self._handle_tag_close_close() | return self._handle_tag_close_close() | ||||
elif self._read(-1) in ("\n", self.START): | elif self._read(-1) in ("\n", self.START): | ||||
if this in ("#", "*"): | |||||
self._parse_list() | |||||
if this in ("#", "*", ";", ":"): | |||||
self._handle_list() | |||||
elif this == next == self._read(2) == self._read(3) == "-": | elif this == next == self._read(2) == self._read(3) == "-": | ||||
self._parse_hr() | |||||
self._handle_hr() | |||||
else: | else: | ||||
self._emit_text(self._read()) | |||||
self._emit_text(this) | |||||
elif this in ("\n", ":") and self._context & contexts.DL_TERM: | |||||
self._handle_dl_term() | |||||
else: | else: | ||||
self._emit_text(this) | self._emit_text(this) | ||||
self._head += 1 | self._head += 1 | ||||
@@ -24,7 +24,8 @@ | |||||
from __future__ import unicode_literals | from __future__ import unicode_literals | ||||
__all__ = ["is_parsable", "is_visible", "is_single", "is_single_only"] | |||||
__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", | |||||
"is_single_only"] | |||||
PARSER_BLACKLIST = [ | PARSER_BLACKLIST = [ | ||||
# enwiki extensions @ 2013-06-28 | # enwiki extensions @ 2013-06-28 | ||||
@@ -43,6 +44,17 @@ INVISIBLE_TAGS = [ | |||||
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] | SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] | ||||
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] | SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] | ||||
MARKUP_TO_HTML = { | |||||
"#": "li", | |||||
"*": "li", | |||||
";": "dt", | |||||
":": "dd" | |||||
} | |||||
def get_html_tag(markup): | |||||
"""Return the HTML tag associated with the given wiki-markup.""" | |||||
return MARKUP_TO_HTML[markup] | |||||
def is_parsable(tag): | def is_parsable(tag): | ||||
"""Return if the given *tag*'s contents should be passed to the parser.""" | """Return if the given *tag*'s contents should be passed to the parser.""" | ||||
return tag.lower() not in PARSER_BLACKLIST | return tag.lower() not in PARSER_BLACKLIST | ||||
@@ -281,7 +281,7 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tex | |||||
name: dt_adjacent | name: dt_adjacent | ||||
label: mdttiple adjacent dts | label: mdttiple adjacent dts | ||||
input: ";\n;b\n;c\nd\n;e\nf" | |||||
input: "a\n;b\n;c\nd\n;e\nf" | |||||
output: [Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="e\nf")] | output: [Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="e\nf")] | ||||
--- | --- | ||||
@@ -309,7 +309,7 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tag | |||||
name: complex_dd | name: complex_dd | ||||
label: dd with a lot in it | label: dd with a lot in it | ||||
input: ": this is a :test of an [[description item|dd]] with {{plenty|of|stuff}}" | |||||
input: ": this is a test of an [[description item|dd]] with {{plenty|of|stuff}}" | |||||
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="description item"), WikilinkSeparator(), Text(text="dd"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()] | output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="description item"), WikilinkSeparator(), Text(text="dd"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()] | ||||
--- | --- | ||||
@@ -323,7 +323,7 @@ output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Tex | |||||
name: dd_adjacent | name: dd_adjacent | ||||
label: mddtiple adjacent dds | label: mddtiple adjacent dds | ||||
input: ":\n:b\n:c\nd\n:e\nf" | |||||
input: "a\n:b\n:c\nd\n:e\nf" | |||||
output: [Text(text="a\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf")] | output: [Text(text="a\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf")] | ||||
--- | --- | ||||
@@ -358,15 +358,15 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tex | |||||
name: dt_dd_mix2 | name: dt_dd_mix2 | ||||
label: the correct usage of a dt/dd unit, as in a dl | label: the correct usage of a dt/dd unit, as in a dl | ||||
input: ";foo:bar" | |||||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")] | |||||
input: ";foo:bar:baz" | |||||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")] | |||||
--- | --- | ||||
name: dt_dd_mix3 | name: dt_dd_mix3 | ||||
label: another complex example of dts and dds | |||||
input: ";:::;foo::;:bar;;" | |||||
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=";:bar;;")] | |||||
label: another example of correct (but strange) dt/dd usage | |||||
input: ":;;::foo:bar:baz" | |||||
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")] | |||||
--- | --- | ||||