From dd2a6f913b140fb9a1b81cfa7dbc41e5f5050b1c Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Fri, 9 Aug 2013 20:42:19 -0400 Subject: [PATCH] Add support for dt, dd. Refactor. Fix some broken tests. --- mwparserfromhell/parser/contexts.py | 16 +++++++----- mwparserfromhell/parser/tokenizer.py | 46 ++++++++++++++++++++++------------ mwparserfromhell/tag_defs.py | 14 ++++++++++- tests/tokenizer/tags_wikimarkup.mwtest | 16 ++++++------ 4 files changed, 61 insertions(+), 31 deletions(-) diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 211136c..2785708 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -69,6 +69,8 @@ Local (stack-specific) contexts: * :py:const:`TAG_BODY` * :py:const:`TAG_CLOSE` +* :py:const:`DL_TERM` + * :py:const:`SAFETY_CHECK` * :py:const:`HAS_TEXT` @@ -115,12 +117,14 @@ TAG_BODY = 1 << 16 TAG_CLOSE = 1 << 17 TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE -HAS_TEXT = 1 << 18 -FAIL_ON_TEXT = 1 << 19 -FAIL_NEXT = 1 << 20 -FAIL_ON_LBRACE = 1 << 21 -FAIL_ON_RBRACE = 1 << 22 -FAIL_ON_EQUALS = 1 << 23 +DL_TERM = 1 << 18 + +HAS_TEXT = 1 << 19 +FAIL_ON_TEXT = 1 << 20 +FAIL_NEXT = 1 << 21 +FAIL_ON_LBRACE = 1 << 22 +FAIL_ON_RBRACE = 1 << 23 +FAIL_ON_EQUALS = 1 << 24 SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE + FAIL_ON_RBRACE + FAIL_ON_EQUALS) diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index f167db4..d3ce7bd 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -26,7 +26,7 @@ import re from . import contexts, tokens from ..compat import htmlentities -from ..tag_defs import is_parsable, is_single, is_single_only +from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only __all__ = ["Tokenizer"] @@ -629,20 +629,24 @@ class Tokenizer(object): else: self._emit_all(tag) - def _parse_list(self): - """Parse a wiki-style list (``#``, ``*``, ``;``, ``:``).""" - def emit(): - self._emit(tokens.TagOpenOpen(wiki_markup=self._read())) - self._emit_text("li") - self._emit(tokens.TagCloseSelfclose()) + def _handle_list_marker(self): + """Handle a list marker at the head (``#``, ``*``, ``;``, ``:``).""" + markup = self._read() + if markup == ";": + self._context |= contexts.DL_TERM + self._emit(tokens.TagOpenOpen(wiki_markup=markup)) + self._emit_text(get_html_tag(markup)) + self._emit(tokens.TagCloseSelfclose()) - emit() - while self._read(1) in ("#", "*"): + def _handle_list(self): + """Handle a wiki-style list (``#``, ``*``, ``;``, ``:``).""" + self._handle_list_marker() + while self._read(1) in ("#", "*", ";", ":"): self._head += 1 - emit() + self._handle_list_marker() - def _parse_hr(self): - """Parse a wiki-style horizontal rule (``----``) at the string head.""" + def _handle_hr(self): + """Handle a wiki-style horizontal rule (``----``) in the string.""" length = 4 self._head += 3 while self._read(1) == "-": @@ -652,6 +656,14 @@ class Tokenizer(object): self._emit_text("hr") self._emit(tokens.TagCloseSelfclose()) + def _handle_dl_term(self): + """Handle the term in a description list (``foo`` in ``;foo:bar``).""" + self._context ^= contexts.DL_TERM + if self._read() == ":": + self._handle_list_marker() + else: + self._emit_text("\n") + def _handle_end(self): """Handle the end of the stream of wikitext.""" fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | @@ -806,12 +818,14 @@ class Tokenizer(object): elif this == ">" and self._context & contexts.TAG_CLOSE: return self._handle_tag_close_close() elif self._read(-1) in ("\n", self.START): - if this in ("#", "*"): - self._parse_list() + if this in ("#", "*", ";", ":"): + self._handle_list() elif this == next == self._read(2) == self._read(3) == "-": - self._parse_hr() + self._handle_hr() else: - self._emit_text(self._read()) + self._emit_text(this) + elif this in ("\n", ":") and self._context & contexts.DL_TERM: + self._handle_dl_term() else: self._emit_text(this) self._head += 1 diff --git a/mwparserfromhell/tag_defs.py b/mwparserfromhell/tag_defs.py index 94e0ac4..2395fc6 100644 --- a/mwparserfromhell/tag_defs.py +++ b/mwparserfromhell/tag_defs.py @@ -24,7 +24,8 @@ from __future__ import unicode_literals -__all__ = ["is_parsable", "is_visible", "is_single", "is_single_only"] +__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single", + "is_single_only"] PARSER_BLACKLIST = [ # enwiki extensions @ 2013-06-28 @@ -43,6 +44,17 @@ INVISIBLE_TAGS = [ SINGLE_ONLY = ["br", "hr", "meta", "link", "img"] SINGLE = SINGLE_ONLY + ["li", "dt", "dd"] +MARKUP_TO_HTML = { + "#": "li", + "*": "li", + ";": "dt", + ":": "dd" +} + +def get_html_tag(markup): + """Return the HTML tag associated with the given wiki-markup.""" + return MARKUP_TO_HTML[markup] + def is_parsable(tag): """Return if the given *tag*'s contents should be passed to the parser.""" return tag.lower() not in PARSER_BLACKLIST diff --git a/tests/tokenizer/tags_wikimarkup.mwtest b/tests/tokenizer/tags_wikimarkup.mwtest index 9ce71b6..c9664fb 100644 --- a/tests/tokenizer/tags_wikimarkup.mwtest +++ b/tests/tokenizer/tags_wikimarkup.mwtest @@ -281,7 +281,7 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tex name: dt_adjacent label: mdttiple adjacent dts -input: ";\n;b\n;c\nd\n;e\nf" +input: "a\n;b\n;c\nd\n;e\nf" output: [Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="e\nf")] --- @@ -309,7 +309,7 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tag name: complex_dd label: dd with a lot in it -input: ": this is a :test of an [[description item|dd]] with {{plenty|of|stuff}}" +input: ": this is a test of an [[description item|dd]] with {{plenty|of|stuff}}" output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="description item"), WikilinkSeparator(), Text(text="dd"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()] --- @@ -323,7 +323,7 @@ output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Tex name: dd_adjacent label: mddtiple adjacent dds -input: ":\n:b\n:c\nd\n:e\nf" +input: "a\n:b\n:c\nd\n:e\nf" output: [Text(text="a\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf")] --- @@ -358,15 +358,15 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tex name: dt_dd_mix2 label: the correct usage of a dt/dd unit, as in a dl -input: ";foo:bar" -output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")] +input: ";foo:bar:baz" +output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")] --- name: dt_dd_mix3 -label: another complex example of dts and dds -input: ";:::;foo::;:bar;;" -output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=";:bar;;")] +label: another example of correct (but strange) dt/dd usage +input: ":;;::foo:bar:baz" +output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")] ---