Browse Source

Add support for dt, dd. Refactor. Fix some broken tests.

tags/v0.3
Ben Kurtovic 10 years ago
parent
commit
dd2a6f913b
4 changed files with 61 additions and 31 deletions
  1. +10
    -6
      mwparserfromhell/parser/contexts.py
  2. +30
    -16
      mwparserfromhell/parser/tokenizer.py
  3. +13
    -1
      mwparserfromhell/tag_defs.py
  4. +8
    -8
      tests/tokenizer/tags_wikimarkup.mwtest

+ 10
- 6
mwparserfromhell/parser/contexts.py View File

@@ -69,6 +69,8 @@ Local (stack-specific) contexts:
* :py:const:`TAG_BODY`
* :py:const:`TAG_CLOSE`

* :py:const:`DL_TERM`

* :py:const:`SAFETY_CHECK`

* :py:const:`HAS_TEXT`
@@ -115,12 +117,14 @@ TAG_BODY = 1 << 16
TAG_CLOSE = 1 << 17
TAG = TAG_OPEN + TAG_ATTR + TAG_BODY + TAG_CLOSE

HAS_TEXT = 1 << 18
FAIL_ON_TEXT = 1 << 19
FAIL_NEXT = 1 << 20
FAIL_ON_LBRACE = 1 << 21
FAIL_ON_RBRACE = 1 << 22
FAIL_ON_EQUALS = 1 << 23
DL_TERM = 1 << 18

HAS_TEXT = 1 << 19
FAIL_ON_TEXT = 1 << 20
FAIL_NEXT = 1 << 21
FAIL_ON_LBRACE = 1 << 22
FAIL_ON_RBRACE = 1 << 23
FAIL_ON_EQUALS = 1 << 24
SAFETY_CHECK = (HAS_TEXT + FAIL_ON_TEXT + FAIL_NEXT + FAIL_ON_LBRACE +
FAIL_ON_RBRACE + FAIL_ON_EQUALS)



+ 30
- 16
mwparserfromhell/parser/tokenizer.py View File

@@ -26,7 +26,7 @@ import re

from . import contexts, tokens
from ..compat import htmlentities
from ..tag_defs import is_parsable, is_single, is_single_only
from ..tag_defs import get_html_tag, is_parsable, is_single, is_single_only

__all__ = ["Tokenizer"]

@@ -629,20 +629,24 @@ class Tokenizer(object):
else:
self._emit_all(tag)

def _parse_list(self):
"""Parse a wiki-style list (``#``, ``*``, ``;``, ``:``)."""
def emit():
self._emit(tokens.TagOpenOpen(wiki_markup=self._read()))
self._emit_text("li")
self._emit(tokens.TagCloseSelfclose())
def _handle_list_marker(self):
"""Handle a list marker at the head (``#``, ``*``, ``;``, ``:``)."""
markup = self._read()
if markup == ";":
self._context |= contexts.DL_TERM
self._emit(tokens.TagOpenOpen(wiki_markup=markup))
self._emit_text(get_html_tag(markup))
self._emit(tokens.TagCloseSelfclose())

emit()
while self._read(1) in ("#", "*"):
def _handle_list(self):
"""Handle a wiki-style list (``#``, ``*``, ``;``, ``:``)."""
self._handle_list_marker()
while self._read(1) in ("#", "*", ";", ":"):
self._head += 1
emit()
self._handle_list_marker()

def _parse_hr(self):
"""Parse a wiki-style horizontal rule (``----``) at the string head."""
def _handle_hr(self):
"""Handle a wiki-style horizontal rule (``----``) in the string."""
length = 4
self._head += 3
while self._read(1) == "-":
@@ -652,6 +656,14 @@ class Tokenizer(object):
self._emit_text("hr")
self._emit(tokens.TagCloseSelfclose())

def _handle_dl_term(self):
"""Handle the term in a description list (``foo`` in ``;foo:bar``)."""
self._context ^= contexts.DL_TERM
if self._read() == ":":
self._handle_list_marker()
else:
self._emit_text("\n")

def _handle_end(self):
"""Handle the end of the stream of wikitext."""
fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK |
@@ -806,12 +818,14 @@ class Tokenizer(object):
elif this == ">" and self._context & contexts.TAG_CLOSE:
return self._handle_tag_close_close()
elif self._read(-1) in ("\n", self.START):
if this in ("#", "*"):
self._parse_list()
if this in ("#", "*", ";", ":"):
self._handle_list()
elif this == next == self._read(2) == self._read(3) == "-":
self._parse_hr()
self._handle_hr()
else:
self._emit_text(self._read())
self._emit_text(this)
elif this in ("\n", ":") and self._context & contexts.DL_TERM:
self._handle_dl_term()
else:
self._emit_text(this)
self._head += 1


+ 13
- 1
mwparserfromhell/tag_defs.py View File

@@ -24,7 +24,8 @@

from __future__ import unicode_literals

__all__ = ["is_parsable", "is_visible", "is_single", "is_single_only"]
__all__ = ["get_html_tag", "is_parsable", "is_visible", "is_single",
"is_single_only"]

PARSER_BLACKLIST = [
# enwiki extensions @ 2013-06-28
@@ -43,6 +44,17 @@ INVISIBLE_TAGS = [
SINGLE_ONLY = ["br", "hr", "meta", "link", "img"]
SINGLE = SINGLE_ONLY + ["li", "dt", "dd"]

MARKUP_TO_HTML = {
"#": "li",
"*": "li",
";": "dt",
":": "dd"
}

def get_html_tag(markup):
"""Return the HTML tag associated with the given wiki-markup."""
return MARKUP_TO_HTML[markup]

def is_parsable(tag):
"""Return if the given *tag*'s contents should be passed to the parser."""
return tag.lower() not in PARSER_BLACKLIST


+ 8
- 8
tests/tokenizer/tags_wikimarkup.mwtest View File

@@ -281,7 +281,7 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tex

name: dt_adjacent
label: mdttiple adjacent dts
input: ";\n;b\n;c\nd\n;e\nf"
input: "a\n;b\n;c\nd\n;e\nf"
output: [Text(text="a\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="e\nf")]

---
@@ -309,7 +309,7 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tag

name: complex_dd
label: dd with a lot in it
input: ": this is a&nbsp:test of an [[description item|dd]] with {{plenty|of|stuff}}"
input: ": this is a&nbsp;test of an [[description item|dd]] with {{plenty|of|stuff}}"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=" this is a"), HTMLEntityStart(), Text(text="nbsp"), HTMLEntityEnd(), Text(text="test of an "), WikilinkOpen(), Text(text="description item"), WikilinkSeparator(), Text(text="dd"), WikilinkClose(), Text(text=" with "), TemplateOpen(), Text(text="plenty"), TemplateParamSeparator(), Text(text="of"), TemplateParamSeparator(), Text(text="stuff"), TemplateClose()]

---
@@ -323,7 +323,7 @@ output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Tex

name: dd_adjacent
label: mddtiple adjacent dds
input: ":\n:b\n:c\nd\n:e\nf"
input: "a\n:b\n:c\nd\n:e\nf"
output: [Text(text="a\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="b\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="c\nd\n"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="e\nf")]

---
@@ -358,15 +358,15 @@ output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Tex

name: dt_dd_mix2
label: the correct usage of a dt/dd unit, as in a dl
input: ";foo:bar"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar")]
input: ";foo:bar:baz"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")]

---

name: dt_dd_mix3
label: another complex example of dts and dds
input: ";:::;foo::;:bar;;"
output: [TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text=";:bar;;")]
label: another example of correct (but strange) dt/dd usage
input: ":;;::foo:bar:baz"
output: [TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=";"), Text(text="dt"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="foo"), TagOpenOpen(wiki_markup=":"), Text(text="dd"), TagCloseSelfclose(), Text(text="bar:baz")]

---



Loading…
Cancel
Save