From d1a9ba9a34f544d241b7595655e74a68c5b3f60b Mon Sep 17 00:00:00 2001 From: Ben Kurtovic Date: Sat, 1 Dec 2012 13:42:08 -0500 Subject: [PATCH] Starting tag work. - Translation dict, contexts, parse_* and handle_* hooks in tokenizer. --- mwparserfromhell/nodes/tag.py | 36 +++++++++++++++++ mwparserfromhell/parser/contexts.py | 65 +++++++++++++++++++----------- mwparserfromhell/parser/tokenizer.c | 1 - mwparserfromhell/parser/tokenizer.py | 77 +++++++++++++++++++++++++++++++++++- 4 files changed, 155 insertions(+), 24 deletions(-) diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py index 5873a49..c32f398 100644 --- a/mwparserfromhell/nodes/tag.py +++ b/mwparserfromhell/nodes/tag.py @@ -73,6 +73,42 @@ class Tag(Node): TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE)) TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE + TRANSLATIONS = { + "i": TAG_ITALIC, + "em": TAG_ITALIC, + "b": TAG_BOLD, + "strong": TAG_BOLD, + "u": TAG_UNDERLINE, + "s": TAG_STRIKETHROUGH, + "ul": TAG_UNORDERED_LIST, + "ol": TAG_ORDERED_LIST, + "dt": TAG_DEF_TERM, + "dd": TAG_DEF_ITEM, + "blockquote": TAG_BLOCKQUOTE, + "hl": TAG_RULE, + "br": TAG_BREAK, + "abbr": TAG_ABBR, + "pre": TAG_PRE, + "tt": TAG_MONOSPACE, + "code": TAG_CODE, + "span": TAG_SPAN, + "div": TAG_DIV, + "font": TAG_FONT, + "small": TAG_SMALL, + "big": TAG_BIG, + "center": TAG_CENTER, + "ref": TAG_REF, + "gallery": TAG_GALLERY, + "math": TAG_MATH, + "nowiki": TAG_NOWIKI, + "noinclude": TAG_NOINCLUDE, + "includeonly": TAG_INCLUDEONLY, + "onlyinclude": TAG_ONLYINCLUDE, + "syntaxhighlight": TAG_SYNTAXHIGHLIGHT, + "source": TAG_SYNTAXHIGHLIGHT, + "poem": TAG_POEM, + } + def __init__(self, type_, tag, contents=None, attrs=None, showtag=True, self_closing=False, open_padding=0, close_padding=0): super(Tag, self).__init__() diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py index 9d41870..a67bd76 100644 --- a/mwparserfromhell/parser/contexts.py +++ b/mwparserfromhell/parser/contexts.py @@ -62,35 +62,56 @@ Local (stack-specific) contexts: * :py:const:`COMMENT` -Global contexts: +* :py:const:`TAG` -* :py:const:`GL_HEADING` -""" + * :py:const:`TAG_OPEN` + * :py:const:`TAG_ATTR` -# Local contexts: + * :py:const:`TAG_ATTR_NAME` + * :py:const:`TAG_ATTR_BODY` + * :py:const:`TAG_ATTR_BODY_QUOTED` -TEMPLATE = 0b00000000000111 -TEMPLATE_NAME = 0b00000000000001 -TEMPLATE_PARAM_KEY = 0b00000000000010 -TEMPLATE_PARAM_VALUE = 0b00000000000100 + * :py:const:`TAG_BODY` + * :py:const:`TAG_CLOSE` -ARGUMENT = 0b00000000011000 -ARGUMENT_NAME = 0b00000000001000 -ARGUMENT_DEFAULT = 0b00000000010000 +Global contexts: -WIKILINK = 0b00000001100000 -WIKILINK_TITLE = 0b00000000100000 -WIKILINK_TEXT = 0b00000001000000 +* :py:const:`GL_HEADING` +""" -HEADING = 0b01111110000000 -HEADING_LEVEL_1 = 0b00000010000000 -HEADING_LEVEL_2 = 0b00000100000000 -HEADING_LEVEL_3 = 0b00001000000000 -HEADING_LEVEL_4 = 0b00010000000000 -HEADING_LEVEL_5 = 0b00100000000000 -HEADING_LEVEL_6 = 0b01000000000000 +# Local contexts: -COMMENT = 0b10000000000000 +TEMPLATE = 0b00000000000000000111 +TEMPLATE_NAME = 0b00000000000000000001 +TEMPLATE_PARAM_KEY = 0b00000000000000000010 +TEMPLATE_PARAM_VALUE = 0b00000000000000000100 + +ARGUMENT = 0b00000000000000011000 +ARGUMENT_NAME = 0b00000000000000001000 +ARGUMENT_DEFAULT = 0b00000000000000010000 + +WIKILINK = 0b00000000000001100000 +WIKILINK_TITLE = 0b00000000000000100000 +WIKILINK_TEXT = 0b00000000000001000000 + +HEADING = 0b00000001111110000000 +HEADING_LEVEL_1 = 0b00000000000010000000 +HEADING_LEVEL_2 = 0b00000000000100000000 +HEADING_LEVEL_3 = 0b00000000001000000000 +HEADING_LEVEL_4 = 0b00000000010000000000 +HEADING_LEVEL_5 = 0b00000000100000000000 +HEADING_LEVEL_6 = 0b00000001000000000000 + +COMMENT = 0b00000010000000000000 + +TAG = 0b11111100000000000000 +TAG_OPEN = 0b00000100000000000000 +TAG_ATTR = 0b00111000000000000000 +TAG_ATTR_NAME = 0b00001000000000000000 +TAG_ATTR_BODY = 0b00010000000000000000 +TAG_ATTR_BODY_QUOTED = 0b00100000000000000000 +TAG_BODY = 0b01000000000000000000 +TAG_CLOSE = 0b10000000000000000000 # Global contexts: diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c index cc1b4dd..71b6cc3 100644 --- a/mwparserfromhell/parser/tokenizer.c +++ b/mwparserfromhell/parser/tokenizer.c @@ -767,7 +767,6 @@ Tokenizer_parse_heading(Tokenizer* self) self->global ^= GL_HEADING; return 0; } - level = PyInt_FromSsize_t(heading->level); if (!level) { Py_DECREF(heading->title); diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py index 5b0e976..f640aa2 100644 --- a/mwparserfromhell/parser/tokenizer.py +++ b/mwparserfromhell/parser/tokenizer.py @@ -27,6 +27,7 @@ import string from . import contexts from . import tokens +from ..nodes.tag import Tag from ..compat import htmlentities __all__ = ["Tokenizer"] @@ -420,6 +421,57 @@ class Tokenizer(object): self._write(tokens.CommentEnd()) self._head += 2 + def _parse_tag(self): + """Parse an HTML tag at the head of the wikicode string.""" + self._head += 1 + reset = self._head + self._push() + try: + t_open, type_, self_close, o_pad = self._parse(contexts.TAG_OPEN) + if not self_close: + t_body = self._parse(contexts.TAG_BODY) + t_close, c_pad = self._parse(contexts.TAG_CLOSE) + except BadRoute: + self._head = reset + self._pop() + self._write_text("<") + else: + self._pop() + self._write(tokens.TagOpenOpen(type=type_, showtag=False)) + self._write_all(t_open) + if self_close: + self._write(tokens.TagCloseSelfclose(padding=o_pad)) + else: + self._write(tokens.TagCloseOpen(padding=o_pad)) + self._write_all(t_body) + self._write(tokens.TagOpenClose()) + self._write_all(t_close) + self._write(tokens.TagCloseClose(padding=c_pad)) + + def _handle_attribute(self): + if not self._context & contexts.TAG_ATTR: + ## check name is valid + + def _handle_attribute_name(self): + ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED + pass + + def _handle_quoted_attribute_close(self): + pass + + def _handle_tag_close_open(self): + pass ## .padding + + def _handle_tag_selfclose(self): + pass ## .padding + + def _handle_tag_close_open(self): + pass + + def _handle_tag_close_close(self): + ## check that the closing name is the same as the opening name + pass ## .padding + def _parse(self, context=0): """Parse the wikicode string, using *context* for when to stop.""" self._push(context) @@ -432,7 +484,7 @@ class Tokenizer(object): if this is self.END: fail = (contexts.TEMPLATE | contexts.ARGUMENT | contexts.WIKILINK | contexts.HEADING | - contexts.COMMENT) + contexts.COMMENT | contexts.TAG) if self._context & contexts.TEMPLATE_PARAM_KEY: self._pop() if self._context & fail: @@ -484,6 +536,29 @@ class Tokenizer(object): self._parse_comment() else: self._write_text(this) + elif this == "<" and not self._context & (contexts.TAG ^ contexts.TAG_BODY): + self._parse_tag() + elif this == " " and (self._context & contexts.TAG_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): + self._handle_attribute() + elif this == "=" and self._context & contexts.TAG_ATTR_NAME: + self._handle_attribute_name() + elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED: + self._handle_quoted_attribute_close() + elif this == "\n" and (self._context & contexts.TAG_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): + self._fail_route() + elif this == ">" and (self._context & contexts.TAG_ATTR_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): + return self._handle_tag_close_open() + elif this == "/" and next == ">" and ( + self._context & contexts.TAG_ATTR_OPEN and not + self._context & contexts.TAG_ATTR_BODY_QUOTED): + return self._handle_tag_selfclose() + elif this == "<" and next == "/" and self._context & contexts.TAG_BODY: + self._handle_tag_close_open() + elif this == ">" and self._context & contexts.TAG_CLOSE: + self._handle_tag_close_close() else: self._write_text(this) self._head += 1