From d1a9ba9a34f544d241b7595655e74a68c5b3f60b Mon Sep 17 00:00:00 2001
From: Ben Kurtovic <ben.kurtovic@verizon.net>
Date: Sat, 1 Dec 2012 13:42:08 -0500
Subject: [PATCH] Starting tag work.

- Translation dict, contexts, parse_* and handle_* hooks in tokenizer.
---
 mwparserfromhell/nodes/tag.py        | 36 +++++++++++++++++
 mwparserfromhell/parser/contexts.py  | 65 +++++++++++++++++++-----------
 mwparserfromhell/parser/tokenizer.c  |  1 -
 mwparserfromhell/parser/tokenizer.py | 77 +++++++++++++++++++++++++++++++++++-
 4 files changed, 155 insertions(+), 24 deletions(-)

diff --git a/mwparserfromhell/nodes/tag.py b/mwparserfromhell/nodes/tag.py
index 5873a49..c32f398 100644
--- a/mwparserfromhell/nodes/tag.py
+++ b/mwparserfromhell/nodes/tag.py
@@ -73,6 +73,42 @@ class Tag(Node):
     TAGS_INVISIBLE = set((TAG_REF, TAG_GALLERY, TAG_MATH, TAG_NOINCLUDE))
     TAGS_VISIBLE = set(range(300)) - TAGS_INVISIBLE
 
+    TRANSLATIONS = {
+        "i": TAG_ITALIC,
+        "em": TAG_ITALIC,
+        "b": TAG_BOLD,
+        "strong": TAG_BOLD,
+        "u": TAG_UNDERLINE,
+        "s": TAG_STRIKETHROUGH,
+        "ul": TAG_UNORDERED_LIST,
+        "ol": TAG_ORDERED_LIST,
+        "dt": TAG_DEF_TERM,
+        "dd": TAG_DEF_ITEM,
+        "blockquote": TAG_BLOCKQUOTE,
+        "hl": TAG_RULE,
+        "br": TAG_BREAK,
+        "abbr": TAG_ABBR,
+        "pre": TAG_PRE,
+        "tt": TAG_MONOSPACE,
+        "code": TAG_CODE,
+        "span": TAG_SPAN,
+        "div": TAG_DIV,
+        "font": TAG_FONT,
+        "small": TAG_SMALL,
+        "big": TAG_BIG,
+        "center": TAG_CENTER,
+        "ref": TAG_REF,
+        "gallery": TAG_GALLERY,
+        "math": TAG_MATH,
+        "nowiki": TAG_NOWIKI,
+        "noinclude": TAG_NOINCLUDE,
+        "includeonly": TAG_INCLUDEONLY,
+        "onlyinclude": TAG_ONLYINCLUDE,
+        "syntaxhighlight": TAG_SYNTAXHIGHLIGHT,
+        "source": TAG_SYNTAXHIGHLIGHT,
+        "poem": TAG_POEM,
+    }
+
     def __init__(self, type_, tag, contents=None, attrs=None, showtag=True,
                  self_closing=False, open_padding=0, close_padding=0):
         super(Tag, self).__init__()
diff --git a/mwparserfromhell/parser/contexts.py b/mwparserfromhell/parser/contexts.py
index 9d41870..a67bd76 100644
--- a/mwparserfromhell/parser/contexts.py
+++ b/mwparserfromhell/parser/contexts.py
@@ -62,35 +62,56 @@ Local (stack-specific) contexts:
 
 * :py:const:`COMMENT`
 
-Global contexts:
+* :py:const:`TAG`
 
-* :py:const:`GL_HEADING`
-"""
+    * :py:const:`TAG_OPEN`
+    * :py:const:`TAG_ATTR`
 
-# Local contexts:
+        * :py:const:`TAG_ATTR_NAME`
+        * :py:const:`TAG_ATTR_BODY`
+        * :py:const:`TAG_ATTR_BODY_QUOTED`
 
-TEMPLATE =              0b00000000000111
-TEMPLATE_NAME =         0b00000000000001
-TEMPLATE_PARAM_KEY =    0b00000000000010
-TEMPLATE_PARAM_VALUE =  0b00000000000100
+    * :py:const:`TAG_BODY`
+    * :py:const:`TAG_CLOSE`
 
-ARGUMENT =              0b00000000011000
-ARGUMENT_NAME =         0b00000000001000
-ARGUMENT_DEFAULT =      0b00000000010000
+Global contexts:
 
-WIKILINK =              0b00000001100000
-WIKILINK_TITLE =        0b00000000100000
-WIKILINK_TEXT =         0b00000001000000
+* :py:const:`GL_HEADING`
+"""
 
-HEADING =               0b01111110000000
-HEADING_LEVEL_1 =       0b00000010000000
-HEADING_LEVEL_2 =       0b00000100000000
-HEADING_LEVEL_3 =       0b00001000000000
-HEADING_LEVEL_4 =       0b00010000000000
-HEADING_LEVEL_5 =       0b00100000000000
-HEADING_LEVEL_6 =       0b01000000000000
+# Local contexts:
 
-COMMENT =               0b10000000000000
+TEMPLATE =              0b00000000000000000111
+TEMPLATE_NAME =         0b00000000000000000001
+TEMPLATE_PARAM_KEY =    0b00000000000000000010
+TEMPLATE_PARAM_VALUE =  0b00000000000000000100
+
+ARGUMENT =              0b00000000000000011000
+ARGUMENT_NAME =         0b00000000000000001000
+ARGUMENT_DEFAULT =      0b00000000000000010000
+
+WIKILINK =              0b00000000000001100000
+WIKILINK_TITLE =        0b00000000000000100000
+WIKILINK_TEXT =         0b00000000000001000000
+
+HEADING =               0b00000001111110000000
+HEADING_LEVEL_1 =       0b00000000000010000000
+HEADING_LEVEL_2 =       0b00000000000100000000
+HEADING_LEVEL_3 =       0b00000000001000000000
+HEADING_LEVEL_4 =       0b00000000010000000000
+HEADING_LEVEL_5 =       0b00000000100000000000
+HEADING_LEVEL_6 =       0b00000001000000000000
+
+COMMENT =               0b00000010000000000000
+
+TAG =                   0b11111100000000000000
+TAG_OPEN =              0b00000100000000000000
+TAG_ATTR =              0b00111000000000000000
+TAG_ATTR_NAME =         0b00001000000000000000
+TAG_ATTR_BODY =         0b00010000000000000000
+TAG_ATTR_BODY_QUOTED =  0b00100000000000000000
+TAG_BODY =              0b01000000000000000000
+TAG_CLOSE =             0b10000000000000000000
 
 
 # Global contexts:
diff --git a/mwparserfromhell/parser/tokenizer.c b/mwparserfromhell/parser/tokenizer.c
index cc1b4dd..71b6cc3 100644
--- a/mwparserfromhell/parser/tokenizer.c
+++ b/mwparserfromhell/parser/tokenizer.c
@@ -767,7 +767,6 @@ Tokenizer_parse_heading(Tokenizer* self)
         self->global ^= GL_HEADING;
         return 0;
     }
-
     level = PyInt_FromSsize_t(heading->level);
     if (!level) {
         Py_DECREF(heading->title);
diff --git a/mwparserfromhell/parser/tokenizer.py b/mwparserfromhell/parser/tokenizer.py
index 5b0e976..f640aa2 100644
--- a/mwparserfromhell/parser/tokenizer.py
+++ b/mwparserfromhell/parser/tokenizer.py
@@ -27,6 +27,7 @@ import string
 
 from . import contexts
 from . import tokens
+from ..nodes.tag import Tag
 from ..compat import htmlentities
 
 __all__ = ["Tokenizer"]
@@ -420,6 +421,57 @@ class Tokenizer(object):
             self._write(tokens.CommentEnd())
             self._head += 2
 
+    def _parse_tag(self):
+        """Parse an HTML tag at the head of the wikicode string."""
+        self._head += 1
+        reset = self._head
+        self._push()
+        try:
+            t_open, type_, self_close, o_pad = self._parse(contexts.TAG_OPEN)
+            if not self_close:
+                t_body = self._parse(contexts.TAG_BODY)
+                t_close, c_pad = self._parse(contexts.TAG_CLOSE)
+        except BadRoute:
+            self._head = reset
+            self._pop()
+            self._write_text("<")
+        else:
+            self._pop()
+            self._write(tokens.TagOpenOpen(type=type_, showtag=False))
+            self._write_all(t_open)
+            if self_close:
+                self._write(tokens.TagCloseSelfclose(padding=o_pad))
+            else:
+                self._write(tokens.TagCloseOpen(padding=o_pad))
+                self._write_all(t_body)
+                self._write(tokens.TagOpenClose())
+                self._write_all(t_close)
+                self._write(tokens.TagCloseClose(padding=c_pad))
+
+    def _handle_attribute(self):
+        if not self._context & contexts.TAG_ATTR:
+            ## check name is valid
+
+    def _handle_attribute_name(self):
+        ## check if next character is a ", if so, set TAG_ATTR_BODY_QUOTED
+        pass
+
+    def _handle_quoted_attribute_close(self):
+        pass
+
+    def _handle_tag_close_open(self):
+        pass  ## .padding
+
+    def _handle_tag_selfclose(self):
+        pass  ## .padding
+
+    def _handle_tag_close_open(self):
+        pass
+
+    def _handle_tag_close_close(self):
+        ## check that the closing name is the same as the opening name
+        pass  ## .padding
+
     def _parse(self, context=0):
         """Parse the wikicode string, using *context* for when to stop."""
         self._push(context)
@@ -432,7 +484,7 @@ class Tokenizer(object):
             if this is self.END:
                 fail = (contexts.TEMPLATE | contexts.ARGUMENT |
                         contexts.WIKILINK | contexts.HEADING |
-                        contexts.COMMENT)
+                        contexts.COMMENT | contexts.TAG)
                 if self._context & contexts.TEMPLATE_PARAM_KEY:
                     self._pop()
                 if self._context & fail:
@@ -484,6 +536,29 @@ class Tokenizer(object):
                     self._parse_comment()
                 else:
                     self._write_text(this)
+            elif this == "<" and not self._context & (contexts.TAG ^ contexts.TAG_BODY):
+                self._parse_tag()
+            elif this == " " and (self._context & contexts.TAG_OPEN and not
+                                  self._context & contexts.TAG_ATTR_BODY_QUOTED):
+                self._handle_attribute()
+            elif this == "=" and self._context & contexts.TAG_ATTR_NAME:
+                self._handle_attribute_name()
+            elif this == '"' and self._context & contexts.TAG_ATTR_BODY_QUOTED:
+                self._handle_quoted_attribute_close()
+            elif this == "\n" and (self._context & contexts.TAG_OPEN and not
+                                  self._context & contexts.TAG_ATTR_BODY_QUOTED):
+                self._fail_route()
+            elif this == ">" and (self._context & contexts.TAG_ATTR_OPEN and not
+                                  self._context & contexts.TAG_ATTR_BODY_QUOTED):
+                return self._handle_tag_close_open()
+            elif this == "/" and next == ">" and (
+                            self._context & contexts.TAG_ATTR_OPEN and not
+                            self._context & contexts.TAG_ATTR_BODY_QUOTED):
+                return self._handle_tag_selfclose()
+            elif this == "<" and next == "/" and self._context & contexts.TAG_BODY:
+                self._handle_tag_close_open()
+            elif this == ">" and self._context & contexts.TAG_CLOSE:
+                self._handle_tag_close_close()
             else:
                 self._write_text(this)
             self._head += 1